BlosSOM
Interactive dimensionality reduction on large datasets (EmbedSOM and FLOWER combined)
scaled_data.cpp
Go to the documentation of this file.
1/* This file is part of BlosSOM.
2 *
3 * Copyright (C) 2021 Mirek Kratochvil
4 * Sona Molnarova
5 *
6 * BlosSOM is free software: you can redistribute it and/or modify it under
7 * the terms of the GNU General Public License as published by the Free
8 * Software Foundation, either version 3 of the License, or (at your option)
9 * any later version.
10 *
11 * BlosSOM is distributed in the hope that it will be useful, but WITHOUT ANY
12 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14 * details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * BlosSOM. If not, see <https://www.gnu.org/licenses/>.
18 */
19
20#include "scaled_data.h"
21
22#include <cmath>
23
24void
25ScaledData::update(const TransData &td, FrameStats &frame_stats)
26{
27 if (dirty(td) && (td.dim() != dim() || td.n != n)) {
28 n = td.n;
29 config.resize(td.dim());
31 data.resize(n * dim());
32 clean(td);
33 frame_stats.reset(frame_stats.scaled_t);
35 }
36
37 auto [ri, rn] = dirty_range(td);
38 if (!rn) {
39 frame_stats.reset(frame_stats.scaled_t);
40 return;
41 }
42
43 const size_t max_points =
44 batch_size_gen.next(frame_stats.scaled_t, frame_stats.scaled_duration);
45
46 if (rn > max_points)
47 rn = max_points;
48 clean_range(td, rn);
49
50 std::vector<float> means = td.sums;
51 std::vector<float> isds = td.sqsums;
52 size_t d = dim();
53
54 frame_stats.add_const_time();
55
56 for (size_t di = 0; di < d; ++di) {
57 means[di] /= n;
58 isds[di] /= n;
59 isds[di] = 1 / sqrt(isds[di] - means[di] * means[di]);
60 if (isds[di] > 10000)
61 isds[di] = 10000;
62 }
63
64 for (; rn-- > 0; ++ri) {
65 if (ri >= n)
66 ri = 0;
67 for (size_t di = 0; di < d; ++di)
68 data[ri * d + di] =
69 (td.data[ri * d + di] - means[di]) *
70 (config[di].scale ? config[di].sdev * isds[di] : 1);
71 }
72
73 frame_stats.store_time(frame_stats.scaled_t);
74
75 touch();
76}
77
78void
80{
81 for (auto &c : config)
82 c = ScaleConfig();
84}
size_t next(float T, float t)
Computes size of the next batch.
void clean(const Dirt &d)
Call this when the cache is refreshed.
Definition: dirty.h:67
bool dirty(const Dirt &d)
Returns true if the cache needs to be refreshed.
Definition: dirty.h:60
void touch()
Make the cache dirty.
Definition: dirty.h:43
size_t n
Definition: dirty.h:83
float scaled_duration
Definition: frame_stats.h:42
void add_const_time()
Definition: frame_stats.h:73
void store_time(float &to)
Definition: frame_stats.h:79
void reset(float &t)
Definition: frame_stats.h:85
float scaled_t
Definition: frame_stats.h:32
Configuration of the single-dimension scaling.
Definition: scaled_data.h:33
std::vector< ScaleConfig > config
Separate configurations for each dimension.
Definition: scaled_data.h:57
void update(const TransData &td, FrameStats &frame_stats)
Recomputes the data if any of the config has been touched.
Definition: scaled_data.cpp:25
void touch_config()
Notifies Sweeper that the config has been modified and that the data has to be recomputed.
Definition: scaled_data.h:73
std::vector< float > data
Scaled data in the same format as DataModel::data.
Definition: scaled_data.h:55
size_t dim() const
Returns dimension of the scaled data.
Definition: scaled_data.h:66
void reset()
Resets configurations to their initial values.
Definition: scaled_data.cpp:79
BatchSizeGen batch_size_gen
Definition: scaled_data.h:59
std::tuple< size_t, size_t > dirty_range(const Dirts &d)
Find the range to refresh.
Definition: dirty.h:113
void clean_range(const Dirts &d, size_t n)
Clean a range of the cache.
Definition: dirty.h:130
Storage of the transformed data.
Definition: trans_data.h:74
size_t dim() const
Returns dimension of the transformed data.
Definition: trans_data.h:93
std::vector< float > data
Transformed data in the same format as DataModel::data.
Definition: trans_data.h:76
std::vector< float > sums
Array representing sums for each dimension.
Definition: trans_data.h:79
std::vector< float > sqsums
Array representing square sums for each dimension.
Definition: trans_data.h:81