BlosSOM
Interactive dimensionality reduction on large datasets (EmbedSOM and FLOWER combined)
trans_data.cpp
Go to the documentation of this file.
1/* This file is part of BlosSOM.
2 *
3 * Copyright (C) 2021 Mirek Kratochvil
4 * Sona Molnarova
5 *
6 * BlosSOM is free software: you can redistribute it and/or modify it under
7 * the terms of the GNU General Public License as published by the Free
8 * Software Foundation, either version 3 of the License, or (at your option)
9 * any later version.
10 *
11 * BlosSOM is distributed in the hope that it will be useful, but WITHOUT ANY
12 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14 * details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * BlosSOM. If not, see <https://www.gnu.org/licenses/>.
18 */
19
20#include "trans_data.h"
21#include <cmath>
22
23void
25{
26 if (!dirty(dm))
27 return;
28
29 means.clear();
30 means.resize(dm.d, 0);
31 sds = means;
32
33 size_t d = dm.d;
34 for (size_t ni = 0; ni < dm.n; ++ni) {
35 for (size_t di = 0; di < d; ++di) {
36 float tmp = dm.data[ni * d + di];
37 means[di] += tmp;
38 sds[di] += tmp * tmp;
39 }
40 }
41
42 for (size_t di = 0; di < d; ++di) {
43 means[di] /= dm.n;
44 sds[di] /= dm.n;
45 sds[di] = sqrt(sds[di] - means[di] * means[di]);
46 }
47
48 clean(dm);
49 touch();
50}
51
52void
54 const RawDataStats &s,
55 FrameStats &frame_stats)
56{
57 if (dirty(dm)) {
58 config.resize(dm.d);
59 n = dm.n;
60 data.clear(); // TODO this needs to be updated if rolling stats should
61 // work
62 data.resize(n * dim(), 0);
63 sums.clear();
64 sums.resize(dim(), 0);
65 sqsums = sums;
66 touch();
67 clean(dm);
68 frame_stats.reset(frame_stats.trans_t);
70 }
71
72 if (stat_watch.dirty(s)) {
73 refresh(dm);
75 }
76
77 // make sure we're the right size
78 auto [ri, rn] = dirty_range(dm);
79 if (!rn) {
80 frame_stats.reset(frame_stats.trans_t);
81 return;
82 }
83
84 const size_t max_points =
85 batch_size_gen.next(frame_stats.trans_t, frame_stats.trans_duration);
86
87 if (rn > max_points)
88 rn = max_points;
89
90 clean_range(dm, rn);
91 const size_t d = dim();
92 std::vector<float> sums_adjust(d, 0), sqsums_adjust(d, 0);
93
94 frame_stats.add_const_time();
95
96 for (; rn-- > 0; ++ri) {
97 if (ri >= n)
98 ri = 0;
99 for (size_t di = 0; di < d; ++di) {
100 const auto &c = config[di];
101
102 float tmp = data[ri * d + di];
103 sums_adjust[di] -= tmp;
104 sqsums_adjust[di] -= tmp * tmp;
105 tmp = dm.data[ri * d + di];
106
107 tmp += c.affine_adjust;
108 if (c.asinh)
109 tmp = asinhf(tmp / c.asinh_cofactor);
110
111 data[ri * d + di] = tmp;
112 sums_adjust[di] += tmp;
113 sqsums_adjust[di] += tmp * tmp;
114 }
115 }
116
117 for (size_t di = 0; di < d; ++di) {
118 sums[di] += sums_adjust[di];
119 sqsums[di] += sqsums_adjust[di];
120 }
121
122 frame_stats.store_time(frame_stats.trans_t);
123
124 touch();
125}
126
127void
129{
130 for (auto &c : config)
131 c = TransConfig();
132 touch_config();
133}
134
135#if 0
136void
137TransData::disable_col(size_t c)
138{
139 // TODO update config, remove the column from output if needed, reduce `d`,
140 // ...
141}
142
143void
144TransData::enable_col(size_t c)
145{
146 // TODO reverse of disable_col
147}
148#endif
size_t next(float T, float t)
Computes size of the next batch.
void clean(const Dirt &d)
Call this when the cache is refreshed.
Definition: dirty.h:67
bool dirty(const Dirt &d)
Returns true if the cache needs to be refreshed.
Definition: dirty.h:60
Storage of data from loaded input file.
Definition: data_model.h:32
size_t d
Dimension size.
Definition: data_model.h:39
std::vector< float > data
One-dimensional array storing d-dimensional input data in row-major order.
Definition: data_model.h:35
void touch()
Make the cache dirty.
Definition: dirty.h:43
size_t n
Definition: dirty.h:83
void add_const_time()
Definition: frame_stats.h:73
void store_time(float &to)
Definition: frame_stats.h:79
void reset(float &t)
Definition: frame_stats.h:85
float trans_duration
Definition: frame_stats.h:40
float trans_t
Definition: frame_stats.h:30
Statistics from the untransformed dataset.
Definition: trans_data.h:38
void update(const DataModel &dm)
Recomputes the statistics if the input data changed.
Definition: trans_data.cpp:24
std::vector< float > means
Array containing means for each dimension.
Definition: trans_data.h:40
std::vector< float > sds
Array containing standard deviations for each dimension.
Definition: trans_data.h:42
std::tuple< size_t, size_t > dirty_range(const Dirts &d)
Find the range to refresh.
Definition: dirty.h:113
void clean_range(const Dirts &d, size_t n)
Clean a range of the cache.
Definition: dirty.h:130
void refresh(const Dirts &d)
Force-refresh the whole range.
Definition: dirty.h:105
Configuration of single-dimension transformation.
Definition: trans_data.h:54
void update(const DataModel &dm, const RawDataStats &s, FrameStats &frame_stats)
Recomputes the data if any of the config has been touched.
Definition: trans_data.cpp:53
void touch_config()
Notifies Sweeper that the config has been modified and that the data has to be recomputed.
Definition: trans_data.h:99
size_t dim() const
Returns dimension of the transformed data.
Definition: trans_data.h:93
Cleaner stat_watch
Definition: trans_data.h:101
void reset()
Resets configurations to their initial values.
Definition: trans_data.cpp:128
std::vector< float > data
Transformed data in the same format as DataModel::data.
Definition: trans_data.h:76
std::vector< float > sums
Array representing sums for each dimension.
Definition: trans_data.h:79
BatchSizeGen batch_size_gen
Definition: trans_data.h:86
std::vector< TransConfig > config
Separate configurations for each dimension.
Definition: trans_data.h:84
std::vector< float > sqsums
Array representing square sums for each dimension.
Definition: trans_data.h:81