BlosSOM
Interactive dimensionality reduction on large datasets (EmbedSOM and FLOWER combined)
embedsom.cpp
Go to the documentation of this file.
1
2#include "embedsom.h"
3
4/* This file is originally a part of EmbedSOM.
5 *
6 * Copyright (C) 2018-2021 Mirek Kratochvil <exa.exa@gmail.com>
7 *
8 * EmbedSOM is free software: you can redistribute it and/or modify it under
9 * the terms of the GNU General Public License as published by the Free
10 * Software Foundation, either version 3 of the License, or (at your option)
11 * any later version.
12 *
13 * EmbedSOM is distributed in the hope that it will be useful, but WITHOUT ANY
14 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
16 * details.
17 *
18 * You should have received a copy of the GNU General Public License along with
19 * EmbedSOM. If not, see <https://www.gnu.org/licenses/>.
20 */
21
22#include <algorithm>
23#include <cfloat>
24#include <cmath>
25#include <vector>
26
27using namespace std;
28
29/*
30 * Distance computation tools
31 */
32
33static inline float
34sqrf(float n)
35{
36 return n * n;
37}
38
39namespace distfs {
40
41struct sqeucl
42{
43 inline static float back(float x) { return sqrt(x); }
44 inline static float comp(const float *p1, const float *p2, const size_t dim)
45 {
46#ifndef USE_INTRINS
47 float sqdist = 0;
48 for (size_t i = 0; i < dim; ++i) {
49 float tmp = p1[i] - p2[i];
50 sqdist += tmp * tmp;
51 }
52 return sqdist;
53#else
54 const float *p1e = p1 + dim, *p1ie = p1e - 3;
55
56 __m128 s = _mm_setzero_ps();
57 for (; p1 < p1ie; p1 += 4, p2 += 4) {
58 __m128 tmp = _mm_sub_ps(_mm_loadu_ps(p1), _mm_loadu_ps(p2));
59 s = _mm_add_ps(_mm_mul_ps(tmp, tmp), s);
60 }
61 float sqdist = s[0] + s[1] + s[2] + s[3];
62 for (; p1 < p1e; ++p1, ++p2) {
63 float tmp = *p1 - *p2;
64 sqdist += tmp * tmp;
65 }
66 return sqdist;
67#endif
68 }
69
70 inline static void proj(const float *la,
71 const float *lb,
72 const float *p,
73 const size_t dim,
74 float &o_scalar,
75 float &o_sqdist)
76 {
77
78#ifndef USE_INTRINS
79 float scalar = 0, sqdist = 0;
80 for (size_t k = 0; k < dim; ++k) {
81 float tmp = lb[k] - la[k];
82 sqdist += tmp * tmp;
83 scalar += tmp * (p[k] - la[k]);
84 }
85#else
86 float scalar, sqdist;
87 {
88 const float *ke = la + dim, *kie = ke - 3;
89
90 __m128 sca = _mm_setzero_ps(), sqd = _mm_setzero_ps();
91 for (; la < kie; la += 4, lb += 4, p += 4) {
92 __m128 ti = _mm_loadu_ps(la);
93 __m128 tmp = _mm_sub_ps(_mm_loadu_ps(lb), ti);
94 sqd = _mm_add_ps(sqd, _mm_mul_ps(tmp, tmp));
95 sca = _mm_add_ps(
96 sca, _mm_mul_ps(tmp, _mm_sub_ps(_mm_loadu_ps(p), ti)));
97 }
98 scalar = sca[0] + sca[1] + sca[2] + sca[3];
99 sqdist = sqd[0] + sqd[1] + sqd[2] + sqd[3];
100 for (; la < ke; ++la, ++lb, ++p) {
101 float tmp = *lb - *la;
102 sqdist += tmp * tmp;
103 scalar += tmp * (*p - *la);
104 }
105 }
106#endif
107 o_scalar = scalar;
108 o_sqdist = sqdist;
109 }
110};
111
112} // namespace distfs
113
114/*
115 * some required constants
116 */
117
118// small numbers first!
119static const float min_boost = 1e-5; // lower limit for the parameter
120
121// this affects how steeply the score decreases for near-farthest codes
122static const float max_avoidance = 10;
123
124// this is added before normalizing the distances
125static const float zero_avoidance = 1e-10;
126
127// a tiny epsilon for preventing singularities
128static const float koho_gravity = 1e-5;
129
130/*
131 * KNN computation (mainly heap)
132 */
134{
135 float dist;
136 size_t id;
137};
138
139static void
140heap_down(dist_id *heap, size_t start, size_t lim)
141{
142 for (;;) {
143 size_t L = 2 * start + 1;
144 size_t R = L + 1;
145 if (R < lim) {
146 float dl = heap[L].dist;
147 float dr = heap[R].dist;
148
149 if (dl > dr) {
150 if (heap[start].dist >= dl)
151 break;
152 swap(heap[L], heap[start]);
153 start = L;
154 } else {
155 if (heap[start].dist >= dr)
156 break;
157 swap(heap[R], heap[start]);
158 start = R;
159 }
160 } else if (L < lim) {
161 if (heap[start].dist < heap[L].dist)
162 swap(heap[L], heap[start]);
163 break; // exit safely!
164 } else
165 break;
166 }
167}
168
169template<class distf>
170static void
171knn(const float *point,
172 const float *hidim_lm,
173 size_t n_landmarks,
174 size_t dim,
175 size_t topnn,
176 vector<dist_id> &dists)
177{
178 size_t i;
179
180 // push first topnn kohos
181 for (i = 0; i < topnn; ++i) {
182 dists[i].dist = distf::comp(point, hidim_lm + i * dim, dim);
183 dists[i].id = i;
184 }
185
186 // make a heap
187 for (i = 0; i < topnn; ++i)
188 heap_down(dists.data(), topnn - i - 1, topnn);
189
190 // insert the rest
191 for (i = topnn; i < n_landmarks; ++i) {
192 float s = distf::comp(point, hidim_lm + i * dim, dim);
193 if (dists[0].dist < s)
194 continue;
195 dists[0].dist = s;
196 dists[0].id = i;
197 heap_down(dists.data(), 0, topnn);
198 }
199
200 // heapsort the NNs
201 for (i = topnn - 1; i > 0; --i) {
202 swap(dists[0], dists[i]);
203 heap_down(dists.data(), 0, i);
204 }
205}
206
207/*
208 * Projection- and fitting-related helpers
209 */
210
211template<int embed_dim>
212static void
213add_gravity(const float *lodim_lm, float score, float *mtx)
214{
215 float gs = score * koho_gravity;
216 if (embed_dim == 2) {
217 mtx[0] += gs;
218 mtx[3] += gs;
219 mtx[4] += gs * lodim_lm[0];
220 mtx[5] += gs * lodim_lm[1];
221 }
222 if (embed_dim == 3) {
223 mtx[0] += gs;
224 mtx[4] += gs;
225 mtx[8] += gs;
226 mtx[9] += gs * lodim_lm[0];
227 mtx[10] += gs * lodim_lm[1];
228 mtx[11] += gs * lodim_lm[2];
229 }
230}
231
232template<int embed_dim>
233inline static float
234dotp_ec(const float *a, const float *b)
235{
236 float r = 0;
237 for (size_t i = 0; i < embed_dim; ++i)
238 r += a[i] * b[i];
239 return r;
240}
241
242template<int embed_dim>
243static void
244add_approximation(float score_i,
245 float score_j,
246 const float *ilm,
247 const float *jlm,
248 float scalar_proj,
249 float adjust,
250 float *mtx)
251{
252 float h[embed_dim], hp = 0;
253 for (size_t i = 0; i < embed_dim; ++i)
254 hp += sqrf(h[i] = jlm[i] - ilm[i]);
255 if (hp < zero_avoidance)
256 return;
257
258 const float s =
259 score_i * score_j * powf(1 + hp, -adjust) * expf(-sqrf(scalar_proj - .5));
260 const float sihp = s / hp;
261 const float rhsc = s * (scalar_proj + dotp_ec<embed_dim>(h, ilm) / hp);
262
263 if (embed_dim == 2) {
264
265 mtx[0] += h[0] * h[0] * sihp;
266 mtx[1] += h[0] * h[1] * sihp;
267 mtx[2] += h[1] * h[0] * sihp;
268 mtx[3] += h[1] * h[1] * sihp;
269 mtx[4] += h[0] * rhsc;
270 mtx[5] += h[1] * rhsc;
271 }
272
273 if (embed_dim == 3) {
274 mtx[0] += h[0] * h[0] * sihp;
275 mtx[1] += h[0] * h[1] * sihp;
276 mtx[2] += h[0] * h[2] * sihp;
277 mtx[3] += h[1] * h[0] * sihp;
278 mtx[4] += h[1] * h[1] * sihp;
279 mtx[5] += h[1] * h[2] * sihp;
280 mtx[6] += h[2] * h[0] * sihp;
281 mtx[7] += h[2] * h[1] * sihp;
282 mtx[8] += h[2] * h[2] * sihp;
283 mtx[9] += h[0] * rhsc;
284 mtx[10] += h[1] * rhsc;
285 mtx[11] += h[2] * rhsc;
286 }
287}
288
289template<int embed_dim>
290static void
291solve_lin_eq(const float *mtx, float *embedding)
292{
293 if (embed_dim == 2) {
294 float det = mtx[0] * mtx[3] - mtx[1] * mtx[2];
295 embedding[0] = (mtx[4] * mtx[3] - mtx[5] * mtx[2]) / det;
296 embedding[1] = (mtx[0] * mtx[5] - mtx[1] * mtx[4]) / det;
297 }
298 if (embed_dim == 3) {
299 // this looks ugly
300 float det = mtx[0] * mtx[4] * mtx[8] + mtx[1] * mtx[5] * mtx[6] +
301 mtx[2] * mtx[3] * mtx[7] - mtx[0] * mtx[5] * mtx[7] -
302 mtx[1] * mtx[3] * mtx[8] - mtx[2] * mtx[4] * mtx[6];
303 embedding[0] = (mtx[9] * mtx[4] * mtx[8] + mtx[10] * mtx[5] * mtx[6] +
304 mtx[11] * mtx[3] * mtx[7] - mtx[9] * mtx[5] * mtx[7] -
305 mtx[10] * mtx[3] * mtx[8] - mtx[11] * mtx[4] * mtx[6]) /
306 det;
307 embedding[1] = (mtx[0] * mtx[10] * mtx[8] + mtx[1] * mtx[11] * mtx[6] +
308 mtx[2] * mtx[9] * mtx[7] - mtx[0] * mtx[11] * mtx[7] -
309 mtx[1] * mtx[9] * mtx[8] - mtx[2] * mtx[10] * mtx[6]) /
310 det;
311 embedding[2] = (mtx[0] * mtx[4] * mtx[11] + mtx[1] * mtx[5] * mtx[9] +
312 mtx[2] * mtx[3] * mtx[10] - mtx[0] * mtx[5] * mtx[10] -
313 mtx[1] * mtx[3] * mtx[11] - mtx[2] * mtx[4] * mtx[9]) /
314 det;
315 }
316}
317
318/*
319 * scoring
320 */
321template<class distf>
322static void
323sorted_dists_to_scores(vector<dist_id> &dists,
324 const size_t topn,
325 const size_t topnn,
326 const float boost)
327{
328 // compute the distance distribution for the scores
329 float mean = 0, sd = 0, wsum = 0;
330 for (size_t i = 0; i < topnn; ++i) {
331 const float tmp = distf::back(dists[i].dist);
332 const float w = 1 / float(i + 1);
333 mean += tmp * w;
334 sd += tmp * tmp * w;
335 wsum += w;
336 dists[i].dist = tmp;
337 }
338
339 mean /= wsum;
340 sd = boost / sqrtf(sd / wsum - sqrf(mean));
341 const float nmax = max_avoidance / dists[topnn - 1].dist;
342
343 // convert the stuff to scores
344 for (size_t i = 0; i < topn; ++i)
345 if (topn < topnn)
346 dists[i].dist = expf((mean - dists[i].dist) * sd) *
347 (1 - expf(dists[i].dist * nmax - max_avoidance));
348 else
349 dists[i].dist = expf((mean - dists[i].dist) * sd);
350}
351
352/*
353 * EmbedSOM function for a single point
354 */
355template<class distf, int embed_dim>
356static void
357embedsom_point(const size_t n_landmarks,
358 const size_t dim,
359 const float boost,
360 const size_t topn,
361 const float adjust,
362 const float *point,
363 const float *hidim_lm,
364 const float *lodim_lm,
365 float *embedding,
366 vector<dist_id> &dists)
367{
368 const size_t topnn = topn < n_landmarks ? topn + 1 : topn;
369
370 knn<distf>(point, hidim_lm, n_landmarks, dim, topnn, dists);
371
372 sorted_dists_to_scores<distf>(dists, topn, topnn, boost);
373
374 // create the empty equation matrix
375 float mtx[embed_dim * (1 + embed_dim)];
376 fill(mtx, mtx + embed_dim * (1 + embed_dim), 0);
377
378 // for all points in the neighborhood
379 for (size_t i = 0; i < topn; ++i) {
380 size_t idx = dists[i].id;
381 float score_i = dists[i].dist; // score of 'i'
382
383 float ilm[embed_dim]; // lodim_lm for point 'i'
384 copy_n(lodim_lm + embed_dim * idx, embed_dim, ilm);
385
386 /* this adds a really tiny influence of the point to
387 * prevent singularities */
388 add_gravity<embed_dim>(ilm, score_i, mtx);
389
390 // for all combinations of point 'i' with points in the
391 // neighborhood
392 for (size_t j = i + 1; j < topn; ++j) {
393
394 size_t jdx = dists[j].id;
395 float score_j = dists[j].dist; // score of 'j'
396
397 float jlm[embed_dim]; // lodim_lm for point 'j'
398 copy_n(lodim_lm + embed_dim * jdx, embed_dim, jlm);
399
400 float scalar, sqdist;
401 distf::proj(hidim_lm + dim * idx,
402 hidim_lm + dim * jdx,
403 point,
404 dim,
405 scalar,
406 sqdist);
407
408 if (sqdist == 0)
409 continue;
410 else
411 scalar /= sqdist;
412
413 add_approximation<embed_dim>(
414 score_i, score_j, ilm, jlm, scalar, adjust, mtx);
415 }
416 }
417
418 solve_lin_eq<embed_dim>(mtx, embedding);
419}
420
421void
422embedsom(size_t n,
423 size_t n_landmarks,
424 size_t dim,
425 float boost,
426 size_t topn,
427 float adjust,
428 const float *points,
429 const float *hidim_lm,
430 const float *lodim_lm,
431 float *embedding)
432{
433 if (topn > n_landmarks)
434 topn = n_landmarks;
435
436 const size_t topnn = topn < n_landmarks ? topn + 1 : topn;
437
438 if (n_landmarks < 3)
439 return;
440
441 vector<dist_id> dists;
442 dists.resize(topnn);
443
444 // TODO constkill the lowdim=2 template parameter
445 for (size_t i = 0; i < n; ++i)
446 embedsom_point<distfs::sqeucl, 2>(n_landmarks,
447 dim,
448 boost,
449 topn,
450 adjust,
451 points + dim * i,
452 hidim_lm,
453 lodim_lm,
454 embedding + 2 * i,
455 dists);
456}
static const float koho_gravity
Definition: embedsom.cpp:128
static void solve_lin_eq(const float *mtx, float *embedding)
Definition: embedsom.cpp:291
static void heap_down(dist_id *heap, size_t start, size_t lim)
Definition: embedsom.cpp:140
static const float min_boost
Definition: embedsom.cpp:119
static void embedsom_point(const size_t n_landmarks, const size_t dim, const float boost, const size_t topn, const float adjust, const float *point, const float *hidim_lm, const float *lodim_lm, float *embedding, vector< dist_id > &dists)
Definition: embedsom.cpp:357
void embedsom(size_t n, size_t n_landmarks, size_t dim, float boost, size_t topn, float adjust, const float *points, const float *hidim_lm, const float *lodim_lm, float *embedding)
Definition: embedsom.cpp:422
static float dotp_ec(const float *a, const float *b)
Definition: embedsom.cpp:234
static void add_gravity(const float *lodim_lm, float score, float *mtx)
Definition: embedsom.cpp:213
static float sqrf(float n)
Definition: embedsom.cpp:34
static void add_approximation(float score_i, float score_j, const float *ilm, const float *jlm, float scalar_proj, float adjust, float *mtx)
Definition: embedsom.cpp:244
static const float max_avoidance
Definition: embedsom.cpp:122
static void knn(const float *point, const float *hidim_lm, size_t n_landmarks, size_t dim, size_t topnn, vector< dist_id > &dists)
Definition: embedsom.cpp:171
static void sorted_dists_to_scores(vector< dist_id > &dists, const size_t topn, const size_t topnn, const float boost)
Definition: embedsom.cpp:323
static const float zero_avoidance
Definition: embedsom.cpp:125
float dist
Definition: embedsom.cpp:135
size_t id
Definition: embedsom.cpp:136
static void proj(const float *la, const float *lb, const float *p, const size_t dim, float &o_scalar, float &o_sqdist)
Definition: embedsom.cpp:70
static float back(float x)
Definition: embedsom.cpp:43
static float comp(const float *p1, const float *p2, const size_t dim)
Definition: embedsom.cpp:44