BlosSOM
Interactive dimensionality reduction on large datasets (EmbedSOM and FLOWER combined)
embedsom_cuda_projection.cu
Go to the documentation of this file.
1/*
2The MIT License
3
4Copyright (c) 2021 Adam Smelko
5 Martin Krulis
6 Mirek Kratochvil
7
8Permission is hereby granted, free of charge,
9to any person obtaining a copy of this software and
10associated documentation files (the "Software"), to
11deal in the Software without restriction, including
12without limitation the rights to use, copy, modify,
13merge, publish, distribute, sublicense, and/or sell
14copies of the Software, and to permit persons to whom
15the Software is furnished to do so,
16subject to the following conditions:
17
18The above copyright notice and this permission notice
19shall be included in all copies or substantial portions of the Software.
20
21THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
23OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
25ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28*/
29
30#include "embedsom_cuda.h"
31
32#define _CG_ABI_EXPERIMENTAL
33
34#include "cooperative_groups.h"
35#include "cooperative_groups/reduce.h"
36#include "cuda_runtime.h"
37#include "device_launch_parameters.h"
38
39namespace cg = cooperative_groups;
40
41template<typename F = float>
43{
44public:
45 static constexpr F minBoost = F(1e-5);
46 static constexpr F maxAvoidance = F(10);
47 static constexpr F zeroAvoidance = F(1e-10);
48 static constexpr F gridGravity = F(1e-5);
49};
50
51/** Helper structure to transform neighbor distances to scores.
52 *
53 * Shares `distance` field for storing scores and loading distances.
54 */
55template<typename F>
57{
58 knn_entry<F> *const __restrict__ neighbors;
59 __forceinline__ __device__ F getNeighborDistance(const uint32_t idx) const
60 {
61 return neighbors[idx].distance;
62 }
63 __forceinline__ __device__ void storeScore(const uint32_t idx,
64 const F score)
65 {
66 neighbors[idx].distance = score;
67 }
68};
69
70/** Helper structure to help transform neighbor distances to scores.
71 *
72 * Separate arrays for distances and scores.
73 */
74template<typename F>
76{
77 const knn_entry<F> *const __restrict__ neighbors;
78 F *const __restrict__ scores;
79
80 __forceinline__ __device__ F getNeighborDistance(const uint32_t idx) const
81 {
82 return neighbors[idx].distance;
83 }
84
85 __forceinline__ __device__ void storeScore(const uint32_t idx,
86 const F score)
87 {
88 scores[idx] = score;
89 }
90};
91
92/** Sum a value across the warp */
93template<typename F>
94__inline__ __device__ F
96{
97 for (int offset = warpSize / 2; offset > 0; offset /= 2)
98 val += __shfl_down_sync(0xffffffff, val, offset);
99 return val;
100}
101
102/** Read aligned data entries for EmbedSOM projection */
103template<typename F, typename ArrayF>
104__inline__ __device__ void
105readAligned(F *const __restrict__ dst,
106 const F *const __restrict__ src,
107 const knn_entry<F> *const __restrict__ neighbors,
108 const uint32_t n,
109 const uint32_t dim,
110 const uint32_t groupRank,
111 const uint32_t groupSize)
112{
113 constexpr auto size = sizeof(ArrayF) / sizeof(F);
114
115 const uint32_t loadsCount = dim / size;
116 const uint32_t dimX = dim / size;
117
118 ArrayF *const dstX = reinterpret_cast<ArrayF *>(dst);
119 const ArrayF *const srcX = reinterpret_cast<const ArrayF *>(src);
120
121 for (size_t i = groupRank; i < n * loadsCount; i += groupSize) {
122 const auto idx = i / loadsCount;
123 const auto off = i % loadsCount;
124
125 dstX[idx * dimX + off] = srcX[neighbors[idx].index * dimX + off];
126 }
127}
128
129/** Version of readAligned() for the 2D grid */
130template<typename F>
131__inline__ __device__ void
132readAlignedGrid2D(F *const __restrict__ dst,
133 const F *const __restrict__ src,
134 const knn_entry<F> *const __restrict__ neighbors,
135 const uint32_t n,
136 const uint32_t groupRank,
137 const uint32_t groupSize)
138{
139 using ArrayT = typename Vec<2, F>::Type;
140
141 ArrayT *const dstX = reinterpret_cast<ArrayT *>(dst);
142 const ArrayT *const srcX = reinterpret_cast<const ArrayT *>(src);
143
144 for (size_t i = groupRank; i < n; i += groupSize)
145 dstX[i] = srcX[neighbors[i].index];
146}
147
148/** Load data into shared memory cache */
149template<typename F>
150__inline__ __device__ void
151storeToCache(const uint32_t groupRank,
152 const uint32_t groupSize,
153 const F *const __restrict__ points,
154 F *const __restrict__ pointsCache,
155 const F *const __restrict__ grid,
156 F *const __restrict__ gridCache,
157 const F *const __restrict__ grid2d,
158 F *const __restrict__ grid2dCache,
159 const knn_entry<F> *const __restrict__ neighbors,
160 const uint32_t dim,
161 const uint32_t gridCacheLeadingDim,
162 const uint32_t k)
163{
164 auto copyIdx = groupRank;
165 for (; copyIdx < dim; copyIdx += groupSize)
166 pointsCache[copyIdx] = points[copyIdx];
167
168 copyIdx -= dim;
169
170 for (; copyIdx < k * dim; copyIdx += groupSize) {
171 auto globIdx = copyIdx / dim;
172 auto globOff = copyIdx % dim;
173 gridCache[globIdx * gridCacheLeadingDim + globOff] =
174 grid[neighbors[globIdx].index * dim + globOff];
175 }
176
177 readAlignedGrid2D<F>(
178 grid2dCache, grid2d, neighbors, k, groupRank, groupSize);
179}
180
181/** Compute scores for a vector of sorted distances */
182template<typename F>
183__inline__ __device__ void
184sortedDistsToScores(knn_entry<F> *const __restrict__ neighbors,
185 const std::size_t adjustedK,
186 const std::size_t k,
187 const F boost)
188{
189 // compute the distance distribution for the scores
190 F mean = 0, sd = 0, wsum = 0;
191 for (uint32_t i = 0; i < adjustedK; ++i) {
192 const F tmp = sqrt(neighbors[i].distance);
193 const F w = 1 / F(i + 1);
194 mean += tmp * w;
195 sd += tmp * tmp * w;
196 wsum += w;
197 neighbors[i].distance = tmp;
198 }
199
200 mean /= wsum;
201 sd = boost / sqrt(sd / wsum - mean * mean);
202 const F nmax =
203 EmbedSOMConstants<F>::maxAvoidance / neighbors[adjustedK - 1].distance;
204
205 // convert the stuff to scores
206 for (uint32_t i = 0; i < k; ++i) {
207 if (k < adjustedK)
208 neighbors[i].distance =
209 exp((mean - neighbors[i].distance) * sd) *
210 (1 - exp(neighbors[i].distance * nmax -
212 else
213 neighbors[i].distance = exp((mean - neighbors[i].distance) * sd);
214 }
215}
216
217/** Convert the distances to scores using thread_block_tile and its built in
218 * functions for reduction and shuffle.
219 */
220template<typename F, class SCORE_STORAGE, class TILE>
221__inline__ __device__ void
223 SCORE_STORAGE storage,
224 const std::size_t adjustedK,
225 const std::size_t k,
226 const F boost)
227{
228 // TRICKY: if k is big enough and tile is small enough, this array may
229 // overflow. Should be MAX_K / tile.size()
230 F tmpScores[10];
231 F lastScore;
232
233 // compute the distance distribution for the scores
234 F mean = 0, sd = 0, wsum = 0;
235 for (uint32_t i = tile.thread_rank(); i < adjustedK; i += tile.size()) {
236 const F tmp = sqrt(storage.getNeighborDistance(i));
237 const F w = 1 / F(i + 1);
238 mean += tmp * w;
239 wsum += w;
240 tmpScores[i / tile.size()] = tmp;
241 }
242
243 {
244 mean = cg::reduce(tile, mean, cg::plus<F>());
245 wsum = cg::reduce(tile, wsum, cg::plus<F>());
246 }
247
248 mean /= wsum;
249
250 for (uint32_t i = tile.thread_rank(); i < adjustedK; i += tile.size()) {
251 const F tmp = tmpScores[i / tile.size()] - mean;
252 const F w = 1 / F(i + 1);
253 sd += tmp * tmp * w;
254 }
255
256 {
257 sd = cg::reduce(tile, sd, cg::plus<F>());
258
259 const auto lastScoreThreadIdx = (adjustedK - 1) % tile.size();
260 const auto lastScoreIdx = (adjustedK - 1) / tile.size();
261 lastScore = tile.shfl(tmpScores[lastScoreIdx], lastScoreThreadIdx);
262 }
263
264 sd = boost / sqrt(sd / wsum);
265 const F nmax = EmbedSOMConstants<F>::maxAvoidance / lastScore;
266
267 // convert the stuff to scores
268 if (k < adjustedK)
269 for (uint32_t i = tile.thread_rank(); i < k; i += tile.size()) {
270 const auto scoreIdx = i / tile.size();
271 const F score = exp((mean - tmpScores[scoreIdx]) * sd) *
272 (1 - exp(tmpScores[scoreIdx] * nmax -
274 storage.storeScore(i, score);
275 }
276 else
277 for (uint32_t i = tile.thread_rank(); i < k; i += tile.size())
278 storage.storeScore(i,
279 exp((mean - tmpScores[i / tile.size()]) * sd));
280}
281
282/** Add single-point approximation to the matrix */
283template<typename F>
284__inline__ __device__ void
285addGravity(const F score,
286 const F *const __restrict__ grid2DPoint,
287 F *const __restrict__ mtx)
288{
289 const F gs = score * EmbedSOMConstants<F>::gridGravity;
290
291 mtx[0] += gs;
292 mtx[3] += gs;
293 mtx[4] += gs * grid2DPoint[0];
294 mtx[5] += gs * grid2DPoint[1];
295}
296
297/** Add single-point approximation to the matrix, on 2-vectors */
298template<typename F>
299__inline__ __device__ void
300addGravity2Wise(const F score,
301 const F *const __restrict__ grid2DPoint,
302 F *const __restrict__ mtx)
303{
304 const F gs = score * EmbedSOMConstants<F>::gridGravity;
305
306 const typename Vec<2, F>::Type tmpGrid2d =
307 reinterpret_cast<const typename Vec<2, F>::Type *>(grid2DPoint)[0];
308
309 mtx[0] += gs;
310 mtx[3] += gs;
311 mtx[4] += gs * tmpGrid2d.x;
312 mtx[5] += gs * tmpGrid2d.y;
313}
314
315/** Compute a projection of the point to a line defined by points I and J */
316template<typename F>
317__inline__ __device__ typename Vec<2, F>::Type
318euclideanProjection(const F *const __restrict__ point,
319 const F *const __restrict__ gridPointI,
320 const F *const __restrict__ gridPointJ,
321 const uint32_t dim)
322{
323 typename Vec<2, F>::Type result{ 0.0, 0.0 };
324 for (uint32_t k = 0; k < dim; ++k) {
325 const F tmp = gridPointJ[k] - gridPointI[k];
326 result.y += tmp * tmp;
327 result.x += tmp * (point[k] - gridPointI[k]);
328 }
329 return result;
330}
331
332/** Run the projection on 4-vectors */
333template<typename F>
334__inline__ __device__ typename Vec<2, F>::Type
335euclideanProjection4Wise(const F *const __restrict__ point,
336 const F *const __restrict__ gridPointI,
337 const F *const __restrict__ gridPointJ,
338 const uint32_t dim)
339{
340 const auto *const __restrict__ gridPointI4 =
341 reinterpret_cast<const typename Vec<4, F>::Type *>(gridPointI);
342 const auto *const __restrict__ gridPointJ4 =
343 reinterpret_cast<const typename Vec<4, F>::Type *>(gridPointJ);
344 const auto *const __restrict__ point4 =
345 reinterpret_cast<const typename Vec<4, F>::Type *>(point);
346
347 typename Vec<2, F>::Type result{ 0.0, 0.0 };
348
349#define DOIT(X) \
350 tmp = tmpGridJ.X - tmpGridI.X; \
351 result.y += tmp * tmp; \
352 result.x += tmp * (tmpPoint.X - tmpGridI.X)
353
354 for (uint32_t k = 0; k < dim / 4; ++k) {
355 const auto tmpGridI = gridPointI4[k];
356 const auto tmpGridJ = gridPointJ4[k];
357 const auto tmpPoint = point4[k];
358
359 F tmp;
360 DOIT(x);
361 DOIT(y);
362 DOIT(z);
363 DOIT(w);
364 }
365#undef DOIT
366
367 for (uint32_t k = dim - (dim % 4); k < dim; ++k) {
368 const F tmp = gridPointJ[k] - gridPointI[k];
369 result.y += tmp * tmp;
370 result.x += tmp * (point[k] - gridPointI[k]);
371 }
372
373 return result;
374}
375
376/** Add the result of projections to the approximation matrix */
377template<typename F>
378__inline__ __device__ void
379addApproximation(const F scoreI,
380 const F scoreJ,
381 const F *const __restrict__ grid2DPointI,
382 const F *const __restrict__ grid2DPointJ,
383 const F adjust,
384 const F scalarProjection,
385 F *const __restrict__ mtx)
386{
387 F h[2], hp = 0;
388#pragma unroll
389 for (uint32_t i = 0; i < 2; ++i) {
390 h[i] = grid2DPointJ[i] - grid2DPointI[i];
391 hp += h[i] * h[i];
392 }
393
395 return;
396
397 const F exponent = scalarProjection - .5;
398 const F s =
399 scoreI * scoreJ * pow(1 + hp, adjust) * exp(-exponent * exponent);
400 const F sihp = s / hp;
401 const F rhsc = s * (scalarProjection +
402 (h[0] * grid2DPointI[0] + h[1] * grid2DPointI[1]) / hp);
403
404 mtx[0] += h[0] * h[0] * sihp;
405 mtx[1] += h[0] * h[1] * sihp;
406 mtx[2] += h[1] * h[0] * sihp;
407 mtx[3] += h[1] * h[1] * sihp;
408 mtx[4] += h[0] * rhsc;
409 mtx[5] += h[1] * rhsc;
410}
411
412/** Same as addApproximation(), on 2-vectors. */
413template<typename F>
414__inline__ __device__ void
416 const F scoreJ,
417 const F *const __restrict__ grid2DPointI,
418 const F *const __restrict__ grid2DPointJ,
419 const F adjust,
420 const F scalarProjection,
421 F *const __restrict__ mtx)
422{
423 const typename Vec<2, F>::Type tmpGrid2dI =
424 reinterpret_cast<const typename Vec<2, F>::Type *>(grid2DPointI)[0];
425 const typename Vec<2, F>::Type tmpGrid2dJ =
426 reinterpret_cast<const typename Vec<2, F>::Type *>(grid2DPointJ)[0];
427
428 const F h[2]{ tmpGrid2dJ.x - tmpGrid2dI.x, tmpGrid2dJ.y - tmpGrid2dI.y };
429 const F hp = h[0] * h[0] + h[1] * h[1];
430
432 return;
433
434 const F exponent = scalarProjection - .5;
435 const F s =
436 scoreI * scoreJ * pow(1 + hp, adjust) * exp(-exponent * exponent);
437 const F sihp = s / hp;
438 const F rhsc =
439 s * (scalarProjection + (h[0] * tmpGrid2dI.x + h[1] * tmpGrid2dI.y) / hp);
440
441 mtx[0] += h[0] * h[0] * sihp;
442 mtx[1] += h[0] * h[1] * sihp;
443 mtx[2] += h[1] * h[0] * sihp;
444 mtx[3] += h[1] * h[1] * sihp;
445 mtx[4] += h[0] * rhsc;
446 mtx[5] += h[1] * rhsc;
447}
448
449/** Type tag for indexing by small rectangles.
450 *
451 * "Concatenates" 2 columns into one (1. and k-1., 2. and k-2., ...) and
452 * creates indexing on top of k * k/2 rectangle.
453 * No branch divergence.
454 * Thread index assignments:
455 * k| 0 1 2 3 4
456 * -+--------------------
457 * 0| 0 1 2 3
458 * 1| 5 6 7
459 * 2| 8 9
460 * 3| 4
461 * 4|
462 */
464{};
465
466/** Function template for getting indexes */
467template<typename INDEXER>
468__inline__ __device__ uint2
469getIndices(uint32_t plainIndex, uint32_t k)
470{
471}
472
473/** Specialization of getIndices for the rectangle indexing. */
474template<>
475__inline__ __device__ uint2
476getIndices<RectangleIndexer>(uint32_t plainIndex, uint32_t k)
477{
478 uint2 indices;
479 const uint32_t tempI = plainIndex / k;
480 const uint32_t tempJ = plainIndex % k;
481 const auto invertedI = k - 1 - tempI;
482 indices.x = tempJ < invertedI ? tempI : invertedI - 1;
483 indices.y = (tempJ < invertedI ? tempJ : tempJ - invertedI) + indices.x + 1;
484 return indices;
485}
486
487/** Base projection kernel (each thread does everything for a single point). */
488template<typename F>
489__global__ void
490projectionBaseKernel(const F *__restrict__ points,
491 const F *const __restrict__ grid,
492 const F *const __restrict__ grid2d,
493 knn_entry<F> *__restrict__ neighbors,
494 F *__restrict__ projections,
495 const uint32_t dim,
496 const uint32_t n,
497 const uint32_t gridSize,
498 const uint32_t k,
499 const F adjust,
500 const F boost)
501{
502 // assign defaults and generate scores
503 {
504 const uint32_t adjustedK = k < gridSize ? k + 1 : k;
505 const uint32_t pointIdx = blockIdx.x * blockDim.x + threadIdx.x;
506 if (pointIdx >= n)
507 return;
508 points = points + pointIdx * dim;
509 neighbors = neighbors + pointIdx * adjustedK;
510 projections = projections + pointIdx * 2;
511 sortedDistsToScores<F>(neighbors, adjustedK, k, boost);
512 }
513
514 F mtx[6];
515 memset(mtx, 0, 6 * sizeof(F));
516 for (uint32_t i = 0; i < k; ++i) {
517 const uint32_t idxI = neighbors[i].index;
518 const F scoreI = neighbors[i].distance;
519 addGravity(scoreI, grid2d + idxI * 2, mtx);
520 for (uint32_t j = i + 1; j < k; ++j) {
521 const uint32_t idxJ = neighbors[j].index;
522 const F scoreJ = neighbors[j].distance;
523 const auto result = euclideanProjection<F>(
524 points, grid + idxI * dim, grid + idxJ * dim, dim);
525 F scalarProjection = result.x;
526 const F squaredGridPointsDistance = result.y;
527 if (squaredGridPointsDistance == F(0))
528 continue;
529 scalarProjection /= squaredGridPointsDistance;
530 addApproximation(scoreI,
531 scoreJ,
532 grid2d + idxI * 2,
533 grid2d + idxJ * 2,
534 adjust,
535 scalarProjection,
536 mtx);
537 }
538 }
539 // solve linear equation
540 const F det = mtx[0] * mtx[3] - mtx[1] * mtx[2];
541 projections[0] = (mtx[4] * mtx[3] - mtx[5] * mtx[2]) / det;
542 projections[1] = (mtx[0] * mtx[5] - mtx[1] * mtx[4]) / det;
543}
544
545/** Aligned-shared-memory projection kernel.
546 *
547 * One block computes embedding for one point, using CUB block reduce for
548 * matrix reduction.
549 */
550template<typename F, typename INDEXER, size_t tileSize>
551__global__ void
552projectionAlignedShMemoryKernel(const F *__restrict__ points,
553 const F *const __restrict__ grid,
554 const F *const __restrict__ grid2d,
555 knn_entry<F> *__restrict__ neighbors,
556 F *__restrict__ projections,
557 const uint32_t dim,
558 const uint32_t n,
559 const uint32_t gridSize,
560 const uint32_t k,
561 const F adjust,
562 const F boost,
563 const uint32_t groupSize,
564 const uint32_t cacheLeadingDim)
565{
566 extern __shared__ char sharedMemory[];
567
568 const uint32_t groupRank = threadIdx.x % groupSize;
569 const uint32_t groupIdx = threadIdx.x / groupSize;
570 const uint32_t groupsCount = blockDim.x / groupSize;
571
572 const auto grid2dPadding =
573 (k * 3) % cacheLeadingDim == 0
574 ? 0
575 : cacheLeadingDim - ((k * 3) % cacheLeadingDim);
576 auto sharedMemoryoff =
577 reinterpret_cast<F *>(sharedMemory) +
578 ((k + 1) * cacheLeadingDim + k * 3 + grid2dPadding) * groupIdx;
579
580 F *const __restrict__ pointCache = sharedMemoryoff;
581 F *const __restrict__ gridCache = sharedMemoryoff + cacheLeadingDim;
582 F *const __restrict__ grid2dCache =
583 sharedMemoryoff + (k + 1) * cacheLeadingDim;
584 F *const __restrict__ scoreCache = grid2dCache + k * 2;
585
586 F *const __restrict__ reduceFinishStorage =
587 reinterpret_cast<F *>(sharedMemory) +
588 ((k + 1) * cacheLeadingDim + k * 3 + grid2dPadding) * groupsCount;
589
590 auto tile = cg::tiled_partition<tileSize>(cg::this_thread_block());
591
592 // assign defaults and generate scores
593 {
594 const uint32_t adjustedK = k < gridSize ? k + 1 : k;
595
596 const auto workIdx = blockIdx.x * groupsCount + groupIdx;
597
598 if (workIdx >= n)
599 return;
600
601 points = points + workIdx * dim;
602 neighbors = neighbors + workIdx * adjustedK;
603 projections = projections + workIdx * 2;
604
605 if (groupRank < tile.size())
606 sortedDistsToScoresGroup<F>(
607 tile,
608 NeighborScoreStorage<F>{ neighbors, scoreCache },
609 adjustedK,
610 k,
611 boost);
612 else
613 storeToCache(groupRank - tile.size(),
614 groupSize - tile.size(),
615 points,
616 pointCache,
617 grid,
618 gridCache,
619 grid2d,
620 grid2dCache,
621 neighbors,
622 dim,
623 cacheLeadingDim,
624 k);
625
626 if (groupSize == tile.size())
627 storeToCache(groupRank,
628 groupSize,
629 points,
630 pointCache,
631 grid,
632 gridCache,
633 grid2d,
634 grid2dCache,
635 neighbors,
636 dim,
637 cacheLeadingDim,
638 k);
639
640 __syncthreads();
641 }
642
643 F mtx[6];
644 memset(mtx, 0, 6 * sizeof(F));
645
646 for (uint32_t i = groupRank; i < k; i += groupSize)
647 addGravity2Wise(scoreCache[i], grid2dCache + i * 2, mtx);
648
649 const uint32_t neighborPairs = (k * (k - 1)) / 2;
650 for (uint32_t i = groupRank; i < neighborPairs; i += groupSize) {
651 const auto indices = getIndices<INDEXER>(i, k);
652
653 const auto I = indices.x;
654 const auto J = indices.y;
655
656 const auto result =
657 euclideanProjection4Wise<F>(pointCache,
658 gridCache + I * cacheLeadingDim,
659 gridCache + J * cacheLeadingDim,
660 dim);
661 F scalarProjection = result.x;
662 const F squaredGridPointsDistance = result.y;
663
664 if (squaredGridPointsDistance == F(0))
665 continue;
666
667 scalarProjection /= squaredGridPointsDistance;
668
669 addApproximation2Wise(scoreCache[I],
670 scoreCache[J],
671 grid2dCache + I * 2,
672 grid2dCache + J * 2,
673 adjust,
674 scalarProjection,
675 mtx);
676 }
677
678#pragma unroll
679 for (size_t i = 0; i < 6; ++i) {
680 mtx[i] = cg::reduce(tile, mtx[i], cg::plus<F>());
681
682 const auto warpId = threadIdx.x / warpSize;
683
684 if (threadIdx.x % warpSize == 0 && groupRank != 0)
685 reduceFinishStorage[warpId] = mtx[i];
686
687 __syncthreads();
688
689 if (groupRank == 0) {
690 for (uint32_t j = 1; j < groupSize / warpSize; ++j) {
691 mtx[i] += reduceFinishStorage[warpId + j];
692 }
693 }
694 }
695
696 if (groupRank == 0) {
697 const F det = mtx[0] * mtx[3] - mtx[1] * mtx[2];
698 projections[0] = (mtx[4] * mtx[3] - mtx[5] * mtx[2]) / det;
699 projections[1] = (mtx[0] * mtx[5] - mtx[1] * mtx[4]) / det;
700 }
701}
702
703// TODO this might be used as a backup solution for extreme params
704#if 0
705void
706EsomCuda::runProjectionBaseKernel(float boost, float adjust)
707{
708 unsigned int blockSize = 256;
709 unsigned int blockCount = (mPointsCount + blockSize - 1) / blockSize;
710 projectionBaseKernel<float> << <blockCount, blockSize >> > (
711 mCuPoints,
712 mCuLandmarksHighDim,
713 mCuLandmarksLowDim,
714 reinterpret_cast<::knn_entry<float> *>(mCuknn_entry),
715 mCuEmbedding,
716 mDim,
717 mPointsCount,
718 mLandmarksCount,
719 mTopK,
720 adjust,
721 boost);
722
723 CUCH(cudaGetLastError());
724}
725#endif
726
727void
729 size_t n,
730 size_t g,
731 size_t k,
732 float boost,
733 float adjust)
734{
735 constexpr size_t alignment = 16;
736 auto pointBytes = d * sizeof(float);
737 auto dimCacheResidual = pointBytes % alignment;
738 auto gridCacheLeadingDim =
739 pointBytes + (dimCacheResidual == 0 ? 0 : alignment - dimCacheResidual);
740 gridCacheLeadingDim /= sizeof(float);
741
742 auto blockSize = 128;
743 auto groupSize = 32;
744 auto groupsPerBlock = blockSize / groupSize;
745 unsigned int blockCount = (n + groupsPerBlock - 1) / groupsPerBlock;
746 auto warpCount = blockSize / 32;
747 auto grid2dPadding =
748 (k * 3) % gridCacheLeadingDim == 0
749 ? 0
750 : gridCacheLeadingDim - ((k * 3) % gridCacheLeadingDim);
751 auto sharedMem =
752 sizeof(float) * warpCount +
753 sizeof(float) * (k + 1) * gridCacheLeadingDim * groupsPerBlock +
754 sizeof(float) * (k * 3 + grid2dPadding) * groupsPerBlock;
755
756 projectionAlignedShMemoryKernel<float, RectangleIndexer, 32>
757 <<<blockCount, blockSize, sharedMem>>>(data,
758 lm_hi,
759 lm_lo,
760 knns,
761 points,
762 d,
763 n,
764 g,
765 k,
766 adjust,
767 boost,
768 groupSize,
769 gridCacheLeadingDim);
770
771 CUCH(cudaGetLastError());
772}
#define CUCH(status)
Macro wrapper for CUDA calls checking.
__inline__ __device__ void sortedDistsToScores(knn_entry< F > *const __restrict__ neighbors, const std::size_t adjustedK, const std::size_t k, const F boost)
Compute scores for a vector of sorted distances.
__inline__ __device__ uint2 getIndices(uint32_t plainIndex, uint32_t k)
Function template for getting indexes.
__global__ void projectionAlignedShMemoryKernel(const F *__restrict__ points, const F *const __restrict__ grid, const F *const __restrict__ grid2d, knn_entry< F > *__restrict__ neighbors, F *__restrict__ projections, const uint32_t dim, const uint32_t n, const uint32_t gridSize, const uint32_t k, const F adjust, const F boost, const uint32_t groupSize, const uint32_t cacheLeadingDim)
Aligned-shared-memory projection kernel.
__inline__ __device__ void readAlignedGrid2D(F *const __restrict__ dst, const F *const __restrict__ src, const knn_entry< F > *const __restrict__ neighbors, const uint32_t n, const uint32_t groupRank, const uint32_t groupSize)
Version of readAligned() for the 2D grid.
__inline__ __device__ uint2 getIndices< RectangleIndexer >(uint32_t plainIndex, uint32_t k)
Specialization of getIndices for the rectangle indexing.
__inline__ __device__ void storeToCache(const uint32_t groupRank, const uint32_t groupSize, const F *const __restrict__ points, F *const __restrict__ pointsCache, const F *const __restrict__ grid, F *const __restrict__ gridCache, const F *const __restrict__ grid2d, F *const __restrict__ grid2dCache, const knn_entry< F > *const __restrict__ neighbors, const uint32_t dim, const uint32_t gridCacheLeadingDim, const uint32_t k)
Load data into shared memory cache.
__inline__ __device__ void addGravity(const F score, const F *const __restrict__ grid2DPoint, F *const __restrict__ mtx)
Add single-point approximation to the matrix.
__inline__ __device__ void addGravity2Wise(const F score, const F *const __restrict__ grid2DPoint, F *const __restrict__ mtx)
Add single-point approximation to the matrix, on 2-vectors.
__inline__ __device__ Vec< 2, F >::Type euclideanProjection4Wise(const F *const __restrict__ point, const F *const __restrict__ gridPointI, const F *const __restrict__ gridPointJ, const uint32_t dim)
Run the projection on 4-vectors.
__global__ void projectionBaseKernel(const F *__restrict__ points, const F *const __restrict__ grid, const F *const __restrict__ grid2d, knn_entry< F > *__restrict__ neighbors, F *__restrict__ projections, const uint32_t dim, const uint32_t n, const uint32_t gridSize, const uint32_t k, const F adjust, const F boost)
Base projection kernel (each thread does everything for a single point).
__inline__ __device__ void addApproximation(const F scoreI, const F scoreJ, const F *const __restrict__ grid2DPointI, const F *const __restrict__ grid2DPointJ, const F adjust, const F scalarProjection, F *const __restrict__ mtx)
Add the result of projections to the approximation matrix.
__inline__ __device__ void addApproximation2Wise(const F scoreI, const F scoreJ, const F *const __restrict__ grid2DPointI, const F *const __restrict__ grid2DPointJ, const F adjust, const F scalarProjection, F *const __restrict__ mtx)
Same as addApproximation(), on 2-vectors.
__inline__ __device__ void readAligned(F *const __restrict__ dst, const F *const __restrict__ src, const knn_entry< F > *const __restrict__ neighbors, const uint32_t n, const uint32_t dim, const uint32_t groupRank, const uint32_t groupSize)
Read aligned data entries for EmbedSOM projection.
__inline__ __device__ F warpReduceSum(F val)
Sum a value across the warp.
#define DOIT(X)
__inline__ __device__ Vec< 2, F >::Type euclideanProjection(const F *const __restrict__ point, const F *const __restrict__ gridPointI, const F *const __restrict__ gridPointJ, const uint32_t dim)
Compute a projection of the point to a line defined by points I and J.
__inline__ __device__ void sortedDistsToScoresGroup(const TILE &tile, SCORE_STORAGE storage, const std::size_t adjustedK, const std::size_t k, const F boost)
Convert the distances to scores using thread_block_tile and its built in functions for reduction and ...
static float distance(const glm::vec2 &x, const glm::vec2 &y)
void runProjectionKernel(size_t d, size_t n, size_t g, size_t k, float boost, float adjust)
Execute the CUDA projection kernel atop the context.
knn_entry< float > * knns
Definition: embedsom_cuda.h:54
static constexpr F zeroAvoidance
static constexpr F minBoost
static constexpr F gridGravity
static constexpr F maxAvoidance
Helper structure to help transform neighbor distances to scores.
__forceinline__ __device__ F getNeighborDistance(const uint32_t idx) const
__forceinline__ __device__ void storeScore(const uint32_t idx, const F score)
const knn_entry< F > *const __restrict__ neighbors
F *const __restrict__ scores
Type tag for indexing by small rectangles.
Helper structure to transform neighbor distances to scores.
__forceinline__ __device__ F getNeighborDistance(const uint32_t idx) const
__forceinline__ __device__ void storeScore(const uint32_t idx, const F score)
knn_entry< F > *const __restrict__ neighbors
A structure for packing neighbor index and distance for kNN search.
uint32_t index
F distance