32#define _CG_ABI_EXPERIMENTAL
34#include "cooperative_groups.h"
35#include "cooperative_groups/reduce.h"
36#include "cuda_runtime.h"
37#include "device_launch_parameters.h"
39namespace cg = cooperative_groups;
41template<
typename F =
float>
63 __forceinline__ __device__
void storeScore(
const uint32_t idx,
85 __forceinline__ __device__
void storeScore(
const uint32_t idx,
94__inline__ __device__ F
97 for (
int offset = warpSize / 2; offset > 0; offset /= 2)
98 val += __shfl_down_sync(0xffffffff, val, offset);
103template<
typename F,
typename ArrayF>
104__inline__ __device__
void
106 const F *
const __restrict__ src,
110 const uint32_t groupRank,
111 const uint32_t groupSize)
113 constexpr auto size =
sizeof(ArrayF) /
sizeof(F);
115 const uint32_t loadsCount = dim / size;
116 const uint32_t dimX = dim / size;
118 ArrayF *
const dstX =
reinterpret_cast<ArrayF *
>(dst);
119 const ArrayF *
const srcX =
reinterpret_cast<const ArrayF *
>(src);
121 for (
size_t i = groupRank; i < n * loadsCount; i += groupSize) {
122 const auto idx = i / loadsCount;
123 const auto off = i % loadsCount;
125 dstX[idx * dimX + off] = srcX[neighbors[idx].
index * dimX + off];
131__inline__ __device__
void
133 const F *
const __restrict__ src,
136 const uint32_t groupRank,
137 const uint32_t groupSize)
141 ArrayT *
const dstX =
reinterpret_cast<ArrayT *
>(dst);
142 const ArrayT *
const srcX =
reinterpret_cast<const ArrayT *
>(src);
144 for (
size_t i = groupRank; i < n; i += groupSize)
145 dstX[i] = srcX[neighbors[i].index];
150__inline__ __device__
void
152 const uint32_t groupSize,
153 const F *
const __restrict__ points,
154 F *
const __restrict__ pointsCache,
155 const F *
const __restrict__ grid,
156 F *
const __restrict__ gridCache,
157 const F *
const __restrict__ grid2d,
158 F *
const __restrict__ grid2dCache,
161 const uint32_t gridCacheLeadingDim,
164 auto copyIdx = groupRank;
165 for (; copyIdx < dim; copyIdx += groupSize)
166 pointsCache[copyIdx] = points[copyIdx];
170 for (; copyIdx < k * dim; copyIdx += groupSize) {
171 auto globIdx = copyIdx / dim;
172 auto globOff = copyIdx % dim;
173 gridCache[globIdx * gridCacheLeadingDim + globOff] =
174 grid[neighbors[globIdx].
index * dim + globOff];
177 readAlignedGrid2D<F>(
178 grid2dCache, grid2d, neighbors, k, groupRank, groupSize);
183__inline__ __device__
void
185 const std::size_t adjustedK,
190 F mean = 0, sd = 0, wsum = 0;
191 for (uint32_t i = 0; i < adjustedK; ++i) {
192 const F tmp = sqrt(neighbors[i].
distance);
193 const F w = 1 / F(i + 1);
201 sd = boost / sqrt(sd / wsum - mean * mean);
206 for (uint32_t i = 0; i < k; ++i) {
209 exp((mean - neighbors[i].
distance) * sd) *
210 (1 - exp(neighbors[i].
distance * nmax -
220template<
typename F,
class SCORE_STORAGE,
class TILE>
221__inline__ __device__
void
223 SCORE_STORAGE storage,
224 const std::size_t adjustedK,
234 F mean = 0, sd = 0, wsum = 0;
235 for (uint32_t i = tile.thread_rank(); i < adjustedK; i += tile.size()) {
236 const F tmp = sqrt(storage.getNeighborDistance(i));
237 const F w = 1 / F(i + 1);
240 tmpScores[i / tile.size()] = tmp;
244 mean = cg::reduce(tile, mean, cg::plus<F>());
245 wsum = cg::reduce(tile, wsum, cg::plus<F>());
250 for (uint32_t i = tile.thread_rank(); i < adjustedK; i += tile.size()) {
251 const F tmp = tmpScores[i / tile.size()] - mean;
252 const F w = 1 / F(i + 1);
257 sd = cg::reduce(tile, sd, cg::plus<F>());
259 const auto lastScoreThreadIdx = (adjustedK - 1) % tile.size();
260 const auto lastScoreIdx = (adjustedK - 1) / tile.size();
261 lastScore = tile.shfl(tmpScores[lastScoreIdx], lastScoreThreadIdx);
264 sd = boost / sqrt(sd / wsum);
269 for (uint32_t i = tile.thread_rank(); i < k; i += tile.size()) {
270 const auto scoreIdx = i / tile.size();
271 const F score = exp((mean - tmpScores[scoreIdx]) * sd) *
272 (1 - exp(tmpScores[scoreIdx] * nmax -
274 storage.storeScore(i, score);
277 for (uint32_t i = tile.thread_rank(); i < k; i += tile.size())
278 storage.storeScore(i,
279 exp((mean - tmpScores[i / tile.size()]) * sd));
284__inline__ __device__
void
286 const F *
const __restrict__ grid2DPoint,
287 F *
const __restrict__ mtx)
293 mtx[4] += gs * grid2DPoint[0];
294 mtx[5] += gs * grid2DPoint[1];
299__inline__ __device__
void
301 const F *
const __restrict__ grid2DPoint,
302 F *
const __restrict__ mtx)
311 mtx[4] += gs * tmpGrid2d.x;
312 mtx[5] += gs * tmpGrid2d.y;
319 const F *
const __restrict__ gridPointI,
320 const F *
const __restrict__ gridPointJ,
324 for (uint32_t k = 0; k < dim; ++k) {
325 const F tmp = gridPointJ[k] - gridPointI[k];
326 result.y += tmp * tmp;
327 result.x += tmp * (point[k] - gridPointI[k]);
336 const F *
const __restrict__ gridPointI,
337 const F *
const __restrict__ gridPointJ,
340 const auto *
const __restrict__ gridPointI4 =
342 const auto *
const __restrict__ gridPointJ4 =
344 const auto *
const __restrict__ point4 =
350 tmp = tmpGridJ.X - tmpGridI.X; \
351 result.y += tmp * tmp; \
352 result.x += tmp * (tmpPoint.X - tmpGridI.X)
354 for (uint32_t k = 0; k < dim / 4; ++k) {
355 const auto tmpGridI = gridPointI4[k];
356 const auto tmpGridJ = gridPointJ4[k];
357 const auto tmpPoint = point4[k];
367 for (uint32_t k = dim - (dim % 4); k < dim; ++k) {
368 const F tmp = gridPointJ[k] - gridPointI[k];
369 result.y += tmp * tmp;
370 result.x += tmp * (point[k] - gridPointI[k]);
378__inline__ __device__
void
381 const F *
const __restrict__ grid2DPointI,
382 const F *
const __restrict__ grid2DPointJ,
384 const F scalarProjection,
385 F *
const __restrict__ mtx)
389 for (uint32_t i = 0; i < 2; ++i) {
390 h[i] = grid2DPointJ[i] - grid2DPointI[i];
397 const F exponent = scalarProjection - .5;
399 scoreI * scoreJ * pow(1 + hp, adjust) * exp(-exponent * exponent);
400 const F sihp = s / hp;
401 const F rhsc = s * (scalarProjection +
402 (h[0] * grid2DPointI[0] + h[1] * grid2DPointI[1]) / hp);
404 mtx[0] += h[0] * h[0] * sihp;
405 mtx[1] += h[0] * h[1] * sihp;
406 mtx[2] += h[1] * h[0] * sihp;
407 mtx[3] += h[1] * h[1] * sihp;
408 mtx[4] += h[0] * rhsc;
409 mtx[5] += h[1] * rhsc;
414__inline__ __device__
void
417 const F *
const __restrict__ grid2DPointI,
418 const F *
const __restrict__ grid2DPointJ,
420 const F scalarProjection,
421 F *
const __restrict__ mtx)
428 const F h[2]{ tmpGrid2dJ.x - tmpGrid2dI.x, tmpGrid2dJ.y - tmpGrid2dI.y };
429 const F hp = h[0] * h[0] + h[1] * h[1];
434 const F exponent = scalarProjection - .5;
436 scoreI * scoreJ * pow(1 + hp, adjust) * exp(-exponent * exponent);
437 const F sihp = s / hp;
439 s * (scalarProjection + (h[0] * tmpGrid2dI.x + h[1] * tmpGrid2dI.y) / hp);
441 mtx[0] += h[0] * h[0] * sihp;
442 mtx[1] += h[0] * h[1] * sihp;
443 mtx[2] += h[1] * h[0] * sihp;
444 mtx[3] += h[1] * h[1] * sihp;
445 mtx[4] += h[0] * rhsc;
446 mtx[5] += h[1] * rhsc;
467template<
typename INDEXER>
468__inline__ __device__ uint2
475__inline__ __device__ uint2
479 const uint32_t tempI = plainIndex / k;
480 const uint32_t tempJ = plainIndex % k;
481 const auto invertedI = k - 1 - tempI;
482 indices.x = tempJ < invertedI ? tempI : invertedI - 1;
483 indices.y = (tempJ < invertedI ? tempJ : tempJ - invertedI) + indices.x + 1;
491 const F *
const __restrict__ grid,
492 const F *
const __restrict__ grid2d,
494 F *__restrict__ projections,
497 const uint32_t gridSize,
504 const uint32_t adjustedK = k < gridSize ? k + 1 : k;
505 const uint32_t pointIdx = blockIdx.x * blockDim.x + threadIdx.x;
508 points = points + pointIdx * dim;
509 neighbors = neighbors + pointIdx * adjustedK;
510 projections = projections + pointIdx * 2;
511 sortedDistsToScores<F>(neighbors, adjustedK, k, boost);
515 memset(mtx, 0, 6 *
sizeof(F));
516 for (uint32_t i = 0; i < k; ++i) {
517 const uint32_t idxI = neighbors[i].
index;
518 const F scoreI = neighbors[i].
distance;
520 for (uint32_t j = i + 1; j < k; ++j) {
521 const uint32_t idxJ = neighbors[j].
index;
522 const F scoreJ = neighbors[j].
distance;
523 const auto result = euclideanProjection<F>(
524 points, grid + idxI * dim, grid + idxJ * dim, dim);
525 F scalarProjection = result.x;
526 const F squaredGridPointsDistance = result.y;
527 if (squaredGridPointsDistance == F(0))
529 scalarProjection /= squaredGridPointsDistance;
540 const F det = mtx[0] * mtx[3] - mtx[1] * mtx[2];
541 projections[0] = (mtx[4] * mtx[3] - mtx[5] * mtx[2]) / det;
542 projections[1] = (mtx[0] * mtx[5] - mtx[1] * mtx[4]) / det;
550template<
typename F,
typename INDEXER,
size_t tileSize>
553 const F *
const __restrict__ grid,
554 const F *
const __restrict__ grid2d,
556 F *__restrict__ projections,
559 const uint32_t gridSize,
563 const uint32_t groupSize,
564 const uint32_t cacheLeadingDim)
566 extern __shared__
char sharedMemory[];
568 const uint32_t groupRank = threadIdx.x % groupSize;
569 const uint32_t groupIdx = threadIdx.x / groupSize;
570 const uint32_t groupsCount = blockDim.x / groupSize;
572 const auto grid2dPadding =
573 (k * 3) % cacheLeadingDim == 0
575 : cacheLeadingDim - ((k * 3) % cacheLeadingDim);
576 auto sharedMemoryoff =
577 reinterpret_cast<F *
>(sharedMemory) +
578 ((k + 1) * cacheLeadingDim + k * 3 + grid2dPadding) * groupIdx;
580 F *
const __restrict__ pointCache = sharedMemoryoff;
581 F *
const __restrict__ gridCache = sharedMemoryoff + cacheLeadingDim;
582 F *
const __restrict__ grid2dCache =
583 sharedMemoryoff + (k + 1) * cacheLeadingDim;
584 F *
const __restrict__ scoreCache = grid2dCache + k * 2;
586 F *
const __restrict__ reduceFinishStorage =
587 reinterpret_cast<F *
>(sharedMemory) +
588 ((k + 1) * cacheLeadingDim + k * 3 + grid2dPadding) * groupsCount;
590 auto tile = cg::tiled_partition<tileSize>(cg::this_thread_block());
594 const uint32_t adjustedK = k < gridSize ? k + 1 : k;
596 const auto workIdx = blockIdx.x * groupsCount + groupIdx;
601 points = points + workIdx * dim;
602 neighbors = neighbors + workIdx * adjustedK;
603 projections = projections + workIdx * 2;
605 if (groupRank < tile.size())
606 sortedDistsToScoresGroup<F>(
614 groupSize - tile.size(),
626 if (groupSize == tile.size())
644 memset(mtx, 0, 6 *
sizeof(F));
646 for (uint32_t i = groupRank; i < k; i += groupSize)
649 const uint32_t neighborPairs = (k * (k - 1)) / 2;
650 for (uint32_t i = groupRank; i < neighborPairs; i += groupSize) {
651 const auto indices = getIndices<INDEXER>(i, k);
653 const auto I = indices.x;
654 const auto J = indices.y;
657 euclideanProjection4Wise<F>(pointCache,
658 gridCache + I * cacheLeadingDim,
659 gridCache + J * cacheLeadingDim,
661 F scalarProjection = result.x;
662 const F squaredGridPointsDistance = result.y;
664 if (squaredGridPointsDistance == F(0))
667 scalarProjection /= squaredGridPointsDistance;
679 for (
size_t i = 0; i < 6; ++i) {
680 mtx[i] = cg::reduce(tile, mtx[i], cg::plus<F>());
682 const auto warpId = threadIdx.x / warpSize;
684 if (threadIdx.x % warpSize == 0 && groupRank != 0)
685 reduceFinishStorage[warpId] = mtx[i];
689 if (groupRank == 0) {
690 for (uint32_t j = 1; j < groupSize / warpSize; ++j) {
691 mtx[i] += reduceFinishStorage[warpId + j];
696 if (groupRank == 0) {
697 const F det = mtx[0] * mtx[3] - mtx[1] * mtx[2];
698 projections[0] = (mtx[4] * mtx[3] - mtx[5] * mtx[2]) / det;
699 projections[1] = (mtx[0] * mtx[5] - mtx[1] * mtx[4]) / det;
706EsomCuda::runProjectionBaseKernel(
float boost,
float adjust)
708 unsigned int blockSize = 256;
709 unsigned int blockCount = (mPointsCount + blockSize - 1) / blockSize;
710 projectionBaseKernel<float> << <blockCount, blockSize >> > (
723 CUCH(cudaGetLastError());
735 constexpr size_t alignment = 16;
736 auto pointBytes = d *
sizeof(float);
737 auto dimCacheResidual = pointBytes % alignment;
738 auto gridCacheLeadingDim =
739 pointBytes + (dimCacheResidual == 0 ? 0 : alignment - dimCacheResidual);
740 gridCacheLeadingDim /=
sizeof(float);
742 auto blockSize = 128;
744 auto groupsPerBlock = blockSize / groupSize;
745 unsigned int blockCount = (n + groupsPerBlock - 1) / groupsPerBlock;
746 auto warpCount = blockSize / 32;
748 (k * 3) % gridCacheLeadingDim == 0
750 : gridCacheLeadingDim - ((k * 3) % gridCacheLeadingDim);
752 sizeof(float) * warpCount +
753 sizeof(
float) * (k + 1) * gridCacheLeadingDim * groupsPerBlock +
754 sizeof(
float) * (k * 3 + grid2dPadding) * groupsPerBlock;
756 projectionAlignedShMemoryKernel<float, RectangleIndexer, 32>
757 <<<blockCount, blockSize, sharedMem>>>(
data,
769 gridCacheLeadingDim);
771 CUCH(cudaGetLastError());
#define CUCH(status)
Macro wrapper for CUDA calls checking.
__inline__ __device__ void sortedDistsToScores(knn_entry< F > *const __restrict__ neighbors, const std::size_t adjustedK, const std::size_t k, const F boost)
Compute scores for a vector of sorted distances.
__inline__ __device__ uint2 getIndices(uint32_t plainIndex, uint32_t k)
Function template for getting indexes.
__global__ void projectionAlignedShMemoryKernel(const F *__restrict__ points, const F *const __restrict__ grid, const F *const __restrict__ grid2d, knn_entry< F > *__restrict__ neighbors, F *__restrict__ projections, const uint32_t dim, const uint32_t n, const uint32_t gridSize, const uint32_t k, const F adjust, const F boost, const uint32_t groupSize, const uint32_t cacheLeadingDim)
Aligned-shared-memory projection kernel.
__inline__ __device__ void readAlignedGrid2D(F *const __restrict__ dst, const F *const __restrict__ src, const knn_entry< F > *const __restrict__ neighbors, const uint32_t n, const uint32_t groupRank, const uint32_t groupSize)
Version of readAligned() for the 2D grid.
__inline__ __device__ uint2 getIndices< RectangleIndexer >(uint32_t plainIndex, uint32_t k)
Specialization of getIndices for the rectangle indexing.
__inline__ __device__ void storeToCache(const uint32_t groupRank, const uint32_t groupSize, const F *const __restrict__ points, F *const __restrict__ pointsCache, const F *const __restrict__ grid, F *const __restrict__ gridCache, const F *const __restrict__ grid2d, F *const __restrict__ grid2dCache, const knn_entry< F > *const __restrict__ neighbors, const uint32_t dim, const uint32_t gridCacheLeadingDim, const uint32_t k)
Load data into shared memory cache.
__inline__ __device__ void addGravity(const F score, const F *const __restrict__ grid2DPoint, F *const __restrict__ mtx)
Add single-point approximation to the matrix.
__inline__ __device__ void addGravity2Wise(const F score, const F *const __restrict__ grid2DPoint, F *const __restrict__ mtx)
Add single-point approximation to the matrix, on 2-vectors.
__inline__ __device__ Vec< 2, F >::Type euclideanProjection4Wise(const F *const __restrict__ point, const F *const __restrict__ gridPointI, const F *const __restrict__ gridPointJ, const uint32_t dim)
Run the projection on 4-vectors.
__global__ void projectionBaseKernel(const F *__restrict__ points, const F *const __restrict__ grid, const F *const __restrict__ grid2d, knn_entry< F > *__restrict__ neighbors, F *__restrict__ projections, const uint32_t dim, const uint32_t n, const uint32_t gridSize, const uint32_t k, const F adjust, const F boost)
Base projection kernel (each thread does everything for a single point).
__inline__ __device__ void addApproximation(const F scoreI, const F scoreJ, const F *const __restrict__ grid2DPointI, const F *const __restrict__ grid2DPointJ, const F adjust, const F scalarProjection, F *const __restrict__ mtx)
Add the result of projections to the approximation matrix.
__inline__ __device__ void addApproximation2Wise(const F scoreI, const F scoreJ, const F *const __restrict__ grid2DPointI, const F *const __restrict__ grid2DPointJ, const F adjust, const F scalarProjection, F *const __restrict__ mtx)
Same as addApproximation(), on 2-vectors.
__inline__ __device__ void readAligned(F *const __restrict__ dst, const F *const __restrict__ src, const knn_entry< F > *const __restrict__ neighbors, const uint32_t n, const uint32_t dim, const uint32_t groupRank, const uint32_t groupSize)
Read aligned data entries for EmbedSOM projection.
__inline__ __device__ F warpReduceSum(F val)
Sum a value across the warp.
__inline__ __device__ Vec< 2, F >::Type euclideanProjection(const F *const __restrict__ point, const F *const __restrict__ gridPointI, const F *const __restrict__ gridPointJ, const uint32_t dim)
Compute a projection of the point to a line defined by points I and J.
__inline__ __device__ void sortedDistsToScoresGroup(const TILE &tile, SCORE_STORAGE storage, const std::size_t adjustedK, const std::size_t k, const F boost)
Convert the distances to scores using thread_block_tile and its built in functions for reduction and ...
static float distance(const glm::vec2 &x, const glm::vec2 &y)
void runProjectionKernel(size_t d, size_t n, size_t g, size_t k, float boost, float adjust)
Execute the CUDA projection kernel atop the context.
knn_entry< float > * knns
static constexpr F zeroAvoidance
static constexpr F minBoost
static constexpr F gridGravity
static constexpr F maxAvoidance
Helper structure to help transform neighbor distances to scores.
__forceinline__ __device__ F getNeighborDistance(const uint32_t idx) const
__forceinline__ __device__ void storeScore(const uint32_t idx, const F score)
const knn_entry< F > *const __restrict__ neighbors
F *const __restrict__ scores
Type tag for indexing by small rectangles.
Helper structure to transform neighbor distances to scores.
__forceinline__ __device__ F getNeighborDistance(const uint32_t idx) const
__forceinline__ __device__ void storeScore(const uint32_t idx, const F score)
knn_entry< F > *const __restrict__ neighbors
A structure for packing neighbor index and distance for kNN search.