35#include "cooperative_groups.h"
36#include "cuda_runtime.h"
37#include "device_launch_parameters.h"
40template<
typename F =
float>
41__inline__ __device__ F
42sqeucl(
const F *__restrict__ lhs,
const F *__restrict__ rhs,
const uint32_t dim)
45 for (uint32_t d = 0; d < dim; ++d) {
46 F diff = *lhs++ - *rhs++;
53template<
typename F =
float>
54__inline__ __device__
void
59 topK[idx] = topK[idx - 1];
73 const F *
const __restrict__ grid,
77 const uint32_t gridSize,
82 const uint32_t pointIdx = blockIdx.x * blockDim.x + threadIdx.x;
87 topKs = topKs + pointIdx * k;
88 points = points + pointIdx * dim;
95 for (; gridIdx < k; ++gridIdx) {
96 topKs[gridIdx] = { sqeucl<F>(points, grid + gridIdx * dim, dim),
98 bubbleUp<F>(topKs, gridIdx);
101 for (; gridIdx < gridSize; ++gridIdx) {
102 F dist = sqeucl<F>(points, grid + gridIdx * dim, dim);
105 topKs[k - 1] = { dist, gridIdx };
106 bubbleUp<F>(topKs, k - 1);
120template<
typename F,
int K>
123 const F *
const __restrict__ grid,
127 const uint32_t gridSize,
128 const uint32_t actualK)
130 extern __shared__
char sharedMemory[];
134 knn_entry<F> *
const shmTopkBlock = shmTopk + K * (threadIdx.x / (K / 2));
136 knn_entry<F> *
const shmNewData = shmTopk + (blockDim.x * 2);
139 K * (threadIdx.x / (K / 2));
143 const uint32_t pointIdx =
144 (blockIdx.x * blockDim.x + threadIdx.x) / (K / 2);
147 points += pointIdx * dim;
148 topKs += pointIdx * actualK;
151 for (uint32_t i = threadIdx.x % (K / 2); i < K;
153 shmTopkBlock[i] = { i < gridSize
154 ? sqeucl<F>(points, grid + dim * i, dim)
159 const uint32_t gridSizeRoundedToK = ((gridSize + K - 1) / K) * K;
160 for (uint32_t gridOffset = K; gridOffset < gridSizeRoundedToK;
163 for (uint32_t i = threadIdx.x % (K / 2); i < K; i += (K / 2)) {
164 shmNewDataBlock[i] = {
165 (gridOffset + i) < gridSize
166 ? sqeucl<F>(points, grid + dim * (gridOffset + i), dim)
175 bitonic_topk_update_opt<knn_entry<F>, K / 2>(shmTopk, shmNewData);
181 bitonic_sort<knn_entry<F>, K / 2>(shmTopk);
187 for (uint32_t i = threadIdx.x % (K / 2); i < actualK; i += (K / 2))
188 topKs[i] = shmTopkBlock[i];
197template<
typename F,
int K = 2>
203 uint32_t pointsCount,
210 if constexpr (K > 256)
211 throw std::runtime_error(
"Ooops, this should never happen. Bitonic "
212 "kernel wrapper was invoked with k > 256.");
213 else if (K < 2 || K < actualK)
214 runnerWrapperBitonicOpt<F, K * 2>(
224 constexpr unsigned blockSize = 256;
227 if constexpr (blockSize * 2 != (blockSize | (blockSize - 1)) + 1)
228 throw std::runtime_error(
"CUDA block size must be a power of two "
229 "for bitonic topk selection.");
230 if constexpr (K / 2 > blockSize)
231 throw std::runtime_error(
"CUDA block size must be at half of k "
232 "(rounded u to nearest power of 2).");
234 unsigned int blockCount =
235 ((pointsCount * K / 2) + blockSize - 1) / blockSize;
238 unsigned int shmSize = blockSize * 4 *
sizeof(
knn_entry<F>);
240 topkBitonicOptKernel<F, K><<<blockCount, blockSize, shmSize>>>(
241 points, grid, topKs, dim, pointsCount, gridSize, actualK);
251 if (adjusted_k > 256) {
254 constexpr unsigned blockSize = 256;
255 unsigned blockCount = (n + blockSize - 1) / blockSize;
256 topkBaseKernel<float><<<blockCount, blockSize>>>(
data,
262 uint32_t(adjusted_k));
264 runnerWrapperBitonicOpt<float, 2>(
data,
270 uint32_t(adjusted_k));
void runnerWrapperBitonicOpt(const F *points, const F *grid, knn_entry< F > *topKs, uint32_t dim, uint32_t pointsCount, uint32_t gridSize, uint32_t actualK)
Start the bitonic kNN with the right parameters.
__global__ void topkBitonicOptKernel(const F *__restrict__ points, const F *const __restrict__ grid, knn_entry< F > *__restrict__ topKs, const uint32_t dim, const uint32_t n, const uint32_t gridSize, const uint32_t actualK)
Bitonic kNN selection.
__global__ void topkBaseKernel(const F *__restrict__ points, const F *const __restrict__ grid, knn_entry< F > *__restrict__ topKs, const uint32_t dim, const uint32_t n, const uint32_t gridSize, const uint32_t k)
Base kernel for kNN computation.
__inline__ __device__ F sqeucl(const F *__restrict__ lhs, const F *__restrict__ rhs, const uint32_t dim)
Compute squared euclidean distance.
__inline__ __device__ void bubbleUp(knn_entry< F > *const __restrict__ topK, uint32_t idx)
Bubble a knn-entry up a list, insertsort-style.
static float distance(const glm::vec2 &x, const glm::vec2 &y)
void runKNNKernel(size_t d, size_t n, size_t g, size_t adjusted_k)
Execute the CUDA kNN within the context.
knn_entry< float > * knns
A structure for packing neighbor index and distance for kNN search.