BlosSOM
Interactive dimensionality reduction on large datasets (EmbedSOM and FLOWER combined)
embedsom_cuda_knn.cu
Go to the documentation of this file.
1/*
2The MIT License
3
4Copyright (c) 2021 Adam Smelko
5 Martin Krulis
6 Mirek Kratochvil
7
8Permission is hereby granted, free of charge,
9to any person obtaining a copy of this software and
10associated documentation files (the "Software"), to
11deal in the Software without restriction, including
12without limitation the rights to use, copy, modify,
13merge, publish, distribute, sublicense, and/or sell
14copies of the Software, and to permit persons to whom
15the Software is furnished to do so,
16subject to the following conditions:
17
18The above copyright notice and this permission notice
19shall be included in all copies or substantial portions of the Software.
20
21THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
23OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
25ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28*/
29
30#include "embedsom_cuda.h"
31#include <limits>
32
33#include "bitonic.cuh"
34
35#include "cooperative_groups.h"
36#include "cuda_runtime.h"
37#include "device_launch_parameters.h"
38
39/** Compute squared euclidean distance */
40template<typename F = float>
41__inline__ __device__ F
42sqeucl(const F *__restrict__ lhs, const F *__restrict__ rhs, const uint32_t dim)
43{
44 F sum(0.0);
45 for (uint32_t d = 0; d < dim; ++d) {
46 F diff = *lhs++ - *rhs++;
47 sum += diff * diff;
48 }
49 return sum;
50}
51
52/** Bubble a knn-entry up a list, insertsort-style */
53template<typename F = float>
54__inline__ __device__ void
55bubbleUp(knn_entry<F> *const __restrict__ topK, uint32_t idx)
56{
57 while (idx > 0 && topK[idx - 1].distance > topK[idx].distance) {
58 const knn_entry<F> tmp = topK[idx];
59 topK[idx] = topK[idx - 1];
60 topK[idx - 1] = tmp;
61 --idx;
62 }
63}
64
65/** Base kernel for kNN computation.
66 *
67 * Each thread iterates over whole point grid and computes kNN for a specified
68 * point using insertion sort in global memory.
69 */
70template<typename F>
71__global__ void
72topkBaseKernel(const F *__restrict__ points,
73 const F *const __restrict__ grid,
74 knn_entry<F> *__restrict__ topKs,
75 const uint32_t dim,
76 const uint32_t n,
77 const uint32_t gridSize,
78 const uint32_t k)
79{
80 // assign correct point and topK pointers for a thread
81 {
82 const uint32_t pointIdx = blockIdx.x * blockDim.x + threadIdx.x;
83
84 if (pointIdx >= n)
85 return;
86
87 topKs = topKs + pointIdx * k;
88 points = points + pointIdx * dim;
89 }
90
91 // iterate over grid points
92 {
93 uint32_t gridIdx = 0;
94
95 for (; gridIdx < k; ++gridIdx) {
96 topKs[gridIdx] = { sqeucl<F>(points, grid + gridIdx * dim, dim),
97 gridIdx };
98 bubbleUp<F>(topKs, gridIdx);
99 }
100
101 for (; gridIdx < gridSize; ++gridIdx) {
102 F dist = sqeucl<F>(points, grid + gridIdx * dim, dim);
103
104 if (topKs[k - 1].distance > dist) {
105 topKs[k - 1] = { dist, gridIdx };
106 bubbleUp<F>(topKs, k - 1);
107 }
108 }
109 }
110}
111
112/** Bitonic kNN selection.
113 *
114 * This uses a bitonic top-k selection (modified bitonic sort) to run the kNN
115 * search. No inputs are explicitly cached, shm is used for intermediate topk
116 * results @tparam K is number of top-k selected items and number of threads
117 * working cooperatively (keeping 2xK intermediate result in shm) note that K
118 * is a power of 2 which is the nearest greater or equal value to actualK
119 */
120template<typename F, int K>
121__global__ void
122topkBitonicOptKernel(const F *__restrict__ points,
123 const F *const __restrict__ grid,
124 knn_entry<F> *__restrict__ topKs,
125 const uint32_t dim,
126 const uint32_t n,
127 const uint32_t gridSize,
128 const uint32_t actualK)
129{
130 extern __shared__ char sharedMemory[];
131
132 knn_entry<F> *const shmTopk = (knn_entry<F> *)(sharedMemory);
133 // topk chunk that belongs to this thread
134 knn_entry<F> *const shmTopkBlock = shmTopk + K * (threadIdx.x / (K / 2));
135 // every thread works on two items at a time
136 knn_entry<F> *const shmNewData = shmTopk + (blockDim.x * 2);
137 knn_entry<F> *const shmNewDataBlock =
138 shmNewData +
139 K * (threadIdx.x / (K / 2)); // newData chunk that belongs to this thread
140 // assign correct point and topK pointers for a thread (K/2 threads
141 // cooperate on each point)
142 {
143 const uint32_t pointIdx =
144 (blockIdx.x * blockDim.x + threadIdx.x) / (K / 2);
145 if (pointIdx >= n)
146 return;
147 points += pointIdx * dim;
148 topKs += pointIdx * actualK;
149 }
150 // fill in initial topk intermediate result
151 for (uint32_t i = threadIdx.x % (K / 2); i < K;
152 i += (K / 2)) { // yes, this loop should go off exactly twice
153 shmTopkBlock[i] = { i < gridSize
154 ? sqeucl<F>(points, grid + dim * i, dim)
155 : valueMax<F>,
156 i };
157 }
158 // process the grid points in K-sized blocks
159 const uint32_t gridSizeRoundedToK = ((gridSize + K - 1) / K) * K;
160 for (uint32_t gridOffset = K; gridOffset < gridSizeRoundedToK;
161 gridOffset += K) {
162 // compute another K new distances (again the loop runs just 2 times)
163 for (uint32_t i = threadIdx.x % (K / 2); i < K; i += (K / 2)) {
164 shmNewDataBlock[i] = {
165 (gridOffset + i) < gridSize
166 ? sqeucl<F>(points, grid + dim * (gridOffset + i), dim)
167 : valueMax<F>,
168 gridOffset + i
169 };
170 }
171
172 // sync the whole block (bitonic update operates on the block)
173 __syncthreads();
174 // merge the new K distances with the intermediate top ones
175 bitonic_topk_update_opt<knn_entry<F>, K / 2>(shmTopk, shmNewData);
176 // sync the block again
177 __syncthreads();
178 }
179
180 // sort the top K results once more, for the last time
181 bitonic_sort<knn_entry<F>, K / 2>(shmTopk);
182 __syncthreads();
183
184 /* copy the results from shm to global memory (tricky part: some of the
185 * results get ignored here, because K must be a power of 2, which may be
186 * bigger than the `actualK` number of results that we want. */
187 for (uint32_t i = threadIdx.x % (K / 2); i < actualK; i += (K / 2))
188 topKs[i] = shmTopkBlock[i];
189}
190
191/** Start the bitonic kNN with the right parameters
192 *
193 * This function recursively scans (in compiler!) for a sufficient K to contain
194 * the actual number of neighbors required and runs the correct instantiation
195 * of topkBitonicOptKernel().
196 */
197template<typename F, int K = 2>
198void
200 const F *grid,
201 knn_entry<F> *topKs,
202 uint32_t dim,
203 uint32_t pointsCount,
204 uint32_t gridSize,
205 uint32_t actualK)
206{
207 /* TODO make a fallback for this case. Better run some operation a bit
208 * slower, than nothing at all
209 * TODO 2: report errors to frontend instead of just exploding */
210 if constexpr (K > 256)
211 throw std::runtime_error("Ooops, this should never happen. Bitonic "
212 "kernel wrapper was invoked with k > 256.");
213 else if (K < 2 || K < actualK)
214 runnerWrapperBitonicOpt<F, K * 2>(
215 points,
216 grid,
217 topKs,
218 dim,
219 pointsCount,
220 gridSize,
221 actualK); // recursion step (try next power of 2)
222 else {
223 // here we found K such that actualK <= K <= 256
224 constexpr unsigned blockSize = 256;
225
226 // some extra safety checks (should never fire)
227 if constexpr (blockSize * 2 != (blockSize | (blockSize - 1)) + 1)
228 throw std::runtime_error("CUDA block size must be a power of two "
229 "for bitonic topk selection.");
230 if constexpr (K / 2 > blockSize)
231 throw std::runtime_error("CUDA block size must be at half of k "
232 "(rounded u to nearest power of 2).");
233
234 unsigned int blockCount =
235 ((pointsCount * K / 2) + blockSize - 1) / blockSize;
236 /* the kernel requires 4 items per thread
237 * (2 in topk, and 2 for newly incoming data) */
238 unsigned int shmSize = blockSize * 4 * sizeof(knn_entry<F>);
239
240 topkBitonicOptKernel<F, K><<<blockCount, blockSize, shmSize>>>(
241 points, grid, topKs, dim, pointsCount, gridSize, actualK);
242 }
243}
244
245void
247 size_t n,
248 size_t g,
249 size_t adjusted_k)
250{
251 if (adjusted_k > 256) {
252 /* If k is too high, fallback to the base kernel. Fortunately no one
253 * ever would like to use such a high k, right? */
254 constexpr unsigned blockSize = 256;
255 unsigned blockCount = (n + blockSize - 1) / blockSize;
256 topkBaseKernel<float><<<blockCount, blockSize>>>(data,
257 lm_hi,
258 knns,
259 uint32_t(d),
260 uint32_t(n),
261 uint32_t(g),
262 uint32_t(adjusted_k));
263 } else
264 runnerWrapperBitonicOpt<float, 2>(data,
265 lm_hi,
266 knns,
267 uint32_t(d),
268 uint32_t(n),
269 uint32_t(g),
270 uint32_t(adjusted_k));
271}
void runnerWrapperBitonicOpt(const F *points, const F *grid, knn_entry< F > *topKs, uint32_t dim, uint32_t pointsCount, uint32_t gridSize, uint32_t actualK)
Start the bitonic kNN with the right parameters.
__global__ void topkBitonicOptKernel(const F *__restrict__ points, const F *const __restrict__ grid, knn_entry< F > *__restrict__ topKs, const uint32_t dim, const uint32_t n, const uint32_t gridSize, const uint32_t actualK)
Bitonic kNN selection.
__global__ void topkBaseKernel(const F *__restrict__ points, const F *const __restrict__ grid, knn_entry< F > *__restrict__ topKs, const uint32_t dim, const uint32_t n, const uint32_t gridSize, const uint32_t k)
Base kernel for kNN computation.
__inline__ __device__ F sqeucl(const F *__restrict__ lhs, const F *__restrict__ rhs, const uint32_t dim)
Compute squared euclidean distance.
__inline__ __device__ void bubbleUp(knn_entry< F > *const __restrict__ topK, uint32_t idx)
Bubble a knn-entry up a list, insertsort-style.
static float distance(const glm::vec2 &x, const glm::vec2 &y)
void runKNNKernel(size_t d, size_t n, size_t g, size_t adjusted_k)
Execute the CUDA kNN within the context.
knn_entry< float > * knns
Definition: embedsom_cuda.h:54
A structure for packing neighbor index and distance for kNN search.