BlosSOM
Interactive dimensionality reduction on large datasets (EmbedSOM and FLOWER combined)
bitonic.cuh
Go to the documentation of this file.
1/*
2The MIT License
3
4Copyright (c) 2021 Martin Krulis
5 Mirek Kratochvil
6
7Permission is hereby granted, free of charge,
8to any person obtaining a copy of this software and
9associated documentation files (the "Software"), to
10deal in the Software without restriction, including
11without limitation the rights to use, copy, modify,
12merge, publish, distribute, sublicense, and/or sell
13copies of the Software, and to permit persons to whom
14the Software is furnished to do so,
15subject to the following conditions:
16
17The above copyright notice and this permission notice
18shall be included in all copies or substantial portions of the Software.
19
20THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
24ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27*/
28
29#ifndef ESOM_CUDA_BITONIC_CUH
30#define ESOM_CUDA_BITONIC_CUH
31
32#include <cuda_runtime.h>
33#include <device_launch_parameters.h>
34
35/** Wrapped compare and swap-to-correct-order function
36 *
37 * This is supposed to be used as a policy template parameter*/
38template<typename T>
40{
41 static __device__ void order(T &a, T &b)
42 {
43 if (a <= b)
44 return;
45 T tmp = a;
46 a = b;
47 b = tmp;
48 }
49};
50
51/** Min-max-into-ordering function
52 *
53 * This is supposed to be used as a policy template parameter*/
54template<typename T>
56{
57 static __device__ void order(T &a, T &b)
58 {
59 T A = a;
60 T B = b;
61 a = min(A, B);
62 b = max(A, B);
63 }
64};
65
66/** Comparator policy that automatically chooses a good implementation */
67template<typename T>
69{
70};
71
72template<>
73struct ComparatorPolicy<float> : OrderWithMinMax<float>
74{
75};
76
77/** A single "layer" of the parallel bitonic comparator. */
78template<typename T,
79 int BLOCK_SIZE,
80 class CMP = ComparatorPolicy<T>,
81 bool UP = false>
82__device__ void
83bitonic_merge_step(T *__restrict__ data)
84{
85 if constexpr (BLOCK_SIZE < 1)
86 return; // this should never happen, but just to be safe
87
88 /* masking lower bits from thread idx (like modulo STEP) */
89 unsigned localMask = (unsigned)BLOCK_SIZE - 1;
90 /* masking remaining bits from thread idx */
91 unsigned blockMask = ~localMask;
92 unsigned localIdx = (unsigned)threadIdx.x & localMask;
93 /* offset of the block has 2x size of threads in local block
94 * (each thread works on 2 values at once) */
95 unsigned blockOffset = ((unsigned)threadIdx.x & blockMask) * 2;
96
97 /* UP is the first phase of the merge, which is different (compares first
98 * with last, second with last second ...).
99 *
100 * Remaining steps use DOWN (ie UP == false) which is regular pairwise
101 * comparison within STEP-size sub-block. */
102 unsigned secondIdx =
103 UP ? blockOffset + ((unsigned)BLOCK_SIZE * 2 - 1) - localIdx
104 : blockOffset + localIdx + (unsigned)BLOCK_SIZE;
105 CMP::order(data[blockOffset + localIdx], data[secondIdx]);
106
107 if constexpr (BLOCK_SIZE > 32)
108 __syncthreads();
109 else
110 __syncwarp();
111}
112
113/** Parallel bitonic merge.
114 *
115 * This runs several layers of the parallel bitonic comparators. */
116template<typename T,
117 int BLOCK_SIZE,
118 class CMP = ComparatorPolicy<T>,
119 bool UP = true>
120__device__ void
121bitonic_merge(T *__restrict__ data)
122{
123 if constexpr (BLOCK_SIZE < 1)
124 return; // recursion bottom case
125
126 /* do one comparison step */
127 bitonic_merge_step<T, BLOCK_SIZE, CMP, UP>(data);
128
129 /* run recursion (note that in this implementation, any subsequent merge
130 * steps are with UP = false, only first step is UP) */
131 bitonic_merge<T, BLOCK_SIZE / 2, CMP, false>(data);
132}
133
134/**
135 * Perform multiple bitonic sorts by all active threads.
136 *
137 * @tparam T data item type
138 *
139 * @tparam BLOCK_SIZE Number of threads that work cooperatively on a block of
140 * items that has exactly 2x BLOCK_SIZE items. Size of thread block must be
141 * multiple of BLOCK_SIZE (multiple data blocks may be sorted by one thread
142 * block).
143 *
144 * @tparam CMP comparator policy class that implements compare and swap on type
145 * T
146 *
147 * @param data pointer to an array of T to be sorted in-place, of size at least
148 * two times the total number of threads
149 */
150template<typename T, int BLOCK_SIZE, class CMP = ComparatorPolicy<T>>
151__device__ __forceinline__ void
152bitonic_sort(T *__restrict__ data)
153{
154 if constexpr (BLOCK_SIZE < 1)
155 return;
156
157 if constexpr (BLOCK_SIZE > 1) {
158 // recursively sort halves of the input (this will be unrolled by
159 // compiler)
160 bitonic_sort<T, BLOCK_SIZE / 2, CMP>(data);
161 if constexpr (BLOCK_SIZE > 32) {
162 __syncthreads();
163 } else {
164 __syncwarp();
165 }
166 }
167
168 bitonic_merge<T, BLOCK_SIZE, CMP>(data);
169}
170
171/** Perform one update step of bitonic topk algorithm.
172 *
173 * The algorith takes two inputs: current topk sub-result and new data (e.g.,
174 * newly computed distances). It sorts inputs and runs a bitonic merge on
175 * these, producing a bitonic sequence with the lower half of the data in topK,
176 * and (as a side product) a bitonic sequence with the upper half of the data
177 * in newData.
178 *
179 * Note that hat way the topk part is updated, but not entirely sorted (so we
180 * save some time).
181 *
182 * @tparam T item data type
183 *
184 * @tparam BLOCK_SIZE Number of threads that work cooperatively on a block of
185 * items. Both topk result and newData of one block have 2x BLOCK_SIZE items.
186 * Size of thread block must be multiple of BLOCK_SIZE (multiple data blocks may
187 * be sorted by one thread block).
188 *
189 * @tparam CMP comparator policy, such as ComparatorPolicy.
190 *
191 * @param topK array of `T`'s with intermediate top results, containing at
192 * least 2x(number of threads) items
193 *
194 * @param newData array of new results to be merged into topK, of the same size
195 */
196template<typename T, int BLOCK_SIZE, class CMP = ComparatorPolicy<T>>
197__device__ __forceinline__ void
198bitonic_topk_update_opt(T *__restrict__ topK, T *__restrict__ newData)
199{
200 /* extra safety */
201 if constexpr (BLOCK_SIZE < 1)
202 return;
203
204 if constexpr (BLOCK_SIZE > 1) {
205 /* recursively sort halves of the input
206 * (this will be unrolled by compiler) */
207 bitonic_sort<T, BLOCK_SIZE, CMP>(topK);
208 bitonic_sort<T, BLOCK_SIZE, CMP>(newData);
209 if constexpr (BLOCK_SIZE > 32)
210 __syncthreads();
211 else
212 __syncwarp();
213 }
214
215 /* masking lower bits from thread idx (like modulo STEP) */
216 unsigned localMask = BLOCK_SIZE - 1;
217 /* masking remaining bits from thread idx */
218 unsigned blockMask = ~localMask;
219 unsigned localIdx = (unsigned)threadIdx.x & localMask;
220 /* offset of the block has 2x size of threads in local block
221 * (each thread works on 2 values at once) */
222 unsigned blockOffset = ((unsigned)threadIdx.x & blockMask) * 2;
223
224 /* compare the upper and lower half of the data, producing a bitonic lower
225 * half with values smaller than the upper half */
226 CMP::order(
227 topK[blockOffset + localIdx],
228 newData[blockOffset + ((unsigned)BLOCK_SIZE * 2 - 1) - localIdx]);
229 CMP::order(topK[blockOffset + localIdx + BLOCK_SIZE],
230 newData[blockOffset + ((unsigned)BLOCK_SIZE - 1) - localIdx]);
231
232 if constexpr (BLOCK_SIZE > 32)
233 __syncthreads();
234 else
235 __syncwarp();
236}
237
238#endif
__device__ void bitonic_merge(T *__restrict__ data)
Parallel bitonic merge.
Definition: bitonic.cuh:121
__device__ __forceinline__ void bitonic_sort(T *__restrict__ data)
Perform multiple bitonic sorts by all active threads.
Definition: bitonic.cuh:152
__device__ void bitonic_merge_step(T *__restrict__ data)
A single "layer" of the parallel bitonic comparator.
Definition: bitonic.cuh:83
__device__ __forceinline__ void bitonic_topk_update_opt(T *__restrict__ topK, T *__restrict__ newData)
Perform one update step of bitonic topk algorithm.
Definition: bitonic.cuh:198
Comparator policy that automatically chooses a good implementation.
Definition: bitonic.cuh:69
Wrapped compare and swap-to-correct-order function.
Definition: bitonic.cuh:40
static __device__ void order(T &a, T &b)
Definition: bitonic.cuh:41
Min-max-into-ordering function.
Definition: bitonic.cuh:56
static __device__ void order(T &a, T &b)
Definition: bitonic.cuh:57