29#ifndef ESOM_CUDA_BITONIC_CUH
30#define ESOM_CUDA_BITONIC_CUH
32#include <cuda_runtime.h>
33#include <device_launch_parameters.h>
41 static __device__
void order(T &a, T &b)
57 static __device__
void order(T &a, T &b)
85 if constexpr (BLOCK_SIZE < 1)
89 unsigned localMask = (unsigned)BLOCK_SIZE - 1;
91 unsigned blockMask = ~localMask;
92 unsigned localIdx = (unsigned)threadIdx.x & localMask;
95 unsigned blockOffset = ((
unsigned)threadIdx.x & blockMask) * 2;
103 UP ? blockOffset + ((unsigned)BLOCK_SIZE * 2 - 1) - localIdx
104 : blockOffset + localIdx + (unsigned)BLOCK_SIZE;
105 CMP::order(data[blockOffset + localIdx], data[secondIdx]);
107 if constexpr (BLOCK_SIZE > 32)
123 if constexpr (BLOCK_SIZE < 1)
127 bitonic_merge_step<T, BLOCK_SIZE, CMP, UP>(data);
150template<
typename T,
int BLOCK_SIZE,
class CMP = ComparatorPolicy<T>>
151__device__ __forceinline__
void
154 if constexpr (BLOCK_SIZE < 1)
157 if constexpr (BLOCK_SIZE > 1) {
161 if constexpr (BLOCK_SIZE > 32) {
168 bitonic_merge<T, BLOCK_SIZE, CMP>(data);
196template<
typename T,
int BLOCK_SIZE,
class CMP = ComparatorPolicy<T>>
197__device__ __forceinline__
void
201 if constexpr (BLOCK_SIZE < 1)
204 if constexpr (BLOCK_SIZE > 1) {
207 bitonic_sort<T, BLOCK_SIZE, CMP>(topK);
208 bitonic_sort<T, BLOCK_SIZE, CMP>(newData);
209 if constexpr (BLOCK_SIZE > 32)
216 unsigned localMask = BLOCK_SIZE - 1;
218 unsigned blockMask = ~localMask;
219 unsigned localIdx = (unsigned)threadIdx.x & localMask;
222 unsigned blockOffset = ((
unsigned)threadIdx.x & blockMask) * 2;
227 topK[blockOffset + localIdx],
228 newData[blockOffset + ((
unsigned)BLOCK_SIZE * 2 - 1) - localIdx]);
229 CMP::order(topK[blockOffset + localIdx + BLOCK_SIZE],
230 newData[blockOffset + ((
unsigned)BLOCK_SIZE - 1) - localIdx]);
232 if constexpr (BLOCK_SIZE > 32)
__device__ void bitonic_merge(T *__restrict__ data)
Parallel bitonic merge.
__device__ __forceinline__ void bitonic_sort(T *__restrict__ data)
Perform multiple bitonic sorts by all active threads.
__device__ void bitonic_merge_step(T *__restrict__ data)
A single "layer" of the parallel bitonic comparator.
__device__ __forceinline__ void bitonic_topk_update_opt(T *__restrict__ topK, T *__restrict__ newData)
Perform one update step of bitonic topk algorithm.
Comparator policy that automatically chooses a good implementation.
Wrapped compare and swap-to-correct-order function.
static __device__ void order(T &a, T &b)
Min-max-into-ordering function.
static __device__ void order(T &a, T &b)