cocos-engine-external/sources/taskflow/cuda/cuda_algorithm/cuda_reduce.hpp

115 lines
2.7 KiB
C++

#pragma once
#include "../cuda_error.hpp"
namespace tf {
template <typename T, typename C>
__device__ void cuda_warp_reduce(
volatile T* shm, size_t N, size_t tid, C op
) {
if(tid + 32 < N) shm[tid] = op(shm[tid], shm[tid+32]);
if(tid + 16 < N) shm[tid] = op(shm[tid], shm[tid+16]);
if(tid + 8 < N) shm[tid] = op(shm[tid], shm[tid+8]);
if(tid + 4 < N) shm[tid] = op(shm[tid], shm[tid+4]);
if(tid + 2 < N) shm[tid] = op(shm[tid], shm[tid+2]);
if(tid + 1 < N) shm[tid] = op(shm[tid], shm[tid+1]);
}
// Kernel: cuda_reduce
// This reduction kernel assums only one block to avoid extra output memory.
template <typename I, typename T, typename C, bool uninitialized>
__global__ void cuda_reduce(I first, size_t N, T* res, C op) {
cudaSharedMemory<T> shared_memory;
T* shm = shared_memory.get();
size_t tid = threadIdx.x;
if(tid >= N) {
return;
}
shm[tid] = *(first+tid);
for(size_t i=tid+blockDim.x; i<N; i+=blockDim.x) {
shm[tid] = op(shm[tid], *(first+i));
}
__syncthreads();
for(size_t s = blockDim.x / 2; s > 32; s >>= 1) {
if(tid < s && tid + s < N) {
shm[tid] = op(shm[tid], shm[tid+s]);
}
__syncthreads();
}
if(tid < 32) {
cuda_warp_reduce(shm, N, tid, op);
}
if(tid == 0) {
if constexpr (uninitialized) {
*res = shm[0];
}
else {
*res = op(*res, shm[0]);
}
}
}
//template <typename C>
//__device__ void cuda_warp_reduce(
// volatile int* shm, size_t N, size_t tid, size_t gid, C op
//) {
// if(gid + 32 < N) shm[tid] = op(shm[tid], shm[tid+32]);
// if(gid + 16 < N) shm[tid] = op(shm[tid], shm[tid+16]);
// if(gid + 8 < N) shm[tid] = op(shm[tid], shm[tid+8]);
// if(gid + 4 < N) shm[tid] = op(shm[tid], shm[tid+4]);
// if(gid + 2 < N) shm[tid] = op(shm[tid], shm[tid+2]);
// if(gid + 1 < N) shm[tid] = op(shm[tid], shm[tid+1]);
//}
//
//template <typename C>
//__global__ void cuda_reduce(int* din, int* dout, size_t N, C op) {
//
// extern __shared__ int shm[];
//
// size_t tid = threadIdx.x;
// size_t gid = threadIdx.x + blockIdx.x * (blockDim.x);
// size_t gsd = blockDim.x * gridDim.x;
//
// if(gid >= N) {
// return;
// }
//
// //printf("%lu %lu %lu\n", tid, gid, gsd);
//
// shm[tid] = din[gid];
//
// for(size_t nxt = gid + gsd; nxt < N; nxt += gsd) {
// shm[tid] = op(shm[tid], din[nxt]);
// }
//
// __syncthreads();
//
// for(size_t s = blockDim.x / 2; s > 32; s >>= 1) {
// if(tid < s && gid + s < N) {
// shm[tid] = op(shm[tid], shm[tid+s]);
// }
// __syncthreads();
// }
//
// if(tid < 32) {
// warp_reduce(shm, N, tid, gid, op);
// }
//
// if(tid == 0){
// dout[blockIdx.x] = shm[0];
// }
//}
} // end of namespace tf -----------------------------------------------------