I’m using a Kepler architecture code to make a sum reduce to my arrays (From here: [Kepler architecture reductions][1]). And I wanted to expand it to a global code. In other words, I want that the code can run in every architecture.
I was studying and trying to apply the reduce6 code from the NVIDIA samples, but I had some problems, the results weren’t the same.
I post the both codes here:
__forceinline__ __device__ float warpReduceSum(float val)
{
for (int offset = warpSize/2; offset > 0; offset /= 2){
val += __shfl_down(val, offset);
}
//for (int i=1; i<warpSize; i*=2) val += __shfl_xor(val, i);
return val;
}
__forceinline__ __device__ float blockReduceSum(float val) {
// --- The shared memory is appointed to contain the warp reduction results. It is understood that the maximum number of threads per block will be
// 1024, so that there will be at most 32 warps per each block.
static __shared__ float shared[32];
int lane = threadIdx.x % warpSize; // Thread index within the warp
int wid = threadIdx.x / warpSize; // Warp ID
// --- Performing warp reduction. Only the threads with 0 index within the warp have the "val" value set with the warp reduction result
val = warpReduceSum(val);
// --- Only the threads with 0 index within the warp write the warp result to shared memory
if (lane==0){
shared[wid]=val; // Write reduced value to shared memory
}
// --- Wait for all warp reductions
__syncthreads();
// --- There will be at most 1024 threads within a block and at most 1024 blocks within a grid. The partial sum is read from shared memory only
// the corresponding warp existed, otherwise the partial sum is set to zero.
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
// --- The first warp performs the final partial warp summation.
if (wid==0) val = warpReduceSum(val);
return val;
}
__global__ void deviceReduceKernel(float *in, float* out, long N)
{
float sum = 0.f;
// --- Reduce multiple elements per thread.
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x){
sum += in[i];
}
sum = blockReduceSum(sum);
if (threadIdx.x==0){
out[blockIdx.x]=sum;
}
}
__host__ float deviceReduce(float *in, long N) {
float *device_out;
gpuErrchk(cudaMalloc(&device_out, sizeof(float)*1024));
gpuErrchk(cudaMemset(device_out, 0, sizeof(float)*1024));
int threads= 512;
int blocks = min((int(N) + threads - 1) / threads, 1024);
//bool isPowTwo = isPow2(N);
deviceReduceKernel<<<blocks, threads>>>(in, device_out, N);
gpuErrchk(cudaDeviceSynchronize());
//isPowTwo = isPow2(blocks);
deviceReduceKernel<<<1, 1024>>>(device_out, device_out, blocks);
gpuErrchk(cudaDeviceSynchronize());
float sum = 0.0;
gpuErrchk(cudaMemcpy(&sum,device_out,sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(device_out);
return sum;
}
And the reduce6 kernel:
template <unsigned int blockSize, bool nIsPow2>
__global__ void reduce6(float *g_idata, float *g_odata, unsigned int n)
{
extern __shared__ float sdata[];
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockSize*2 + threadIdx.x;
unsigned int gridSize = blockSize*2*gridDim.x;
float mySum = 0.f;
// we reduce multiple elements per thread. The number is determined by the
// number of active thread blocks (via gridDim). More blocks will result
// in a larger gridSize and therefore fewer elements per thread
while (i < n)
{
mySum += g_idata[i];
// ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
if (nIsPow2 || i + blockSize < n)
mySum += g_idata[i+blockSize];
i += gridSize;
}
// each thread puts its local sum into shared memory
sdata[tid] = mySum;
__syncthreads();
// do reduction in shared mem
if ((blockSize >= 512) && (tid < 256))
{
sdata[tid] = mySum = mySum + sdata[tid + 256];
}
__syncthreads();
if ((blockSize >= 256) &&(tid < 128))
{
sdata[tid] = mySum = mySum + sdata[tid + 128];
}
__syncthreads();
if ((blockSize >= 128) && (tid < 64))
{
sdata[tid] = mySum = mySum + sdata[tid + 64];
}
__syncthreads();
#if (__CUDA_ARCH__ >= 300 )
if ( tid < 32 )
{
// Fetch final intermediate sum from 2nd warp
if (blockSize >= 64) mySum += sdata[tid + 32];
// Reduce final warp using shuffle
for (int offset = warpSize/2; offset > 0; offset /= 2)
{
mySum += __shfl_down(mySum, offset);
}
}
#else
// fully unroll reduction within a single warp
if ((blockSize >= 64) && (tid < 32))
{
sdata[tid] = mySum = mySum + sdata[tid + 32];
}
__syncthreads();
if ((blockSize >= 32) && (tid < 16))
{
sdata[tid] = mySum = mySum + sdata[tid + 16];
}
__syncthreads();
if ((blockSize >= 16) && (tid < 8))
{
sdata[tid] = mySum = mySum + sdata[tid + 8];
}
__syncthreads();
if ((blockSize >= 8) && (tid < 4))
{
sdata[tid] = mySum = mySum + sdata[tid + 4];
}
__syncthreads();
if ((blockSize >= 4) && (tid < 2))
{
sdata[tid] = mySum = mySum + sdata[tid + 2];
}
__syncthreads();
if ((blockSize >= 2) && ( tid < 1))
{
sdata[tid] = mySum = mySum + sdata[tid + 1];
}
__syncthreads();
#endif
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = mySum;
}
The question is, how can use a reduce6 kind of reduction in my code, because my code has an static shared memory[32] for the 32 threads of a warp and also doesn’t use de blocksize and isPow2 as reduce6 does. Another thing that I realized is that the reduce6 code has another way to do the blocks sum. I mean, in my code the final block sum is like:
deviceReduceKernel<<<1, 1024>>>(device_out, device_out, blocks);
gpuErrchk(cudaDeviceSynchronize());
float sum = 0.0;
gpuErrchk(cudaMemcpy(&sum,device_out,sizeof(float),cudaMemcpyDeviceToHost));
While in reduction6 the block sum is like:
// sum partial block sums on GPU
int s=numBlocks;
int kernel = whichKernel;
while (s > cpuFinalThreshold)
{
int threads = 0, blocks = 0;
getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
reduce<T>(s, threads, blocks, kernel, d_odata, d_odata);
if (kernel < 3)
{
s = (s + threads - 1) / threads;
}
else
{
s = (s + (threads*2-1)) / (threads*2);
}
}
if (s > 1)
{
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, d_odata, s * sizeof(T), cudaMemcpyDeviceToHost));
for (int i=0; i < s; i++)
{
gpu_result += h_odata[i];
}
}
Also, what is CPUFinalThreshold?
[1]: http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/