CUDA reduction

theblur · September 30, 2008, 11:44am

Greetings!

Is there any available code that i can use for finding max or min in an array using CUDA, without the limitation of only working for power-of-2 arrays?

Deyuan

Reimar · September 30, 2008, 12:26pm

Huh, which implementation do you have that does not work for non-power-of-two?

It should be just a matter of adding something like “if (secondindex >= size) maximum = array[firstindex]; else maximum = max(array[firstindex], array[secondindex]);” for the very first reduction step.

theblur · September 30, 2008, 12:54pm

I was looking at the reduction implementation in CUDA SDK. It works only for power-of-2 arrays. LOL, I think u put it right.

But do u have a complete code, so that I don’t need to start from scratch. I don’t know how to optimize this process, like block size, bank conflict avoidance, etc…

Fuchs · September 30, 2008, 1:41pm

This is an untested version:

#include <cutil.h>

#include "sharedmem.cuh"

#include <iostream>

/*

    This version adds multiple elements per thread sequentially.  This reduces the overall

    cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).

    (Brent's Theorem optimization)

*/

template <class T, unsigned int blockSize>

__global__ void reduce_max_kernel(T *g_odata, T *g_idata, unsigned int n)

{

    SharedMemory<T> smem;

    T *sdata = smem.getPointer();

   // perform first level of reduction,

    // reading from global memory, writing to shared memory

    unsigned int tid = threadIdx.x;

    unsigned int i = blockIdx.x*blockSize + threadIdx.x;

    unsigned int gridSize = blockSize*2*gridDim.x;

   // we reduce multiple elements per thread.  The number is determined by the 

    // number of active thread blocks (via gridSize).  More blocks will result

    // in a larger gridSize and therefore fewer elements per thread

    T thMax = fmaxf(g_idata[i], g_idata[i + blockSize]);

    i += gridSize;

    while (i < n)

    {

      T a = fmaxf(g_idata[i], g_idata[i + blockSize]);

      thMax = fmaxf(thMax, a);

      i += gridSize;

    }

    sdata[tid] = thMax;

    __syncthreads();

   // do reduction in shared mem

    if (blockSize >= 512) { if (tid < 256) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + 256]); } __syncthreads(); }

    if (blockSize >= 256) { if (tid < 128) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + 128]); } __syncthreads(); }

    if (blockSize >= 128) { if (tid <  64) { sdata[tid] = fmaxf(sdata[tid], sdata[tid +  64]); } __syncthreads(); }

    

    if (tid < 32)

    {

        if (blockSize >=  64) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + 32]); }

        if (blockSize >=  32) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + 16]); }

        if (blockSize >=  16) { sdata[tid] = fmaxf(sdata[tid], sdata[tid +  8]); }

        if (blockSize >=   8) { sdata[tid] = fmaxf(sdata[tid], sdata[tid +  4]); }

        if (blockSize >=   4) { sdata[tid] = fmaxf(sdata[tid], sdata[tid +  2]); }

        if (blockSize >=   2) { sdata[tid] = fmaxf(sdata[tid], sdata[tid +  1]); }

    }

    

    // write result for this block to global mem 

    if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}

////////////////////////////////////////////////////////////////////////////////

// Wrapper function for kernel launch

////////////////////////////////////////////////////////////////////////////////

template <class T>

T reduce_max(T *d_odata, T *d_idata, int size)

{

  const int maxThreads = 128;

  const int maxBlocks  = 128; // TODO: Test/Increase for future devices w/ more processors

  int threads = 1;

 if( size != 1 ) {

    threads = (size < maxThreads*2) ? size / 2 : maxThreads;

  }

  int blocks = size / (threads * 2);

  blocks = min(maxBlocks, blocks);

 dim3 dimBlock(threads, 1, 1);

  dim3 dimGrid(blocks, 1, 1);

  int smemSize = threads * sizeof(T);

 switch (threads)

  {

  case 512:

      reduce_max_kernel<T, 512><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case 256:

      reduce_max_kernel<T, 256><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case 128:

      reduce_max_kernel<T, 128><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case 64:

      reduce_max_kernel<T,  64><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case 32:

      reduce_max_kernel<T,  32><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case 16:

      reduce_max_kernel<T,  16><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case  8:

      reduce_max_kernel<T,   8><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case  4:

      reduce_max_kernel<T,   4><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case  2:

      reduce_max_kernel<T,   2><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case  1:

      reduce_max_kernel<T,   1><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  default:

      exit(1);

  }

 T* h_odata = new T[blocks];

  CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_odata, blocks*sizeof(T), cudaMemcpyDeviceToHost) );

 T result = h_odata[0];

  for( int i = 1; i < blocks; i++ ) {

    result = max(result, h_odata[i]);

  }

  delete[] h_odata;

 return result;

}

// Instanciate kernels to prevent linker errors

template int    reduce_max<int   >(int    *d_odata, int    *d_idata, int size);

template float  reduce_max<float >(float  *d_odata, float  *d_idata, int size);

template double reduce_max<double>(double *d_odata, double *d_idata, int size);

Header file:

#ifndef __REDUCTION_MAX_KERNEL_H__

#define __REDUCTION_MAX_KERNEL_H__

template <class T> T reduce_max(T *d_odata, T *d_idata, int size);

#endif // #ifndef __REDUCTION_MAX_KERNEL_H__

HTH

rmjdsouza · September 30, 2008, 2:24pm

I see that the CUDPP libraries can only sort stream of INT UINT FLOAT CHAR and UCHAR. How does one setup CUDPP to sort, find max, add etc of any generic object

This is an untested version:

#include <cutil.h>

#include "sharedmem.cuh"

#include <iostream>

/*

 Â  Â This version adds multiple elements per thread sequentially. Â This reduces the overall

 Â  Â cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).

 Â  Â (Brent's Theorem optimization)

*/

template <class T, unsigned int blockSize>

__global__ void reduce_max_kernel(T *g_odata, T *g_idata, unsigned int n)

{

 Â  Â SharedMemory<T> smem;

 Â  Â T *sdata = smem.getPointer();

Â  Â // perform first level of reduction,

 Â  Â // reading from global memory, writing to shared memory

 Â  Â unsigned int tid = threadIdx.x;

 Â  Â unsigned int i = blockIdx.x*blockSize + threadIdx.x;

 Â  Â unsigned int gridSize = blockSize*2*gridDim.x;

Â  Â // we reduce multiple elements per thread. Â The number is determined by the 

 Â  Â // number of active thread blocks (via gridSize). Â More blocks will result

 Â  Â // in a larger gridSize and therefore fewer elements per thread

 Â  Â T thMax = fmaxf(g_idata[i], g_idata[i + blockSize]);

 Â  Â i += gridSize;

 Â  Â while (i < n)

 Â  Â {

 Â  Â  Â T a = fmaxf(g_idata[i], g_idata[i + blockSize]);

 Â  Â  Â thMax = fmaxf(thMax, a);

 Â  Â  Â i += gridSize;

 Â  Â }

 Â  Â sdata[tid] = thMax;

 Â  Â __syncthreads();

Â  Â // do reduction in shared mem

 Â  Â if (blockSize >= 512) { if (tid < 256) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + 256]); } __syncthreads(); }

 Â  Â if (blockSize >= 256) { if (tid < 128) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + 128]); } __syncthreads(); }

 Â  Â if (blockSize >= 128) { if (tid < Â 64) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + Â 64]); } __syncthreads(); }

 Â  Â 

 Â  Â if (tid < 32)

 Â  Â {

 Â  Â  Â  Â if (blockSize >= Â 64) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + 32]); }

 Â  Â  Â  Â if (blockSize >= Â 32) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + 16]); }

 Â  Â  Â  Â if (blockSize >= Â 16) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + Â 8]); }

 Â  Â  Â  Â if (blockSize >= Â  8) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + Â 4]); }

 Â  Â  Â  Â if (blockSize >= Â  4) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + Â 2]); }

 Â  Â  Â  Â if (blockSize >= Â  2) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + Â 1]); }

 Â  Â }

 Â  Â 

 Â  Â // write result for this block to global mem 

 Â  Â if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}

////////////////////////////////////////////////////////////////////////////////

// Wrapper function for kernel launch

////////////////////////////////////////////////////////////////////////////////

template <class T>

T reduce_max(T *d_odata, T *d_idata, int size)

{

 Â const int maxThreads = 128;

 Â const int maxBlocks Â = 128; // TODO: Test/Increase for future devices w/ more processors

 Â int threads = 1;

Â if( size != 1 ) {

 Â  Â threads = (size < maxThreads*2) ? size / 2 : maxThreads;

 Â }

 Â int blocks = size / (threads * 2);

 Â blocks = min(maxBlocks, blocks);

Â dim3 dimBlock(threads, 1, 1);

 Â dim3 dimGrid(blocks, 1, 1);

 Â int smemSize = threads * sizeof(T);

Â switch (threads)

 Â {

 Â case 512:

 Â  Â  Â reduce_max_kernel<T, 512><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

 Â case 256:

 Â  Â  Â reduce_max_kernel<T, 256><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

 Â case 128:

 Â  Â  Â reduce_max_kernel<T, 128><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

 Â case 64:

 Â  Â  Â reduce_max_kernel<T, Â 64><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

 Â case 32:

 Â  Â  Â reduce_max_kernel<T, Â 32><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

 Â case 16:

 Â  Â  Â reduce_max_kernel<T, Â 16><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

 Â case Â 8:

 Â  Â  Â reduce_max_kernel<T, Â  8><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

 Â case Â 4:

 Â  Â  Â reduce_max_kernel<T, Â  4><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

 Â case Â 2:

 Â  Â  Â reduce_max_kernel<T, Â  2><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

 Â case Â 1:

 Â  Â  Â reduce_max_kernel<T, Â  1><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

 Â default:

 Â  Â  Â exit(1);

 Â }

Â T* h_odata = new T[blocks];

 Â CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_odata, blocks*sizeof(T), cudaMemcpyDeviceToHost) );

Â T result = h_odata[0];

 Â for( int i = 1; i < blocks; i++ ) {

 Â  Â result = max(result, h_odata[i]);

 Â }

 Â delete[] h_odata;

Â return result;

}

// Instanciate kernels to prevent linker errors

template int Â  Â reduce_max<int Â  >(int Â  Â *d_odata, int Â  Â *d_idata, int size);

template float Â reduce_max<float >(float Â *d_odata, float Â *d_idata, int size);

template double reduce_max<double>(double *d_odata, double *d_idata, int size);

Header file:

#ifndef __REDUCTION_MAX_KERNEL_H__

#define __REDUCTION_MAX_KERNEL_H__

template <class T> T reduce_max(T *d_odata, T *d_idata, int size);

#endif // #ifndef __REDUCTION_MAX_KERNEL_H__

HTH

[snapback]445991[/snapback]

Fuchs · October 1, 2008, 1:10pm

[url=“http://www.gpgpu.org/developer/cudpp/rel/cudpp_1.0a/html/todo.html”]http://www.gpgpu.org/developer/cudpp/rel/c.../html/todo.html[/url]

There is a reduce_sum on the todo list. So I guess there will be reduction kernels in future version. But not yet…

theblur · October 5, 2008, 11:17am

Thank you!

But I think the code still has the limitation of power-of-2.

This is an untested version:

#include <cutil.h>

#include "sharedmem.cuh"

#include <iostream>

/*

    This version adds multiple elements per thread sequentially.  This reduces the overall

    cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).

    (Brent's Theorem optimization)

*/

template <class T, unsigned int blockSize>

__global__ void reduce_max_kernel(T *g_odata, T *g_idata, unsigned int n)

{

    SharedMemory<T> smem;

    T *sdata = smem.getPointer();

   // perform first level of reduction,

    // reading from global memory, writing to shared memory

    unsigned int tid = threadIdx.x;

    unsigned int i = blockIdx.x*blockSize + threadIdx.x;

    unsigned int gridSize = blockSize*2*gridDim.x;

   // we reduce multiple elements per thread.  The number is determined by the 

    // number of active thread blocks (via gridSize).  More blocks will result

    // in a larger gridSize and therefore fewer elements per thread

    T thMax = fmaxf(g_idata[i], g_idata[i + blockSize]);

    i += gridSize;

    while (i < n)

    {

      T a = fmaxf(g_idata[i], g_idata[i + blockSize]);

      thMax = fmaxf(thMax, a);

      i += gridSize;

    }

    sdata[tid] = thMax;

    __syncthreads();

   // do reduction in shared mem

    if (blockSize >= 512) { if (tid < 256) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + 256]); } __syncthreads(); }

    if (blockSize >= 256) { if (tid < 128) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + 128]); } __syncthreads(); }

    if (blockSize >= 128) { if (tid <  64) { sdata[tid] = fmaxf(sdata[tid], sdata[tid +  64]); } __syncthreads(); }

    

    if (tid < 32)

    {

        if (blockSize >=  64) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + 32]); }

        if (blockSize >=  32) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + 16]); }

        if (blockSize >=  16) { sdata[tid] = fmaxf(sdata[tid], sdata[tid +  8]); }

        if (blockSize >=   8) { sdata[tid] = fmaxf(sdata[tid], sdata[tid +  4]); }

        if (blockSize >=   4) { sdata[tid] = fmaxf(sdata[tid], sdata[tid +  2]); }

        if (blockSize >=   2) { sdata[tid] = fmaxf(sdata[tid], sdata[tid +  1]); }

    }

    

    // write result for this block to global mem 

    if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}

////////////////////////////////////////////////////////////////////////////////

// Wrapper function for kernel launch

////////////////////////////////////////////////////////////////////////////////

template <class T>

T reduce_max(T *d_odata, T *d_idata, int size)

{

  const int maxThreads = 128;

  const int maxBlocks  = 128; // TODO: Test/Increase for future devices w/ more processors

  int threads = 1;

 if( size != 1 ) {

    threads = (size < maxThreads*2) ? size / 2 : maxThreads;

  }

  int blocks = size / (threads * 2);

  blocks = min(maxBlocks, blocks);

 dim3 dimBlock(threads, 1, 1);

  dim3 dimGrid(blocks, 1, 1);

  int smemSize = threads * sizeof(T);

 switch (threads)

  {

  case 512:

      reduce_max_kernel<T, 512><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case 256:

      reduce_max_kernel<T, 256><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case 128:

      reduce_max_kernel<T, 128><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case 64:

      reduce_max_kernel<T,  64><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case 32:

      reduce_max_kernel<T,  32><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case 16:

      reduce_max_kernel<T,  16><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case  8:

      reduce_max_kernel<T,   8><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case  4:

      reduce_max_kernel<T,   4><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case  2:

      reduce_max_kernel<T,   2><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  case  1:

      reduce_max_kernel<T,   1><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

  default:

      exit(1);

  }

 T* h_odata = new T[blocks];

  CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_odata, blocks*sizeof(T), cudaMemcpyDeviceToHost) );

 T result = h_odata[0];

  for( int i = 1; i < blocks; i++ ) {

    result = max(result, h_odata[i]);

  }

  delete[] h_odata;

 return result;

}

// Instanciate kernels to prevent linker errors

template int    reduce_max<int   >(int    *d_odata, int    *d_idata, int size);

template float  reduce_max<float >(float  *d_odata, float  *d_idata, int size);

template double reduce_max<double>(double *d_odata, double *d_idata, int size);

Header file:

#ifndef __REDUCTION_MAX_KERNEL_H__

#define __REDUCTION_MAX_KERNEL_H__

template <class T> T reduce_max(T *d_odata, T *d_idata, int size);

#endif // #ifndef __REDUCTION_MAX_KERNEL_H__

HTH

[snapback]445991[/snapback]

theblur · October 6, 2008, 11:50am

And I don’t know why, since it seems to work only for an array less than 32 elements. I’m testing by float. If a bigger array is input, then only the 32 elements at head are checked.

Fuchs · October 6, 2008, 2:18pm

It should work for even number of elements. If you have arrays smaller than (lets say) 512 elements you shouldn’t use GPU reduction anyways. So maybe a little if statement in the C++ function which would handle that case then on the CPU would be better. I use it for 2**20 elements and it just runs 1-2% faster than CUBLAS reduction kernels.

theblur · October 12, 2008, 11:39am

I also gave cublasIsamax() a try. It seems not so fast as Mark Harris’ reduction algorithm. Does it also use Mark Harris’ algorithm or not??

johank · June 7, 2009, 10:12pm

As a note, the code as given does not work.

Minor change is needed:

unsigned int i = blockIdx.x*blockSize + threadIdx.x;

should be

unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;

this change is directly from the nvidia reduction samples…