Hi,
I’m trying to implement a boxcar convolution down 1 dimension of an image, with 32 boxcars of width 1…32 samples.
The fastest implementation I have so far has each thread in a 32-thread warp responsible for a boxcar. Each thread keeps the the sum over the last ibc (boxcar index) samples, and the last sample. For each pixel in a column, the threads in the warp update the state and shift the last sample across to the next thread with a __shfl_up.
I’m surprised that I only get 0.68 Gflops and 1.40 GByte/sec on my GPU GeForce GT 750M. Am I doing something wrong?
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <assert.h>
const int N = 1024;
const int NBOX = 32;
#ifdef __cplusplus
extern "C"
#endif
__host__ inline void gpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s:%d\n", cudaGetErrorString(code), file, line);
assert(code == cudaSuccess);
exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
cudaEvent_t m_start;
cudaEvent_t m_stop;
void tic() {
gpuErrchk(cudaEventRecord(m_start));
gpuErrchk(cudaEventSynchronize(m_start));
}
float toc() {
gpuErrchk(cudaEventRecord(m_stop));
gpuErrchk(cudaEventSynchronize(m_stop));
float ms;
gpuErrchk(cudaEventElapsedTime(&ms, m_start, m_stop));
return ms;
}
__global__ void boxcar_do_kernel (
const float* __restrict__ indata,
float* __restrict__ outdata)
{
int ibc = threadIdx.x; // boxcar index. ibc=0 is 1 sample long, ibc=1 is 2 samples long, etc
float state = 0; // Sum of ibc samples - Ideally we do a prefix sum to initialise this, but that's not important now.
float vlast = 0; // value of sample ibc samples ago
int offset = N*(threadIdx.y + blockDim.y*blockIdx.x); // Start on the beginning of a column
const float* __restrict__ iptr = &indata[offset];
float* __restrict__ optr = &outdata[offset + ibc];
// for each pixel in the column
for(int t = 0; t < N; ++t) {
// All threads in warp access the same address location - LDU instruction?
float vin = *iptr;
iptr++; // increment for next time
// Add the current sample and subtract the 'previous' one - the only FP in the joint
state += vin - vlast;
// shift previous values one thread to the right. leaves vlast for ibc=0 unchanged.
vlast = __shfl_up(vlast, 1, NBOX);
// set vlast to vin for the 1-sample boxcar (i.e. ibc=0)
if (ibc == 0) {
vlast = vin;
}
// write state into output
if (outdata != NULL) {
*optr = state/(sqrtf((float) (ibc + 1))); // Scale by variance
// increment output pointer
optr += NBOX;
}
}
}
int main(int argc, char* argv[])
{
// Allocate input and output arrays
float* vin;
float* vout;
gpuErrchk(cudaMalloc(&vin, N*N));
gpuErrchk(cudaMalloc(&vout, N*N*NBOX));
gpuErrchk(cudaEventCreate(&m_start));
gpuErrchk(cudaEventCreate(&m_stop));
// Print GPU name
cudaDeviceProp prop;
gpuErrchk(cudaGetDeviceProperties(&prop, 0));
printf("Using GPU %s\n", prop.name);
int niter = 16;
// Setup grid and block size. Each block processes NBOX boxcars, and nblocks columns
int nblocks = 8;
int grid_size = N/nblocks;
dim3 block_size(NBOX, nblocks);
// Calculate statistics
float nops = N*N*NBOX*2*niter; // 2 additions per boxcar per pixel
float nbytes = sizeof(float)*N*N*(NBOX+1)*niter; // 1 read and NBOX writes per pixel
tic();
for(int ii = 0; ii < niter; ++ii) {
boxcar_do_kernel<<<grid_size, block_size>>>(vin, vout);
}
float time_sec = toc() / 1e3;
printf("Ran %d iterations WITH output in %f seconds = %0.2f Gflops and %0.2f GByte/sec\n", niter, time_sec, nops/time_sec/1e9, nbytes/time_sec/1e9);
// Let's see what happens if we don't write to gmem
nbytes = sizeof(float)*N*N*niter; // 1 read per pixel
tic();
for(int ii = 0; ii < niter; ++ii) {
boxcar_do_kernel<<<grid_size, block_size>>>(vin, NULL);
}
time_sec = toc() / 1e3;
printf("Ran %d iterations WITHOUT output in %f seconds = %0.2f Gflops and %0.2f GByte/sec\n", niter, time_sec, nops/time_sec/1e9, nbytes/time_sec/1e9);
}