Hi everyone,
I have some problems using cuda streams. Could someone please help?
I’ve developed a kernel that perform the rotation of non-squared images (2336x1200) of 90 degrees:
__global__ void kernel(BYTE *odata, BYTE *idata, int width, int height)
{
__shared__ BYTE block[TILE_DIM][TILE_DIM+1];
int col = blockIdx.x * TILE_DIM + threadIdx.x;
int row = blockIdx.y * TILE_DIM + threadIdx.y;
for (int i=0; i<TILE_DIM; i+= BLOCK_ROWS){
if((col < width) && (row < height))
{
int tid_in = row * width + col;
block[threadIdx.y][threadIdx.x] = idata[tid_in+i*IMAGE_WIDTH];
}
}
__syncthreads();
col = blockIdx.y * TILE_DIM + threadIdx.x;
row = blockIdx.x * TILE_DIM + threadIdx.y;
for(int i=0; i<TILE_DIM; i+= BLOCK_ROWS){
if((col < height) && (row < width))
{
//90° anticlockwise
//int tid_out = (width-row-1) * height + col;
//90° clockwise
int tid_out = row * height + height - col - 1;
odata[tid_out+i*IMAGE_HEIGHT] = block[threadIdx.x][threadIdx.y];
}
}
}
I would like to apply this kernel to a block of images of fixed size (for example a block of 45 images).
Following the post http://devblogs.nvidia.com/parallelforall/how-overlap-data-transfers-cuda-cc/ i’ve written this code:
float ms;
//IMAGE_WIDTH = 2336, IMAGE_HEIGHT = 1200, NIMAGES = 30
int n = IMAGE_WIDTH * IMAGE_HEIGHT * NIMAGES;
// size of memory required to store the matrix
const int mem_size = sizeof(BYTE) * n;
//host input and output data
BYTE *h_idata,*h_odata;
//device input and output data
BYTE *d_idata,*d_odata;
checkCudaErrors(cudaHostAlloc((void **)&h_idata, mem_size, cudaHostAllocWriteCombined));
checkCudaErrors(cudaHostAlloc((void **)&h_odata, mem_size, cudaHostAllocDefault));
checkCudaErrors(cudaMalloc((void **) &d_idata, NSTREAMS * chunk_size*sizeof(BYTE)));
checkCudaErrors(cudaMalloc((void **) &d_odata, NSTREAMS * chunk_size*sizeof(BYTE)));
// initalize host input data
.......
cudaEvent_t start, stop;
cudaStream_t stream[NSTREAMS];
double dim_grid_x = ((double)IMAGE_WIDTH/TILE_DIM);
double dim_grid_y = ((double)IMAGE_HEIGHT/TILE_DIM);
dim3 grid(ceil(dim_grid_x),ceil(dim_grid_y),1), threads(TILE_DIM,BLOCK_ROWS,1);
const int streamSize = n / NSTREAMS;
const int streamBytes = streamSize * sizeof(BYTE);
for (int i = 0; i < NSTREAMS; ++i)
checkCudaErrors( cudaStreamCreate(&stream[i]) );
checkCudaErrors( cudaEventCreate(&start) );
checkCudaErrors( cudaEventCreate(&stop) );
checkCudaErrors( cudaEventRecord(start,0) );
for (int i = 0; i < NSTREAMS; ++i)
{
//int offset = i * streamSize;
checkCudaErrors( cudaMemcpyAsync(&d_idata[i * streamSize], &h_idata[i * streamSize], streamBytes, cudaMemcpyHostToDevice,stream[i]) );
}
for (int i = 0; i < NSTREAMS; ++i)
{
//int offset = i * streamSize;
kernel<<<grid, threads, 0, stream[i]>>>(d_odata, d_idata, IMAGE_WIDTH, IMAGE_HEIGHT,i);
}
for (int i = 0; i < NSTREAMS; ++i)
{
//int offset = i * streamSize;
checkCudaErrors( cudaMemcpyAsync(&h_odata[i * streamSize], &d_odata[i * streamSize], streamBytes, cudaMemcpyDeviceToHost,stream[i]) );
}
checkCudaErrors( cudaEventRecord(stop, 0) );
checkCudaErrors( cudaEventSynchronize(stop));
checkCudaErrors( cudaEventElapsedTime(&ms, start, stop) );
printf("Computation Time (ms): %f \n\n", ms);
checkCudaErrors( cudaEventDestroy(start));
checkCudaErrors( cudaEventDestroy(stop) );
for (int i = 0; i < NSTREAMS; ++i)
checkCudaErrors( cudaStreamDestroy(stream[i]) );
The code works correctly only if i set a number of streams exactly equal to the number of images to rotate (NSTREAMS = NIMAGES).
On the contrary, if i have a number of images per stream greater than 1 (NSTREAMS = 15, NIMAGES = 45), the rotation don’t work as expected: only the first 15 images are ok.
How can i solve this problem?? I’m new on using cuda so i’ll appreciate any suggestions.
Thanks in advance!!
Domenico