Hello,
I made a program which use cudaStream to try performance on my shield tablet.
Here is the code:
#define WIDTH 6400
#define HEIGHT 4800
#define NB_STREAM 10
#define BLOC_X 32
#define BLOC_Y 32
cudaStream_t Stream[NB_STREAM];
cudaArray * Array_PatchsMaxDevice;
texture<u_int8_t, 2,cudaReadModeElementType> Image;
__global__ void SobelKernel(u_int8_t *ptDataOut,int hoffset,int widthToProcess,int heightToProcess)
{
int x = blockIdx.x*blockDim.x;
int y = blockIdx.y*blockDim.y;
int xglobal = x + threadIdx.x;
int yglobal = y + threadIdx.y;
if(xglobal>=widthToProcess || yglobal >= heightToProcess )
return;
// atomicAdd(&ptDataOut[hoffset*WIDTH + xglobal +yglobal*WIDTH],30000);
ptDataOut[hoffset*WIDTH + xglobal +yglobal*WIDTH] = tex2D(Image,xglobal,yglobal+hoffset);
}
void processFilter()
{
u_int8_t *u8_PtImageHost;
u_int8_t *u8_ptDataOutHost;
u_int8_t *u8_ptDataOutDevice;
u_int8_t u8_Used[NB_STREAM];
u8_PtImageHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
u8_ptDataOutHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
checkCudaErrors(cudaMalloc((void**)&u8_ptDataOutDevice,WIDTH*HEIGHT*sizeof(u_int8_t)));
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned char>();
checkCudaErrors(cudaMallocArray(&Array_PatchsMaxDevice, &channelDesc,WIDTH,HEIGHT ));
checkCudaErrors(cudaBindTextureToArray(Image,Array_PatchsMaxDevice));
dim3 threads(BLOC_X,BLOC_Y);
dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)HEIGHT/BLOC_Y));
//ClearKernel<<<blocks,threads>>>(u8_ptDataOutDevice,WIDTH,HEIGHT);
int blockh = HEIGHT/NB_STREAM;
for(int i=0;i<NB_STREAM;i++)
{
cudaSetDevice(0);
cudaStreamCreate(&Stream[i]);
}
cudaEvent_t Start;
cudaEvent_t Stop;
cudaEventCreate(&Start);
cudaEventCreate(&Stop);
cudaEventRecord(Start, 0);
for(int i=0;i<NB_STREAM;i++)
{
if(i == 0)
{
int localHEIGHT = blockh;
checkCudaErrors(cudaMemcpy2DToArrayAsync( Array_PatchsMaxDevice,
0,
0,
u8_PtImageHost,
WIDTH,
WIDTH,
blockh,
cudaMemcpyHostToDevice ,
Stream[i]));
dim3 threads(BLOC_X,BLOC_Y);
dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)localHEIGHT/BLOC_Y));
SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,0,WIDTH,localHEIGHT-1);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpyAsync(u8_ptDataOutHost,u8_ptDataOutDevice,WIDTH*(localHEIGHT-1)*sizeof(u_int8_t),cudaMemcpyDeviceToHost,Stream[i]));
u8_Used[i] = 1;
}else{
int ioffsetImage = WIDTH*(HEIGHT/NB_STREAM );
int hoffset = HEIGHT/NB_STREAM *i;
int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1);
int localHEIGHT = min(HEIGHT - (blockh*i),blockh);
//printf("hoffset: %d hoffsetkernel %d localHEIGHT %d rest %d ioffsetImage %d \n",hoffset,hoffsetkernel,localHEIGHT,HEIGHT - (blockh +1 +blockh*(i-1)),ioffsetImage*i/WIDTH);
checkCudaErrors(cudaMemcpy2DToArrayAsync( Array_PatchsMaxDevice,
0,
hoffset,
&u8_PtImageHost[ioffsetImage*i],
WIDTH,
WIDTH,
localHEIGHT,
cudaMemcpyHostToDevice ,
Stream[i]));
dim3 threads(BLOC_X,BLOC_Y);
dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)localHEIGHT/BLOC_Y));
SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,hoffsetkernel,WIDTH,localHEIGHT);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpyAsync(&u8_ptDataOutHost[hoffsetkernel*WIDTH],&u8_ptDataOutDevice[hoffsetkernel*WIDTH],WIDTH*localHEIGHT*sizeof(u_int8_t),cudaMemcpyDeviceToHost,Stream[i]));
u8_Used[i] = 1;
if(HEIGHT - (blockh +1 +blockh*(i-1))<=0)
{
break;
}
}
}
for(int i=0;i<NB_STREAM;i++)
{
cudaStreamSynchronize(Stream[i]);
}
cudaEventRecord(Stop, 0);
cudaEventSynchronize(Start);
cudaEventSynchronize(Stop);
float dt_ms;
cudaEventElapsedTime(&dt_ms, Start, Stop);
printf("dt_ms %f \n",dt_ms);
LOGD("dt_ms %f \n",dt_ms);
}
The problem is that there is no difference between stream size 1 or 10, the problem takes approximatly 120ms.
Is there a problem in my code?
Because it should be faster with 10 Streams? Should I have a huge difference?