cuda gpu slower than cpu
Hello, I am having trouble figuring out why my cuda code runs slower than my cpu code

my desktop configuration is i7 2600S, geforce 560ti

and my code is as follows:


int** kernel_shiftSeam(int **MCEnergyMat, int **newE, int *seam, int width, int height, int direction)
{
//time measurement
float elapsed_time_ms = 0;
cudaEvent_t start, stop; //threads per block

dim3 threads(16,16);
//blocks
dim3 blocks((width+threads.x-1)/threads.x, (height+threads.y-1)/threads.y);

int *device_Seam;

int *host_Seam;

int seamSize;
if(direction == 1)
{
seamSize = height*sizeof(int);
host_Seam = (int*)malloc(seamSize);
for(int i=0;i<height;i++)
host_Seam[i] = seam[i];
}
else
{
seamSize = width*sizeof(int);
host_Seam = (int*)malloc(seamSize);
for(int i=0;i<width;i++)
host_Seam[i] = seam[i];
}

cudaMalloc((void**)&device_Seam, seamSize);
cudaMemcpy(device_Seam, host_Seam, seamSize, cudaMemcpyHostToDevice);

global_host_MC = MCEnergyMat;
new_host_MC = newE;

//copy host array to device
cudaMemcpy(global_MC, global_MC2, sizeof(int*)*width, cudaMemcpyHostToDevice);
for(int i=0;i<width;i++)
cudaMemcpy(global_MC2[i], global_host_MC[i], sizeof(int)*height, cudaMemcpyHostToDevice);

cudaMemcpy(new_MC, new_MC2, sizeof(int*)*width, cudaMemcpyHostToDevice);
for(int i=0;i<width;i++)
cudaMemcpy(new_MC2[i], new_host_MC[i], sizeof(int)*height, cudaMemcpyHostToDevice);


cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);

//do some operations on the 2d matrix
gpu_shiftSeam<<< blocks,threads >>>(global_MC, new_MC, device_Seam, width, height);

//measure end time for cpu calcuations
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_ms, start, stop );

execTime += elapsed_time_ms;

//copy out the data back to host (RESULT)
for(int i=0;i<width;i++)
{
cudaMemcpy(newE[i], new_MC2[i], sizeof(int)*height, cudaMemcpyDeviceToHost);
}

return newE;
}

I looped it 800 times and I got the follow results:

GPU
Computation Time (the gpu_shiftseam part) : 1176ms
Total program run time: 22s


CPU
Computation Time (same operation as gpu_shiftseam but on host) : 12522ms
Total program run time: 12s


Apparently the GPU computation time is way shorter than the one on CPU, but
for some reason the total program run time for gpu is a lot longer, does
anyone know why? Is it because of the number of threads/blocks I am assigning
is incorrect? Or is the "slowness" coming from allocating memory on device?


Thanks a lot!
Hello, I am having trouble figuring out why my cuda code runs slower than my cpu code



my desktop configuration is i7 2600S, geforce 560ti



and my code is as follows:





int** kernel_shiftSeam(int **MCEnergyMat, int **newE, int *seam, int width, int height, int direction)

{

//time measurement

float elapsed_time_ms = 0;

cudaEvent_t start, stop; //threads per block



dim3 threads(16,16);

//blocks

dim3 blocks((width+threads.x-1)/threads.x, (height+threads.y-1)/threads.y);



int *device_Seam;



int *host_Seam;



int seamSize;

if(direction == 1)

{

seamSize = height*sizeof(int);

host_Seam = (int*)malloc(seamSize);

for(int i=0;i<height;i++)

host_Seam[i] = seam[i];

}

else

{

seamSize = width*sizeof(int);

host_Seam = (int*)malloc(seamSize);

for(int i=0;i<width;i++)

host_Seam[i] = seam[i];

}



cudaMalloc((void**)&device_Seam, seamSize);

cudaMemcpy(device_Seam, host_Seam, seamSize, cudaMemcpyHostToDevice);



global_host_MC = MCEnergyMat;

new_host_MC = newE;



//copy host array to device

cudaMemcpy(global_MC, global_MC2, sizeof(int*)*width, cudaMemcpyHostToDevice);

for(int i=0;i<width;i++)

cudaMemcpy(global_MC2[i], global_host_MC[i], sizeof(int)*height, cudaMemcpyHostToDevice);



cudaMemcpy(new_MC, new_MC2, sizeof(int*)*width, cudaMemcpyHostToDevice);

for(int i=0;i<width;i++)

cudaMemcpy(new_MC2[i], new_host_MC[i], sizeof(int)*height, cudaMemcpyHostToDevice);





cudaEventCreate(&start);

cudaEventCreate(&stop);

cudaEventRecord(start, 0);



//do some operations on the 2d matrix

gpu_shiftSeam<<< blocks,threads >>>(global_MC, new_MC, device_Seam, width, height);



//measure end time for cpu calcuations

cudaEventRecord(stop, 0);

cudaEventSynchronize(stop);

cudaEventElapsedTime(&elapsed_time_ms, start, stop );



execTime += elapsed_time_ms;



//copy out the data back to host (RESULT)

for(int i=0;i<width;i++)

{

cudaMemcpy(newE[i], new_MC2[i], sizeof(int)*height, cudaMemcpyDeviceToHost);

}



return newE;

}



I looped it 800 times and I got the follow results:



GPU

Computation Time (the gpu_shiftseam part) : 1176ms

Total program run time: 22s





CPU

Computation Time (same operation as gpu_shiftseam but on host) : 12522ms

Total program run time: 12s





Apparently the GPU computation time is way shorter than the one on CPU, but

for some reason the total program run time for gpu is a lot longer, does

anyone know why? Is it because of the number of threads/blocks I am assigning

is incorrect? Or is the "slowness" coming from allocating memory on device?





Thanks a lot!

#1
Posted 04/30/2012 03:23 AM   
There is overhead with "booting" the card and memory allocations. Try to measure the total time of execution without the shift part just to see how much are those.
There is overhead with "booting" the card and memory allocations. Try to measure the total time of execution without the shift part just to see how much are those.

#2
Posted 04/30/2012 09:33 AM   
This is mainly the memory copy operation, and the fact that you don't interleave memory copy and Kernel execution.

Each cudaMemcpy also have a big overhead when copying few data, I suggest to prepare a huge single block and copy it with just one cudaMemcpy() instead looping, and interleave block preparation and copy on CPU while kernel is running on the GPU.
This is mainly the memory copy operation, and the fact that you don't interleave memory copy and Kernel execution.



Each cudaMemcpy also have a big overhead when copying few data, I suggest to prepare a huge single block and copy it with just one cudaMemcpy() instead looping, and interleave block preparation and copy on CPU while kernel is running on the GPU.

Parallelis.com, Parallel-computing technologies and benchmarks. Current Projects: OpenCL Chess & OpenCL Benchmark

#3
Posted 05/01/2012 03:36 PM   
Scroll To Top