cuda gpu slower than cpu

Hello, I am having trouble figuring out why my cuda code runs slower than my cpu code

my desktop configuration is i7 2600S, geforce 560ti

and my code is as follows:

int** kernel_shiftSeam(int **MCEnergyMat, int **newE, int *seam, int width, int height, int direction)
{
//time measurement
float elapsed_time_ms = 0;
cudaEvent_t start, stop; //threads per block

dim3 threads(16,16);
//blocks
dim3 blocks((width+threads.x-1)/threads.x, (height+threads.y-1)/threads.y);

int *device_Seam;

int *host_Seam;

int seamSize;
if(direction == 1)
{
	seamSize = height*sizeof(int);
	host_Seam = (int*)malloc(seamSize);
	for(int i=0;i<height;i++)
		host_Seam[i] = seam[i];
}
else
{
	seamSize = width*sizeof(int);
	host_Seam = (int*)malloc(seamSize);
	for(int i=0;i<width;i++)
		host_Seam[i] = seam[i];
}

cudaMalloc((void**)&device_Seam, seamSize);
cudaMemcpy(device_Seam, host_Seam, seamSize, cudaMemcpyHostToDevice);

global_host_MC = MCEnergyMat;
new_host_MC = newE;

//copy host array to device
cudaMemcpy(global_MC, global_MC2, sizeof(int*)*width, cudaMemcpyHostToDevice);
for(int i=0;i<width;i++)
	cudaMemcpy(global_MC2[i], global_host_MC[i], sizeof(int)*height, cudaMemcpyHostToDevice);
	
cudaMemcpy(new_MC, new_MC2, sizeof(int*)*width, cudaMemcpyHostToDevice);
for(int i=0;i<width;i++)
	cudaMemcpy(new_MC2[i], new_host_MC[i], sizeof(int)*height, cudaMemcpyHostToDevice);


cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);

    //do some operations on the 2d matrix
gpu_shiftSeam<<< blocks,threads >>>(global_MC, new_MC, device_Seam, width, height);

//measure end time for cpu calcuations
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_ms, start, stop );

execTime += elapsed_time_ms;

//copy out the data back to host (RESULT)
for(int i=0;i<width;i++)
{
	cudaMemcpy(newE[i], new_MC2[i], sizeof(int)*height, cudaMemcpyDeviceToHost);
}

return newE;

}

I looped it 800 times and I got the follow results:

GPU
Computation Time (the gpu_shiftseam part) : 1176ms
Total program run time: 22s

CPU
Computation Time (same operation as gpu_shiftseam but on host) : 12522ms
Total program run time: 12s

Apparently the GPU computation time is way shorter than the one on CPU, but
for some reason the total program run time for gpu is a lot longer, does
anyone know why? Is it because of the number of threads/blocks I am assigning
is incorrect? Or is the “slowness” coming from allocating memory on device?

Thanks a lot!

There is overhead with “booting” the card and memory allocations. Try to measure the total time of execution without the shift part just to see how much are those.

This is mainly the memory copy operation, and the fact that you don’t interleave memory copy and Kernel execution.

Each cudaMemcpy also have a big overhead when copying few data, I suggest to prepare a huge single block and copy it with just one cudaMemcpy() instead looping, and interleave block preparation and copy on CPU while kernel is running on the GPU.