Overlapping GPU and CPU computation?
Hi,

I am experiencing a problem running CUDA and CPU code concurrently. My understanding (after looking at page 13 of http://www.nvidia.co.../1122_GTC09.pdf ) is that it is possible to launch a CUDA kernel asynchronously and carry out work on the CPU while the kernel is executing. Consider the example below:

Sample.cu
[code]
__global__ void CUDA_Long_Kernel(float* num)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;

for (int i=0; i< 90000; i++)
{
num[y*blockDim.x + x] = 0;

__syncthreads();
}
}

extern "C" void CUDA_Long_Kernel(dim3 threadsPerBlock, dim3 numBlocks, cudaStream_t stream, float* pos)
{

CUDA_Long_Kernel<<<numBlocks, threadsPerBlock, 0, stream>>>(pos);


}
[/code]


main.cpp
[code]
#include <stdio.h>
#include <cuda.h>
#include <cutil.h>
#include <cuda_runtime_api.h>
#include <cutil_inline_drvapi.h>
#include <cutil_inline_runtime.h>
#include <windows.h>


extern "C" void CUDA_Long_Kernel(dim3 threadsPerBlock, dim3 numBlocks, cudaStream_t stream, float* pos);

int main(int argc, char* argv[])
{
cudaStream_t stream;
cudaStreamCreate(&stream);

dim3 threadsPerBlock(16, 16);
dim3 numBlocks(30, 30, 1);
float* pos;
cudaMalloc((void**)&pos, numBlocks.x * numBlocks.y * threadsPerBlock.x * threadsPerBlock.y);

unsigned int hTimer;
cutCreateTimer(&hTimer);
cutResetTimer(hTimer);
cutStartTimer(hTimer);

//GPU work - (this takes around 530ms on my machine)
CUDA_Long_Kernel(threadsPerBlock, numBlocks, stream, pos);


//CPU work - just waste some time (this takes around 93ms on my machine)
int x = 0;
for (int i=0; i< 5000000; i++)
{
x =pow(x,(double)2);
}



cudaThreadSynchronize();

printf("Processing time: %f msec - %i\n", cutGetTimerValue(hTimer), x);
cutDeleteTimer(hTimer);

cudaFree(pos);

cudaStreamDestroy(stream);
getchar();
}
[/code]


If I execute the CPU work alone it takes about 93ms and the GPU work alone takes around 530ms (on a Core i7 930 with a GeForce GTX470). When I run the example above, the code takes about 630ms. Shouldn't the GPU work hide the work being carried out on the CPU? i.e. I would expect the total runtime to be somewhere around 530ms. Can anyone please shed some light on why this is happening?

Thanks

Steven
Hi,



I am experiencing a problem running CUDA and CPU code concurrently. My understanding (after looking at page 13 of http://www.nvidia.co.../1122_GTC09.pdf ) is that it is possible to launch a CUDA kernel asynchronously and carry out work on the CPU while the kernel is executing. Consider the example below:



Sample.cu



__global__ void CUDA_Long_Kernel(float* num)

{

int x = blockIdx.x * blockDim.x + threadIdx.x;

int y = blockIdx.y * blockDim.y + threadIdx.y;



for (int i=0; i< 90000; i++)

{

num[y*blockDim.x + x] = 0;



__syncthreads();

}

}



extern "C" void CUDA_Long_Kernel(dim3 threadsPerBlock, dim3 numBlocks, cudaStream_t stream, float* pos)

{



CUDA_Long_Kernel<<<numBlocks, threadsPerBlock, 0, stream>>>(pos);





}






main.cpp



#include <stdio.h>

#include <cuda.h>

#include <cutil.h>

#include <cuda_runtime_api.h>

#include <cutil_inline_drvapi.h>

#include <cutil_inline_runtime.h>

#include <windows.h>





extern "C" void CUDA_Long_Kernel(dim3 threadsPerBlock, dim3 numBlocks, cudaStream_t stream, float* pos);



int main(int argc, char* argv[])

{

cudaStream_t stream;

cudaStreamCreate(&stream);



dim3 threadsPerBlock(16, 16);

dim3 numBlocks(30, 30, 1);

float* pos;

cudaMalloc((void**)&pos, numBlocks.x * numBlocks.y * threadsPerBlock.x * threadsPerBlock.y);



unsigned int hTimer;

cutCreateTimer(&hTimer);

cutResetTimer(hTimer);

cutStartTimer(hTimer);



//GPU work - (this takes around 530ms on my machine)

CUDA_Long_Kernel(threadsPerBlock, numBlocks, stream, pos);





//CPU work - just waste some time (this takes around 93ms on my machine)

int x = 0;

for (int i=0; i< 5000000; i++)

{

x =pow(x,(double)2);

}







cudaThreadSynchronize();



printf("Processing time: %f msec - %i\n", cutGetTimerValue(hTimer), x);

cutDeleteTimer(hTimer);



cudaFree(pos);



cudaStreamDestroy(stream);

getchar();

}






If I execute the CPU work alone it takes about 93ms and the GPU work alone takes around 530ms (on a Core i7 930 with a GeForce GTX470). When I run the example above, the code takes about 630ms. Shouldn't the GPU work hide the work being carried out on the CPU? i.e. I would expect the total runtime to be somewhere around 530ms. Can anyone please shed some light on why this is happening?



Thanks



Steven

#1
Posted 11/15/2010 01:50 PM   
Hi,

I am experiencing a problem running CUDA and CPU code concurrently. My understanding (after looking at page 13 of http://www.nvidia.co.../1122_GTC09.pdf ) is that it is possible to launch a CUDA kernel asynchronously and carry out work on the CPU while the kernel is executing. Consider the example below:

Sample.cu
[code]
__global__ void CUDA_Long_Kernel(float* num)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;

for (int i=0; i< 90000; i++)
{
num[y*blockDim.x + x] = 0;

__syncthreads();
}
}

extern "C" void CUDA_Long_Kernel(dim3 threadsPerBlock, dim3 numBlocks, cudaStream_t stream, float* pos)
{

CUDA_Long_Kernel<<<numBlocks, threadsPerBlock, 0, stream>>>(pos);


}
[/code]


main.cpp
[code]
#include <stdio.h>
#include <cuda.h>
#include <cutil.h>
#include <cuda_runtime_api.h>
#include <cutil_inline_drvapi.h>
#include <cutil_inline_runtime.h>
#include <windows.h>


extern "C" void CUDA_Long_Kernel(dim3 threadsPerBlock, dim3 numBlocks, cudaStream_t stream, float* pos);

int main(int argc, char* argv[])
{
cudaStream_t stream;
cudaStreamCreate(&stream);

dim3 threadsPerBlock(16, 16);
dim3 numBlocks(30, 30, 1);
float* pos;
cudaMalloc((void**)&pos, numBlocks.x * numBlocks.y * threadsPerBlock.x * threadsPerBlock.y);

unsigned int hTimer;
cutCreateTimer(&hTimer);
cutResetTimer(hTimer);
cutStartTimer(hTimer);

//GPU work - (this takes around 530ms on my machine)
CUDA_Long_Kernel(threadsPerBlock, numBlocks, stream, pos);


//CPU work - just waste some time (this takes around 93ms on my machine)
int x = 0;
for (int i=0; i< 5000000; i++)
{
x =pow(x,(double)2);
}



cudaThreadSynchronize();

printf("Processing time: %f msec - %i\n", cutGetTimerValue(hTimer), x);
cutDeleteTimer(hTimer);

cudaFree(pos);

cudaStreamDestroy(stream);
getchar();
}
[/code]


If I execute the CPU work alone it takes about 93ms and the GPU work alone takes around 530ms (on a Core i7 930 with a GeForce GTX470). When I run the example above, the code takes about 630ms. Shouldn't the GPU work hide the work being carried out on the CPU? i.e. I would expect the total runtime to be somewhere around 530ms. Can anyone please shed some light on why this is happening?

Thanks

Steven
Hi,



I am experiencing a problem running CUDA and CPU code concurrently. My understanding (after looking at page 13 of http://www.nvidia.co.../1122_GTC09.pdf ) is that it is possible to launch a CUDA kernel asynchronously and carry out work on the CPU while the kernel is executing. Consider the example below:



Sample.cu



__global__ void CUDA_Long_Kernel(float* num)

{

int x = blockIdx.x * blockDim.x + threadIdx.x;

int y = blockIdx.y * blockDim.y + threadIdx.y;



for (int i=0; i< 90000; i++)

{

num[y*blockDim.x + x] = 0;



__syncthreads();

}

}



extern "C" void CUDA_Long_Kernel(dim3 threadsPerBlock, dim3 numBlocks, cudaStream_t stream, float* pos)

{



CUDA_Long_Kernel<<<numBlocks, threadsPerBlock, 0, stream>>>(pos);





}






main.cpp



#include <stdio.h>

#include <cuda.h>

#include <cutil.h>

#include <cuda_runtime_api.h>

#include <cutil_inline_drvapi.h>

#include <cutil_inline_runtime.h>

#include <windows.h>





extern "C" void CUDA_Long_Kernel(dim3 threadsPerBlock, dim3 numBlocks, cudaStream_t stream, float* pos);



int main(int argc, char* argv[])

{

cudaStream_t stream;

cudaStreamCreate(&stream);



dim3 threadsPerBlock(16, 16);

dim3 numBlocks(30, 30, 1);

float* pos;

cudaMalloc((void**)&pos, numBlocks.x * numBlocks.y * threadsPerBlock.x * threadsPerBlock.y);



unsigned int hTimer;

cutCreateTimer(&hTimer);

cutResetTimer(hTimer);

cutStartTimer(hTimer);



//GPU work - (this takes around 530ms on my machine)

CUDA_Long_Kernel(threadsPerBlock, numBlocks, stream, pos);





//CPU work - just waste some time (this takes around 93ms on my machine)

int x = 0;

for (int i=0; i< 5000000; i++)

{

x =pow(x,(double)2);

}







cudaThreadSynchronize();



printf("Processing time: %f msec - %i\n", cutGetTimerValue(hTimer), x);

cutDeleteTimer(hTimer);



cudaFree(pos);



cudaStreamDestroy(stream);

getchar();

}






If I execute the CPU work alone it takes about 93ms and the GPU work alone takes around 530ms (on a Core i7 930 with a GeForce GTX470). When I run the example above, the code takes about 630ms. Shouldn't the GPU work hide the work being carried out on the CPU? i.e. I would expect the total runtime to be somewhere around 530ms. Can anyone please shed some light on why this is happening?



Thanks



Steven

#2
Posted 11/15/2010 01:50 PM   
It is considered bad form to cross post in the web forums since it can accidentally split the discussion in two, depending on which thread people see first. I'd suggest people jump over to the other thread to answer this question:

http://forums.nvidia.com/index.php?showtopic=185713
It is considered bad form to cross post in the web forums since it can accidentally split the discussion in two, depending on which thread people see first. I'd suggest people jump over to the other thread to answer this question:



http://forums.nvidia.com/index.php?showtopic=185713

#3
Posted 11/15/2010 02:00 PM   
It is considered bad form to cross post in the web forums since it can accidentally split the discussion in two, depending on which thread people see first. I'd suggest people jump over to the other thread to answer this question:

http://forums.nvidia.com/index.php?showtopic=185713
It is considered bad form to cross post in the web forums since it can accidentally split the discussion in two, depending on which thread people see first. I'd suggest people jump over to the other thread to answer this question:



http://forums.nvidia.com/index.php?showtopic=185713

#4
Posted 11/15/2010 02:00 PM   
[quote name='seibert' date='15 November 2010 - 02:00 PM' timestamp='1289829605' post='1146849']
It is considered bad form to cross post in the web forums since it can accidentally split the discussion in two, depending on which thread people see first. I'd suggest people jump over to the other thread to answer this question:

http://forums.nvidia.com/index.php?showtopic=185713
[/quote]


Hi, thanks for your reply. I wasn't sure whether the question should go under programming or general CUDA behaviour
[quote name='seibert' date='15 November 2010 - 02:00 PM' timestamp='1289829605' post='1146849']

It is considered bad form to cross post in the web forums since it can accidentally split the discussion in two, depending on which thread people see first. I'd suggest people jump over to the other thread to answer this question:



http://forums.nvidia.com/index.php?showtopic=185713







Hi, thanks for your reply. I wasn't sure whether the question should go under programming or general CUDA behaviour

#5
Posted 11/15/2010 02:27 PM   
[quote name='seibert' date='15 November 2010 - 02:00 PM' timestamp='1289829605' post='1146849']
It is considered bad form to cross post in the web forums since it can accidentally split the discussion in two, depending on which thread people see first. I'd suggest people jump over to the other thread to answer this question:

http://forums.nvidia.com/index.php?showtopic=185713
[/quote]


Hi, thanks for your reply. I wasn't sure whether the question should go under programming or general CUDA behaviour
[quote name='seibert' date='15 November 2010 - 02:00 PM' timestamp='1289829605' post='1146849']

It is considered bad form to cross post in the web forums since it can accidentally split the discussion in two, depending on which thread people see first. I'd suggest people jump over to the other thread to answer this question:



http://forums.nvidia.com/index.php?showtopic=185713







Hi, thanks for your reply. I wasn't sure whether the question should go under programming or general CUDA behaviour

#6
Posted 11/15/2010 02:27 PM   
Scroll To Top