Concurrent Kernels
I'm running the example from the SDK and in some cases it does not run concurrently. [code] __global__ void clock_block0(unsigned long count) { unsigned long c = 0; while (c < count) { c = c*1.0; c = c + 1.0; } } __global__ void clock_block1(unsigned long count) { unsigned long c = 0; while (c < count) { c = c*1.0; c = c + 1.0; } } int main(int argc, char **argv) { cudaError_t cudaStatus; int nkernels = 6; // number of concurrent kernels int nstreams = nkernels + 1; // use one more stream than concurrent kernel int nbytes = nkernels * sizeof(unsigned long); // number of data bytes float kernel_time = 10; // time the kernel should run in ms float elapsed_time; // timing variables int cuda_device = 0; cudaStream_t streams[2]; cudaStreamCreate(&streams[0]); cudaStreamCreate(&streams[1]); // allocate host memory unsigned long *a = 0; // pointer to the array data in host memory cudaMallocHost((void **)&a, nbytes); // allocate device memory unsigned long *d_a = 0; // pointers to data and init value in the device memory cudaMalloc((void **)&d_a, nbytes); cudaMemcpyAsync(d_a, a, sizeof(unsigned long), cudaMemcpyHostToDevice, streams[0]); cudaStreamSynchronize(streams[0]); int i=0; while (i<nkernels) { while ((cudaStatus=cudaStreamQuery(streams[0]))==cudaErrorNotReady && (cudaStatus=cudaStreamQuery(streams[1]))==cudaErrorNotReady); cudaStatus = cudaStreamQuery(streams[0]); if (cudaStatus==cudaSuccess) { clock_block0<<<3,768,0,streams[0]>>>(100000); i++; } cudaStatus = cudaStreamQuery(streams[1]); if (cudaStatus==cudaSuccess) { clock_block1<<<1,768,0,streams[1]>>>(100000); i++; } } cudaMemcpy(a, d_a, sizeof(unsigned long), cudaMemcpyDeviceToHost); // release resources for (int i = 0; i < 2; i++) cudaStreamDestroy(streams[i]); cudaFreeHost(a); cudaFree(d_a); cudaDeviceReset(); } } [/code] [img]http://i.stack.imgur.com/hWEFs.jpg[/img] Now, when I change the code: [code] cudaStatus = cudaStreamQuery(streams[0]); if (cudaStatus==cudaSuccess) { clock_block0<<<3,768,0,streams[0]>>>(100000); i++; } cudaStatus = cudaStreamQuery(streams[1]); if (cudaStatus==cudaSuccess) { clock_block1<<<1,768,0,streams[1]>>>(200000); i++; } [/code] [img]http://i.stack.imgur.com/AH4Q8.jpg[/img] Just the first 2 calls run concurrently. My GPU is 2.1 and there are 2 SMs. It is possible to run up to 4 blocks x 768 threads, for this kernel. Does anyone know why it happens?
I'm running the example from the SDK and in some cases it does not run concurrently.

__global__ void clock_block0(unsigned long count)
{
unsigned long c = 0;

while (c < count) {
c = c*1.0;
c = c + 1.0;
}
}
__global__ void clock_block1(unsigned long count)
{
unsigned long c = 0;

while (c < count) {
c = c*1.0;
c = c + 1.0;
}
}
int main(int argc, char **argv) {
cudaError_t cudaStatus;
int nkernels = 6; // number of concurrent kernels
int nstreams = nkernels + 1; // use one more stream than concurrent kernel
int nbytes = nkernels * sizeof(unsigned long); // number of data bytes
float kernel_time = 10; // time the kernel should run in ms
float elapsed_time; // timing variables
int cuda_device = 0;

cudaStream_t streams[2];
cudaStreamCreate(&streams[0]);
cudaStreamCreate(&streams[1]);

// allocate host memory
unsigned long *a = 0; // pointer to the array data in host memory
cudaMallocHost((void **)&a, nbytes);

// allocate device memory
unsigned long *d_a = 0; // pointers to data and init value in the device memory
cudaMalloc((void **)&d_a, nbytes);

cudaMemcpyAsync(d_a, a, sizeof(unsigned long), cudaMemcpyHostToDevice, streams[0]);

cudaStreamSynchronize(streams[0]);

int i=0;
while (i<nkernels) {
while ((cudaStatus=cudaStreamQuery(streams[0]))==cudaErrorNotReady && (cudaStatus=cudaStreamQuery(streams[1]))==cudaErrorNotReady);

cudaStatus = cudaStreamQuery(streams[0]);
if (cudaStatus==cudaSuccess) {
clock_block0<<<3,768,0,streams[0]>>>(100000);
i++;
}

cudaStatus = cudaStreamQuery(streams[1]);
if (cudaStatus==cudaSuccess) {
clock_block1<<<1,768,0,streams[1]>>>(100000);
i++;
}
}

cudaMemcpy(a, d_a, sizeof(unsigned long), cudaMemcpyDeviceToHost);

// release resources
for (int i = 0; i < 2; i++)
cudaStreamDestroy(streams[i]);

cudaFreeHost(a);
cudaFree(d_a);

cudaDeviceReset();
}
}


Image

Now, when I change the code:

cudaStatus = cudaStreamQuery(streams[0]);
if (cudaStatus==cudaSuccess) {
clock_block0<<<3,768,0,streams[0]>>>(100000);
i++;
}

cudaStatus = cudaStreamQuery(streams[1]);
if (cudaStatus==cudaSuccess) {
clock_block1<<<1,768,0,streams[1]>>>(200000);
i++;
}


Image

Just the first 2 calls run concurrently.

My GPU is 2.1 and there are 2 SMs.
It is possible to run up to 4 blocks x 768 threads, for this kernel.

Does anyone know why it happens?

#1
Posted 04/02/2013 04:13 PM   
We've just solved a concurrency problem in another thread on this forum. It was the work queues of the WDDM drivers in Vista, Windows 7 and 8, causing kernel calls to get delayed. There we inserted a cudaStreamQuery(streams[x]) right after the kernel launches to force the work queue to be flushed and the kernel launch to begin immedately. Try this here, maybe? Christian
We've just solved a concurrency problem in another thread on this forum.

It was the work queues of the WDDM drivers in Vista, Windows 7 and 8, causing kernel calls to get delayed. There we inserted a cudaStreamQuery(streams[x]) right after the kernel launches to force the work queue to be flushed and the kernel launch to begin immedately. Try this here, maybe?

Christian

#2
Posted 04/02/2013 04:12 PM   
Hello Christian, Thank you for your help, but it doesn't work. I saw the post in [url]https://devtalk.nvidia.com/default/topic/538148/cuda-programming-and-performance/overlapping-cpu-and-gpu-operations-using-streams-total-failure-any-help-/post/3775423/#3775423[/url] Modifying the code to: [code] cudaStatus = cudaStreamQuery(streams[0]); if (cudaStatus==cudaSuccess) { clock_block0<<<3,768,0,streams[0]>>>(100000); cudaStreamQuery(streams[0]); i++; } cudaStatus = cudaStreamQuery(streams[1]); if (cudaStatus==cudaSuccess) { clock_block1<<<1,768,0,streams[1]>>>(200000); cudaStreamQuery(streams[1]); i++; } [/code] I get the following result: [img]http://i.stack.imgur.com/xZMzt.jpg[/img]
Hello Christian,

Thank you for your help, but it doesn't work.

I saw the post in https://devtalk.nvidia.com/default/topic/538148/cuda-programming-and-performance/overlapping-cpu-and-gpu-operations-using-streams-total-failure-any-help-/post/3775423/#3775423

Modifying the code to:

cudaStatus = cudaStreamQuery(streams[0]);
if (cudaStatus==cudaSuccess) {
clock_block0<<<3,768,0,streams[0]>>>(100000);
cudaStreamQuery(streams[0]);
i++;
}

cudaStatus = cudaStreamQuery(streams[1]);
if (cudaStatus==cudaSuccess) {
clock_block1<<<1,768,0,streams[1]>>>(200000);
cudaStreamQuery(streams[1]);
i++;
}


I get the following result:
Image

#3
Posted 04/02/2013 04:33 PM   
On Windows OS using WDDM driver model (Vista, 7, 8) work is submitted to the GPU in command buffers. Only work in the same command buffer can be executed concurrently. cudaStreamQuery(stream) flushes the CUDA user mode work queue. Each flush results in a WDDM KMD command buffer. You only want to flush the user mode work queue when you want to do additional CPU processing. Calling a *Synchronize call or a synchronous command will also flush the user mode work queue. Your example should result in concurrent execution on the Windows XP, Linux, MacOSX, and the Windows Tesla Compute Cluster (TCC) driver.
On Windows OS using WDDM driver model (Vista, 7, 8) work is submitted to the GPU in command buffers. Only work in the same command buffer can be executed concurrently. cudaStreamQuery(stream) flushes the CUDA user mode work queue. Each flush results in a WDDM KMD command buffer. You only want to flush the user mode work queue when you want to do additional CPU processing. Calling a *Synchronize call or a synchronous command will also flush the user mode work queue.

Your example should result in concurrent execution on the Windows XP, Linux, MacOSX, and the Windows Tesla Compute Cluster (TCC) driver.

#4
Posted 04/02/2013 11:48 PM   
Scroll To Top

Add Reply