I am trying to get more deep into the kernel concurrency so I have created a very simple program to test it:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
__global__ void K1()
{
printf("Hello from K1!\n");
}
__global__ void K2()
{
printf("Hello from K2!\n");
}
int main()
{
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
for (int i = 0; i < 10; i++)
K1<<<1, 1, 0, stream1>>>();
for (int i = 0; i < 10; i++)
K2<<<1, 1, 0, stream2>>>();
for (int i = 0; i < 10; i++) {
K1<<<1, 1, 0, stream1>>>();
K2<<<1, 1, 0, stream2>>>();
}
for (int i = 0; i < 10; i++)
K2<<<1, 1, 0, stream2>>>();
for (int i = 0; i < 10; i++)
K1<<<1, 1, 0, stream1>>>();
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
cudaDeviceReset();
return 0;
}
The output from the profiler shown in the attached image.
Now, my question is: is this really expected behavior? Is it not odd that the execution of kernel K2 needs to wait until almost all K1 have been taken from the submit queue? Shouldn’t it be logical that CUDA recognizes that the K2 kernel is in different stream so it starts executing it immediately?
Indeed, I also achieved complete concurrency when compiling and running on device that supports cc5.2. My first experiment was on cc3.0 (Tesla K10). I would dare to say that this is a pretty serious limitation for devices < cc3.5