I am currently learning how to use cuda streams and I am trying to create a simple example. In this example I have a simple kernel that squares the elements in an array. I also created two streams and an input array for each stream. Then on each stream I copy the data to the device and launch the kernel. Here is my code
#include <stdio.h>
__global__ void square_array(float *a, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N)
a[idx] = a[idx] * a[idx];
}
int main(void) {
// Pointer to host & device arrays
float *h_a1, *d_a1;
float *h_a2, *d_a2;
// Number of elements in arrays
const int N = 100000;
int block_size = 4;
int n_blocks = N / block_size + (N % block_size == 0 ? 0 : 1);
cudaStream_t s1;
cudaStreamCreate(&s1);
cudaStream_t s2;
cudaStreamCreate(&s2);
// Allocate array on host
h_a1 = (float *) malloc(N * sizeof(float));
h_a2 = (float *) malloc(N * sizeof(float));
// Allocate array on device using pinned memory
cudaMallocHost((void **) &d_a1, N * sizeof(float));
cudaMallocHost((void **) &d_a2, N * sizeof(float));
// Initialize host array and copy it to CUDA device
for (int i = 0; i < N; i++) {
h_a1[i] = (float) i;
h_a2[i] = (float) i;
}
cudaMemcpyAsync(d_a1, h_a1, N * sizeof(float), cudaMemcpyHostToDevice, s1);
cudaMemcpyAsync(d_a2, h_a2, N * sizeof(float), cudaMemcpyHostToDevice, s2);
// Do calculation on device:
square_array<<<n_blocks, block_size, 0, s1>>>(d_a1, N);
square_array<<<n_blocks, block_size, 0, s2>>>(d_a2, N);
// Retrieve result from device and store it in host array
cudaMemcpyAsync(h_a1, d_a1, sizeof(float) * N, cudaMemcpyDeviceToHost, s1);
cudaMemcpyAsync(h_a2, d_a2, sizeof(float) * N, cudaMemcpyDeviceToHost, s2);
// Cleanup
free(h_a1);
cudaFree(d_a1);
free(h_a2);
cudaFree(d_a2);
cudaStreamDestroy(s1);
cudaStreamDestroy(s2);
}
I am trying to make it so the operations in both streams (i.e. the memory transfers and kernel launches) occur concurrently. However when I profile this code with nvvp I see that the overlap in kernel execution is very minimal. Specifically, the kernel on the first stream runs from 71.93ms to 72.717ms and the kernel from the second stream runs from 72.694ms to 73.473ms. So my question is shouldn’t the execution be overlapping more? If so, is there something wrong with my code?