Below are two snippets of code that perform the same task. The kernel obtains an input array, adds a constant to that, and returns the output.
Could someone explain to me the differences between these two snippets of code?
The first one is significantly faster than the second snippet.
Is this due to the fact that the first one is using pinned memory that is residing on the GPU maybe?
If you see/know something, please let me know!
Thanks, BHa
float *inputA = NULL;
float *outputA = NULL;
cudaHostAlloc((void**)&inputA, dataSize, cudaHostAllocMapped);
cudaHostAlloc((void**)&outputA, dataSize, cudaHostAllocMapped);
add << < numBlocks, blockSize >> > (inputA, outputA);
cudaFreeHost(inputA);
cudaFreeHost(outputA);
float *inputA = NULL;
float *devA = NULL;
float *outputA = NULL;
float *devOutA = NULL;
cudaMallocHost(&inputA, dataSize);
cudaMalloc(&devA, dataSize);
cudaMallocHost(&outputA, dataSize);
cudaMalloc(&devOutA, dataSize);
cudaMemcpyAsync(devA, inputA, dataSize, cudaMemcpyDefault);
add << < numBlocks, blockSize >> > (devA, devOutA);
cudaMemcpyAsync(outputA, devOutA, dataSize, cudaMemcpyDeviceToHost);
cudaFreeHost(inputA);
cudaFree(devA);
cudaFreeHost(outputA);
cudaFree(devOutA);