Hi.
I’m testing a task based runtime which uses Unified Memory to run CUDA kernels. The problem I have is that when adding prefetch operations, the performance degrades a lot.
The problem only appears when I need to run a lot of kernels (hundreds/thousands), when only a few kernels are involved there is no issue. I have tested it on a microbenchmark, which divided an array of data into a number of blocks and runs a kernel for each block. With many small blocks, the performance is horrible.
#define DEVICE 0
// KERNEL
__global__ void kernel(long int n, float* x, float *y)
{
long int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i < n) x[i] = x[i] + y[i];
}
void init_data(long int N, float *x, float *y){
for(long int i = 0; i < N; i++){
x[i] = i;
y[i] = i+2;
}
}
// KERNEL CALL
void call_kernels(long int N, long int BS, float *x, float *y){
long int blockSize = 128;
long int gridSize = ceil( BS / blockSize );
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaEvent_t event;
for (long int i=0; i<N; i+=BS ){
cudaMemPrefetchAsync(x, BS*sizeof(float), DEVICE, stream);
cudaMemPrefetchAsync(y, BS*sizeof(float), DEVICE, stream);
kernel<<<gridSize, blockSize>>>(BS, &x[i], &y[i]);
cudaMemPrefetchAsync(x, BS*sizeof(float), cudaCpuDeviceId, stream);
cudaMemPrefetchAsync(y, BS*sizeof(float), cudaCpuDeviceId, stream);
}
cudaDeviceSynchronize();
cudaStreamDestroy(stream);
}
void check_results(long int N, float *x){
for(long int i = 0; i < N; ++i){
if(x[i] != i+i+2){
printf("Error when checking results in position %ld, is %f, should be %ld\n", i, x[i], i+i+2);
return;
}
}
printf("Results are correct\n");
}
//THREAD MAIN
int main(){
long unsigned int size = ceil((long int) 2 * 1024 * 1024 * 1024 / sizeof(float)); //TWO GB of floats
//Data
float *x, *y;
cudaSetDevice(DEVICE);
cudaMallocManaged((void **) &x, size*sizeof(float), cudaMemAttachGlobal);
cudaMallocManaged((void **) &y, size*sizeof(float), cudaMemAttachGlobal);
long int BS = size / 8192;
init_data(size, x, y);
call_kernels(size, BS, x, y);
check_results(size, x);
return NULL;
}
When removing the prefetch calls the program runs well, however, with the prefetch calls the performance degrades for many blocks. Another version of the same program which does not use unified memory and instead uses cudaMemcpy where the cudaMemPrefetchAsyncs are located also runs well.
Is this due to overhead on the prefetch operation or am I doing something wrong?
Also, I am running on a Pascal Titan X.
Thank you!