hi all,
I’m trying to do matrix multiplication with two GPUs to let device 0 work for Upper half of matrix C is and device 1 for lower half of matrix C …using zero copy .
first ,I don’t know do we have to use one kernel or 2 kernel ?
second ,how can I control the upper part and the second part ?
third,do I have to use cudaMemcpyAsync() ?
External Image External Image External Image
what I did is like this
//device 0 //
cudaGetDeviceProperties(&prop, 0);
if (!prop.canMapHostMemory)
exit(0);
cudaSetDeviceFlags(cudaDeviceMapHost);
//float* a_h;
-----
-----
cudaHostAlloc(&a_h, nBytes, cudaHostAllocMapped);
cudaHostAlloc(&b_h, nBytes, cudaHostAllocMapped);
cudaHostAlloc(&c_h, nBytes, cudaHostAllocMapped);
//float* a_map;
----
----
//
cudaHostGetDevicePointer(&a_map, a_h, 0);
cudaHostGetDevicePointer(&b_map, a_h, 0);
cudaHostGetDevicePointer(&c_map, a_h, 0);
kernel<<<gridSize, blockSize>>>(a_map,b_map,c_map);
//device 1//
cudaGetDeviceProperties(&prop, 1);
if (!prop.canMapHostMemory)
exit(0);
cudaSetDeviceFlags(cudaDeviceMapHost);
//float* a_h;
-----
-----
cudaHostAlloc(&a_h, nBytes, cudaHostAllocMapped);
cudaHostAlloc(&b_h, nBytes, cudaHostAllocMapped);
cudaHostAlloc(&c_h, nBytes, cudaHostAllocMapped);
//float* a_map;
----
----
//
cudaHostGetDevicePointer(&a_map, a_h, 0);
cudaHostGetDevicePointer(&b_map, a_h, 0);
cudaHostGetDevicePointer(&c_map, a_h, 0);
kernel<<<gridSize, blockSize>>>(a_map,b_map,c_map);
lookind foroward to some help .
Thanks