Hi folks,
in preparation for my bachelor thesis i’ve been working on a matrix matrix multiplication implementation on a multi gpu basis in order to get some reference times, so i came up with the following code based on the multi gpu cuda sample.
I’d really appreciate it, if you would take a look and provide any further suggestions.
Best regards
#include "matMulMultiGPU.h"
const int MAX_GPU_COUNT = 3;
int main() {
int block_size = 32;
GPUplan plans[MAX_GPU_COUNT];
size_t ptxSize;
char *filename = "./matMul.cu";
dim3 dimsA(8192, 8192, 1);
dim3 dimsB(8192, 8192, 1);
dim3 dimsC(dimsB.x, dimsA.y, 1);
dim3 threads(block_size, block_size);
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
unsigned int size_A = dimsA.x * dimsA.y;
unsigned int size_B = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A;
unsigned int mem_size_B = sizeof(float) * size_B;
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float* h_mat_a, *h_mat_b, *h_mat_c;
std::ifstream inputFile(filename, std::ios::in | std::ios::binary | std::ios::ate);
std::streampos pos = inputFile.tellg();
size_t inputSize = (size_t)pos;
char * memBlock = new char[inputSize + 1];
inputFile.seekg(0, std::ios::beg);
inputFile.read(memBlock, inputSize);
inputFile.close();
memBlock[inputSize] = '\x0';
nvrtcProgram prog;
nvrtcCreateProgram(&prog, memBlock, filename, 0, NULL, NULL);
nvrtcCompileProgram(prog, 0, NULL);
nvrtcGetPTXSize(prog, &ptxSize);
char *ptx = (char *)malloc(sizeof(char) * ptxSize);
nvrtcGetPTX(prog, ptx);
nvrtcDestroyProgram(&prog);
CUmodule module;
cuInit(0);
//Create streams for issuing GPU command asynchronously and allocate memory (GPU and System page-locked)
int nrows = dimsA.y / MAX_GPU_COUNT;
int lastrows = dimsA.y - nrows*(MAX_GPU_COUNT - 1);
for (int i = 0; i < MAX_GPU_COUNT; i++)
{
CUDA_ERROR_CHECK(cuDeviceGet(&plans[i].device, i));
CUDA_ERROR_CHECK(cuCtxCreate(&plans[i].context, 0, plans[i].device));
CUDA_ERROR_CHECK(cuCtxSetCurrent(plans[i].context));
CUDA_ERROR_CHECK(cuStreamCreate(&plans[i].stream, CU_STREAM_NON_BLOCKING));
int rows = nrows;
if (i == MAX_GPU_COUNT - 1) {
rows = lastrows;
rows = (rows + block_size - 1);
}
CUDA_ERROR_CHECK(cuMemAlloc(&plans[i].d_mat_a, rows * dimsA.x * sizeof(float)));
CUDA_ERROR_CHECK(cuMemAlloc(&plans[i].d_mat_b, mem_size_B));
CUDA_ERROR_CHECK(cuMemAlloc(&plans[i].d_mat_c, rows * dimsC.x * sizeof(float)));
CUDA_ERROR_CHECK(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
CUDA_ERROR_CHECK(cuModuleGetFunction(&plans[i].kernel_addr, module, "matrixMul"));
if (i == 0) {
CUDA_ERROR_CHECK(cuMemHostAlloc((void**)&h_mat_a, mem_size_A, CU_MEMHOSTALLOC_PORTABLE));
CUDA_ERROR_CHECK(cuMemHostAlloc((void**)&h_mat_b, mem_size_B, CU_MEMHOSTALLOC_PORTABLE));
CUDA_ERROR_CHECK(cuMemHostAlloc((void**)&h_mat_c, mem_size_C, CU_MEMHOSTALLOC_PORTABLE));
for (int i = 0; i < dimsA.x * dimsA.y; i++) {
h_mat_a[i] = 1.0f;
}
for (int i = 0; i < dimsB.x * dimsB.y; i++) {
h_mat_b[i] = 1.0f;
}
}
}
// Data transfer and kernel execution
for (int i = 0; i < MAX_GPU_COUNT; i++)
{
int rows = nrows;
int grid_y = nrows / threads.y;
if (i == MAX_GPU_COUNT - 1) {
rows = lastrows;
grid_y = (rows + threads.y - 1) / threads.y;
}
int offset = dimsA.x *( nrows * i);
//Set device
CUDA_ERROR_CHECK(cuCtxSetCurrent(plans[i].context));
//Copy input data from CPU
CUDA_ERROR_CHECK(cuMemcpyHtoDAsync(plans[i].d_mat_a, &(h_mat_a[offset]), rows * dimsA.x * sizeof(float), plans[i].stream));
CUDA_ERROR_CHECK(cuMemcpyHtoDAsync(plans[i].d_mat_b, h_mat_b, mem_size_B, plans[i].stream));
void *arr[] = { (void *)&plans[i].d_mat_a, (void *)&plans[i].d_mat_b, (void *)&plans[i].d_mat_c, (void *)&dimsA.x, (void *)&dimsB.x };
CUDA_ERROR_CHECK(cuLaunchKernel(plans[i].kernel_addr,
grid.x, grid_y, grid.z,
threads.x, threads.y, threads.z,
0, plans[i].stream,
&arr[0],
0));
CUDA_ERROR_CHECK(cuMemcpyDtoHAsync(&(h_mat_c[offset]), plans[i].d_mat_c, rows * dimsA.x * sizeof(float), plans[i].stream));
}
for (int i = 0; i < MAX_GPU_COUNT; i++) {
CUDA_ERROR_CHECK(cuCtxSetCurrent(plans[i].context));
CUDA_ERROR_CHECK(cuStreamSynchronize(plans[i].stream));
}
std::cout << "First: " << h_mat_c[0] << std::endl;
std::cout << "Last: " << h_mat_c[dimsC.x * dimsC.y - 1] << std::endl;
//Clean up
for (int i = 0; i < MAX_GPU_COUNT; i++) {
CUDA_ERROR_CHECK(cuMemFree(plans[i].d_mat_a));
CUDA_ERROR_CHECK(cuMemFree(plans[i].d_mat_b));
CUDA_ERROR_CHECK(cuMemFree(plans[i].d_mat_c));
CUDA_ERROR_CHECK(cuStreamDestroy(plans[i].stream));
CUDA_ERROR_CHECK(cuCtxDestroy(plans[i].context));
}
return 0;
}