Hi, I’m implement a multiplication code from a book.
But, I was in a trouble because the size of the matrix could not be large…
I assume that this matrix is square matrix and A X B = C → [NN] X [NN] = [N*N]
and Also I use a tile for block index.
How can I set the size of the matrix to be 1000 X 1000 or more and more.
((I use Titan X and this GPU can support 1024 thread per block))
#include “device_launch_parameters.h”
#include “book.h”
#include “cuda_runtime.h”
#define N 128
#define TILE_WIDTH 16
global void matMulkernel(int *a, int *b, int *c, int width){
int Row = blockIdx.y*TILE_WIDTH + threadIdx.y;
int Col = blockIdx.x*TILE_WIDTH + threadIdx.x;
float Pvalue = 0;
for(int k = 0; k < width ; ++k)
Pvalue += a[Row*width+k] * b[k*width+Col];
c[Row*width+Col] = Pvalue;
}
int main(){
int a[N*N], b[N*N], c[N*N];
int *dev_a, *dev_b, *dev_c;
int _size = N*N*sizeof(int);
for(int i = 0; i < N*N; i++){
a[i] = 2;
b[i] = 3;
c[i] = 0;
}
HANDLE_ERROR(cudaMalloc((void**)&dev_a, _size));
HANDLE_ERROR(cudaMalloc((void**)&dev_b, _size));
HANDLE_ERROR(cudaMalloc((void**)&dev_c, _size));
HANDLE_ERROR(cudaMemcpy(dev_a, a, _size, cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_b, b, _size, cudaMemcpyHostToDevice));
dim3 dimGrids(N/TILE_WIDTH, N/TILE_WIDTH);
dim3 dimBlocks(TILE_WIDTH, TILE_WIDTH);
matMulkernel<<<dimGrids, dimBlocks>>>(dev_a, dev_b, dev_c, N);
HANDLE_ERROR(cudaMemcpy(c, dev_c, _size, cudaMemcpyDeviceToHost));
for(int i = 0; i< N*N; i++)
printf("%d \n", c[i]);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}