I’m trying to do matrix multiplication A=W*B using shared memory. My code is below:
__global__ void forward (float* B, float* A, float* W, int Wcols, int Wrows, int Bcols){
int row=blockDim.y*blockIdx.y+threadIdx.y;
int col=blockDim.x*blockIdx.x+threadIdx.x;
__shared__ float _B[MAXBLOCKSIZE][MAXBLOCKSIZE];
__shared__ float _W[MAXBLOCKSIZE][MAXBLOCKSIZE];
int i, j;
float c=0;
for (i=0; i<(MAXBLOCKSIZE + Wcols - 1)/MAXBLOCKSIZE; i++){
if (i*MAXBLOCKSIZE+ threadIdx.x < Wcols && row < Wrows)
_W[threadIdx.y][threadIdx.x] = W[row*Wcols + i*MAXBLOCKSIZE+ threadIdx.x];
else
_W[threadIdx.y][threadIdx.x] = 0.0;
if (i*MAXBLOCKSIZE+threadIdx.y < Wcols && col<Bcols)
_B[threadIdx.y][threadIdx.x] = B[(i*MAXBLOCKSIZE+threadIdx.y)*Bcols+col];
else
_B[threadIdx.y][threadIdx.x] = 0.0;
__syncthreads();
for (j=0; j<MAXBLOCKSIZE; j++){
c+=_W[threadIdx.y][j]*B[j][threadIdx.x];
}
__syncthreads();
}
if (row < Wrows && col<Bcols){
A[row*Bcols+col]=c;
}
}
The problem is that my code is faster when I use 16x16 blocks and slower when I use 32x32 blocks. Why does this happen, since using 16x16 blocks also creates 16-way bank conflicts when accessing _W[threadIdx.y][j] ?