Hi all,
My real issue is that I found some random error when I try to do a matrix product in cuda. I found that the issue is related to some really strange behavior in the kernel.
For example, the value of threadIdx, BlockDim, are wrong when I print them. Here is an example of the printf from the code below:
idx_element :=31 idx_Chunk_Copy_Smem := 0 ---- threadIdx:(3, 0, 0) blockIdx:(0, 0, 0) blockDim:(0, 0, 32) gridDim:(32, 1, 4) -- zero = 4
As you can see the blockDim is 0 in x and y ! And threadIdx.x = 3 is printed whereas in the code I asked to print only when threadIdx.x == 0 (cf the if statement in line 178 in the code)
Also note the value of zero which is 4 while it is just a constant value initialized to 0 (cf. line 177 in the code).
If I change the value of the variable
// Matrix Info
const INDEX_DEV bugZero = 128;
in line 204 in the code from 128 to 32, then I get the following result
idx_element :=0 idx_Chunk_Copy_Smem := 0 ---- threadIdx:(0, 0, 0) blockIdx:(0, 0, 0) blockDim:(0, 0, 32) gridDim:(32, 1, 1) -- zero = 1
As you can see, the value of zero is now 1 !!
Here is the code. I tried to get rid of some part of the full code in order to just to focus on the printf but then I could not reproduce the behavior anymore so I put the full code here. It’s long but at least you can copy/past it directly and should work.
Here ismy environment:
Windows 10 64 bits
Visual Studio 2017
CUDA 9.1
GTX1080Ti
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include <string>
#include <stdio.h>
#include <iostream>
#include <memory>
#include <typeinfo> // operator typeid
//----------------------------------------------------------------------------------------------------
// Defines Data Types
typedef double REAL;
typedef float FLOAT32;
typedef double FLOAT64;
typedef unsigned int UINT32;
typedef long INT32;
typedef long long int INT64;
typedef unsigned long long int UINT64;
typedef size_t INDEX_CUDA;
typedef size_t INDEX_DEV;
const INDEX_CUDA CONST_BLOCK_DIM_32 = 32;
// Return the error code to the caller on error. To be used wuth all cuda*** methods
#define CUDA_SAFE_CALL(funccall) \
{ \
const cudaError_t err_id = funccall; \
if (err_id != cudaSuccess) \
{ \
__printCudaError(err_id, __FILE__,__LINE__ , "CUDA_SAFE_CALL"); \
return -1; \
} \
}
// Print information on the error
inline void __printCudaError(const cudaError_t err_id, const char * const file, const int line, const char * const funcName)
{
std::cout << std::endl;
std::cout << "---------------------------------------------------------------------------------------------------" << std::endl;
std::cout << funcName << " Runtime API failed." << std::endl;
std::cout << " Reason: " << cudaGetErrorString(err_id) << std::endl;
std::cout << " Error Code: " << err_id << std::endl;
std::cout << " File: " << file << std::endl;
std::cout << " Line: " << static_cast<FLOAT64>(line) << std::endl;
std::cout << "---------------------------------------------------------------------------------------------------" << '\n' << std::endl;
}
//----------------------------------------------------------------------------------------------------
template<class T>
__global__ void MatrixProdPadDynSmem_k(const T * const pA,
const T * const pB,
const INDEX_CUDA SizeColsRes,
const INDEX_CUDA SizeRowsRes,
const INDEX_CUDA SizeColsA_RowsB,
const INDEX_CUDA commonSize_Smem_ColsA_RowsB,
const INDEX_CUDA start_idx_ptr_Smem_B,
T * const oRes)
{
/*========================================================================
* Dynamic Rectangular shared memory (The result oRes is a Matrix C:= A*B)
* ========================================================================
*
* s_PAD_DYN_SMEM_TILE_A is of size : BlockDim.y x commonSize_Smem_ColsA_RowsB
* - We must have BlockDim.y since the number of rows in A that can be handled is exactly this value
* because we want a single thread to handle a single element in C
* - The size of commonSize_Smem_ColsA_RowsB is desirable to be a multiple of warp size (x32) to allow a Coalesced read
* of an entire row in the matrix A
*
* s_PAD_DYN_SMEM_TILE_B is of size : commonSize_Smem_ColsA_RowsB x BlockDim.x
* - We must have BlockDim.x as sizeCols here since the number of cols in B that can be handled is exactly this value
* because we want a single thread to handle a single element in C
* - The size of BlockDim.x is desirable to be a multiple of warp size (x32) to allow a Coalesced read
* of an entire row in the matrix B
*/
extern __shared__ T s_TOTAL_DYN_SMEM_TILE[];
T * const s_PAD_DYN_SMEM_TILE_A = &s_TOTAL_DYN_SMEM_TILE[0];
T * const s_PAD_DYN_SMEM_TILE_B = &s_TOTAL_DYN_SMEM_TILE[start_idx_ptr_Smem_B];
/*----------------------------------------------------------------------------------------------------------------
* STEP 1: One warp performs a coalesced read of a rows from a block of the matrix A and B stored in global memory
------------------------------------------------------------------------------------------------------------------*/
/*- matrix coordinate : (iy, ix) -> (line , Column)
* - global linear memory index : idx = iy * SizeCols + ix
*/
// First step: map the thread and block index to the coordinate of the matrix C (Accessing with Row-major order)
const UINT32 col_idx_Res = threadIdx.x + blockIdx.x * blockDim.x; // Col index
const UINT32 row_idx_Res = threadIdx.y + blockIdx.y * blockDim.y; // Row index
const INDEX_CUDA NB_CHUNK_COPY_SMEM_A_B = SizeColsA_RowsB / commonSize_Smem_ColsA_RowsB;
/*----------------------------------------------------------------------------------------------------------------
* STEP 2: The warp then writes the data into shared memory using row-major ordering.
------------------------------------------------------------------------------------------------------------------*/
if (col_idx_Res < SizeColsRes && row_idx_Res < SizeRowsRes)
{
//========================================================================
// Case of matrix A: the shared memory has the same rows as BlockDim.y
const INDEX_CUDA UnrollingFactorColsSmemA = commonSize_Smem_ColsA_RowsB / blockDim.x;
const INDEX_CUDA SIZE_COLS_SMEM_A = commonSize_Smem_ColsA_RowsB;
const UINT32 STEP_COLS_SINGLE_THREAD(blockDim.x);
// Coordinate in matrix A. Accessing with Row-major order
const UINT32 row_idx_A = threadIdx.y + blockIdx.y * blockDim.y; // Row index
//========================================================================
// Case of matrix B: the shared memory has the same cols as BlockDim.x
const INDEX_CUDA UnrollingFactorRowsSmemB = commonSize_Smem_ColsA_RowsB / blockDim.y;
const INDEX_CUDA SIZE_ROWS_SMEM_B = commonSize_Smem_ColsA_RowsB;
const INDEX_CUDA SIZE_COLS_SMEM_B = blockDim.x;
const UINT32 STEP_ROW_SINGLE_THREAD(blockDim.y);
// Coordinate in matrix B. Accessing with Row-major order
const UINT32 col_idx_B = threadIdx.x + blockIdx.x * blockDim.x; // Col index
// Use double to avoid loss of accuracy
FLOAT64 runningSum(0.0);
// For each Tile/Chunk we copy the data in shared memory
for (INDEX_CUDA idx_Chunk_Copy_Smem = 0; idx_Chunk_Copy_Smem < NB_CHUNK_COPY_SMEM_A_B; ++idx_Chunk_Copy_Smem)
{
//========================================================================
/* Case of matrix A: the shared memory has the same rows as BlockDim.y
* Each 32 consecutive threads in a warp will handle "UnrollingFactorColsSmemA" Cols of Matrix A
*/
for (INDEX_CUDA k = 0; k < UnrollingFactorColsSmemA; ++k)
{
const UINT32 col_idx_A = threadIdx.x + k * STEP_COLS_SINGLE_THREAD + idx_Chunk_Copy_Smem * SIZE_COLS_SMEM_A; // Col index
if (col_idx_A < SizeColsA_RowsB)
{
// linear global memory index for matrix A
const UINT32 idx_A_RowMajor = row_idx_A * SizeColsA_RowsB + col_idx_A;
// load data from global memory to shared memory
s_PAD_DYN_SMEM_TILE_A[threadIdx.y * SIZE_COLS_SMEM_A + (threadIdx.x + k * STEP_COLS_SINGLE_THREAD)] = pA[idx_A_RowMajor];
}
}
//========================================================================
/* Case of matrix B: the shared memory has the same cols as BlockDim.x
* Each 32 consecutive threads in a warp will handle "UnrollingFactorRowsSmemB" Rows of Matrix B
*/
for (INDEX_CUDA k = 0; k < UnrollingFactorRowsSmemB; ++k)
{
const UINT32 row_idx_B = threadIdx.y + k * STEP_ROW_SINGLE_THREAD + idx_Chunk_Copy_Smem * SIZE_ROWS_SMEM_B; // Row index
if (row_idx_B < SizeColsA_RowsB)
{
// linear global memory index for matrix B
const UINT32 idx_B_RowMajor = row_idx_B * SizeColsRes + col_idx_B;
// load data from global memory to shared memory
s_PAD_DYN_SMEM_TILE_B[(threadIdx.y + k * STEP_ROW_SINGLE_THREAD) * SIZE_COLS_SMEM_B + threadIdx.x] = pB[idx_B_RowMajor];
}
}
// thread synchronization
__syncthreads();
// Here we are in a single fixed BLOCK of SMEM handled by a single BlockDim
for (INDEX_CUDA idx_element = 0; idx_element < commonSize_Smem_ColsA_RowsB; ++idx_element)
{
// Load an entire row of A and entire col of B from their respective shared memory
// This line is necessary otherwiose we must initialize the shared memory with zero
if (idx_element + idx_Chunk_Copy_Smem * commonSize_Smem_ColsA_RowsB < SizeColsA_RowsB)
{
const unsigned int zero = 0;
if (threadIdx.y == zero && threadIdx.x == zero && blockIdx.x == zero && blockIdx.y == zero)
{
printf("idx_element :=%d idx_Chunk_Copy_Smem := %d "
"---- threadIdx:(%d, %d, %d) blockIdx:(%d, %d, %d) blockDim:(%d, %d, %d) "
"gridDim:(%d, %d, %d) -- zero = %d \n", idx_element, idx_Chunk_Copy_Smem,
threadIdx.x, threadIdx.y, threadIdx.z,
blockIdx.x, blockIdx.y, blockIdx.z, blockDim.x, blockDim.y, blockDim.z,
gridDim.x, gridDim.y, gridDim.z, zero);
}
runningSum += s_PAD_DYN_SMEM_TILE_A[threadIdx.y * commonSize_Smem_ColsA_RowsB + idx_element]
* s_PAD_DYN_SMEM_TILE_B[idx_element * commonSize_Smem_ColsA_RowsB + threadIdx.x];
}
}
}
/* The global index is then
* const INDEX_CUDA idx_byRow = iy * SizeCols + ix;
*/
oRes[row_idx_Res * SizeColsRes + col_idx_Res] = runningSum;
}
}
int main()
{
// Matrix Info
const INDEX_DEV bugZero = 128;
const INDEX_DEV SizeRowsA = bugZero;
const INDEX_DEV SizeColsA = bugZero;
const INDEX_DEV SizeRowsB = SizeColsA;
const INDEX_DEV SizeColsB = bugZero;
const INDEX_DEV SIZE_COLS_Res = SizeColsB;
const INDEX_DEV SIZE_ROWS_Res = SizeRowsA;
const INDEX_DEV SIZE_MATTRIX_A = SizeColsA * SizeRowsA;
const INDEX_DEV SIZE_MATRIX_BYTES_A = SIZE_MATTRIX_A * sizeof(REAL);
const INDEX_DEV SIZE_MATTRIX_B = SizeColsB * SizeRowsB;
const INDEX_DEV SIZE_MATRIX_BYTES_B = SIZE_MATTRIX_B * sizeof(REAL);
const INDEX_DEV SIZE_MATTRIX_Res = SIZE_COLS_Res * SIZE_ROWS_Res;
const INDEX_DEV SIZE_MATRIX_BYTES_Res = SIZE_MATTRIX_Res * sizeof(REAL);
// Exce config
const dim3 BlockDim(CONST_BLOCK_DIM_32, CONST_BLOCK_DIM_32);
const FLOAT64 numBlocks_x = std::ceil(static_cast<double>(SIZE_COLS_Res) / BlockDim.x);
const FLOAT64 numBlocks_y = std::ceil(static_cast<double>(SIZE_ROWS_Res) / BlockDim.y);
const dim3 GridDim(static_cast<UINT32>(numBlocks_x), static_cast<UINT32>(numBlocks_y), 1);
std::cout << "\n ===========================================================================" << std::endl;
std::cout << "Starting the kernel with: " << std::endl;
std::cout << " BlockDim : " << "(" << BlockDim.x << ", " << BlockDim.y << ", " << BlockDim.z << ")" << std::endl;
std::cout << " GridDim : " << "(" << GridDim.x << ", " << GridDim.y << ", " << GridDim.z << ")" << std::endl;
std::cout << " Matrix size A : " << "(" << SizeRowsA << " x " << SizeColsA << ")" << std::endl;
std::cout << " Matrix size B : " << "(" << SizeRowsB << " x " << SizeColsB << ")" << std::endl;
std::cout << " Matrix size C : " << "(" << SIZE_ROWS_Res << " x " << SIZE_COLS_Res << ")" << std::endl;
std::cout << "===========================================================================\n " << std::endl;
// Device pointers
REAL * devPtr_A = nullptr;
REAL * devPtr_B = nullptr;
REAL * devPtr_Res = nullptr;
// Memory allocation on Device
CUDA_SAFE_CALL(cudaMalloc((void **)&devPtr_A, SIZE_MATRIX_BYTES_A));
CUDA_SAFE_CALL(cudaMalloc((void **)&devPtr_B, SIZE_MATRIX_BYTES_B));
CUDA_SAFE_CALL(cudaMalloc((void **)&devPtr_Res, SIZE_MATRIX_BYTES_Res));
// transfer data from host to device
CUDA_SAFE_CALL(cudaMemset(devPtr_A, 0, SIZE_MATRIX_BYTES_Res));
CUDA_SAFE_CALL(cudaMemset(devPtr_B, 0, SIZE_MATRIX_BYTES_Res));
CUDA_SAFE_CALL(cudaMemset(devPtr_Res, 0, SIZE_MATRIX_BYTES_Res));
// Use CONST_BLOCK_DIM_32 for the number of Cols in the shared memory Tile A to allow the access by a warp with 32 consecutive threads
const UINT32 paddingValue = 0; /// GetPaddingValue();
const INDEX_CUDA commonSize_Smem_ColsA_RowsB = CONST_BLOCK_DIM_32;
// Setup the Rectangular Paddded Shared Memory for matrix A as Rows x Cols
const INDEX_CUDA sizeRows_Tile_Smem_A = CONST_BLOCK_DIM_32;
const INDEX_CUDA SIZE_DYN_SMEM_TILE_A = BlockDim.y * (sizeRows_Tile_Smem_A + paddingValue);
const INDEX_CUDA SIZE_DYN_SMEM_TILE_A_BYTES = SIZE_DYN_SMEM_TILE_A * sizeof(REAL);
// Setup the Rectangular Paddded Shared Memory for matrix B as Rows x Cols
const INDEX_CUDA sizeCols_Tile_Smem_B = CONST_BLOCK_DIM_32;
const INDEX_CUDA SIZE_DYN_SMEM_TILE_B_BYTES = (sizeCols_Tile_Smem_B + paddingValue) * BlockDim.x * sizeof(REAL);
// Manually set up two arrays in shared memory.
const INDEX_CUDA start_idx_ptr_Smem_B = SIZE_DYN_SMEM_TILE_A;
const INDEX_CUDA CONST_SMEM_TILE_BYTES = SIZE_DYN_SMEM_TILE_B_BYTES + SIZE_DYN_SMEM_TILE_A_BYTES;
MatrixProdPadDynSmem_k <<< GridDim, BlockDim, CONST_SMEM_TILE_BYTES >> > (devPtr_A, devPtr_B, SIZE_COLS_Res, SIZE_ROWS_Res, SizeColsA, commonSize_Smem_ColsA_RowsB, start_idx_ptr_Smem_B, devPtr_Res);
if (cudaPeekAtLastError() != cudaSuccess)
{
std::cout << "Error in CUDA \n " << std::endl;
}
CUDA_SAFE_CALL(cudaFree(devPtr_A));
CUDA_SAFE_CALL(cudaFree(devPtr_B));
CUDA_SAFE_CALL(cudaFree(devPtr_Res));
devPtr_A = nullptr;
devPtr_B = nullptr;
devPtr_Res = nullptr;
cudaDeviceReset();
return 0;
}
I got the same issue a couple of weeks ago for another kernel but the issue disappeared after 2 days (I did not do anything !!) so I think it is kind of radom behavior …
Can someone reproduce it and if yes, any ideas of what’s wrong here ?
Thanks,
kernel.cu (15.4 KB)