Hi,
As per documentation from this link cuBLAS :: CUDA Toolkit Documentation,
cublasGemmEx() is not working for INT8 matrix multiplications.
It says:
“For CUDA_R_32I computation type the matrix types combinations supported by cublasGemmEx are listed below. This path is only supported with alpha, beta being either 1 or 0; A, B being 32-bit aligned; and lda, ldb being multiples of 4.” and ““the combination of the parameters Atype, Btype and Ctype and the algorithm type, algo is not supported””
I am getting below error:
CUBLAS_STATUS_NOT_SUPPORTED
Attached my code below: don’t know what’s wrong. It’S not working according to documentation.
I checked for all the algos, but it doesnt work.
My cublas compilation is also correct picking cublas from cuda-8.0 :
nvcc -arch=sm_61 -o cublas cublas.cu -L /usr/local/cuda-8.0/lib64/ -lcublas
==========================================================================================================
#undef _GLIBCXX_ATOMIC_BUILTINS
#undef _GLIBCXX_USE_INT128
#include <thrust/device_vector.h>
#include <cublas_v2.h>
#include
// C-style indexing
int ci(int row, int column, int nColumns) {
return row*nColumns+column;
}
int main(void)
{
cudaSetDevice(3);
int rowD = 40 ; // number of rows of D
int colD = 40; // number of columns of D
int rowE = colD; // number of rows of E
int colE = 40; // number of columns of E
int rowF = rowD;
int colF = colE;
// initialize data
thrust::device_vector D(rowD * colD);
thrust::device_vector E(rowE * colE);
thrust::device_vector F(rowF * colF);
for (size_t i = 0; i < rowD; i++){
for (size_t j = 0; j < colD; j++){
D[ci(i,j,colD)]=(i+j) ;
// std::cout << D[ci(i,j,colD)] << " ";
}
//std::cout << “\n”;
}
for (size_t i = 0; i < rowE; i++){
for (size_t j = 0; j < colE; j++){
E[ci(i,j,colE)]=(i+j);
//std::cout << E[ci(i,j,colE)] << " ";
}
//std::cout << “\n”;
}
for (size_t i = 0; i < rowF; i++)
for (size_t j = 0; j < colF; j++)
F[ci(i,j,colF)]=0;
cublasHandle_t handle;
/* Initialize CUBLAS */
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << “!!! CUBLAS initialization error\n”;
}
float alpha = 1.0f;float beta = 0.0f;
#if 0
status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
colE, rowD, colD,
&alpha, thrust::raw_pointer_cast(&E[0]), colE,
thrust::raw_pointer_cast(&D[0]), colD,
&beta, thrust::raw_pointer_cast(&F[0]), colE);// colE x rowD
#endif
status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N,
colE, rowD, colD,
&alpha, thrust::raw_pointer_cast(&E[0]), CUDA_R_8I ,colE,
thrust::raw_pointer_cast(&D[0]), CUDA_R_8I ,colD,
&beta, thrust::raw_pointer_cast(&F[0]), CUDA_R_32I ,colE, CUDA_R_32I,CUBLAS_GEMM_ALGO0);// colE x rowD
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << “!!! 0 kernel execution error.\n” << status << std::endl;
}
#if 0
for (size_t i = 0; i < rowF; i++){
for (size_t j = 0; j < colF; j++){
std::cout << F[ci(i,j,colF)] << " ";
}
#endif
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << “!!! shutdown error (A)\n”;
}
return 0;
}