How does cublasGemmEx() call work with CUDA_R_16F inputs and CUDA_R_32F computeType

Hi

I have written a small test to see how cublasGemmEX works.
I tried to call first cublasGemmEX () first with CUDA_R_8I inputs to get CUDA_R_32I output for matrix multiplication(No bias term). I converted this output from int to float. It works!

Now I want to call the same cuda API cublasGemmEX () but WITH Bias term. so I set beta to 1 along with alpha.
I am trying to multiply again same inputs as in first call to cublasGemmEX () but now in CUDA_R_16F. But values contained in them are same as when in CUDA_R_8I.

My float output already contained multiplication result of first call, I expect to multiply again same inputs to get the same output as above and add to it. So, basically my output after second call to cublasGemmEX () should be double of each values.

But I am getting very small values , almost zeros.

I attach my code here:

File : testCublasAgain.cu

#include
#include <cublas_v2.h>
#include <curand.h>

void print_matrix(const float *A, int nr_rows_A, int nr_cols_A);
void print_matrix_int16(const int16_t *A, int nr_rows_A, int nr_cols_A);
// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on GPU
void GPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) {
// Create a pseudo-random number generator
curandGenerator_t prng;
curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);

 // Set the seed for the random number generator using the system clock
 curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) clock());
 // Fill the array with random numbers on the device
 curandGenerateUniform(prng, A, nr_rows_A * nr_cols_A);

}

void gpu_blas_mmul( const float *A, const float *B, float *C, const int m, const int k, const int n) {
int lda=m,ldb=k,ldc=m;
const float alf = 1;
const float bet = 0;
const int alf1 = 1;
const int bet1 = 0;
const float *alpha = &alf;
const float *beta = &bet;
const int *alpha1 = &alf1;
const int *beta1 = &bet1;

    cublasHandle_t handle;
    cublasCreate(&handle);
  // Do the actual multiplication
  cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
  //cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, k, alpha, A, CUDA_R_32F, lda, B, CUDA_R_32F,  ldb, beta, C, CUDA_R_32F, ldc, CUDA_R_32F, CUBLAS_GEMM_DFALT);

     // Destroy the handle
 cublasDestroy(handle);

}
global void Int2Float(int * in,float * out, int noOfElements)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
//Dummy
if(i < noOfElements)
{
out[i] = in[i];
}
}

void gpu_blas_mmul_int8( const int8_t *A, const int8_t *B, int *C, const int m, const int k, const int n) {
int lda=m,ldb=k,ldc=m;
const float alf = 1;
const float bet = 0;
const int alf1 = 1;
const int bet1 = 0;
const float *alpha = &alf;
const float *beta = &bet;
const int *alpha1 = &alf1;
const int *beta1 = &bet1;

  int16_t *h_L = (int16_t *)malloc(m * k * sizeof(int16_t));
  int16_t *h_M = (int16_t *)malloc(k * n * sizeof(int16_t));
  float *h_N = (float *)malloc(m * n * sizeof(float));

  for(int i=0; i<m * k;i++)
          h_L[i]  = i;

  for(int i=0; i<k * n;i++)
          h_M[i]  = i;

 for(int i=0; i<m * n;i++)
          h_N[i]  = 0;

  int16_t *d_L, *d_M;
  float *d_N;
  cudaMalloc(&d_L,m * k * sizeof(int16_t));
  cudaMalloc(&d_M,k * n * sizeof(int16_t));
  cudaMalloc(&d_N,m * n * sizeof(float));

  cudaMemcpy(d_L, h_L, m * k * sizeof(int16_t),cudaMemcpyHostToDevice);
  cudaMemcpy(d_M, h_M, k * n  * sizeof(int16_t),cudaMemcpyHostToDevice);
  std::cout << "h_L =" << std::endl;
  print_matrix_int16(h_L, m, k);
  std::cout << "h_M =" << std::endl;
  print_matrix_int16(h_M, k, n);

    cublasHandle_t handle;
    cublasCreate(&handle);
  // Do the actual multiplication
  //cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
  cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, k, alpha1, A, CUDA_R_8I, lda, B, CUDA_R_8I,  ldb, beta1, C, CUDA_R_32I, ldc, CUDA_R_32I, CUBLAS_GEMM_DFALT);

  //cudaMemcpy(d_N, C, m * n  * sizeof(int),cudaMemcpyDeviceToDevice);
  Int2Float<<<((m*n) / 1024) + 1, 1024>>>(C, d_N, m*n);
  cudaMemcpy(h_N, d_N, m * n  * sizeof(float),cudaMemcpyDeviceToHost);
  std::cout << "h_N =" << std::endl;
  print_matrix(h_N, m, n);


    cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, k, alpha1, d_L, CUDA_R_16F, lda, d_M, CUDA_R_16F,  ldb, alpha1, d_N, CUDA_R_32F, ldc, CUDA_R_32F, CUBLAS_GEMM_DFALT);

  cudaMemcpy(h_N, d_N, m * n  * sizeof(float),cudaMemcpyDeviceToHost);
  std::cout << "h_N Again after adding Bias term =" << std::endl;
  print_matrix(h_N, m, n);

     // Destroy the handle
 cublasDestroy(handle);
 cudaFree(d_L);
 cudaFree(d_M);
 cudaFree(d_N);

 // Free CPU memory
 free(h_L);
 free(h_M);
 free(h_N);

}
//Print matrix A(nr_rows_A, nr_cols_A) storage in column-major format
void print_matrix_int8(const int8_t *A, int nr_rows_A, int nr_cols_A) {

  for(int i = 0; i < nr_rows_A; ++i){
      for(int j = 0; j < nr_cols_A; ++j){
          std::cout << static_cast<int16_t>(A[j * nr_rows_A + i]) << " ";
      }
      std::cout << std::endl;
  }
 std::cout << std::endl;

}

//Print matrix A(nr_rows_A, nr_cols_A) storage in column-major format
void print_matrix_int(const int *A, int nr_rows_A, int nr_cols_A) {

  for(int i = 0; i < nr_rows_A; ++i){
      for(int j = 0; j < nr_cols_A; ++j){
          std::cout << A[j * nr_rows_A + i] << " ";
      }
      std::cout << std::endl;
  }
 std::cout << std::endl;

}

//Print matrix A(nr_rows_A, nr_cols_A) storage in column-major format
void print_matrix_int16(const int16_t *A, int nr_rows_A, int nr_cols_A) {

  for(int i = 0; i < nr_rows_A; ++i){
      for(int j = 0; j < nr_cols_A; ++j){
          std::cout << static_cast<int16_t>(A[j * nr_rows_A + i]) << " ";

    }
      std::cout << std::endl;
  }
 std::cout << std::endl;

}

//Print matrix A(nr_rows_A, nr_cols_A) storage in column-major format
void print_matrix(const float *A, int nr_rows_A, int nr_cols_A) {

  for(int i = 0; i < nr_rows_A; ++i){
      for(int j = 0; j < nr_cols_A; ++j){
          std::cout << A[j * nr_rows_A + i] << " ";
      }
      std::cout << std::endl;
  }
 std::cout << std::endl;

}

int main() {
// Allocate 3 arrays on CPU
int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C;

  // for simplicity we are going to use square arrays
  nr_rows_A = nr_cols_A = nr_rows_B = nr_cols_B = nr_rows_C = nr_cols_C = 4;

float *h_A = (float *)malloc(nr_rows_A * nr_cols_A * sizeof(float));
  float *h_B = (float *)malloc(nr_rows_B * nr_cols_B * sizeof(float));
 float *h_C = (float *)malloc(nr_rows_C * nr_cols_C * sizeof(float));

 int8_t *h_P = (int8_t *)malloc(nr_rows_A * nr_cols_A * sizeof(int8_t));
 int8_t *h_Q = (int8_t *)malloc(nr_rows_B * nr_cols_B * sizeof(int8_t));
 int *h_R = (int *)malloc(nr_rows_C * nr_cols_C * sizeof(int));

// Allocate 3 arrays on GPU
 float *d_A, *d_B, *d_C;
cudaMalloc(&d_A,nr_rows_A * nr_cols_A * sizeof(float));
 cudaMalloc(&d_B,nr_rows_B * nr_cols_B * sizeof(float));
 cudaMalloc(&d_C,nr_rows_C * nr_cols_C * sizeof(float));
 // Fill the arrays A and B on GPU with random numbers
 //GPU_fill_rand(d_A, nr_rows_A, nr_cols_A);
 //GPU_fill_rand(d_B, nr_rows_B, nr_cols_B);
int8_t *d_P, *d_Q;
int *d_R;
cudaMalloc(&d_P,nr_rows_A * nr_cols_A * sizeof(int8_t));
 cudaMalloc(&d_Q,nr_rows_B * nr_cols_B * sizeof(int8_t));
 cudaMalloc(&d_R,nr_rows_C * nr_cols_C * sizeof(int));

for(int i=0; i<nr_rows_A * nr_cols_A;i++)
    h_A[i]  = i;

for(int i=0; i<nr_rows_B * nr_cols_B;i++)
    h_B[i]  = i;

for(int i=0; i<nr_rows_C* nr_cols_C;i++)
    h_C[i]  = 0;

for(int i=0; i<nr_rows_A * nr_cols_A;i++)
    h_P[i]  = i;

for(int i=0; i<nr_rows_B * nr_cols_B;i++)
    h_Q[i]  = i;

for(int i=0; i<nr_rows_C* nr_cols_C;i++)
    h_R[i]  = 0;



 // Optionally we can copy the data back on CPU and print the arrays
//cudaMemcpy(h_B,d_B,nr_rows_B * nr_cols_B * sizeof(float),cudaMemcpyDeviceToHost);
 std::cout << "P =" << std::endl;
 print_matrix_int8(h_P, nr_rows_A, nr_cols_A);
 std::cout << "Q =" << std::endl;
 print_matrix_int8(h_Q, nr_rows_B, nr_cols_B);

 //Copy data to Device
 cudaMemcpy(d_A, h_A, nr_rows_A * nr_cols_A * sizeof(float),cudaMemcpyHostToDevice);
 cudaMemcpy(d_B, h_B, nr_rows_B * nr_cols_B * sizeof(float),cudaMemcpyHostToDevice);

 cudaMemcpy(d_P, h_P, nr_rows_A * nr_cols_A * sizeof(int8_t),cudaMemcpyHostToDevice);
 cudaMemcpy(d_Q, h_Q, nr_rows_B * nr_cols_B * sizeof(int8_t),cudaMemcpyHostToDevice);

 // Multiply A and B on GPU
 //gpu_blas_mmul(d_A, d_B, d_C, nr_rows_A, nr_cols_A, nr_cols_B);
 gpu_blas_mmul_int8(d_P, d_Q, d_R, nr_rows_A, nr_cols_A, nr_cols_B);

 // Copy (and print) the result on host memory
 //cudaMemcpy(h_C,d_C,nr_rows_C * nr_cols_C * sizeof(float),cudaMemcpyDeviceToHost);
 //cudaMemcpy(h_R,d_R,nr_rows_C * nr_cols_C * sizeof(int),cudaMemcpyDeviceToHost);
 //std::cout << "R =" << std::endl;
 //print_matrix_int(h_R, nr_rows_C, nr_cols_C);
//Free GPU memory
 cudaFree(d_A);
 cudaFree(d_B);
 cudaFree(d_C);

 cudaFree(d_P);
 cudaFree(d_Q);
 cudaFree(d_R);

 // Free CPU memory
 free(h_A);
 free(h_B);
 free(h_C);

 free(h_P);
 free(h_Q);
 free(h_R);

 return 0;

}

Command to Compile:

nvcc testCublasAgain.cu -lcublas -lcurand -o mmul_1 -g

Command to Run:
./mmul_1

Output:

P =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15

Q =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15

h_L =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15

h_M =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15

h_N =
56 62 68 74
152 174 196 218
248 286 324 362
344 398 452 506

h_N Again after adding Bias term =
7.84727e-44 8.68805e-44 9.52883e-44 1.03696e-43
2.12997e-43 2.43826e-43 2.74654e-43 3.05483e-43
3.47522e-43 4.00771e-43 4.54021e-43 5.0727e-43
4.82047e-43 5.57717e-43 6.33387e-43 7.09057e-43

However , when I change the cudaDataType_t from CUDA_R_16F inputs to CUDA_R_32F inputs, I get expected answer as below.

I changed this line
from
cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, k, alpha1, d_L, CUDA_R_16F, lda, d_M, CUDA_R_16F, ldb, alpha1, d_N, CUDA_R_32F, ldc, CUDA_R_32F, CUBLAS_GEMM_DFALT);

to

cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, k, alpha1, d_L, CUDA_R_32F, lda, d_M, CUDA_R_32F, ldb, alpha1, d_N, CUDA_R_32F, ldc, CUDA_R_32F, CUBLAS_GEMM_DFALT);

Ofcourse, with datatype of d_L and d_L changed from int16_t to float:

Expected output:

P =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15

Q =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15

h_L =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15

h_M =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15

h_N =
56 62 68 74
152 174 196 218
248 286 324 362
344 398 452 506

h_N Again after adding Bias term =
112 124 136 148
304 348 392 436
496 572 648 724
688 796 904 1012

Update:

I changed again value of alpha to float from int as before. Now the output is there, but with added earlier output: :(

Getting there…

P =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15

Q =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15

h_L =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15

h_M =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15

** On entry to GEMM_EX parameter number 6 had an illegal value
h_N =
56 62 68 74
152 174 196 218
248 286 324 362
344 398 452 506

h_N Again after adding Bias term =
56 62 68 74
152 174 196 218
248 286 324 362
344 398 452 506

Update:

I changed again value of alpha to float from int as before. Now the output is there, but with added earlier output: :(

Getting there…

P =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15

Q =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15

h_L =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15

h_M =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15

** On entry to GEMM_EX parameter number 6 had an illegal value
h_N =
56 62 68 74
152 174 196 218
248 286 324 362
344 398 452 506

h_N Again after adding Bias term =
56 62 68 74
152 174 196 218
248 286 324 362
344 398 452 506