Hi
I have written a small test to see how cublasGemmEX works.
I tried to call first cublasGemmEX () first with CUDA_R_8I inputs to get CUDA_R_32I output for matrix multiplication(No bias term). I converted this output from int to float. It works!
Now I want to call the same cuda API cublasGemmEX () but WITH Bias term. so I set beta to 1 along with alpha.
I am trying to multiply again same inputs as in first call to cublasGemmEX () but now in CUDA_R_16F. But values contained in them are same as when in CUDA_R_8I.
My float output already contained multiplication result of first call, I expect to multiply again same inputs to get the same output as above and add to it. So, basically my output after second call to cublasGemmEX () should be double of each values.
But I am getting very small values , almost zeros.
I attach my code here:
File : testCublasAgain.cu
#include
#include <cublas_v2.h>
#include <curand.h>
void print_matrix(const float *A, int nr_rows_A, int nr_cols_A);
void print_matrix_int16(const int16_t *A, int nr_rows_A, int nr_cols_A);
// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on GPU
void GPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) {
// Create a pseudo-random number generator
curandGenerator_t prng;
curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
// Set the seed for the random number generator using the system clock
curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) clock());
// Fill the array with random numbers on the device
curandGenerateUniform(prng, A, nr_rows_A * nr_cols_A);
}
void gpu_blas_mmul( const float *A, const float *B, float *C, const int m, const int k, const int n) {
int lda=m,ldb=k,ldc=m;
const float alf = 1;
const float bet = 0;
const int alf1 = 1;
const int bet1 = 0;
const float *alpha = &alf;
const float *beta = &bet;
const int *alpha1 = &alf1;
const int *beta1 = &bet1;
cublasHandle_t handle;
cublasCreate(&handle);
// Do the actual multiplication
cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
//cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, k, alpha, A, CUDA_R_32F, lda, B, CUDA_R_32F, ldb, beta, C, CUDA_R_32F, ldc, CUDA_R_32F, CUBLAS_GEMM_DFALT);
// Destroy the handle
cublasDestroy(handle);
}
global void Int2Float(int * in,float * out, int noOfElements)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
//Dummy
if(i < noOfElements)
{
out[i] = in[i];
}
}
void gpu_blas_mmul_int8( const int8_t *A, const int8_t *B, int *C, const int m, const int k, const int n) {
int lda=m,ldb=k,ldc=m;
const float alf = 1;
const float bet = 0;
const int alf1 = 1;
const int bet1 = 0;
const float *alpha = &alf;
const float *beta = &bet;
const int *alpha1 = &alf1;
const int *beta1 = &bet1;
int16_t *h_L = (int16_t *)malloc(m * k * sizeof(int16_t));
int16_t *h_M = (int16_t *)malloc(k * n * sizeof(int16_t));
float *h_N = (float *)malloc(m * n * sizeof(float));
for(int i=0; i<m * k;i++)
h_L[i] = i;
for(int i=0; i<k * n;i++)
h_M[i] = i;
for(int i=0; i<m * n;i++)
h_N[i] = 0;
int16_t *d_L, *d_M;
float *d_N;
cudaMalloc(&d_L,m * k * sizeof(int16_t));
cudaMalloc(&d_M,k * n * sizeof(int16_t));
cudaMalloc(&d_N,m * n * sizeof(float));
cudaMemcpy(d_L, h_L, m * k * sizeof(int16_t),cudaMemcpyHostToDevice);
cudaMemcpy(d_M, h_M, k * n * sizeof(int16_t),cudaMemcpyHostToDevice);
std::cout << "h_L =" << std::endl;
print_matrix_int16(h_L, m, k);
std::cout << "h_M =" << std::endl;
print_matrix_int16(h_M, k, n);
cublasHandle_t handle;
cublasCreate(&handle);
// Do the actual multiplication
//cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, k, alpha1, A, CUDA_R_8I, lda, B, CUDA_R_8I, ldb, beta1, C, CUDA_R_32I, ldc, CUDA_R_32I, CUBLAS_GEMM_DFALT);
//cudaMemcpy(d_N, C, m * n * sizeof(int),cudaMemcpyDeviceToDevice);
Int2Float<<<((m*n) / 1024) + 1, 1024>>>(C, d_N, m*n);
cudaMemcpy(h_N, d_N, m * n * sizeof(float),cudaMemcpyDeviceToHost);
std::cout << "h_N =" << std::endl;
print_matrix(h_N, m, n);
cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, k, alpha1, d_L, CUDA_R_16F, lda, d_M, CUDA_R_16F, ldb, alpha1, d_N, CUDA_R_32F, ldc, CUDA_R_32F, CUBLAS_GEMM_DFALT);
cudaMemcpy(h_N, d_N, m * n * sizeof(float),cudaMemcpyDeviceToHost);
std::cout << "h_N Again after adding Bias term =" << std::endl;
print_matrix(h_N, m, n);
// Destroy the handle
cublasDestroy(handle);
cudaFree(d_L);
cudaFree(d_M);
cudaFree(d_N);
// Free CPU memory
free(h_L);
free(h_M);
free(h_N);
}
//Print matrix A(nr_rows_A, nr_cols_A) storage in column-major format
void print_matrix_int8(const int8_t *A, int nr_rows_A, int nr_cols_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
std::cout << static_cast<int16_t>(A[j * nr_rows_A + i]) << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
//Print matrix A(nr_rows_A, nr_cols_A) storage in column-major format
void print_matrix_int(const int *A, int nr_rows_A, int nr_cols_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
std::cout << A[j * nr_rows_A + i] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
//Print matrix A(nr_rows_A, nr_cols_A) storage in column-major format
void print_matrix_int16(const int16_t *A, int nr_rows_A, int nr_cols_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
std::cout << static_cast<int16_t>(A[j * nr_rows_A + i]) << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
//Print matrix A(nr_rows_A, nr_cols_A) storage in column-major format
void print_matrix(const float *A, int nr_rows_A, int nr_cols_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
std::cout << A[j * nr_rows_A + i] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int main() {
// Allocate 3 arrays on CPU
int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C;
// for simplicity we are going to use square arrays
nr_rows_A = nr_cols_A = nr_rows_B = nr_cols_B = nr_rows_C = nr_cols_C = 4;
float *h_A = (float *)malloc(nr_rows_A * nr_cols_A * sizeof(float));
float *h_B = (float *)malloc(nr_rows_B * nr_cols_B * sizeof(float));
float *h_C = (float *)malloc(nr_rows_C * nr_cols_C * sizeof(float));
int8_t *h_P = (int8_t *)malloc(nr_rows_A * nr_cols_A * sizeof(int8_t));
int8_t *h_Q = (int8_t *)malloc(nr_rows_B * nr_cols_B * sizeof(int8_t));
int *h_R = (int *)malloc(nr_rows_C * nr_cols_C * sizeof(int));
// Allocate 3 arrays on GPU
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A,nr_rows_A * nr_cols_A * sizeof(float));
cudaMalloc(&d_B,nr_rows_B * nr_cols_B * sizeof(float));
cudaMalloc(&d_C,nr_rows_C * nr_cols_C * sizeof(float));
// Fill the arrays A and B on GPU with random numbers
//GPU_fill_rand(d_A, nr_rows_A, nr_cols_A);
//GPU_fill_rand(d_B, nr_rows_B, nr_cols_B);
int8_t *d_P, *d_Q;
int *d_R;
cudaMalloc(&d_P,nr_rows_A * nr_cols_A * sizeof(int8_t));
cudaMalloc(&d_Q,nr_rows_B * nr_cols_B * sizeof(int8_t));
cudaMalloc(&d_R,nr_rows_C * nr_cols_C * sizeof(int));
for(int i=0; i<nr_rows_A * nr_cols_A;i++)
h_A[i] = i;
for(int i=0; i<nr_rows_B * nr_cols_B;i++)
h_B[i] = i;
for(int i=0; i<nr_rows_C* nr_cols_C;i++)
h_C[i] = 0;
for(int i=0; i<nr_rows_A * nr_cols_A;i++)
h_P[i] = i;
for(int i=0; i<nr_rows_B * nr_cols_B;i++)
h_Q[i] = i;
for(int i=0; i<nr_rows_C* nr_cols_C;i++)
h_R[i] = 0;
// Optionally we can copy the data back on CPU and print the arrays
//cudaMemcpy(h_B,d_B,nr_rows_B * nr_cols_B * sizeof(float),cudaMemcpyDeviceToHost);
std::cout << "P =" << std::endl;
print_matrix_int8(h_P, nr_rows_A, nr_cols_A);
std::cout << "Q =" << std::endl;
print_matrix_int8(h_Q, nr_rows_B, nr_cols_B);
//Copy data to Device
cudaMemcpy(d_A, h_A, nr_rows_A * nr_cols_A * sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, nr_rows_B * nr_cols_B * sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_P, h_P, nr_rows_A * nr_cols_A * sizeof(int8_t),cudaMemcpyHostToDevice);
cudaMemcpy(d_Q, h_Q, nr_rows_B * nr_cols_B * sizeof(int8_t),cudaMemcpyHostToDevice);
// Multiply A and B on GPU
//gpu_blas_mmul(d_A, d_B, d_C, nr_rows_A, nr_cols_A, nr_cols_B);
gpu_blas_mmul_int8(d_P, d_Q, d_R, nr_rows_A, nr_cols_A, nr_cols_B);
// Copy (and print) the result on host memory
//cudaMemcpy(h_C,d_C,nr_rows_C * nr_cols_C * sizeof(float),cudaMemcpyDeviceToHost);
//cudaMemcpy(h_R,d_R,nr_rows_C * nr_cols_C * sizeof(int),cudaMemcpyDeviceToHost);
//std::cout << "R =" << std::endl;
//print_matrix_int(h_R, nr_rows_C, nr_cols_C);
//Free GPU memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaFree(d_P);
cudaFree(d_Q);
cudaFree(d_R);
// Free CPU memory
free(h_A);
free(h_B);
free(h_C);
free(h_P);
free(h_Q);
free(h_R);
return 0;
}
Command to Compile:
nvcc testCublasAgain.cu -lcublas -lcurand -o mmul_1 -g
Command to Run:
./mmul_1
Output:
P =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15
Q =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15
h_L =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15
h_M =
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15
h_N =
56 62 68 74
152 174 196 218
248 286 324 362
344 398 452 506
h_N Again after adding Bias term =
7.84727e-44 8.68805e-44 9.52883e-44 1.03696e-43
2.12997e-43 2.43826e-43 2.74654e-43 3.05483e-43
3.47522e-43 4.00771e-43 4.54021e-43 5.0727e-43
4.82047e-43 5.57717e-43 6.33387e-43 7.09057e-43