Happens on Fedora 20, Ubuntu 15.04, 14.04, 12.04 using GTX 970, GTX 1080, GTX TITAN X and TESLA K20x.
Tested on CUDA Toolkit 7.5 and 8.0 RC.
Please compile using:
nvcc cublas_test.cc -o cublas_code -L/usr/local/cuda/lib64 -lcublas --compiler-options -mssse3
nvcc cusparse_test.cc -o cusparse_code -L/usr/local/cuda/lib64 -lcusparse --compiler-options -mssse3
cublas_test.cc:
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <stdio.h>
#include <cstdlib>
#include <vector>
#include <tmmintrin.h>
using std::vector;
void allocate(void ** ptr, int len) {
cudaError_t ret = cudaMalloc(ptr, len);
if (ret != cudaSuccess) {
printf("error in allocate\n");
exit(1);
}
}
void break_cublas() {
__m64 a = (__m64)0LL;
__m64 b = (__m64)0LL;
__m64 c = _mm_hadd_pi32(a, b);
// __m64 c = _mm_shuffle_pi8(a, b);
}
void test(cublasHandle_t & cublasHandle) {
for (int i = 0; i < 3; i++) {
printf("------- %d ------- \n", i);
if (i > 0)
break_cublas();
cudaError_t res;
cublasStatus_t cublasStatus;
int m = 8;
int n = 8;
int k = 1;
double * A_val;
allocate((void**)&A_val, m * n * sizeof(double));
cudaMemset(A_val, 0, sizeof(double) * m * n);
double * B_val;
allocate((void**)&B_val, sizeof(double) * n * k);
cudaMemset(B_val, 0, sizeof(double) * n * k);
double * C_val;
allocate((void**)&C_val, sizeof(double) * m * k);
cudaMemset(C_val, 0, sizeof(double) * n * k);
vector<double> r(m * k);
cudaMemcpy(r.data(), C_val, sizeof(double) * m * k, cudaMemcpyDeviceToHost);
printf("(before multiply) C = ");
for (int i = 0; i < r.size(); i++)
printf("%f ", r[i]);
printf("\n");
double alpha = 1.0;
double beta = 0.0;
cublasStatus = cublasDgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, m, k, n,
&alpha, A_val, m, B_val, n, &beta, C_val, m);
if (cublasStatus != CUBLAS_STATUS_SUCCESS) {
printf("error in multiply\n");
exit(1);
}
cudaMemcpy(r.data(), C_val, sizeof(double) * m * k, cudaMemcpyDeviceToHost);
printf("(after multiply) C = ");
for (int i = 0; i < r.size(); i++)
printf("%f ", r[i]);
printf("\n\n");
cudaFree(A_val);
cudaFree(B_val);
cudaFree(C_val);
}
}
int main() {
cublasHandle_t cublasHandle;
cublasStatus_t cublasStatus;
cublasStatus = cublasCreate(&cublasHandle);
if (cublasStatus != CUBLAS_STATUS_SUCCESS) {
printf("error cublas create\n");
exit(1);
}
test(cublasHandle);
cublasDestroy(cublasHandle);
return 0;
}
cusparse_test.cc:
#include <cuda_runtime.h>
#include <cusparse_v2.h>
#include <stdio.h>
#include <cstdlib>
#include <vector>
#include <tmmintrin.h>
using std::vector;
void allocate(void ** ptr, int len) {
cudaError_t ret = cudaMalloc(ptr, len);
if (ret != cudaSuccess) {
printf("error in allocate\n");
exit(1);
}
}
void break_cusparse() {
__m64 a = (__m64)0LL;
__m64 b = (__m64)0LL;
__m64 c = _mm_hadd_pi32(a, b);
}
void test(cusparseHandle_t & cusparseHandle, cusparseMatDescr_t & descr) {
for (int i = 0; i < 3; i++) {
printf("-------- %d ---------- \n", i);
if (i > 0)
break_cusparse();
cudaError_t res;
cusparseStatus_t cusparseStatus;
int m = 8;
int n = 8;
int k = 1;
double * A_val;
allocate((void**)&A_val, 0 * sizeof(double));
int * A_col;
allocate((void**)&A_col, 0);
int * A_row;
allocate((void**)&A_row, sizeof(int) * (m + 1));
cudaMemset(A_row, 0, sizeof(int) * (m + 1));
int A_nnz = 0;
double alpha = 1.0;
double beta = 0.0;
double * B_val;
allocate((void**)&B_val, sizeof(double) * n * k);
cudaMemset(B_val, 7, sizeof(double) * n * k);
double * C_val;
allocate((void**)&C_val, sizeof(double) * m * k);
cudaMemset(C_val, 0, sizeof(double) * n * k);
vector<double> r(m * k);
cudaMemcpy(r.data(), C_val, sizeof(double) * m * k, cudaMemcpyDeviceToHost);
printf("(before multiply) C = ");
for (int i = 0; i < r.size(); i++)
printf("%f ", r[i]);
printf("\n");
cusparseStatus = cusparseDcsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
m, k, n, A_nnz, &alpha, descr, A_val, A_row, A_col,
B_val, n, &beta, C_val, m);
if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
printf("error multiply\n");
exit(1);
}
cudaMemcpy(r.data(), C_val, sizeof(double) * m * k, cudaMemcpyDeviceToHost);
printf("(after multiply) C = ");
for (int i = 0; i < r.size(); i++)
printf("%f ", r[i]);
printf("\n\n");
cudaFree(A_val);
cudaFree(A_row);
cudaFree(A_col);
cudaFree(B_val);
cudaFree(C_val);
}
}
int main() {
cusparseHandle_t cusparseHandle;
cusparseMatDescr_t descr;
cusparseStatus_t cusparseStatus;
cusparseStatus = cusparseCreate(&cusparseHandle);
if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
printf("error cusparse create\n");
exit(1);
}
cusparseStatus = cusparseCreateMatDescr(&descr);
if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
printf("error cusparseCreateMatDescr\n");
exit(1);
}
cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
test(cusparseHandle, descr);
cusparseDestroy(cusparseHandle);
cusparseDestroyMatDescr(descr);
return 0;
}