gemm returns NaN in cuBLAS and cuSPARSE when ssse3 intel intrinsics are used

Mani-R · August 24, 2016, 7:00pm

Happens on Fedora 20, Ubuntu 15.04, 14.04, 12.04 using GTX 970, GTX 1080, GTX TITAN X and TESLA K20x.
Tested on CUDA Toolkit 7.5 and 8.0 RC.

Please compile using:

nvcc cublas_test.cc -o cublas_code -L/usr/local/cuda/lib64 -lcublas --compiler-options -mssse3
nvcc cusparse_test.cc -o cusparse_code -L/usr/local/cuda/lib64 -lcusparse --compiler-options -mssse3

cublas_test.cc:

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <stdio.h>
#include <cstdlib>
#include <vector>
#include <tmmintrin.h>

using std::vector;

void allocate(void ** ptr, int len) {
    cudaError_t ret = cudaMalloc(ptr, len);
    if (ret != cudaSuccess) {
        printf("error in allocate\n");
        exit(1);
    }
}

void break_cublas() {
    __m64 a = (__m64)0LL;
    __m64 b = (__m64)0LL;
    __m64 c = _mm_hadd_pi32(a, b);
//     __m64 c = _mm_shuffle_pi8(a, b);
}

void test(cublasHandle_t & cublasHandle) {

    for (int i = 0; i < 3; i++) {

        printf("------- %d ------- \n", i);
        if (i > 0)
            break_cublas();

        cudaError_t res;
        cublasStatus_t cublasStatus;

        int m = 8;
        int n = 8;
        int k = 1;

        double * A_val;
        allocate((void**)&A_val, m * n * sizeof(double));
        cudaMemset(A_val, 0, sizeof(double) * m * n);

        double * B_val;
        allocate((void**)&B_val, sizeof(double) * n * k);
        cudaMemset(B_val, 0, sizeof(double) * n * k);

        double * C_val;
        allocate((void**)&C_val, sizeof(double) * m * k);
        cudaMemset(C_val, 0, sizeof(double) * n * k);

        vector<double> r(m * k);
        cudaMemcpy(r.data(), C_val, sizeof(double) * m * k, cudaMemcpyDeviceToHost);
        printf("(before multiply) C = ");
        for (int i = 0; i < r.size(); i++)
            printf("%f ", r[i]);
        printf("\n");

        double alpha = 1.0;
        double beta = 0.0;

        cublasStatus = cublasDgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, m, k, n,
                                   &alpha, A_val, m, B_val, n, &beta, C_val, m);

        if (cublasStatus != CUBLAS_STATUS_SUCCESS) {
            printf("error in multiply\n");
            exit(1);
        }

        cudaMemcpy(r.data(), C_val, sizeof(double) * m * k, cudaMemcpyDeviceToHost);
        printf("(after multiply) C = ");
        for (int i = 0; i < r.size(); i++)
            printf("%f ", r[i]);
        printf("\n\n");

        cudaFree(A_val);
        cudaFree(B_val);
        cudaFree(C_val);
    }
}

int main() {
    cublasHandle_t cublasHandle;
    cublasStatus_t cublasStatus;
    cublasStatus = cublasCreate(&cublasHandle);
    if (cublasStatus != CUBLAS_STATUS_SUCCESS) {
        printf("error cublas create\n");
        exit(1);
    }

    test(cublasHandle);

    cublasDestroy(cublasHandle);

    return 0;
}

cusparse_test.cc:

#include <cuda_runtime.h>
#include <cusparse_v2.h>
#include <stdio.h>
#include <cstdlib>
#include <vector>
#include <tmmintrin.h>

using std::vector;

void allocate(void ** ptr, int len) {
    cudaError_t ret = cudaMalloc(ptr, len);
    if (ret != cudaSuccess) {
        printf("error in allocate\n");
        exit(1);
    }
}

void break_cusparse() {
    __m64 a = (__m64)0LL;
    __m64 b = (__m64)0LL;
    __m64 c = _mm_hadd_pi32(a, b);
}

void test(cusparseHandle_t & cusparseHandle, cusparseMatDescr_t & descr) {

    for (int i = 0; i < 3; i++) {

        printf("-------- %d ---------- \n", i);
        if (i > 0)
            break_cusparse();

        cudaError_t res;

        cusparseStatus_t cusparseStatus;

        int m = 8;
        int n = 8;
        int k = 1;

        double * A_val;
        allocate((void**)&A_val, 0 * sizeof(double));

        int * A_col;
        allocate((void**)&A_col, 0);

        int * A_row;
        allocate((void**)&A_row, sizeof(int) * (m + 1));
        cudaMemset(A_row, 0, sizeof(int) * (m + 1));

        int A_nnz = 0;
        double alpha = 1.0;
        double beta = 0.0;

        double * B_val;
        allocate((void**)&B_val, sizeof(double) * n * k);
        cudaMemset(B_val, 7, sizeof(double) * n * k);

        double * C_val;
        allocate((void**)&C_val, sizeof(double) * m * k);
        cudaMemset(C_val, 0, sizeof(double) * n * k);

        vector<double> r(m * k);
        cudaMemcpy(r.data(), C_val, sizeof(double) * m * k, cudaMemcpyDeviceToHost);
        printf("(before multiply) C = ");
        for (int i = 0; i < r.size(); i++)
            printf("%f ", r[i]);
        printf("\n");
        cusparseStatus = cusparseDcsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                        m, k, n, A_nnz, &alpha, descr, A_val, A_row, A_col,
                                        B_val, n, &beta, C_val, m);

        if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
            printf("error multiply\n");
            exit(1);
        }

        cudaMemcpy(r.data(), C_val, sizeof(double) * m * k, cudaMemcpyDeviceToHost);
        printf("(after multiply) C = ");
        for (int i = 0; i < r.size(); i++)
            printf("%f ", r[i]);
        printf("\n\n");

        cudaFree(A_val);
        cudaFree(A_row);
        cudaFree(A_col);
        cudaFree(B_val);
        cudaFree(C_val);
    }
}

int main() {
    cusparseHandle_t cusparseHandle;
    cusparseMatDescr_t descr;
    cusparseStatus_t cusparseStatus;
    cusparseStatus = cusparseCreate(&cusparseHandle);
    if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
        printf("error cusparse create\n");
        exit(1);
    }

    cusparseStatus = cusparseCreateMatDescr(&descr);
    if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
        printf("error cusparseCreateMatDescr\n");
        exit(1);
    }
    cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
    test(cusparseHandle, descr);

    cusparseDestroy(cusparseHandle);
    cusparseDestroyMatDescr(descr);

    return 0;
}

njuffa · August 25, 2016, 6:28pm

On 64-bit Windows, __m64 is not supported, so I changed the break_cublas() function as follows:

void break_cublas() {
    __m128i a;
    __m128i b;
    __m128i c;
    memset (&a, 0, sizeof(a));
    memset (&b, 0, sizeof(b));
    c = _mm_hadd_epi32(a, b);
    c = _mm_shuffle_epi8(a, b);
}

With that change and compiled with CUDA 7.5, the app runs normally, printing a bunch of zeros at the end:

------- 0 -------
(before multiply) C = 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
(after multiply) C = 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

------- 1 -------
(before multiply) C = 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
(after multiply) C = 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

------- 2 -------
(before multiply) C = 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
(after multiply) C = 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

Mani-R · August 25, 2016, 6:36pm

Based on

__m64 should be accessible in 64-bits Windows too.

Just need to include xmmintrin.h instead.

njuffa · August 25, 2016, 7:11pm

I tried building Microsoft’s _mm_hadd_pi32() example code ([url]Microsoft Docs - Developer tools, technical documentation and coding examples) using standalone MSVC, without any CUDA toolchain involvement, and could not get it to build successfully. Googling for an explanation I came across information that says __m64 is not supported on x64 platforms. That’s all I know at this point. BTW, the Microsoft page you pointed me to says as much: “The __m64 data type is not supported on x64 processors.”

It would be interesting to see what happens on your Linux platform if you exchange your version of break_cublas() with mine. I have no idea how use of SSE intrinsics could possibly break CUBLAS; the only thing I could imagine is that it breaks something in the host code, causing the data returned by the GPU to be incorrectly displayed as NaNs.

Mani-R · August 25, 2016, 7:16pm

Adding your changes to my code fixes the issue (no NaN).

njuffa · August 25, 2016, 7:31pm

As far as I recall (vague memories) __m64 data is handled by the old style x87/MMX units, not by the SSE units. And after using the MMX units, an FPU context switch is required by calling _mm_empty(), otherwise subsequent host-side floating-point processing can/will go off the rails.

What happens if you add a call to _mm_empty() at the end of your original break_cublas() function?

Mani-R · August 25, 2016, 7:41pm

Adding _mm_empty() to the end of break_cublas and break_cusparse functions fixed the issue.

Thanks.

Robert_Crovella · August 25, 2016, 9:50pm

nice work njuffa