Batched svd on cuda surprisingly slow

Hello,

I tried to use batched svd based on this example https://docs.nvidia.com/cuda/cusolver/index.html#batchgesvdj-example1 using cuda 9.1 on a GeForce GTX 1070.

I have many (>2 000 000) small 4x4 matrices which I want to apply SVD on.

The problem is that the call to cusolverDnSgesvdjBatched in my cuda implementation takes more time (> 4 seconds) than consecutive multi-threaded calls to the analogous opencv-function (< 2 sec), which I found rather counter-intuitive.

Here is an example code:

#include <stdio.h>
#include <iostream>

#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include <chrono>


#define cudaErr(err) { cudaCheckErr((err), __FILE__, __LINE__); }
inline void cudaCheckErr(cudaError_t errCode, const char *file, int line, bool abort=true){
   if (errCode != cudaSuccess){
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(errCode), file, line);
      if (abort) exit(errCode);
   }
}


__global__
void prepareSVD(
        int N,
        float* d_A
    ){
    int a_stride = 16; // size of every matrix in A
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int a_beg = a_stride*i;
    if (i < N){
        d_A[a_beg +0] =  1;
        d_A[a_beg +4] =  5;
        d_A[a_beg +8] =  2;
        d_A[a_beg +12] = -1;

        d_A[a_beg +1] =  -2;
        d_A[a_beg +5] =  8;
        d_A[a_beg +9] =  1;
        d_A[a_beg +13] =  4;

        d_A[a_beg +2] =  3;
        d_A[a_beg +6] =  -1;
        d_A[a_beg +10] =  1;
        d_A[a_beg +14] =  -3;

        d_A[a_beg +3] =  3;
        d_A[a_beg +7] =  4;
        d_A[a_beg +11] =  5;
        d_A[a_beg +15] =  6;
    }
}


int main(void){
    int N = 2000000;
    int n, m, nm;
    n = m = 4; // for this particular domain problem
    nm = n*m; // 16

    float* d_A  = NULL;
    float* d_V  = NULL; 
    float* d_U  = NULL;
    float* d_S  = NULL;
    int * d_info  = NULL;
    int lwork = 0;
    float *d_work = NULL;

    cudaErr(cudaMalloc((void**)&d_A, sizeof(float)*nm*N));
    cudaErr(cudaMalloc((void**)&d_V, sizeof(float)*nm*N));
    cudaErr(cudaMalloc((void**)&d_U, sizeof(float)*nm*N));
    cudaErr(cudaMalloc((void**)&d_S, sizeof(float)*n*N));
    cudaErr(cudaMalloc ((void**)&d_info, sizeof(int   )*N));
    cudaErr(cudaDeviceSynchronize());


    int blockSize = 512;
    // Fill the A with values on device side
    prepareSVD<<< (N + blockSize-1)/blockSize, blockSize>>>(
        N,
        d_A
    );

    cudaErr(cudaDeviceSynchronize());

    int lda = m; /* lda >= m */
    int ldu = m; /* ldu >= m */
    int ldv = n; /* ldv >= n */

    cusolverDnHandle_t cusolverH = NULL;
    cudaStream_t stream = NULL;
    gesvdjInfo_t gesvdj_params = NULL;

    cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;

    int* info = (int*)malloc(N*sizeof(int));

    const float tol = 1.e-7;
    const int max_sweeps = 15;

    const int sort_svd  = 0;  //don't sort singular values
    const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; //compute singular vectors

    //   create cusolver handle, bind a stream
    status = cusolverDnCreate(&cusolverH);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    cudaErr(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

    status = cusolverDnSetStream(cusolverH, stream);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    //configuration of gesvdj
    status = cusolverDnCreateGesvdjInfo(&gesvdj_params);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    //default value of tolerance is machine zero
    status = cusolverDnXgesvdjSetTolerance(gesvdj_params,tol);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    // default value of max. sweeps is 100
    status = cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, max_sweeps);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    // disable sorting
    status = cusolverDnXgesvdjSetSortEig(gesvdj_params,sort_svd);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    status = cusolverDnSgesvdjBatched_bufferSize(
        cusolverH,
        jobz,
        m,
        n,
        d_A,
        lda,
        d_S,
        d_U,
        ldu,
        d_V,
        ldv,
        &lwork,
        gesvdj_params,
        N
    );
    cudaErr(cudaDeviceSynchronize());
    assert(CUSOLVER_STATUS_SUCCESS == status);
    cudaErr(cudaMalloc((void**)&d_work, sizeof(float)*lwork));
    cudaErr(cudaDeviceSynchronize());
    auto startTime = std::chrono::steady_clock::now();

    // this function takes to long
    status = cusolverDnSgesvdjBatched(
        cusolverH,
        jobz,
        m,
        n,
        d_A,
        lda,
        d_S,
        d_U,
        ldu,
        d_V,
        ldv,
        d_work,
        lwork,
        d_info,
        gesvdj_params,
        N
    );
    cudaErr(cudaDeviceSynchronize());
    auto diff_sec = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - startTime) / 1000000000.0;
   std::cout << "[TIME] Time for svd call: " << diff_sec.count() << std::endl;   
 
    assert(CUSOLVER_STATUS_SUCCESS == status);
    cudaErr(cudaMemcpy(info, d_info, sizeof(int) * N, cudaMemcpyDeviceToHost));
    cudaErr(cudaDeviceSynchronize());

    free(info);
    cudaErr(cudaFree(d_S));
    cudaErr(cudaFree(d_U));
    cudaErr(cudaFree(d_info));
    cudaErr(cudaFree(d_A));
    cudaErr(cudaFree(d_V));
}

Is this expected behaviour?
Is there a way to get this faster?

Thank you very much in advance.

Hi,labroides,
Did you have the solution for this problem?
Thanks a lot.