cuda 7.0 -- many small parallel svds in MATLAB

Hi,

I am developing an matlab function to speed-up solving many small SVDs on the gpu. The size of the problem is about 200x200 but there should be about 100+ of these that I would like to solve in parallel. Since there is a new SVD solver in the cuda 7.0 cusolver I decided to give it a try. The problem is that at the moment it seems like it’s much faster to use the matlab svd (mostlikely MKL based) routine on the CPU than my current cuda solution so I hope you can help me figure out if this comes from my implementation or it is just impossible to get the performance gains I hope for.

First of all my configuration is an 8 CPU Xeon with a Titan Z card and matlab 2013a with cuda 7.0 and driver version 346.46. I found a post on this http://stackoverflow.com/questions/17401765/parallel-implementation-for-multiple-svds-using-cuda. The idea was to use streams to run the solvers in parallel. There seems to still be synchronization because of the C API i.e. the solver needs to finish before a new one is run so I had to multi-thread it and for that the simplest solution was OMP. So the implementation I have is the following:

#define CUDA_API_PER_THREAD_DEFAULT_STREAM

#include "mex.h"
#include "matrix.h"
#include "gpu/mxGPUArray.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include <math.h>
#include <assert.h>
#include <omp.h>

/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) { exit(code); }
   }
}

extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }

/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
    switch (error)
    {
        case CUSOLVER_STATUS_SUCCESS:
            return "CUSOLVER_SUCCESS";

        case CUSOLVER_STATUS_NOT_INITIALIZED:
            return "CUSOLVER_STATUS_NOT_INITIALIZED";

        case CUSOLVER_STATUS_ALLOC_FAILED:
            return "CUSOLVER_STATUS_ALLOC_FAILED";

        case CUSOLVER_STATUS_INVALID_VALUE:
            return "CUSOLVER_STATUS_INVALID_VALUE";

        case CUSOLVER_STATUS_ARCH_MISMATCH:
            return "CUSOLVER_STATUS_ARCH_MISMATCH";

        case CUSOLVER_STATUS_EXECUTION_FAILED:
            return "CUSOLVER_STATUS_EXECUTION_FAILED";

        case CUSOLVER_STATUS_INTERNAL_ERROR:
            return "CUSOLVER_STATUS_INTERNAL_ERROR";

        case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
            return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";

    }

    return "<unknown>";
}

inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
    if(CUSOLVER_STATUS_SUCCESS != err) {
        fprintf(stderr, "CUSOLVE error in file '%s', line %d\n\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
                                _cudaGetErrorEnum(err)); \
        cudaDeviceReset(); assert(0); \
    }
}

void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }

// create the solver
int MAX_THREADS=omp_get_max_threads();
cusolverDnHandle_t *solver_handle;
float *work=NULL;
float *d_U=NULL, *d_V=NULL, *d_S=NULL;
cudaStream_t *streams;

int work_size = 0;
int *devInfo;

int Nrows;
int Ncols;
int Nims;

float const *X;

/**
 * Replacement for mxGPUCopyFromMxArray
 */
mxGPUArray * mxGPUCopyFromMxArray2(mxArray const * mx) {
    if (mxIsGPUArray(mx)) {
    	mexPrintf("gpu!\n");
        mxGPUArray const * tmp = mxGPUCreateFromMxArray(mx);
        mxGPUArray * returnValue = mxGPUCopyGPUArray(tmp);
        mxGPUDestroyGPUArray(tmp);
        return returnValue;
    } else {
    	mexPrintf("cpu!\n");
        return const_cast<mxGPUArray *>(mxGPUCreateFromMxArray(mx));
    }
}

/*****************************************************************************
 * mex function
 ****************************************************************************/
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
	const mwSize* Xdims;
    int num_streams=MAX_THREADS;

    mxInitGPU();
    const mxGPUArray* Xgpu = mxGPUCreateFromMxArray(prhs[0]);
    mxClassID classID = mxGPUGetClassID(Xgpu);

	if ( classID != mxSINGLE_CLASS ) {
		mexErrMsgTxt("Input arrays must be single precision!");
	}

	Xdims = mxGPUGetDimensions(Xgpu);
	mwSize Xndims = mxGPUGetNumberOfDimensions(Xgpu);
	if (Xndims == 3) {
		Nims = Xdims[2];
	} else {
		Nims = 1;
	}

	Nrows = Xdims[0];
	Ncols = Xdims[1];

	mxGPUArray * out=mxGPUCopyFromMxArray2(prhs[0]);
	float * X=(float* ) mxGPUGetData(out);

	//X = (float const *) mxGPUGetDataReadOnly(Xgpu);
	mexPrintf("%d %d %d\n",Nrows,Ncols,Nims);

	gpuErrchk(cudaMalloc(&devInfo,          sizeof(int)));

	// these need to be allocated by matlab to remain after exit
    const mwSize Udims[3] = {Nrows,Nrows,Nims};
    mxGPUArray* mxd_U = mxGPUCreateGPUArray(3, Udims, mxSINGLE_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES);
	d_U = (float *) mxGPUGetData(mxd_U);
	const mwSize Vdims[3]= {Ncols,Ncols,Nims};
	mxGPUArray* mxd_V = mxGPUCreateGPUArray(3, Vdims, mxSINGLE_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES);
	d_V = (float *)mxGPUGetData(mxd_V);
	const mwSize Sdims[2]= {Ncols,Nims};
	mxGPUArray* mxd_S = mxGPUCreateGPUArray(2, Sdims, mxSINGLE_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES);
	d_S = (float *)mxGPUGetData(mxd_S);

	// create the streams and allocate the outputs
	streams 		= (cudaStream_t*) malloc(MAX_THREADS*sizeof(cudaStream_t));
	solver_handle 	= (cusolverDnHandle_t*) malloc(MAX_THREADS*sizeof(cusolverDnHandle_t));
	for (int i = 0; i < num_streams; i++) {
		//gpuErrchk(cudaStreamCreateWithFlags(&streams[i],cudaStreamNonBlocking));
		gpuErrchk(cudaStreamCreate(&streams[i]));

		// allocate the work space
		cusolveSafeCall(cusolverDnCreate(&solver_handle[i]));
		cusolveSafeCall(cusolverDnSetStream(solver_handle[i], streams[i]));
	}

	cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle[0], Nrows, Ncols, &work_size));
	gpuErrchk(cudaMalloc(&work, num_streams*work_size * sizeof(float)));

	// launch -- this needs to be in a separate loop otherwise it will synch because of the cudaStreamCreate calls
	// mexPrintf("max threads %d for %d images\n",MAX_THREADS,Nims);
#pragma omp parallel for
	for (int i = 0; i < Nims; i++) {
		// launch one worker kernel per stream
		int thread_num = omp_get_thread_num();
		//mexPrintf("im %d on thread no %d\n",i,thread_num);

		cusolveSafeCall(cusolverDnSgesvd(solver_handle[thread_num], 'A', 'A', Nrows, Ncols, &X[Nrows*Ncols*i], Nrows, &d_S[Ncols*i], &d_U[Nrows*Nrows*i], Nrows, &d_V[Ncols*Ncols*i], Ncols, &work[work_size*thread_num], work_size, NULL, devInfo));
		//gpuErrchk(cudaStreamSynchronize(streams[thread_num]));
	}

	mexPrintf("Destroying solver handles!\n");
	for (int i = 0; i<num_streams; i++) {
		//gpuErrchk(cudaStreamSynchronize(streams[i]));
		//cusolveSafeCall(cusolverDnDestroy(solver_handle[i]));
	}
	//mexPrintf("Releasing resources!\n");
	//free(streams);
	//free(solver_handle);
//	cudaDeviceSynchronize();

	plhs[0] = mxGPUCreateMxArrayOnGPU(mxd_U);
	plhs[2] = mxGPUCreateMxArrayOnGPU(mxd_V);
	plhs[1] = mxGPUCreateMxArrayOnGPU(mxd_S);
	return;
}

I am aware that there are memory leaks and not all the synchronizations are kosher but even like this the implementation is about 3 times slower than the MKL version from matlab. I’m running 10 threads and a tensor 200x200x10 which is a bit small for what i want but in the right ballpark.

I’m adding the profiler output. There are a lot of very odd things there like all the HtoD which shouldn’t appear at all since the data is on the GPU and never moved from there. The threads are clearly working but there doesn’t seem to be any major performance improvement.

External Media

Any ideas ?

Help would be greatly appreciated.

Relative to what? The multithreaded CPU version? The single-stream GPU version? Are you using both of the GPUs on the Titan Z? You will need to distribute work manually across the two GPUs on the board.

100 SVDs on matrices of dimension 200 is a small problem for a massive GPU like the Titan Z. Taking into considerations overhead for data movement, kernel launches etc, it may be too small a problem to produce very significant speedup. I have used batched solvers before (not as complex as SVD) and that required at least a few thousand matrices to achieve meaningful speedups compared to a multi-threaded CPU version (that benefits from all data fitting into the last level cache for small problem sizes). Luckily many use cases had on the order of 10,000 to 100,000 matrices to processes.

Yes, relative to the CPU multithreaded version. Mine is a multi stream GPU version which does indeed run in parallel with OMP as the NVVP profile shows. I cannot use both GPUs because matlab doesn’t support it for now and that would just halve the processing time at best while hogging my entire system.

There should be no data movement outside from the one internal to the SVD since my data is already on the GPU to begin with. I mean SVD overwrites the data matrix so I’m cloning it but that should be a very small tensor so I don’t see that being the problem. From what I understand kernel launches should be a small overhead compared to the workload. I could change to a OS threaded version to avoid the OMP overhead but from small checks it seems that the OMP doesn’t have that great an overhead as I initially thought. What i’m saying is that it seems SVD doesn’t seem parallelize well at least in the case I am considering but I would like for someone to point me in the right direction if I’m wrong.

Yes, kernel overhead should be small, but can be more of an issue on Windows with WDDM driver, especially when kernels have short run times. There must be data movement when MATLAB downloads the source data to the GPU and subsequently uploads the SVD results to the host. My hunch is that your matrices are too small to see significant acceleration compared to a well-optimized CPU implementation that operates pretty much entirely out of the last-level cache for a problem of this size.

It would be helpful if someone with hands-on experience with cuSolve would jump in here to comment on realistic performance expectations for small problem sizes.

What I find most puzzling is why the default thread is doing device to host copies for the entire length of the run ? There should be none of that and it might be the reason for the slowdown. The profile snapshot is of a run with 10 matrices (200x200x10) for 100 the slowdown is 5x with respect to the MKL so it seems to be growing with size. Again help would be greatly appreciated.

“There should be none of that”. This statement probably rests on false assumptions, perhaps something like “If all the data is on the GPU, all the activity should be on the GPU.”

I’m not able to run your code directly since I don’t have a convenient install of matlab to use, but I ran a simple test with a single call to cusolverDnSgesvd. I then profiled just that call using nvprof. Turns out that it is a very “busy” API call with all sorts of activity going on (including many D2H memcopies).

So I think your statement is not accurate. That API call is a complex function, performing various operations on both host and device, and needs to copy data back and forth, based on what I can see.

Also, in case this is the area of your confusion, activiites performed by the GPU driver are not necessarily going to be performed in any application threads you spin up.

Thanks @txbob for the answer ! Indeed I assumed that once the data is on the gpu the work will be solely on the GPU but it seems indeed that this is not the case. OMP seems necessary here, without it there is only one stream running (I checked). So in the end at least for this problem size it doesn’t seem like SVD on the GPU is anywhere close to the CPU version. That is quite disappointing. It would be nice if someone from NVIDIA with intimate knowledge of the implementation would have a say on this…