Why is my cublas so slow and is there anything I can do to fix it?

In a previous post I made we noticed that performing dgemm with cublas on my GTX1080ti was substantially slower than a naive matrix multiplication implementation with nested loops.

For instance, the following code (which I run from Matlab) calculates C = A*B and reports the time both using cublas and a custom multiplication kernel. There is however once difference from a conventional matrix multiplication: certain rows and columns of A and B should be ignored during the multiplication. This is the code:

#include "mex.h"
#include "cuda_runtime.h" 
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <cublas_v2.h>

/* Manual matmult + reduction kernel without cublas */
__global__ void ManualMatMultRedKernel(const double *__restrict__ A, const double *__restrict__ B, const double * __restrict__ y, double *SSE, const size_t m, const size_t n, const size_t p, const size_t k, const uint64_T *UseColindex, const uint64_T *smalln, const uint64_T *UseRowIndex, const size_t smallm) {
	double e;
	// Loop over pages in B (k)
	for (int v = blockIdx.z * blockDim.z + threadIdx.z; v < k; v += blockDim.z * gridDim.z) {
		// Loop rows of A (smallm)
		for (int r = blockIdx.x * blockDim.x + threadIdx.x; r < smallm; r += blockDim.x * gridDim.x) {
			// Loop columns of B (p)
			for (int c = blockIdx.y * blockDim.y + threadIdx.y; c < p; c += blockDim.y * gridDim.y) {
				e = 0;
				// Inner loop over rows of B (smalln[v]) 
				for (int steps = 0; steps < smalln[v]; steps++) {
					e += A[UseRowIndex[r] + m*UseColindex[v + k*steps]] * B[UseColindex[v + k*steps] + n*c + n*p*v];
				}
				// NOTE! this kernel also reduces the matrix product after its calculated. Which cublas does not.
				atomicAdd(SSE + v + k*c, e);  // Sum matrix product along rows as its calculated.
			} 
		} 
	}
} 
/* Kernel for filling SmallA with relevant rows from A */
__global__ void FillPartialA(const double *__restrict__ A, const size_t m, const size_t n, const uint64_T *UseRowIndex, const size_t smallm, double *SmallA) {
	// Loop rows of A (smallm)
	for (int r = blockIdx.x * blockDim.x + threadIdx.x; r < smallm; r += blockDim.x * gridDim.x) {
		// Inner loop over columns of A (n)
		for (int steps = blockIdx.y * blockDim.y + threadIdx.y; steps < n; steps += blockDim.y * gridDim.y) {
			SmallA[r + smallm*steps] = A[UseRowIndex[r] + m*steps];		
		}
	}
}

/* ------------------------------ Start of Matlab gateway function ----------------------------- */
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
	
	/* ————————————————————————————————————————————————————————————————————————————————————————*/
	/* Get inputs from Matlab.																   */
	const double *A, *B, *y;
	const bool *UseCol, *UseRow;
	A       = mxGetPr(prhs[0]);				   // [m-by-n] Matrix A.
	B       = mxGetPr(prhs[1]);				   // [n-by-p-by-k] Matrix B.
	y       = mxGetPr(prhs[2]);				   // [m-by-1] vector y. NOTE! THIS VARIBLE IS CURRENTLY NOT USED.
	UseCol  = mxGetLogicals(prhs[3]);          // [k-by-n] logical matrix with columns to use.
	UseRow  = mxGetLogicals(prhs[4]);          // [m-by-1] logical vector with rows to use.

	/* Get size variables for inputs. */
	size_t m, n, p, k;

	const size_t *pDims;
	const mwSize ndims = mxGetNumberOfDimensions(prhs[1]);
	pDims = mxGetDimensions(prhs[1]);

	m = mxGetM(prhs[2]);    // Number of rows of A and y.
	n = pDims[0];		    // Number of columns of A/rows in B.
	p = pDims[1];           // Number of columns in B.
	if (ndims == 2) {
		k = 1;			    // Number of pages in B.
	}
	else {
		k = pDims[2];      // Number of pages in B.
	}

	/* ————————————————————————————————————————————————————————————————————————————————————————*/
	/* Allocate memory on GPU and transfer variables.                                          */
	double *dA, *dB, *dy, *dSSE1, *dSSE2;
	bool *dUseCol, *dUseRow;

	/* Allocate memory on GPU.																     */
	cudaMalloc(&dA     , sizeof(double)   * (int)m   * (int)n            );
	cudaMalloc(&dB     , sizeof(double)   * (int)n   * (int)p   * (int)k );
	cudaMalloc(&dy     , sizeof(double)   * (int)m                       );
	cudaMalloc(&dUseCol, sizeof(bool)     * (int)k   * (int)n            );
	cudaMalloc(&dUseRow, sizeof(bool)     * (int)m                       );
	
	cudaMalloc(&dSSE1  , sizeof(double)   * (int)k   * (int)p            );
	cudaMemset(dSSE1, 0, sizeof(double)   * (int)k   * (int)p            ); // make sure SSE1 is initialized to 0
	
	cudaMalloc(&dSSE2  , sizeof(double)   * (int)k   * (int)p            );
	cudaMemset(dSSE2, 0, sizeof(double)   * (int)k   * (int)p            ); // make sure SSE2 is initialized to 0
	
	/* Transfer variables from host to device.												     */
	cudaMemcpy(dA     , A     , sizeof(double) * (int)m * (int)n         , cudaMemcpyHostToDevice);
	cudaMemcpy(dB     , B     , sizeof(double) * (int)n * (int)p * (int)k, cudaMemcpyHostToDevice);
	cudaMemcpy(dy     , y     , sizeof(double) * (int)m                  , cudaMemcpyHostToDevice);
	cudaMemcpy(dUseCol, UseCol, sizeof(bool)   * (int)k   * (int)n       , cudaMemcpyHostToDevice);
	cudaMemcpy(dUseRow, UseRow, sizeof(bool)   * (int)m                  , cudaMemcpyHostToDevice);


	clock_t t1, t2, t3, t4;	
	/*	———————————————————————————————————————————————————————————————————————————————————————— */
	/* Time non-destructive manual matrix multiplication + reduction kernel                      */
	/*	———————————————————————————————————————————————————————————————————————————————————————— */
	
	// Count active variables in UseCol and store their indices.
	uint64_T *UseColindex, *smalln;
	UseColindex = (uint64_T*)malloc(sizeof(uint64_T) * (int)k * (int)n); // [k-by-n] (overallocated)
	smalln = (uint64_T*)calloc(k, sizeof(uint64_T)); // [k-by-1]
	for (int c = 0; c < n; c++) {
		for (int r = 0; r < k; r++) {
			if (UseCol[r + k*c] == true) {
				UseColindex[r + k*smalln[r]] = c;
				smalln[r] += 1;
			}
		}
	}
	// Copy to device.
	uint64_T *dsmalln, *dUseColindex;
	cudaMalloc(&dsmalln, sizeof(uint64_T)   * (int)k);
	cudaMemcpy(dsmalln, smalln, sizeof(uint64_T) * (int)k, cudaMemcpyHostToDevice);
	cudaMalloc(&dUseColindex, sizeof(uint64_T)   * (int)k * (int)n);
	cudaMemcpy(dUseColindex, UseColindex, sizeof(uint64_T) * (int)k * (int)n, cudaMemcpyHostToDevice);

	// Count active observations in UseRow and store their indices.
	uint64_T *UseRowIndex;
	size_t smallm;
	UseRowIndex = (uint64_T*)malloc(sizeof(uint64_T) * (int)m); // [m-by-1] (overallocated)
	smallm = 0;
	for (int r = 0; r < m; r++) {
		if (UseRow[r] == true) {
			UseRowIndex[smallm] = r;
			smallm++;
		}
	}
	// Copy to device.
	uint64_T *dUseRowIndex;
	cudaMalloc(&dUseRowIndex, sizeof(uint64_T)   * (int)m);
	cudaMemcpy(dUseRowIndex, UseRowIndex, sizeof(uint64_T) * (int)m, cudaMemcpyHostToDevice);

	/* Specify kernel launch configuration.													     */
	int xnTPB = 8; int ynTPB = 16; int znTPB = 8;
	int xnumBlocks = (smallm + xnTPB - 1) / xnTPB;
	int ynumBlocks = (n + ynTPB - 1) / ynTPB;
	int znumBlocks = (k + znTPB - 1) / znTPB;
	dim3 Nblocks(xnumBlocks, ynumBlocks, znumBlocks);
	dim3 nTPB(xnTPB, ynTPB, znTPB);

	t1 = clock();

	// Call manual matrix multiplication + reduction kernel.
	ManualMatMultRedKernel<<<Nblocks, nTPB >>>(dA, dB, dy, dSSE2, m, n, p, k, dUseColindex, dsmalln, dUseRowIndex, smallm);
	cudaDeviceSynchronize();

	t2 = clock();
	printf("Manual matmult kernel took: %.3f ms\n", (double)(t2 - t1) / CLOCKS_PER_SEC * 1000);

	/*———————————————————————————————————————————————————————————————————————————————————————— */
	/* Time non-destructive big cublasDgemm call						                       */
	/*———————————————————————————————————————————————————————————————————————————————————————— */

	cublasHandle_t hdl;
	cublasCreate(&hdl);
	double alpha = 1.0;
	double beta = 0.0; // could also prefill AB with y, set beta to -1.0  so as to compute AB - y

	// Smaller A with only rows which are true in UseRow included and AB.
	double *dSmallA;
	cudaMalloc(&dSmallA, sizeof(double)   * (int)smallm   * (int)n);
	double *dAB;
	cudaMalloc(&dAB, sizeof(double)       * (int)smallm   * (int)p  * (int)k);

	/* Remove rows from A that are not set to true in UseRow.											*/
	int xnTPB2 = 8;	int ynTPB2 = 16; 	int znTPB2 = 1;
	int xnumBlocks2 = (smallm + xnTPB2 - 1) / xnTPB2;
	int ynumBlocks2 = (n + ynTPB2 - 1) / ynTPB2;
	int znumBlocks2 = 1;
	dim3 Nblocks2(xnumBlocks2, ynumBlocks2, znumBlocks2);
	dim3 nTPB2(xnTPB2, ynTPB2, znTPB2);

	FillPartialA<<<Nblocks2, nTPB2>>>(dA, m, n, dUseRowIndex, smallm, dSmallA);
	cudaDeviceSynchronize();
	
	t3 = clock();
	size_t pk = p*k;
	 // Calculate and time C = A*B  with cublas
	cublasDgemm(hdl,
	  CUBLAS_OP_N,  // No transpose
	  CUBLAS_OP_N,  // No transpose
		   smallm,  // rows of dSmallA
		       pk,	// columns of dB
		        n,  // columns of dSmallA
		   &alpha,  // scalar alpha
	  	  dSmallA,  // dSmallA matrix
  		   smallm,  // LDA
		       dB,  // dB matrix
		        n,  // LDB
		    &beta,  // scalar beta
		      dAB,  // output matrix [smallm-by-p-by-k]
		   smallm); // LDC
	cudaDeviceSynchronize();
	t4 = clock();

	printf("dgemm alone took: %.3f ms\n", (double)(t4 - t3) / CLOCKS_PER_SEC * 1000);

	cudaFree(dAB);
	free(UseColindex);
	free(UseRowIndex);
	free(smalln);
	cudaFree(dUseColindex);
	cudaFree(dUseRowIndex);
	cudaFree(dsmalln);
	
	/* Free allocated memory */
	cudaFree(dA);
	cudaFree(dB);
	cudaFree(dy);
	cudaFree(dSSE1);
	cudaFree(dSSE2);
	cudaFree(dUseCol);
	cudaFree(dUseRow);
}

When I run this on my GTX1080ti with input variable dimensions: A = [1e6-by-256], B = [256-by-25-by-50] and UseCol and UseRow set to about 20 % true values the execution time is 173 ms for the manual matrix multiplication kernel and 316 ms for dgemm.

When I profile the code with nsight I get 100 % occupancy for my manual kernel but only 12.5 % occupancy for maxwell_dgemm_128x64_raggedMn_nn.

In the past I have also used other cublas functions such as dgemv, dscal etc. and I have always been pretty unimpressed with their performance (although I have no benchmarks to back that up) as well.

What could be causing cublas to be so slow on my GPU? Could it be caused by an error in the installation of cublas or the GPU driver? Why do I not get full occupancy on a huge matrix multiplication?

Im using cuda 8.0.

I am experiencing a similar phenomenon. I am curious. Other calculations are faster, but slower in the cublas function.