At this point I have decent results but ONLY with square images, or rather images that only contain “on” pixels in the top-left-most square region of the image. I have no idea why this might be, but there are injected artifacts that I cannot explain. This is the function I am using to read a sparsely populated edge-detected image and generate its transpose using CSC->CSR.
void cuSPARSE_T(float* f_image)
{
cusparseStatus_t cs_status;
cusparseHandle_t cs_handle;
cusparseMatDescr_t descr;
cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
cusparseDirection_t dir = CUSPARSE_DIRECTION_COLUMN;
cudaError_t err;
int nnzTotal;
int* nnzPerRowCol;
float* csrVal;
int* csrRowPtr;
int* csrColInd;
float* cscVal;
int* cscRowInd;
int* cscColPtr;
cs_status = cusparseCreate(&cs_handle);
if (cs_status != CUSPARSE_STATUS_SUCCESS)
{
printf("\ncuSparse initilization failed: %d\n", cs_status);
exit(1);
}
cs_status = cusparseCreateMatDescr(&descr);
if (cs_status != CUSPARSE_STATUS_SUCCESS)
{
printf("\ncuSparse Mat Descr failed: %d\n", cs_status);
exit(1);
}
cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
float* d_f_image;
err = cudaMalloc((void**)&d_f_image, sizeof(float) * ip.h * ip.w);
if (err != CUDA_SUCCESS)
{
printf("\nd_f_image Alloc failed: %d", err);
exit(1);
}
err = cudaMemcpy(d_f_image, f_image, sizeof(float) * ip.h * ip.w, cudaMemcpyHostToDevice);
if (err != CUDA_SUCCESS)
{
printf("\nd_f_image memcopy (H2D) failed: %d", err);
exit(1);
}
cudaMalloc((void**)&nnzPerRowCol, sizeof(int) * ip.w);
cs_status = cusparseSnnz(cs_handle, dir, ip.w, ip.h, descr, d_f_image, ip.w, nnzPerRowCol, &nnzTotal);//whw
if (cs_status != CUSPARSE_STATUS_SUCCESS)
{
printf("\ncuSparse NNZ failed: %d\n", cs_status);
exit(1);
}
printf("\nnnzTotal : %d", nnzTotal);
printf("\nDensity : %1.1f%%", 100 * (float)nnzTotal / (ip.w * ip.h));
printf("\ncuSparse NNZ -> success");
cudaMalloc((void**)&csrVal, sizeof(float) * nnzTotal);
cudaMalloc((void**)&csrRowPtr, sizeof(int) * (ip.h + 1));
cudaMalloc((void**)&csrColInd, sizeof(int) * nnzTotal);
cudaMalloc((void**)&cscVal, sizeof(float) * nnzTotal);
cudaMalloc((void**)&cscRowInd, sizeof(int) * nnzTotal);
cudaMalloc((void**)&cscColPtr, sizeof(int) * (ip.w + 1));
//convert to sparse CSC
cs_status = cusparseSdense2csc(cs_handle, ip.w, ip.h, descr, d_f_image, ip.w, nnzPerRowCol, cscVal, cscRowInd, cscColPtr);//whw
if (cs_status != CUSPARSE_STATUS_SUCCESS)
{
printf("\ncuSparse DENSE -> CSC failed: %d\n", cs_status);
exit(1);
}
printf("\ncuSparse DENSE -> CSC success");
//implicit transpose by changing from CSR to CSC format
cs_status = cusparseScsr2csc(cs_handle, ip.w, ip.h, nnzTotal, cscVal, cscColPtr, cscRowInd, csrVal, csrColInd, csrRowPtr, copyValues, idxBase);//wh
if (cs_status != CUSPARSE_STATUS_SUCCESS)
{
printf("\ncuSparse CSC -> CSR failed: %d\n", cs_status);
exit(1);
}
printf("\ncuSparse CSC -> CSR success");
//convert to dense matrix (image output)
cs_status = cusparseScsr2dense(cs_handle, ip.h, ip.w, descr, cscVal, cscColPtr, cscRowInd, d_f_image, ip.h);//hwh
if (cs_status != CUSPARSE_STATUS_SUCCESS)
{
printf("\ncuSparse CSR -> DENSE failed: %d\n", cs_status);
exit(1);
}
printf("\ncuSparse CSR -> DENSE success");
cudaDeviceSynchronize();
cudaMemcpy(f_image, d_f_image, sizeof(float) * ip.h * ip.w, cudaMemcpyDeviceToHost);
cudaFree(d_f_image);
cudaFree(csrVal);
cudaFree(csrRowPtr);
cudaFree(csrColInd);
cudaFree(cscVal);
cudaFree(cscRowInd);
cudaFree(cscColPtr);
cusparseDestroyMatDescr(descr);
cusparseDestroy(cs_handle);
}
…after looking at this one more time, I found it necessary to swap the lengths of csrRowPtr and cscColPtr on lines 65 and 70. This fixed the problem. Column-major indexing tripped me up many times through this.