Hello,
I tried to use cusparseSnnz but I couldn’t get proper results then I try to run and compile the example cusparse.cu from the book “Professional Cuda C programming” but also I cannot get good results. I try to print the vectors of the matrix in the CSR format but they are not as I expected.
I add the version with some modifications. Thank you.
#include “…/common/common.h”
#include <stdio.h>
#include <stdlib.h>
#include <cusparse_v2.h>
#include <cuda.h>
/*
- This is an example demonstrating usage of the cuSPARSE library to perform a
- sparse matrix-vector multiplication on randomly generated data.
*/
/*
- M = # of rows
- N = # of columns
*/
int M = 3;
int N = 3;
/*
-
Generate a vector of length N with random single-precision floating-point
-
values between 0 and 100.
*/
void generate_random_vector(int N, float **outX)
{
int i;
double rMax = (double)RAND_MAX;
float *X = (float *)malloc(sizeof(float) * N);for (i = 0; i < N; i++)
{
int r = rand();
double dr = (double)r;
X[i] = (dr / rMax) * 100.0;
}*outX = X;
}
/*
-
Generate random dense matrix A in column-major order, while rounding some
-
elements down to zero to ensure it is sparse.
*/
int generate_random_dense_matrix(int M, int N, float **outA)
{
int i, j;
double rMax = (double)RAND_MAX;
float *A = (float *)malloc(sizeof(float) * M * N);
int totalNnz = 0;for (j = 0; j < N; j++)
{
for (i = 0; i < M; i++)
{
int r = rand();
float *curr = A + (j * M + i);if (r % 3 > 0) { *curr = 0.0f; } else { double dr = (double)r; *curr = (dr / rMax) * 100.0; } if (*curr != 0.0f) { totalNnz++; } }
}
*outA = A;
return totalNnz;
}
int main(int argc, char **argv)
{
int row;
float *A, *dA;
int *dNnzPerRow;
float *dCsrValA;
int *dCsrRowPtrA;
int *dCsrColIndA;
int totalNnz;
float alpha = 3.0f;
float beta = 4.0f;
float *dX, *X;
float *dY, *Y;
cusparseHandle_t handle = 0;
cusparseMatDescr_t descr = 0;
// Generate input
srand(9384);
int trueNnz = generate_random_dense_matrix(M, N, &A);
generate_random_vector(N, &X);
generate_random_vector(M, &Y);
// Create the cuSPARSE handle
CHECK_CUSPARSE(cusparseCreate(&handle));
// Allocate device memory for vectors and the dense form of the matrix A
CHECK(cudaMalloc((void **)&dX, sizeof(float) * N));
CHECK(cudaMalloc((void **)&dY, sizeof(float) * M));
CHECK(cudaMalloc((void **)&dA, sizeof(float) * M * N));
CHECK(cudaMalloc((void **)&dNnzPerRow, sizeof(int) * M));
// Construct a descriptor of the matrix A
CHECK_CUSPARSE(cusparseCreateMatDescr(&descr));
CHECK_CUSPARSE(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
CHECK_CUSPARSE(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
// Transfer the input vectors and dense matrix A to the device
CHECK(cudaMemcpy(dX, X, sizeof(float) * N, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(dY, Y, sizeof(float) * M, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(dA, A, sizeof(float) * M * N, cudaMemcpyHostToDevice));
int *nnzh=NULL;
// Compute the number of non-zero elements in A
CHECK_CUSPARSE(cusparseSnnz(handle, CUSPARSE_DIRECTION_ROW, M, N, descr, dA,
M, dNnzPerRow, &totalNnz));
nnzh = (int *)malloc(sizeof(int)*N);
cudaMemcpy(dNnzPerRow,nnzh, N*sizeof(int), cudaMemcpyDeviceToHost);
for( int i=0 ; i<N ;i++)
{
printf(“nnzh[%d]=%d\n”,i,nnzh[i]);
}
printf(“totalNnz=%d\n”,totalNnz);
if (totalNnz != trueNnz)
{
fprintf(stderr, "Difference detected between cuSPARSE NNZ and true "
"value: expected %d but got %d\n", trueNnz, totalNnz);
return 1;
}
// Allocate device memory to store the sparse CSR representation of A
CHECK(cudaMalloc((void **)&dCsrValA, sizeof(float) * totalNnz));
CHECK(cudaMalloc((void **)&dCsrRowPtrA, sizeof(int) * (M + 1)));
CHECK(cudaMalloc((void **)&dCsrColIndA, sizeof(int) * totalNnz));
// Convert A from a dense formatting to a CSR formatting, using the GPU
CHECK_CUSPARSE(cusparseSdense2csr(handle, M, N, descr, dA, M, dNnzPerRow,
dCsrValA, dCsrRowPtrA, dCsrColIndA));
// Perform matrix-vector multiplication with the CSR-formatted matrix A
CHECK_CUSPARSE(cusparseScsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
M, N, totalNnz, &alpha, descr, dCsrValA,
dCsrRowPtrA, dCsrColIndA, dX, &beta, dY));
// Copy the result vector back to the host
CHECK(cudaMemcpy(Y, dY, sizeof(float) * M, cudaMemcpyDeviceToHost));
for (row = 0; row < 10; row++)
{
printf("%2.2f\n", Y[row]);
}
printf("...\n");
free(A);
free(X);
free(Y);
CHECK(cudaFree(dX));
CHECK(cudaFree(dY));
CHECK(cudaFree(dA));
CHECK(cudaFree(dNnzPerRow));
CHECK(cudaFree(dCsrValA));
CHECK(cudaFree(dCsrRowPtrA));
CHECK(cudaFree(dCsrColIndA));
CHECK_CUSPARSE(cusparseDestroyMatDescr(descr));
CHECK_CUSPARSE(cusparseDestroy(handle));
return 0;
}