example of sparse LL^t or QR functionality

Hi All,
where can i find example of solving Ax=b on card for sparse matrix A using QR or LU decomposition? I implemnted it myself but got segfault…
Thanks,
Alex

the new cusolver library in CUDA 7.0 provides a sparse API that includes functions that may be useful:

[url]https://docs.nvidia.com/cuda/cusolver/index.html#cusolver-high-level-function-reference[/url]

although it’s not exactly what you describe, there is a batched sparse solver example code in the doc:

[url]https://docs.nvidia.com/cuda/cusolver/index.html#csrqr_batch_examples[/url]

Hi,
I’ve implemented example but it segfault on all matrices from florida collection exсept small matrix LF10_M.mtx. Other (hd4800b.mtx, ex33.mtx) return segfault on cusolverSpXcsrqrAnalysisBatched. Could you help me with this issue? Example below:

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#include <cusolverSp.h>
#include <cuda_runtime_api.h>

#include <time.h>
#include <sys/time.h>

#define SPBLAS_DATA double

//#define _UPPER_TRIANGLE

SPBLAS_DATA zero = 0.0;
SPBLAS_DATA one = 1.0;
SPBLAS_DATA quarter1 = 1.25;

#include “service.h”

int read_matrix2(const char * filename, matrix *a, int k)
{
FILE *fp;
int i;

printf("starting reading data from %s file ... \n", filename); fflush(0); 

int tmp1 = 0;
fp = fopen(filename, "r");
char tmp;
while (1){
	if ((tmp = getc(fp)) == '%')
	{
		while (tmp != '\n')
			fscanf(fp, "%c", &tmp);
	}
	else { goto read; }
}

read:
fseek(fp, -1, SEEK_CUR);

fscanf(fp, "%d %d %d \n", &(a->m), &(a->n), &(a->nnz));
a->n = a->m;

int* ia = (int*)malloc(sizeof(int)* (a->m + 1));
int* ja = (int*)malloc(sizeof(int)* (a->nnz));
double* val = (SPBLAS_DATA*)malloc(sizeof(SPBLAS_DATA)* (a->nnz));

int nnzA = a->nnz;
int m = a->m;
a->k = k;
int* ax = (int*)malloc(sizeof(int)* (a->nnz));
int* ay = (int*)malloc(sizeof(int)* (a->nnz));
double* av = (SPBLAS_DATA*)malloc(sizeof(SPBLAS_DATA)* (a->nnz));

for (i = 0; i<nnzA; i++)
{
	fscanf(fp, "%d %d %lf\n", &ax[i], &ay[i], &av[i]);
}
fclose(fp);

a->y = (SPBLAS_DATA*)malloc(sizeof(SPBLAS_DATA)* a->m * a->k);
a->x = (SPBLAS_DATA*)malloc(sizeof(SPBLAS_DATA)* a->n * a->k);

int counter = 0; // number of non-zero elements in upper triangle part of the matrix
int j;


for (i = 0; i < m+1; i++)
{
	ia[i] = 0;
}

for (i = 0; i < nnzA; i++)
{
	ia[ax[i]+1]++;
}

ia[0] = 1;
for (i = 1; i < m+1; i++)
{
	ia[i] += ia[i - 1];
}

for (i = 0; i < nnzA; i++)
{
	int j = ax[i];
	 ja[ia[j - 1] - 1] = ay[i];
	val[ia[j - 1] - 1] = av[i];
	ia[j - 1]++;
}

int temp1 = ia[0];
int temp2 = ia[0];
ia[0] = 1;
for (i = 1; i<m; i++)
{
	temp2 = ia[i];
	ia[i] = temp1;
	temp1 = temp2;
}

free(ax);
free(ay);
free(av);

for (i = 0; i < m; i++)
{
	int startRow = ia[i];
	int endRow = ia[i + 1] - 1;
	int ib1 = 1;
	int sorted = 0;
	int elem;
	while (!sorted)
	{
		sorted = 1;
		for (elem = startRow - ib1; elem < endRow - ib1 - 1; elem++)
		{

			const int col1 = ja[elem];
			const int col2 = ja[elem + 1];
			if (col1 > col2)
			{
				// exchange elements
				sorted = 0; // next iteration required
				ja[elem] = col2;
				ja[elem + 1] = col1;
				double tmp;
				tmp= val[elem];
				val[elem] = val[elem + 1];
				val[elem + 1] = tmp;
			}
		}
	}
}

a->ia = ia;
a->ja = ja;
a->val = val;

for (i = 0; i < a->m * a->k; i++)
	a->y[i] = zero;
for (i = 0; i < a->n * a->k; i++)
	a->x[i] = one;

return 0;

}

cusolverStatus_t test_solver(cusolverSpHandle_t *handle, cusparseMatDescr_t *descr, matrix *a, matrix *ca, FILE fp, const char * filename)
{
const int ninner = 5;
const int nouter = 1;
const int max_ext = 1;
int st, ext, i, j;
double res[nouter
max_ext];
int counter = 0;
cudaError_t stat;
double cuone = 1.0;
cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;

int ind = a->ia[0];

for (i = 0; i < a->n; i++)
{
	a->y[i] = 0.;
	a->x[i] = 1.;
}

for (i = 0; i < a->n; i++)
{
	for (j = a->ia[i]-ind; j < a->ia[i + 1] - ind; j++)
	{
		a->y[i] += (a->val[j])*(a->x[j]);
	}
}

for (ext = 0; ext < max_ext; ++ext)
{
	for (st = 0; st < nouter; st++)
	{


			double tol = 1.e-10;
			int reorder = 0, batchSize = 1;
			int singularity;
			size_t size_internal, workspaceInBytes;
			void *pBuffer = NULL;
			cudaError_t cudaStat1 = cudaSuccess;

			cudaStat1 = cudaMemcpy(ca->y, a->y, sizeof(double)*(a->n), cudaMemcpyHostToDevice);
			assert(cudaStat1 == cudaSuccess);
			for (i = 0; i < a->n; i++)
			{
				a->y[i] = 0.;
				a->x[i] = 0.;
			}

			csrqrInfo_t info = NULL;
			cusolverSpCreateCsrqrInfo(&info);

			cusolver_status = cusolverSpXcsrqrAnalysisBatched(*handle, a->m, a->m, a->nnz, *descr, ca->ia, ca->ja, info); 

			assert(cusolver_status == CUSOLVER_STATUS_SUCCESS); 

			cusolver_status = cusolverSpDcsrqrBufferInfoBatched(*handle, a->m, a->m, a->nnz, *descr, ca->val, ca->ia, ca->ja, batchSize, info, &size_internal, &workspaceInBytes);
			assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);

			cudaError_t cudaStat = cudaMalloc(&pBuffer, workspaceInBytes);
			assert(cudaStat == cudaSuccess);

			double start = read_timer();

			for (i = 0; i < ninner; i++)
			{
				cusolver_status = cusolverSpDcsrqrsvBatched(*handle, a->m, a->m, a->nnz, *descr, ca->val, ca->ia, ca->ja, ca->y, ca->x, batchSize, info, pBuffer);
				assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);
			}

			double finish = read_timer();
			double seconds = finish - start;

			cusolverSpDestroyCsrqrInfo(info);

			cudaStat1 = cudaMemcpy(a->x, ca->x, sizeof(double)*(a->m)*batchSize, cudaMemcpyDeviceToHost);
			assert(cudaStat1 == cudaSuccess);

		double gflops = 1.;
			printf("Perf: %.3lf Gflop/sec Cycles:%7.5f\n", gflops, seconds/ninner); fflush(0);
	}
	fprintf(fp, "%s,  %d, %d, %d, %d\n", filename, a->n, a->m, a->k, a->nnz);


}

return cusolver_status;

}

int main(int argc, char*argv)
{
char filename[1024], outfile[1024];
FILE *fp;
matrix a, ca;

cusolverSpHandle_t cusolverH = NULL;
csrqrInfo_t info = NULL;
cusparseMatDescr_t descrA = NULL;

cusparseStatus_t cusparse_status = CUSPARSE_STATUS_SUCCESS;
cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
cudaError_t cudaStat5 = cudaSuccess;

int *d_csrRowPtrA = NULL;
int *d_csrColIndA = NULL;
double *d_csrValA = NULL;
double *d_b = NULL; // batchSize * m
double *d_x = NULL; // batchSize * m

size_t size_qr = 0;
size_t size_internal = 0;
void *buffer_qr = NULL; 

// Set up the library handle and data
int m, nnzA;

if (argc < 3)
{
	printf("Usage: ./executable matrix.in matrix.out\n");
	return -1;
}
sprintf(filename, "%s", argv[1]);
sprintf(outfile, "%s", argv[2]);

if (read_matrix2(filename, &a, 1) != 0)
	exit(1); 

/* if (test_matrix(&a, 1) != 0)
exit(1); */

fp = fopen(outfile, "a");

if (fp == NULL)
{
	printf("WARNING! Can't open output file '\%' for writing \n", outfile);
}
else 
{
	fprintf(fp, "matrix name,  nrow, ncol, nnz, GFlop/s\n");
}

ca.n = a.n;
ca.m = a.n;
ca.k = a.k;
ca.nnz = a.nnz;

if (alloc_cuda_matrix(&ca) != 0)
	exit(1);

nnzA = a.nnz;
m = a.n;

cusolver_status = cusolverSpCreate(&cusolverH);
assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);

cusparse_status = cusparseCreateMatDescr(&descrA);
assert(cusparse_status == CUSPARSE_STATUS_SUCCESS);

cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE); // base-1

cusolver_status = cusolverSpCreateCsrqrInfo(&info);
assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);

if (copy2card(&a, &ca) != 0)
	exit(1);

test_solver(&cusolverH, &descrA, &a, &ca, fp, filename);

return 0;

}