Hello everyone,
i used cuBlas function cublasDgetrfBatched() to evalute LU decomposition of square matrix, here is the code:
void cublas_lu(double *a, int n, int batchsize )
{
cublasInit();
cublasHandle_t handle;
cublasCreate_v2(&handle);
int *P, *INFO;
double *a_d;
cudaMalloc(&P, n * batchsize * sizeof(int));
cudaMalloc(&INFO, batchsize * sizeof(int));
cudaMalloc(&a_d, n * n * sizeof(double));
cudaMemcpy(a_d, a, n * n * sizeof(double),cudaMemcpyHostToDevice);
double *A = { a_d };
double ** A_d;
cudaMalloc<double*>(&A_d,sizeof(A));
cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice);
cublasDgetrfBatched(handle,n,A_d,n,P,INFO,batchsize);
cudaMemcpy( a, a_d, n * n * sizeof(double),cudaMemcpyDeviceToHost);
cublasDestroy_v2(handle);
}
Everything works fine, but last copying data from device to host:
cudaMemcpy( a, a_d, n * n * sizeof(double),cudaMemcpyDeviceToHost);
take too long time, for matrix 10000x10000 take unbelievable about ~260 sec…
Typical time for copying this amount of data should take ~1-2 sec. max.
Interisting thing is, when i run code without cublasDgetrfBatched(), copying of data take “normal” time 2-3 sec. what was expected.
I tried cudaDeviceSynchronize(), cudaMemcpyAsync(), cublasGetMatrix() and nothing works :(
Anybody can help?