I am trying to use CUBLAS GEMV to compute a matrix-vector operation. To be sure that the concept is correct I built a version using only CPU and got the correct results however when using the GPU CUBLAS GEMV I get invalid results. I think it may be ordering but don’t know for sure.
The code is listed below. I apologize if it is messy, the project I am working on involves a lot of classes but the source of the problem is the kernel listed in the code. I commented-out the CPU code that works.
If anyone can think of a reason why cuBlasSgemv gives wrong answer but the CPU version doesn’t please let me know. Any hint/idea would be greatly appreciated.
/* Solution is held at T->L() array which is n*n*n in length */
/* T is a pointer to node class */
chebyshev* cheb = T->cheb;
precomputation* pcomp = T->pcomp;
int n = cheb->n;
int N = n*n*n;
int LIDX = T->which();
float lam = (1<<(-T->kern->lambda)*T->level());
node *J;
int row = 216*N, col = N;
std::vector<float> A(row*col);
std::vector<float> x(col);
std::vector<float> y(col);
for(int i(0); i < row; ++i){
for(int j(0); j < col; ++j){
A[col*i + j] = 0.0f;
}
}
// store in ROW-MAJOR order ...
int I = 0;
for(int l(0); l < N; ++l) {
x[l] = 1.0f;
y[l] = T->L()[l];
for(int j(0); j < 216; ++j) {
if(T->interactants()[j].pointer != NULL) {
J = T->interactants()[j].pointer;
for(int m(0); m < N; ++m) {
float mat = pcomp->matrix_M2L[LIDX][j][m][l];
A[col*I + m] = J->M()[m]*mat;
}
}
++I;
}
}
/*
// COLUMN-MAJOR access using CPU - gives correct results ...
for(int i(0); i < col; ++i) {
for(int j(0); j < row; ++j) {
y[i] += (lam*A[row*i + j]*x[i]);
}
}
for(int i(0); i < N; ++i) {
T->L()[i] = y[i]; // store back for solution
}
*/
float *d_A, *d_x, *d_y;
cublasHandle_t handle;
int size = row*col;
cudaMalloc((void**)&d_A, size*sizeof(float));
cudaMalloc((void**)&d_x, col*sizeof(float));
cudaMalloc((void**)&d_y, col*sizeof(float));
cudaMemcpy(d_A, A.data(), size*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_x, x.data(), col*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y.data(), col*sizeof(float), cudaMemcpyHostToDevice);
float alf = lam, beta = 1.0f;
cublasCreate(&handle);
cudaEventRecord(start, 0);
// call to cublas returns INCORRECT results
cublasSgemv(handle, CUBLAS_OP_T, col, row, &alf, d_A, col, d_x, 1, &beta, d_y, 1);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&curtime, start, stop);
time += curtime;
cudaMemcpy(T->L(), d_y, col*sizeof(float), cudaMemcpyDeviceToHost);
Edit: Could the cublasSgemv call be returning incorrect values due to a mis-match with C/C++ style array and the fact that cublas follows FORTRAN style? If so, is there examples with how to call cublasSgemv with C-style arrays?