Cublas, sum and dot. Newbie question.

I am trying to compute using cublas, the sum of the elements of a vector and vector dot product. My problem is that I can’t get the results back due a problem with the memory allocation of the double/pointer which will contain the result at the end of the computation.

int  vector_size = 3;

double* h_M;

  double* h_H;

  double* d_M = 0;

  double* d_H = 0;

cublasStatus_t status;

  cublasHandle_t handle;

status = cublasCreate(&handle);

if (status != CUBLAS_STATUS_SUCCESS) {

        fprintf (stderr, "!!!! CUBLAS initialization error\n");

        return EXIT_FAILURE;

  }

/* Allocate host memory for the matrices */

  h_M = (double*)malloc( vector_size * sizeof(h_M[0]));

  if (h_M == 0) {

    fprintf (stderr, "!!!! host memory allocation error (M)\n");

    return EXIT_FAILURE;

  }

h_H = (double*)malloc( vector_size * sizeof(h_H[0]));

  if (h_H == 0) {

    fprintf (stderr, "!!!! host memory allocation error (H)\n");

    return EXIT_FAILURE;

  }

h_M[0]=1.0;

  h_M[1]=0.0;

  h_M[2]=0.0;

h_H[0]=0.0;

  h_H[1]=1.0;

  h_H[2]=0.0;

/* Allocate device memory for the matrices */

    if (cudaMalloc((void**)&d_M, vector_size * sizeof(d_M[0])) != cudaSuccess) {

        fprintf (stderr, "!!!! device memory allocation error (allocate A)\n");

        return EXIT_FAILURE;

    }

    if (cudaMalloc((void**)&d_H, vector_size * sizeof(d_H[0])) != cudaSuccess) {

        fprintf (stderr, "!!!! device memory allocation error (allocate B)\n");

        return EXIT_FAILURE;

    }

/* Initialize the device matrices with the host matrices */

  status = cublasSetVector(vector_size, sizeof(h_M[0]), h_M, 1, d_M, 1);

  if (status != CUBLAS_STATUS_SUCCESS) {

    fprintf (stderr, "!!!! device access error (write M)\n");

    return EXIT_FAILURE;

  }

  status = cublasSetVector(vector_size, sizeof(h_H[0]), h_H, 1, d_H, 1);

  if (status != CUBLAS_STATUS_SUCCESS) {

    fprintf (stderr, "!!!! device access error (write H)\n");

    return EXIT_FAILURE;

  }

/* Performs operation using cublas */

status = cublasDasum(handle, vector_size, d_M,1,result);

  if (status != CUBLAS_STATUS_SUCCESS) {

      fprintf (stderr, "!!!! kernel execution error.\n");

      return EXIT_FAILURE;

  }

//   /* Read the result back */

   double* sum = 0;

   status = cublasGetVector(1, sizeof(void**), result, 1, sum, 1);// I only require the first element of result, that's why I chose 1 as vector length, is that ok?

   if (status != CUBLAS_STATUS_SUCCESS) {

     fprintf (stderr, "!!!! device access error (read C)\n");

     return EXIT_FAILURE;

   } 

cout << *sum << endl;

/* Performs operation using cublas */

   double* result = 0;

   status = cublasDdot(handle, vector_size, d_M,1,d_H,1, result);// is it ok to not have allocated result into the memory, the manual says that it could be located in the host.

   if (status != CUBLAS_STATUS_SUCCESS) {

     fprintf (stderr, "!!!! kernel execution error.\n");

     return EXIT_FAILURE;

   }

cout << *result << endl;

/* Memory clean up */

   if (cudaFree(d_H) != cudaSuccess) {

     fprintf (stderr, "!!!! memory free error (H)\n");

     return EXIT_FAILURE;

   }

   if (cudaFree(d_M) != cudaSuccess) {

     fprintf (stderr, "!!!! memory free error (M)\n");

     return EXIT_FAILURE;

   }

Then I can’t find the mistake and I obtain “Segmentation fault” when I run the executable. I’m really puzzled about what’s wrong, your help would be very appreciated.

many thanks!

I have question. After the line double* sum = 0; shouldn’t you allocate sum of size 1 with malloc? Do you know at which line does it crash?

I don’t know if I have to allocate it. In the manual for this function says that the “role of dot” could be in in the host or the device. Then my question is, does cudablas fill it automatically in the host? or do I have to send it to the device and the copy it back to the host?

(I’ll try meanwhile)

It fails at blas operations.

Ok, I tried this

// /* Performs operation using cublas */

double*  result ;

if (cudaMalloc((void**)&result , sizeof(double) != cudaSuccess)) { //*result &result result double

    fprintf (stderr, "!!!! device memory allocation error (allocate result)\n");

    return EXIT_FAILURE;

  }

status = cublasDdot(handle, vector_size, d_M,1,d_H,1, result);

  if (status != CUBLAS_STATUS_SUCCESS) {

    fprintf (stderr, "!!!! kernel execution error.\n");

    return EXIT_FAILURE;

  }

double* dot;

  cudaMemcpy( &result, dot, sizeof(double),cudaMemcpyDeviceToHost);

cout <<   dot << endl;

And again if I comment the cublasDot operation it works and I obtain 0 printed on screen. If do the cudblasDot operation it says operation failed, and I’m puzzled.

thanks again for your advice.

I got the solution, it was really an easy problem once I figured out. The declaration of double result was incorrect. I didn’t allocate enough memory, It should have been double sum = new double[1]; and not only double* sum.

Congrats. You discovered the same thing I said previous.

Hi guys, I new to CUBLAS… trying to use cublasDdot; not sure why I’m getting answer t = 0?
will be great if you can help take a look. cheers Ken

#include
#include
#include
#include
#include
#include
#include “cuda.h”
#include
#include
#include
#include
#include

int main(void)
{
int i,n;
double *x, *y, *t;
double *d_x, *d_y, *d_t;
n = 100;

x = (double*)malloc(n *sizeof(double));
y = (double*)malloc(n *sizeof(double));
t = (double*)malloc(sizeof(double));

for( i = 0; i < n; i++)
{
	x[i] = i;
	y[i] = i + 3;
}
cudaMalloc((void **)&d_x, n*sizeof(double));
cudaMalloc((void **)&d_y, n*sizeof(double));
cudaMalloc((void **)&d_t, sizeof(double));
cudaMemcpy(d_x, x, n*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, n*sizeof(double), cudaMemcpyHostToDevice);

cublasHandle_t handle;
cublasCreate(&handle);
cublasDdot(handle, n, d_x, 1, d_y, 1, d_t);
cudaMemcpy(t, d_t, sizeof(double), cudaMemcpyDeviceToHost);
printf("GPU = %lf\n",  t);
system("PAUSE");

free(x);
free(y);
free(t);
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_t);
cublasDestroy(handle);
return 0;

}