Cublas, sum and dot. Newbie question.
I am trying to compute using cublas, the sum of the elements of a vector and vector dot product. My problem is that I can't get the results back due a problem with the memory allocation of the double/pointer which will contain the result at the end of the computation.


[code]


int vector_size = 3;

double* h_M;
double* h_H;
double* d_M = 0;
double* d_H = 0;

cublasStatus_t status;
cublasHandle_t handle;

status = cublasCreate(&handle);

if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}


/* Allocate host memory for the matrices */
h_M = (double*)malloc( vector_size * sizeof(h_M[0]));
if (h_M == 0) {
fprintf (stderr, "!!!! host memory allocation error (M)\n");
return EXIT_FAILURE;
}

h_H = (double*)malloc( vector_size * sizeof(h_H[0]));
if (h_H == 0) {
fprintf (stderr, "!!!! host memory allocation error (H)\n");
return EXIT_FAILURE;
}


h_M[0]=1.0;
h_M[1]=0.0;
h_M[2]=0.0;

h_H[0]=0.0;
h_H[1]=1.0;
h_H[2]=0.0;



/* Allocate device memory for the matrices */
if (cudaMalloc((void**)&d_M, vector_size * sizeof(d_M[0])) != cudaSuccess) {
fprintf (stderr, "!!!! device memory allocation error (allocate A)\n");
return EXIT_FAILURE;
}
if (cudaMalloc((void**)&d_H, vector_size * sizeof(d_H[0])) != cudaSuccess) {
fprintf (stderr, "!!!! device memory allocation error (allocate B)\n");
return EXIT_FAILURE;
}

/* Initialize the device matrices with the host matrices */
status = cublasSetVector(vector_size, sizeof(h_M[0]), h_M, 1, d_M, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write M)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(vector_size, sizeof(h_H[0]), h_H, 1, d_H, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write H)\n");
return EXIT_FAILURE;
}

/* Performs operation using cublas */
status = cublasDasum(handle, vector_size, d_M,1,result);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}


// /* Read the result back */
double* sum = 0;
status = cublasGetVector(1, sizeof(void**), result, 1, sum, 1);// I only require the first element of result, that's why I chose 1 as vector length, is that ok?
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (read C)\n");
return EXIT_FAILURE;
}

cout << *sum << endl;



/* Performs operation using cublas */
double* result = 0;
status = cublasDdot(handle, vector_size, d_M,1,d_H,1, result);// is it ok to not have allocated result into the memory, the manual says that it could be located in the host.
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}


cout << *result << endl;

/* Memory clean up */
if (cudaFree(d_H) != cudaSuccess) {
fprintf (stderr, "!!!! memory free error (H)\n");
return EXIT_FAILURE;
}
if (cudaFree(d_M) != cudaSuccess) {
fprintf (stderr, "!!!! memory free error (M)\n");
return EXIT_FAILURE;
}

[/code]

Then I can't find the mistake and I obtain "Segmentation fault" when I run the executable. I'm really puzzled about what's wrong, your help would be very appreciated.
many thanks!
I am trying to compute using cublas, the sum of the elements of a vector and vector dot product. My problem is that I can't get the results back due a problem with the memory allocation of the double/pointer which will contain the result at the end of the computation.











int vector_size = 3;



double* h_M;

double* h_H;

double* d_M = 0;

double* d_H = 0;



cublasStatus_t status;

cublasHandle_t handle;



status = cublasCreate(&handle);



if (status != CUBLAS_STATUS_SUCCESS) {

fprintf (stderr, "!!!! CUBLAS initialization error\n");

return EXIT_FAILURE;

}





/* Allocate host memory for the matrices */

h_M = (double*)malloc( vector_size * sizeof(h_M[0]));

if (h_M == 0) {

fprintf (stderr, "!!!! host memory allocation error (M)\n");

return EXIT_FAILURE;

}



h_H = (double*)malloc( vector_size * sizeof(h_H[0]));

if (h_H == 0) {

fprintf (stderr, "!!!! host memory allocation error (H)\n");

return EXIT_FAILURE;

}





h_M[0]=1.0;

h_M[1]=0.0;

h_M[2]=0.0;



h_H[0]=0.0;

h_H[1]=1.0;

h_H[2]=0.0;







/* Allocate device memory for the matrices */

if (cudaMalloc((void**)&d_M, vector_size * sizeof(d_M[0])) != cudaSuccess) {

fprintf (stderr, "!!!! device memory allocation error (allocate A)\n");

return EXIT_FAILURE;

}

if (cudaMalloc((void**)&d_H, vector_size * sizeof(d_H[0])) != cudaSuccess) {

fprintf (stderr, "!!!! device memory allocation error (allocate B)\n");

return EXIT_FAILURE;

}



/* Initialize the device matrices with the host matrices */

status = cublasSetVector(vector_size, sizeof(h_M[0]), h_M, 1, d_M, 1);

if (status != CUBLAS_STATUS_SUCCESS) {

fprintf (stderr, "!!!! device access error (write M)\n");

return EXIT_FAILURE;

}

status = cublasSetVector(vector_size, sizeof(h_H[0]), h_H, 1, d_H, 1);

if (status != CUBLAS_STATUS_SUCCESS) {

fprintf (stderr, "!!!! device access error (write H)\n");

return EXIT_FAILURE;

}



/* Performs operation using cublas */

status = cublasDasum(handle, vector_size, d_M,1,result);

if (status != CUBLAS_STATUS_SUCCESS) {

fprintf (stderr, "!!!! kernel execution error.\n");

return EXIT_FAILURE;

}





// /* Read the result back */

double* sum = 0;

status = cublasGetVector(1, sizeof(void**), result, 1, sum, 1);// I only require the first element of result, that's why I chose 1 as vector length, is that ok?

if (status != CUBLAS_STATUS_SUCCESS) {

fprintf (stderr, "!!!! device access error (read C)\n");

return EXIT_FAILURE;

}



cout << *sum << endl;







/* Performs operation using cublas */

double* result = 0;

status = cublasDdot(handle, vector_size, d_M,1,d_H,1, result);// is it ok to not have allocated result into the memory, the manual says that it could be located in the host.

if (status != CUBLAS_STATUS_SUCCESS) {

fprintf (stderr, "!!!! kernel execution error.\n");

return EXIT_FAILURE;

}





cout << *result << endl;



/* Memory clean up */

if (cudaFree(d_H) != cudaSuccess) {

fprintf (stderr, "!!!! memory free error (H)\n");

return EXIT_FAILURE;

}

if (cudaFree(d_M) != cudaSuccess) {

fprintf (stderr, "!!!! memory free error (M)\n");

return EXIT_FAILURE;

}






Then I can't find the mistake and I obtain "Segmentation fault" when I run the executable. I'm really puzzled about what's wrong, your help would be very appreciated.

many thanks!

#1
Posted 04/19/2012 04:24 PM   
I have question. After the line double* sum = 0; shouldn't you allocate sum of size 1 with malloc? Do you know at which line does it crash?
I have question. After the line double* sum = 0; shouldn't you allocate sum of size 1 with malloc? Do you know at which line does it crash?

#2
Posted 04/19/2012 08:10 PM   
[quote name='pasoleatis' date='19 April 2012 - 08:10 PM' timestamp='1334866226' post='1398447']
I have question. After the line double* sum = 0; shouldn't you allocate sum of size 1 with malloc? Do you know at which line does it crash?
[/quote]

I don't know if I have to allocate it. In the manual for this function says that the "role of dot" could be in in the host or the device. Then my question is, does cudablas fill it automatically in the host? or do I have to send it to the device and the copy it back to the host?
(I'll try meanwhile)

It fails at blas operations.
[quote name='pasoleatis' date='19 April 2012 - 08:10 PM' timestamp='1334866226' post='1398447']

I have question. After the line double* sum = 0; shouldn't you allocate sum of size 1 with malloc? Do you know at which line does it crash?





I don't know if I have to allocate it. In the manual for this function says that the "role of dot" could be in in the host or the device. Then my question is, does cudablas fill it automatically in the host? or do I have to send it to the device and the copy it back to the host?

(I'll try meanwhile)



It fails at blas operations.

#3
Posted 04/20/2012 12:47 AM   
[quote name='gatts' date='20 April 2012 - 12:47 AM' timestamp='1334882876' post='1398530']
I don't know if I have to allocate it. In the manual for this function says that the "role of dot" could be in in the host or the device. Then my question is, does cudablas fill it automatically in the host? or do I have to send it to the device and the copy it back to the host?
(I'll try meanwhile)

It fails at blas operations.
[/quote]

Ok, I tried this


[code]
// /* Performs operation using cublas */


double* result ;

if (cudaMalloc((void**)&result , sizeof(double) != cudaSuccess)) { //*result &result result double
fprintf (stderr, "!!!! device memory allocation error (allocate result)\n");
return EXIT_FAILURE;
}


status = cublasDdot(handle, vector_size, d_M,1,d_H,1, result);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}


double* dot;
cudaMemcpy( &result, dot, sizeof(double),cudaMemcpyDeviceToHost);

cout << dot << endl;

[/code]


And again if I comment the cublasDot operation it works and I obtain 0 printed on screen. If do the cudblasDot operation it says operation failed, and I'm puzzled.
thanks again for your advice.
[quote name='gatts' date='20 April 2012 - 12:47 AM' timestamp='1334882876' post='1398530']

I don't know if I have to allocate it. In the manual for this function says that the "role of dot" could be in in the host or the device. Then my question is, does cudablas fill it automatically in the host? or do I have to send it to the device and the copy it back to the host?

(I'll try meanwhile)



It fails at blas operations.





Ok, I tried this







// /* Performs operation using cublas */





double* result ;



if (cudaMalloc((void**)&result , sizeof(double) != cudaSuccess)) { //*result &result result double

fprintf (stderr, "!!!! device memory allocation error (allocate result)\n");

return EXIT_FAILURE;

}





status = cublasDdot(handle, vector_size, d_M,1,d_H,1, result);

if (status != CUBLAS_STATUS_SUCCESS) {

fprintf (stderr, "!!!! kernel execution error.\n");

return EXIT_FAILURE;

}





double* dot;

cudaMemcpy( &result, dot, sizeof(double),cudaMemcpyDeviceToHost);



cout << dot << endl;








And again if I comment the cublasDot operation it works and I obtain 0 printed on screen. If do the cudblasDot operation it says operation failed, and I'm puzzled.

thanks again for your advice.

#4
Posted 04/20/2012 01:08 AM   
I got the solution, it was really an easy problem once I figured out. The declaration of double *result was incorrect. I didn't allocate enough memory, It should have been double* sum = new double[1]; and not only double* sum.
I got the solution, it was really an easy problem once I figured out. The declaration of double *result was incorrect. I didn't allocate enough memory, It should have been double* sum = new double[1]; and not only double* sum.

#5
Posted 04/20/2012 10:12 PM   
[quote name='gatts' date='20 April 2012 - 11:12 PM' timestamp='1334959972' post='1398888']
I got the solution, it was really an easy problem once I figured out. The declaration of double *result was incorrect. I didn't allocate enough memory, It should have been double* sum = new double[1]; and not only double* sum.
[/quote]

Congrats. You discovered the same thing I said previous.
[quote name='gatts' date='20 April 2012 - 11:12 PM' timestamp='1334959972' post='1398888']

I got the solution, it was really an easy problem once I figured out. The declaration of double *result was incorrect. I didn't allocate enough memory, It should have been double* sum = new double[1]; and not only double* sum.





Congrats. You discovered the same thing I said previous.

#6
Posted 04/21/2012 09:52 AM   
Hi guys, I new to CUBLAS... trying to use cublasDdot; not sure why I'm getting answer t = 0? will be great if you can help take a look. cheers Ken #include #include #include #include #include #include #include "cuda.h" #include #include #include #include #include int main(void) { int i,n; double *x, *y, *t; double *d_x, *d_y, *d_t; n = 100; x = (double*)malloc(n *sizeof(double)); y = (double*)malloc(n *sizeof(double)); t = (double*)malloc(sizeof(double)); for( i = 0; i < n; i++) { x[i] = i; y[i] = i + 3; } cudaMalloc((void **)&d_x, n*sizeof(double)); cudaMalloc((void **)&d_y, n*sizeof(double)); cudaMalloc((void **)&d_t, sizeof(double)); cudaMemcpy(d_x, x, n*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_y, y, n*sizeof(double), cudaMemcpyHostToDevice); cublasHandle_t handle; cublasCreate(&handle); cublasDdot(handle, n, d_x, 1, d_y, 1, d_t); cudaMemcpy(t, d_t, sizeof(double), cudaMemcpyDeviceToHost); printf("GPU = %lf\n", t); system("PAUSE"); free(x); free(y); free(t); cudaFree(d_x); cudaFree(d_y); cudaFree(d_t); cublasDestroy(handle); return 0; }
Hi guys, I new to CUBLAS... trying to use cublasDdot; not sure why I'm getting answer t = 0?
will be great if you can help take a look. cheers Ken



#include
#include
#include
#include
#include
#include
#include "cuda.h"
#include
#include
#include
#include
#include

int main(void)
{
int i,n;
double *x, *y, *t;
double *d_x, *d_y, *d_t;
n = 100;

x = (double*)malloc(n *sizeof(double));
y = (double*)malloc(n *sizeof(double));
t = (double*)malloc(sizeof(double));

for( i = 0; i < n; i++)
{
x[i] = i;
y[i] = i + 3;
}
cudaMalloc((void **)&d_x, n*sizeof(double));
cudaMalloc((void **)&d_y, n*sizeof(double));
cudaMalloc((void **)&d_t, sizeof(double));
cudaMemcpy(d_x, x, n*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, n*sizeof(double), cudaMemcpyHostToDevice);

cublasHandle_t handle;
cublasCreate(&handle);
cublasDdot(handle, n, d_x, 1, d_y, 1, d_t);
cudaMemcpy(t, d_t, sizeof(double), cudaMemcpyDeviceToHost);
printf("GPU = %lf\n", t);
system("PAUSE");

free(x);
free(y);
free(t);
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_t);
cublasDestroy(handle);
return 0;

}

#7
Posted 11/29/2012 10:52 AM   
Scroll To Top