Help for my Tesis.

Hi. I’m a computer science student and I’m writing the tesiper my degree on CUDA. In my algorithm for scalar product I have a problem of segmentation fault when N = 1000000. The kernel is the following:

global void scalar_prod( long float *a, long float *b, long float *c){

int tid = threadIdx.x+blockDim.x*blockIdx.x;

while(tid < N){
c[tid]=a[tid]*b[tid];

 tid += blockDim.x*gridDim.x;

}
}

If I use size array 1000000 of float, use 11 MB of Global Memory because are 3 Array. But Global memory of Tesla c2050 is 2GB. Why I’have this problem? Sorry for my english, I’m italian.

You do not say what the specific problem/error is with the code(what is the exact error message?).

This snippet in general looks ok, but need to see the launch configuration and parameters. Also check to sizes of the memory allocations/deallocations and do the usual cudaError_t error checking after all device calls.

Also if you are using 64-bit float, it probably is better to say (const double *a, const double *b, double *c), and if you are ok with 32-bit use just float *.

And where is ‘N’ declared and defined? I do not see it in the input parameters for the kernel.

Hello,

I want to add 25969139 integers by 5 on a GTX 560Ti. I can’t figure out my mistake in the code:

#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include
#include
#include
using namespace std;

double fRand(double fMin, double fMax)
{
double f = (double)rand() / RAND_MAX;
return fMin + f * (fMax - fMin);
}

global
void distance(int count_face_G, int *d_V0, int *d_V1, int d_V2)
{
int i = blockIdx.x
blockDim.x+threadIdx.x;
if(i<count_face_G)
{
d_V0[i]=5;
d_V1[i]=5;
d_V2[i]=5;
}
}

int main()
{
string garbage;
int block = 25361, thread = 1024;
int count_vertex_G, count_face_G, count_vertex_R, count_face_R, *V0, *V1, *V2, *d_V0, *d_V1, *d_V2;
float *X_G, *Y_G, *Z_G, *distance, *d_X_G, *d_Y_G, *d_Z_G, *d_distance, X_R, Y_R, Z_R;
ifstream stream_G, stream_R;
stream_G.open(“C:/Users/Yu/Downloads/fountain_ply/fountain.ply”);
stream_R.open(“C:/Users/Yu/Downloads/fountain_ply/octree_128_13_9.off”);
for(unsigned int i=0;i<6;i++)
stream_G >> garbage;
stream_G >> count_vertex_G;
for(unsigned int i=0;i<11;i++)
stream_G >> garbage;
stream_G >> count_face_G;
for(unsigned int i=0;i<6;i++)
stream_G >> garbage;
stream_R>>garbage;
stream_R>>count_vertex_R;
stream_R>>count_face_R;
stream_R>>garbage;

cout << count_vertex_G << endl;
cout << count_face_G << endl;
cout << count_vertex_R << endl;
cout << count_face_R << endl;

X_G = (float*)malloc(count_vertex_G*sizeof(float));
Y_G = (float*)malloc(count_vertex_G*sizeof(float));
Z_G = (float*)malloc(count_vertex_G*sizeof(float));
cudaMalloc((void **)&d_X_G, count_vertex_G*sizeof(float));//149 MByte
cudaMalloc((void **)&d_Y_G, count_vertex_G*sizeof(float));
cudaMalloc((void **)&d_Z_G, count_vertex_G*sizeof(float));
V0 = (int*)malloc(count_face_G*sizeof(int));
V1 = (int*)malloc(count_face_G*sizeof(int));
V2 = (int*)malloc(count_face_G*sizeof(int));
cudaMalloc((void **)&d_V0, count_face_G*sizeof(int));//298 MByte
cudaMalloc((void **)&d_V1, count_face_G*sizeof(int));
cudaMalloc((void **)&d_V2, count_face_G*sizeof(int));
distance = (float*)malloc(count_vertex_R*sizeof(float));
cudaMalloc((void **)&d_distance, count_vertex_R*sizeof(float));//50 MByte
for(int vertex=0; vertex<count_vertex_G; vertex++)
{
	stream_G >> X_G[vertex];
	stream_G >> Y_G[vertex];
	stream_G >> Z_G[vertex];
	X_G[vertex]=3.0;
	Y_G[vertex]=3.0;
	Z_G[vertex]=3.0;

}
for(int face=0; face<count_face_G; face++)
{
	stream_G >> V0[face];
	stream_G >> V1[face];
	stream_G >> V2[face];
	V0[face]=3;
	V1[face]=3;
	V2[face]=3;
}
cudaMemcpy(d_X_G,X_G,count_vertex_G*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_Y_G,Y_G,count_vertex_G*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_Z_G,Z_G,count_vertex_G*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_V0,V0,count_face_G*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_V1,V1,count_face_G*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_V2,V2,count_face_G*sizeof(int),cudaMemcpyHostToDevice);
dim3 dimgrid(block);
dim3 dimblock(thread);
for(int vertex=0; vertex<count_vertex_R; vertex++)
{
	stream_R >> X_R;
	stream_R >> Y_R;
	stream_R >> Z_R;
	distance<<<dimgrid, dimblock>>>(count_face_G, d_V0, d_V1, d_V2);
}
	//distance<<<dimgrid, dimblock>>>(block*thread, d_X, d_Y);

cudaMemcpy(distance,d_distance,count_vertex_R*sizeof(float),cudaMemcpyDeviceToHost);

free(V0); free(V1); free(V2); free(X_G); free(Y_G); free(Z_G); cudaFree(distance);
cudaFree(d_X_G); cudaFree(d_Y_G); cudaFree(d_Z_G);
cudaFree(d_V0); cudaFree(d_V1); cudaFree(d_V2); cudaFree(d_distance);
cin.ignore();
return 0;

}

error message:

4	IntelliSense: expected an expression	c:\Users\Yu\Documents\Visual Studio 2012\Projects\reconstruction_precision\kernel.cu	99

Error 3 error MSB3721: The command ““C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\bin\nvcc.exe” -gencode=arch=compute_10,code="sm_10,compute_10" --use-local-env --cl-version 2012 -ccbin “C:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin” -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include” -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include" -G --keep-dir Debug -maxrregcount=0 --machine 32 --compile -cudart static -g -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler “/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o Debug\kernel.cu.obj “C:\Users\Yu\Documents\Visual Studio 2012\Projects\reconstruction_precision\kernel.cu”” exited with code 2. C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\V110\BuildCustomizations\CUDA 6.0.targets 597

Error 2 error : expression preceding parentheses of apparent call must have (pointer-to-) function type C:\Users\Yu\Documents\Visual Studio 2012\Projects\reconstruction_precision\kernel.cu 99

Perhaps under water the integer is actually a floating point number this can cause problems, besides from the compile problem…

Just change pointer name in:
float *X_G, *Y_G, *Z_G, *distance, *d_X_G, *d_Y_G, *d_Z_G, *d_distance, X_R, Y_R, Z_R;
from ‘distance’ to something else like ‘h_distance’.
Hope that helps.