Hi there, I am very new to CUDA programming. I ran into this one problem while trying to allocate very large 1D array on my GPU (NVIDIA GeForce GTX 745, 4GB) on CUDA 7.5
main(){
// Memory allocation for voxel locations
cudaMalloc( (void **)&loc1Dx, vol3DcolSizevol3DrowSizevol3DdepSizesizeof(double) );
h_loc1Dx = (double)malloc(vol3DcolSizevol3DrowSizevol3DdepSizesizeof(double));
…
}
-------------> Here lets say depsize = 600, colSize = 600, rowSize = 600 elemts
global void kernel( int vol3DcolSize, int vol3DrowSize, int vol3DdepSize, double loc1Dx){
int depIndex = blockDim.x * blockIdx.x + threadIdx.x; //dep
int colIndex = blockDim.y * blockIdx.y + threadIdx.y; //col
int rowIndex = blockDim.z * blockIdx.z + threadIdx.z; //row
int index = depIndex + (colIndex + rowIndex*vol3DcolSize)*vol3DdepSize;
if ( (rowIndex < vol3DrowSize) && (colIndex < vol3DcolSize) && (depIndex < vol3DdepSize) ){
loc1Dx[index] = some calculations.....
printf("loc1Dx[%d] = %lf\n", index, loc1Dx[index]);
}
syncthreads();
}
Now, my problem is that if the size of the array is reduced, then the printf works, otherwise, it does not print anything
Can someone please help me figure out what I am doing wrong here?