I am having problems using 3D surface memory. My system is cuda 5.0, driver 306.94, a GeForce GTX 580 GPU, and Visual Studio 2010 with Win7 64 bit. In the code below I create a 3D image that consists of an array of 2D images. I set all pixels to the same value. I use Nsight 3.0 to check the value of variables within a kernel. I transfer the 3D image to a cuda array and bind it to both a 3D surface and texture. Within a kernel the texture produces the value that I set the image to. When I read from the surface I get zero. Any suggestions or insight?
texture reconTex;
surface reconSurf;
global static void cudaConebeamTestSurfaceKernel(int cols, int rows, int slices)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if(col >= cols || row >= rows) {return;}
for(int slice = 0; slice < slices; slice++)
{
float cv = 0.0f;
surf3Dread(&cv, reconSurf, col * sizeof(float), row, slice, cudaBoundaryModeZero);
cv += 5;
surf3Dwrite(cv, reconSurf, col * sizeof(float), row, slice, cudaBoundaryModeZero);
}
}
global static void cudaConebeamTestTextureKernel(int cols, int rows, int slices)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if(col >= cols || row >= rows) {return;}
for(int slice = 0; slice < slices; slice++)
{
float cv = tex3D(reconTex, col + 0.5, row + 0.5, slice + 0.5);
float rr = cv;
}
}
void cudaSurfaceMemoryTest()
{
int cols = 256, rows = 256, slices = 256;
// Create a 3D image which consists of an array of 2D images (slices).
float** sliceArray = new float*[slices];
for(int s = 0; s < slices; s++) {sliceArray[s] = new float[cols * rows];}
for(int s = 0; s < slices; s++)
{
for(int r = 0; r < rows; r++)
{
for(int c = 0; c < cols; c++)
{
sliceArray[s][c + r * cols] = 5.444;
}
}
}
// Setup cuda array.
cudaArray* imgArray;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc();
cudaExtent extent = make_cudaExtent(cols, rows, slices);
cudaMalloc3DArray(&imgArray, &channelDesc, extent, cudaArraySurfaceLoadStore);
for(long s = 0; s < slices; s++)
{
cudaMemcpy3DParms params = {0};
params.srcPtr = make_cudaPitchedPtr((void*)sliceArray[s], cols * sizeof(float), cols, rows);
params.dstArray = imgArray;
params.extent = make_cudaExtent(cols, rows, 1);
params.dstPos.x = 0;
params.dstPos.y = 0;
params.dstPos.z = s;
params.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(¶ms);
}
cudaGetChannelDesc(&channelDesc, imgArray);
cudaBindSurfaceToArray(reconSurf, imgArray, channelDesc);
//CudaUtil::checkForCudaError(“”);
reconTex.addressMode[0] = cudaAddressModeClamp;
reconTex.addressMode[1] = cudaAddressModeClamp;
reconTex.addressMode[2] = cudaAddressModeClamp;
reconTex.filterMode = cudaFilterModeLinear;
reconTex.normalized = false;
cudaBindTextureToArray(reconTex, imgArray, channelDesc);
//CudaUtil::checkForCudaError(“”);
// Set block and grid dimensions
dim3 dimBlock, dimGrid;
dimBlock.x = 16;
dimBlock.y = 16;
dimBlock.z = 1;
dimGrid.x = (uint)ceil((float)cols / (float)dimBlock.x);
dimGrid.y = (uint)ceil((float)rows / (float)dimBlock.y);
dimGrid.z = 1;
// Run kernels
for(long i = 0; i < 1; i++)
{
printf("
%d", i);
cudaConebeamTestTextureKernel(cols, rows, slices);
cudaThreadSynchronize();
//CudaUtil::checkForCudaError(“A”);
cudaConebeamTestSurfaceKernel(cols, rows, slices);
cudaThreadSynchronize();
//CudaUtil::checkForCudaError(“B”);
}
cudaFreeArray(imgArray);
// Free image memory
for(int s = 0; s < slices; s++) {if(sliceArray != NULL) delete sliceArray[s];}
if(sliceArray != NULL) {delete sliceArray;}
}