3D surface memory bug?

I am having problems using 3D surface memory. My system is cuda 5.0, driver 306.94, a GeForce GTX 580 GPU, and Visual Studio 2010 with Win7 64 bit. In the code below I create a 3D image that consists of an array of 2D images. I set all pixels to the same value. I use Nsight 3.0 to check the value of variables within a kernel. I transfer the 3D image to a cuda array and bind it to both a 3D surface and texture. Within a kernel the texture produces the value that I set the image to. When I read from the surface I get zero. Any suggestions or insight?

texture reconTex;

surface reconSurf;

global static void cudaConebeamTestSurfaceKernel(int cols, int rows, int slices)

{

int col = blockIdx.x * blockDim.x + threadIdx.x;

int row = blockIdx.y * blockDim.y + threadIdx.y;

if(col >= cols || row >= rows) {return;}

for(int slice = 0; slice < slices; slice++)

{

float cv = 0.0f;

surf3Dread(&cv, reconSurf, col * sizeof(float), row, slice, cudaBoundaryModeZero);

cv += 5;

surf3Dwrite(cv, reconSurf, col * sizeof(float), row, slice, cudaBoundaryModeZero);

}

}

global static void cudaConebeamTestTextureKernel(int cols, int rows, int slices)

{

int col = blockIdx.x * blockDim.x + threadIdx.x;

int row = blockIdx.y * blockDim.y + threadIdx.y;

if(col >= cols || row >= rows) {return;}

for(int slice = 0; slice < slices; slice++)

{

float cv = tex3D(reconTex, col + 0.5, row + 0.5, slice + 0.5);

float rr = cv;

}

}

void cudaSurfaceMemoryTest()

{

int cols = 256, rows = 256, slices = 256;

// Create a 3D image which consists of an array of 2D images (slices).

float** sliceArray = new float*[slices];

for(int s = 0; s < slices; s++) {sliceArray[s] = new float[cols * rows];}

for(int s = 0; s < slices; s++)

{

for(int r = 0; r < rows; r++)

{

for(int c = 0; c < cols; c++)

{

sliceArray[s][c + r * cols] = 5.444;

}

}

}

// Setup cuda array.

cudaArray* imgArray;

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc();

cudaExtent extent = make_cudaExtent(cols, rows, slices);

cudaMalloc3DArray(&imgArray, &channelDesc, extent, cudaArraySurfaceLoadStore);

for(long s = 0; s < slices; s++)

{

cudaMemcpy3DParms params = {0};

params.srcPtr = make_cudaPitchedPtr((void*)sliceArray[s], cols * sizeof(float), cols, rows);

params.dstArray = imgArray;

params.extent = make_cudaExtent(cols, rows, 1);

params.dstPos.x = 0;

params.dstPos.y = 0;

params.dstPos.z = s;

params.kind = cudaMemcpyHostToDevice;

cudaMemcpy3D(&params);

}

cudaGetChannelDesc(&channelDesc, imgArray);

cudaBindSurfaceToArray(reconSurf, imgArray, channelDesc);

//CudaUtil::checkForCudaError(“”);

reconTex.addressMode[0] = cudaAddressModeClamp;

reconTex.addressMode[1] = cudaAddressModeClamp;

reconTex.addressMode[2] = cudaAddressModeClamp;

reconTex.filterMode = cudaFilterModeLinear;

reconTex.normalized = false;

cudaBindTextureToArray(reconTex, imgArray, channelDesc);

//CudaUtil::checkForCudaError(“”);

// Set block and grid dimensions

dim3 dimBlock, dimGrid;

dimBlock.x = 16;

dimBlock.y = 16;

dimBlock.z = 1;

dimGrid.x = (uint)ceil((float)cols / (float)dimBlock.x);

dimGrid.y = (uint)ceil((float)rows / (float)dimBlock.y);

dimGrid.z = 1;

// Run kernels

for(long i = 0; i < 1; i++)

{

printf("

%d", i);

cudaConebeamTestTextureKernel(cols, rows, slices);

cudaThreadSynchronize();

//CudaUtil::checkForCudaError(“A”);

cudaConebeamTestSurfaceKernel(cols, rows, slices);

cudaThreadSynchronize();

//CudaUtil::checkForCudaError(“B”);

}

cudaFreeArray(imgArray);

// Free image memory

for(int s = 0; s < slices; s++) {if(sliceArray != NULL) delete sliceArray[s];}

if(sliceArray != NULL) {delete sliceArray;}

}

Hi,
I am having the same problem. Could you solve it?
Martin