Hi all,
my problem should have a simple answer but I am stuck finding it. The following simple code, where
I used 1d texture memory to store some data and tried to fetch it using the tex1Dfetch, fails
(the code returns zeros) on Fermi architecture, both GTX480 and Tesla 2050, and functions
okay (returning 0…9) on Tesla 1060, and I do not see at all why this should happen.
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cutil.h>
cudaArray* testArray;
texture<float, 1, cudaReadModeElementType> texTest;
__global__ void testKernel(float *_arr)
{
_arr[threadIdx.x] = tex1Dfetch(texTest, threadIdx.x);
}
int main( int narg, char *args[])
{
cudaSetDevice(0);
int nData = 10;
float *h_testData = (float*)malloc(nData*sizeof(float));
for (int i=0; i<nData; i++)
h_testData[i] = (float)i;
float *d_testData;
cudaMalloc((void **) &d_testData, nData*sizeof(float));
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
CUDA_SAFE_CALL( cudaMallocArray( &testArray, &channelDesc, nData, 1 ));
CUDA_SAFE_CALL( cudaMemcpyToArray( testArray, 0, 0, h_testData, nData*sizeof(float), cudaMemcpyHostToDevice));
texTest.addressMode[0] = cudaAddressModeClamp;
texTest.addressMode[1] = cudaAddressModeClamp;
texTest.filterMode = cudaFilterModePoint;
texTest.normalized = false;
CUDA_SAFE_CALL( cudaBindTextureToArray( texTest, testArray, channelDesc));
testKernel <<< 1, nData >>> (d_testData);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUDA_SAFE_CALL( cudaMemcpy(h_testData, d_testData, nData*sizeof(float), cudaMemcpyDeviceToHost) );
for (int i=0; i<nData; i++)
printf(" testData[%i]=%f \n", i, h_testData[i]);
}
I would really appreciate it if somebody had an idea why this happens…