Hi,
I am encountering a question that texture memory does not help improve the speed compared to the global memory. A 3D texture memory is needed because it is a 3D image. What I need to do is just constantly reading from the texture memory instead of global memory.
The process is:
host part:
- Read the 3D image and save it in a 1D array (say “ima”) by vectorization.
- Load the “ima” into 3D texture memory (“ima_texure”)
device part:
3) Read from ima_texure
Here I show part of the code which declares and loads the data into texture memory and reads from the texture memory. Actually I don’t think there is anything wrong in my code because all run well and the results are correct. But I want to get some ideas why this (no speed improvement, even more time) may happen.
Declare and load:
texture<float,cudaTextureType3D,cudaReadModeElementType> ima_tex;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaExtent imaSize = make_cudaExtent(dimx,dimy,dimz);
cudaArray *ima=0;
// Image
cudaMalloc3DArray(&[b]ima[/b], &channelDesc, imaSize);
cudaMemcpy3DParms copyParams1 = {0};
copyParams1.srcPtr = make_cudaPitchedPtr((void*)ima_input, imaSize.width*sizeof(float), imaSize.width, imaSize.height);
copyParams1.dstArray = ima; // destination array
copyParams1.extent = imaSize; // dimensions of the transferred area in elements
copyParams1.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(©Params1);
ima_tex.normalized = false;
ima_tex.filterMode = cudaFilterModePoint; //cudaFilterModePoint; cudaFilterModeLinear;
ima_tex.addressMode[0] = cudaAddressModeWrap;
ima_tex.addressMode[1] = cudaAddressModeWrap;
ima_tex.addressMode[2] = cudaAddressModeWrap;
cudaBindTextureToArray(ima_tex, ima, channelDesc);
Read:
_device float distance(float* ima,int x,int y,int z,int nx,int ny,int nz)
{
float d,distancetotal,temp;
int i,j,k,ni1,nj1,ni2,nj2,nk1,nk2,f;
f=gcfg->patchsize;
distancetotal=0.f;
for(k=-f;k<=f;k++)
{
nk1=z+k;
nk2=nz+k;
if(nk1<0) nk1=-nk1;
if(nk2<0) nk2=-nk2;
if(nk1>=gcfg->dimz) nk1=2*gcfg->dimz-nk1-1;
if(nk2>=gcfg->dimz) nk2=2*gcfg->dimz-nk2-1;
for(j=-f;j<=f;j++)
{
nj1=y+j;
nj2=ny+j;
if(nj1<0) nj1=-nj1;
if(nj2<0) nj2=-nj2;
if(nj1>=gcfg->dimy) nj1=2*gcfg->dimy-nj1-1;
if(nj2>=gcfg->dimy) nj2=2*gcfg->dimy-nj2-1;
for(i=-f;i<=f;i++)
{
ni1=x+i;
ni2=nx+i;
if(ni1<0) ni1=-ni1;
if(ni2<0) ni2=-ni2;
if(ni1>=gcfg->dimx) ni1=2*gcfg->dimx-ni1-1;
if(ni2>=gcfg->dimx) ni2=2*gcfg->dimx-ni2-1;
temp = tex3D(ima_tex,ni1,nj1,nk1)-tex3D(ima_tex,ni2,nj2,nk2);
distancetotal = distancetotal + temp * temp;
}
}
}
d=distancetotal*gcfg->rpatchnomalize;
return d;
}