Guide: cudaMalloc3D and cudaArray's
Hi All,

I'm writing this short guide as a reference for those who wish to use cudaMalloc3D with cudaArray's allocated using cudaMalloc3DArray. This includes using the 3D textures and 2DLayered textures bound to 3D cudaArrays. After looking around on google for a bit and not finding much I figured that others could probably use this information.

In the following code I use a type called 'cudaMatrix' extensively. 'cudaMatrix' is simply a front end for managing linear memory allocated with cudaMalloc3D(). I've included a header file that defines this class and also includes some working examples of how to use cudaMalloc3D() and cudaMemcpy3D().

Ok, so the root of the problem: Populating a 3D texture with device generated data without having to copy the data back to the host.


-----------------------------------------------

[u]Step 1: Setup data in linear memory allocated with cudaMalloc3D.[/u]

Information on this can be found in this guide [url="http://forums.nvidia.com/index.php?showtopic=165400"]here[/url]. Or you can use the cudaMatrixT<type T> front end, which makes this really easy.
[code]
cudaMatrixT<float> initial_data(nx,ny,nz);
[/code]

Elements in the cudaMatrix object can be accessed simply through the overloaded parentheses operator. For the purposes of this example my data is just a bunch of random floats.

[code]
__global__
void fill_kernel(cudaMatrix<float> data,int3 dims,cudaMatrixT<curandState>random_states)
{

unsigned int idx = threadIdx.x;
unsigned int idy = threadIdx.y;
unsigned int idz = threadIdx.z;

unsigned int gidx = blockIdx.x*blockDim.x+idx;
unsigned int gidy = blockIdx.y*blockDim.y+idy;
unsigned int gidz = blockIdx.z*blockDim.z+idz;

if((gidx < dims.x)&&(gidy < dims.y)&&(gidz < dims.z))
{
data(gidx,gidy,gidz) = (curand_uniform(&random_states(gidx,gidy,gidz))*100);
}

}
[/code]



[u]Step 2: Allocate a cudaArray with cudaMalloc3DArray()[/u]

[code]
cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();

cudaExtent extent;
extent.width = nx; // Note, for cudaArrays the width field is the width in elements, not bytes
extent.height = ny;
extent.depth = nz;

cudaArray *array = 0;



cudaMalloc3DArray(&array,&desc,extent,cudaArrayLayered);

[/code]



[u]Step 3: Copy data in linear memory to a 3D cudaArray using cudaMemcpy3D[/u]

[code]
cudaPitchedPtr matrixPtr = initial_data.getptr(); // This is the pointer returned from cudaMalloc3D()

cudaMemcpy3DParms params = {0}; // Initialize to 0

params.srcPtr = matrixPtr;
params.dstArray = array;
params.kind = cudaMemcpyDeviceToDevice;
params.extent = extent; // This is the extent used to allocate the cudaArray 'array'

cudaMemcpy3D(&params);
[/code]



[u]Step 4: Bind the cudaArray to a texture reference[/u]

[code]
texture<float,cudaTextureType2DLayered,cudaReadModeElementType> texref;

cudaBindTextureToArray(texref,array); // This is using the high-level api

[/code]



[u]Step 5: Access the data through the texture reference interface[/u]

[code]
__global__
void check_kernel(cudaMatrixf data,int3 dims)
{

unsigned int idx = threadIdx.x;
unsigned int idy = threadIdx.y;
unsigned int idz = threadIdx.z;

unsigned int gidx = blockIdx.x*blockDim.x+idx;
unsigned int gidy = blockIdx.y*blockDim.y+idy;
unsigned int gidz = blockIdx.z*blockDim.z+idz;

float mydata;
float texdata;

if((gidx < dims.x)&&(gidy < dims.y)&&(gidz < dims.z))
{
texdata = tex2DLayered(texref,gidx,gidy,gidz);
mydata = data(gidx,gidy,gidz);

printf(" mydata = %f, texdata = %f @ %i, %i\n",mydata,texdata,gidx,gidy);

}
}
[/code]

And that is how you use cudaMalloc3DArray(), cudaMemcpy3D() with an array as the destination, and the bind it to a texture reference. I hope that this was useful. If anyone has any suggestions on how I could improve this or if I did something terribly wrong I'd greatly appreciate the feedback.

**Note, cudamatrix.h and cudamatrix_types.h should be .cuh, but the forum wouldn't let me upload those file types.
[attachment=21649:cudamatrix.h]
[attachment=21651:cudamatrix_types.h]
[attachment=21650:texture_memory.cu]
Hi All,



I'm writing this short guide as a reference for those who wish to use cudaMalloc3D with cudaArray's allocated using cudaMalloc3DArray. This includes using the 3D textures and 2DLayered textures bound to 3D cudaArrays. After looking around on google for a bit and not finding much I figured that others could probably use this information.



In the following code I use a type called 'cudaMatrix' extensively. 'cudaMatrix' is simply a front end for managing linear memory allocated with cudaMalloc3D(). I've included a header file that defines this class and also includes some working examples of how to use cudaMalloc3D() and cudaMemcpy3D().



Ok, so the root of the problem: Populating a 3D texture with device generated data without having to copy the data back to the host.





-----------------------------------------------



Step 1: Setup data in linear memory allocated with cudaMalloc3D.



Information on this can be found in this guide here. Or you can use the cudaMatrixT<type T> front end, which makes this really easy.



cudaMatrixT<float> initial_data(nx,ny,nz);




Elements in the cudaMatrix object can be accessed simply through the overloaded parentheses operator. For the purposes of this example my data is just a bunch of random floats.





__global__

void fill_kernel(cudaMatrix<float> data,int3 dims,cudaMatrixT<curandState>random_states)

{



unsigned int idx = threadIdx.x;

unsigned int idy = threadIdx.y;

unsigned int idz = threadIdx.z;



unsigned int gidx = blockIdx.x*blockDim.x+idx;

unsigned int gidy = blockIdx.y*blockDim.y+idy;

unsigned int gidz = blockIdx.z*blockDim.z+idz;



if((gidx < dims.x)&&(gidy < dims.y)&&(gidz < dims.z))

{

data(gidx,gidy,gidz) = (curand_uniform(&random_states(gidx,gidy,gidz))*100);

}



}








Step 2: Allocate a cudaArray with cudaMalloc3DArray()





cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();



cudaExtent extent;

extent.width = nx; // Note, for cudaArrays the width field is the width in elements, not bytes

extent.height = ny;

extent.depth = nz;



cudaArray *array = 0;







cudaMalloc3DArray(&array,&desc,extent,cudaArrayLayered);










Step 3: Copy data in linear memory to a 3D cudaArray using cudaMemcpy3D





cudaPitchedPtr matrixPtr = initial_data.getptr(); // This is the pointer returned from cudaMalloc3D()



cudaMemcpy3DParms params = {0}; // Initialize to 0



params.srcPtr = matrixPtr;

params.dstArray = array;

params.kind = cudaMemcpyDeviceToDevice;

params.extent = extent; // This is the extent used to allocate the cudaArray 'array'



cudaMemcpy3D(&params);








Step 4: Bind the cudaArray to a texture reference





texture<float,cudaTextureType2DLayered,cudaReadModeElementType> texref;



cudaBindTextureToArray(texref,array); // This is using the high-level api










Step 5: Access the data through the texture reference interface





__global__

void check_kernel(cudaMatrixf data,int3 dims)

{



unsigned int idx = threadIdx.x;

unsigned int idy = threadIdx.y;

unsigned int idz = threadIdx.z;



unsigned int gidx = blockIdx.x*blockDim.x+idx;

unsigned int gidy = blockIdx.y*blockDim.y+idy;

unsigned int gidz = blockIdx.z*blockDim.z+idz;



float mydata;

float texdata;



if((gidx < dims.x)&&(gidy < dims.y)&&(gidz < dims.z))

{

texdata = tex2DLayered(texref,gidx,gidy,gidz);

mydata = data(gidx,gidy,gidz);



printf(" mydata = %f, texdata = %f @ %i, %i\n",mydata,texdata,gidx,gidy);



}

}




And that is how you use cudaMalloc3DArray(), cudaMemcpy3D() with an array as the destination, and the bind it to a texture reference. I hope that this was useful. If anyone has any suggestions on how I could improve this or if I did something terribly wrong I'd greatly appreciate the feedback.



**Note, cudamatrix.h and cudamatrix_types.h should be .cuh, but the forum wouldn't let me upload those file types.

[attachment=21649:cudamatrix.h]

[attachment=21651:cudamatrix_types.h]

[attachment=21650:texture_memory.cu]

#1
Posted 07/10/2011 12:29 AM   
Scroll To Top

Add Reply