Passing structures into CUDA kernels

Hello, I’m pretty new to programming, and I’m really new to CUDA

Is it possible to pass structures into CUDA kernels?

for example, I have:

struct matrix{int width; int height; int size; int bitSize; int wstart; int hstart; int *arrayPtr;};

int main(){

	struct matrix h_sample, h_f, h_result;

	struct matrix d_sample, d_f, d_result;

		//then I assign each parameter of h_sample and h_f values, and an array for arrayPtr

		//allocate host memory, etc, etc

		//allocate device memory

	cudaMalloc( (void **) &d_sample.arrayPtr, d_sample.bitSize);

	cudaMalloc( (void **) &d_f.arrayPtr, d_f.bitSize);

	cudaMalloc( (void **) &d_result.arrayPtr, d_result.bitSize);

		//copy the arrays

	cudaMemcpy( d_sample.arrayPtr, h_sample.arrayPtr, h_sample.bitSize, cudaMemcpyHostToDevice );

	cudaMemcpy( d_f.arrayPtr, h_f.arrayPtr, h_f.bitSize, cudaMemcpyHostToDevice );

	cudaMemcpy( d_result.arrayPtr, h_result.arrayPtr, h_result.bitSize, cudaMemcpyHostToDevice );	

		//copy other values of the structures

	d_sample = h_sample;

	d_f = h_f;

	d_result = h_result;

	int numThreadsPerBlock = 4;

	dim3 dimGrid(68/numThreadsPerBlock, 68);

	dim3 dimBlock(numThreadsPerBlock);

	

	//launch kernel on Device

	dilateOnDevice<<< dimGrid, dimBlock >>>(d_sample, d_f, d_result);

		// block until the device has completed

		cudaThreadSynchronize();

	   // Check for any CUDA errors

	   checkCUDAError("kernel invocation");

.....

}

//my program always quits at this point! External Image

my function for checkCUDAerror and my kernel are:

void checkCUDAError(const char *msg)

{

	cudaError_t err = cudaGetLastError();

	if( cudaSuccess != err) 

	{

		fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );

		system("pause");

		exit(EXIT_FAILURE);

	}						 

}

//my kernel:

__global__ void dilateOnDevice(matrix d_sample, matrix d_f, matrix d_result)

{

	int i,j,down,across,max,value,resultIndex, cheight, bheight;

	i = blockDim.x*blockIdx.x+threadIdx.x;

	j = blockDim.y*blockIdx.y+threadIdx.y;

	resultIndex = j+i*d_result.width;

	d_result.arrayPtr[resultIndex]=0;

	for(down=0;down<d_f.height;down++){		//multiplies each column

		max = 0;

		cheight = down*d_result.width;

		bheight = down*d_f.width;

		for(across=0;across<d_f.width;across++){		//multiplies each row

			value = d_f.arrayPtr[across+bheight]*d_sample.arrayPtr[resultIndex + across+cheight];

			if(value>max)

				max = value;

		}

	}

	d_result.arrayPtr[resultIndex]=max;

}

I’m trying to write a simple dilation function here…

since checkCUDAerror print out “unknown error” I’m pretty sure the error is in the kernel, but I don’t know where it is?

or is my kernel just retarded?

my CPU version of the code is working, and I just tried to do the same for my kernel (is this the reason my kernel is being retarded??):

void dilateOnHost(matrix a, matrix b, matrix c,int *result){

	int i,j,down,across,max,value,resultIndex, cheight, bheight;

	for(i=0;i<c.height;i++){	//moves filter down

		for (j=0;j<c.width;j++){	//moves filter across

			resultIndex = j+i*c.width;

			result[resultIndex]=0;

			for(down=0;down<b.height;down++){		//multiplies each column

				max = 0;

				cheight = down*c.width;

				bheight = down*b.width;

				for(across=0;across<b.width;across++){		//multiplies each row

					value = b.arrayPtr[across+bheight]*a.arrayPtr[resultIndex + across+cheight];

					if(value>max)

						max = value;

				}

			}

			result[resultIndex]=max;

		}

	}

}

one suspicion I have right now is that I can’t pass structure into the kernel that way.

so, what is the right way to pass structure into the kernel??

This bit:

//copy other values of the structures

	d_sample = h_sample;

	d_f = h_f;

	d_result = h_result;

looks pretty suspicious. Won’t it overwrite your just allocated and assigned device memory pointers with the host values?

This bit:

//copy other values of the structures

	d_sample = h_sample;

	d_f = h_f;

	d_result = h_result;

looks pretty suspicious. Won’t it overwrite your just allocated and assigned device memory pointers with the host values?

aren’t they supposed to be the same to begin with? hence the cudaMemCpy…

aren’t they supposed to be the same to begin with? hence the cudaMemCpy…

I have used structures many times in CUDA. I generally used structure pointers to do this. Here I show you an example. Hope it helps you.

struct ve_s{
int idx;
float real;
}

global void Kernel(…, struct ve_s* sv, …)
{

sv->real = 34.5;

}

main()
{
struct ve_s vs;
cutilSafeCall( cudaMalloc((void**)&d_vs, sizeof(ve_s)) );
cutilSafeCall( cudaMemcpy(d_vs, &vs, sizeof(ve_s), cudaMemcpyHostToDevice) );

Kernel<<< blocksPerGrid, threadsPerBlock>>>(…, d_vs,…);
}

I have used structures many times in CUDA. I generally used structure pointers to do this. Here I show you an example. Hope it helps you.

struct ve_s{
int idx;
float real;
}

global void Kernel(…, struct ve_s* sv, …)
{

sv->real = 34.5;

}

main()
{
struct ve_s vs;
cutilSafeCall( cudaMalloc((void**)&d_vs, sizeof(ve_s)) );
cutilSafeCall( cudaMemcpy(d_vs, &vs, sizeof(ve_s), cudaMemcpyHostToDevice) );

Kernel<<< blocksPerGrid, threadsPerBlock>>>(…, d_vs,…);
}

1 Like

Yes the same data should eventually reside at both the host and device memory areas but there are at least two big problems there:

  • avidday is right, the implicit copy constructor for the matrix structure will just set the value of the device arrayPtr to the value of the host arrayPtr without copying over any of the data that it points to

  • even if you had an explicit matrix copy constructor which copies the elements of the matrix one by one it would still not work (my guess is it would crash even) as by default the host address space is not mapped into the device address space so unless you use cudaHostAlloc, copying from host to device involves first copying to a newly created pinned memory area on the host and then, unless you also pass the cudaHostAllocMapped to cudaHostAlloc, setting up a DMA transfer to the GPU

Yes the same data should eventually reside at both the host and device memory areas but there are at least two big problems there:

  • avidday is right, the implicit copy constructor for the matrix structure will just set the value of the device arrayPtr to the value of the host arrayPtr without copying over any of the data that it points to

  • even if you had an explicit matrix copy constructor which copies the elements of the matrix one by one it would still not work (my guess is it would crash even) as by default the host address space is not mapped into the device address space so unless you use cudaHostAlloc, copying from host to device involves first copying to a newly created pinned memory area on the host and then, unless you also pass the cudaHostAllocMapped to cudaHostAlloc, setting up a DMA transfer to the GPU

Thanks a lot! You saved my morning with this suggestion.