Passing structures into CUDA kernels
Hello, I'm pretty new to programming, and I'm really new to CUDA

Is it possible to pass structures into CUDA kernels?

for example, I have:

[code]struct matrix{int width; int height; int size; int bitSize; int wstart; int hstart; int *arrayPtr;};

int main(){

struct matrix h_sample, h_f, h_result;
struct matrix d_sample, d_f, d_result;

//then I assign each parameter of h_sample and h_f values, and an array for arrayPtr
//allocate host memory, etc, etc

//allocate device memory
cudaMalloc( (void **) &d_sample.arrayPtr, d_sample.bitSize);
cudaMalloc( (void **) &d_f.arrayPtr, d_f.bitSize);
cudaMalloc( (void **) &d_result.arrayPtr, d_result.bitSize);

//copy the arrays
cudaMemcpy( d_sample.arrayPtr, h_sample.arrayPtr, h_sample.bitSize, cudaMemcpyHostToDevice );
cudaMemcpy( d_f.arrayPtr, h_f.arrayPtr, h_f.bitSize, cudaMemcpyHostToDevice );
cudaMemcpy( d_result.arrayPtr, h_result.arrayPtr, h_result.bitSize, cudaMemcpyHostToDevice );

//copy other values of the structures
d_sample = h_sample;
d_f = h_f;
d_result = h_result;

int numThreadsPerBlock = 4;
dim3 dimGrid(68/numThreadsPerBlock, 68);
dim3 dimBlock(numThreadsPerBlock);

//launch kernel on Device
dilateOnDevice<<< dimGrid, dimBlock >>>(d_sample, d_f, d_result);

// block until the device has completed
cudaThreadSynchronize();

// Check for any CUDA errors
checkCUDAError("kernel invocation");

.....
}[/code]
//my program always quits at this point! /verymad.gif' class='bbc_emoticon' alt=':verymad:' />

my function for checkCUDAerror and my kernel are:
[code]void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
system("pause");
exit(EXIT_FAILURE);
}
}

//my kernel:

__global__ void dilateOnDevice(matrix d_sample, matrix d_f, matrix d_result)
{
int i,j,down,across,max,value,resultIndex, cheight, bheight;

i = blockDim.x*blockIdx.x+threadIdx.x;
j = blockDim.y*blockIdx.y+threadIdx.y;

resultIndex = j+i*d_result.width;
d_result.arrayPtr[resultIndex]=0;
for(down=0;down<d_f.height;down++){ //multiplies each column
max = 0;
cheight = down*d_result.width;
bheight = down*d_f.width;
for(across=0;across<d_f.width;across++){ //multiplies each row
value = d_f.arrayPtr[across+bheight]*d_sample.arrayPtr[resultIndex + across+cheight];
if(value>max)
max = value;
}
}
d_result.arrayPtr[resultIndex]=max;
}[/code]

I'm trying to write a simple dilation function here...

since checkCUDAerror print out "unknown error" I'm pretty sure the error is in the kernel, but I don't know where it is?
or is my kernel just retarded?

my CPU version of the code is working, and I just tried to do the same for my kernel (is this the reason my kernel is being retarded??):
[code]void dilateOnHost(matrix a, matrix b, matrix c,int *result){
int i,j,down,across,max,value,resultIndex, cheight, bheight;

for(i=0;i<c.height;i++){ //moves filter down
for (j=0;j<c.width;j++){ //moves filter across
resultIndex = j+i*c.width;
result[resultIndex]=0;
for(down=0;down<b.height;down++){ //multiplies each column
max = 0;
cheight = down*c.width;
bheight = down*b.width;
for(across=0;across<b.width;across++){ //multiplies each row
value = b.arrayPtr[across+bheight]*a.arrayPtr[resultIndex + across+cheight];
if(value>max)
max = value;
}
}
result[resultIndex]=max;
}
}
}[/code]

one suspicion I have right now is that I can't pass structure into the kernel that way.
so, what is the right way to pass structure into the kernel??
Hello, I'm pretty new to programming, and I'm really new to CUDA



Is it possible to pass structures into CUDA kernels?



for example, I have:



struct matrix{int width; int height; int size; int bitSize; int wstart; int hstart; int *arrayPtr;};



int main(){



struct matrix h_sample, h_f, h_result;

struct matrix d_sample, d_f, d_result;



//then I assign each parameter of h_sample and h_f values, and an array for arrayPtr

//allocate host memory, etc, etc



//allocate device memory

cudaMalloc( (void **) &d_sample.arrayPtr, d_sample.bitSize);

cudaMalloc( (void **) &d_f.arrayPtr, d_f.bitSize);

cudaMalloc( (void **) &d_result.arrayPtr, d_result.bitSize);



//copy the arrays

cudaMemcpy( d_sample.arrayPtr, h_sample.arrayPtr, h_sample.bitSize, cudaMemcpyHostToDevice );

cudaMemcpy( d_f.arrayPtr, h_f.arrayPtr, h_f.bitSize, cudaMemcpyHostToDevice );

cudaMemcpy( d_result.arrayPtr, h_result.arrayPtr, h_result.bitSize, cudaMemcpyHostToDevice );



//copy other values of the structures

d_sample = h_sample;

d_f = h_f;

d_result = h_result;



int numThreadsPerBlock = 4;

dim3 dimGrid(68/numThreadsPerBlock, 68);

dim3 dimBlock(numThreadsPerBlock);



//launch kernel on Device

dilateOnDevice<<< dimGrid, dimBlock >>>(d_sample, d_f, d_result);



// block until the device has completed

cudaThreadSynchronize();



// Check for any CUDA errors

checkCUDAError("kernel invocation");



.....

}


//my program always quits at this point! /verymad.gif' class='bbc_emoticon' alt=':verymad:' />



my function for checkCUDAerror and my kernel are:

void checkCUDAError(const char *msg)

{

cudaError_t err = cudaGetLastError();

if( cudaSuccess != err)

{

fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );

system("pause");

exit(EXIT_FAILURE);

}

}



//my kernel:



__global__ void dilateOnDevice(matrix d_sample, matrix d_f, matrix d_result)

{

int i,j,down,across,max,value,resultIndex, cheight, bheight;



i = blockDim.x*blockIdx.x+threadIdx.x;

j = blockDim.y*blockIdx.y+threadIdx.y;



resultIndex = j+i*d_result.width;

d_result.arrayPtr[resultIndex]=0;

for(down=0;down<d_f.height;down++){ //multiplies each column

max = 0;

cheight = down*d_result.width;

bheight = down*d_f.width;

for(across=0;across<d_f.width;across++){ //multiplies each row

value = d_f.arrayPtr[across+bheight]*d_sample.arrayPtr[resultIndex + across+cheight];

if(value>max)

max = value;

}

}

d_result.arrayPtr[resultIndex]=max;

}




I'm trying to write a simple dilation function here...



since checkCUDAerror print out "unknown error" I'm pretty sure the error is in the kernel, but I don't know where it is?

or is my kernel just retarded?



my CPU version of the code is working, and I just tried to do the same for my kernel (is this the reason my kernel is being retarded??):

void dilateOnHost(matrix a, matrix b, matrix c,int *result){

int i,j,down,across,max,value,resultIndex, cheight, bheight;



for(i=0;i<c.height;i++){ //moves filter down

for (j=0;j<c.width;j++){ //moves filter across

resultIndex = j+i*c.width;

result[resultIndex]=0;

for(down=0;down<b.height;down++){ //multiplies each column

max = 0;

cheight = down*c.width;

bheight = down*b.width;

for(across=0;across<b.width;across++){ //multiplies each row

value = b.arrayPtr[across+bheight]*a.arrayPtr[resultIndex + across+cheight];

if(value>max)

max = value;

}

}

result[resultIndex]=max;

}

}

}




one suspicion I have right now is that I can't pass structure into the kernel that way.

so, what is the right way to pass structure into the kernel??

#1
Posted 09/17/2010 11:41 PM   
Hello, I'm pretty new to programming, and I'm really new to CUDA

Is it possible to pass structures into CUDA kernels?

for example, I have:

[code]struct matrix{int width; int height; int size; int bitSize; int wstart; int hstart; int *arrayPtr;};

int main(){

struct matrix h_sample, h_f, h_result;
struct matrix d_sample, d_f, d_result;

//then I assign each parameter of h_sample and h_f values, and an array for arrayPtr
//allocate host memory, etc, etc

//allocate device memory
cudaMalloc( (void **) &d_sample.arrayPtr, d_sample.bitSize);
cudaMalloc( (void **) &d_f.arrayPtr, d_f.bitSize);
cudaMalloc( (void **) &d_result.arrayPtr, d_result.bitSize);

//copy the arrays
cudaMemcpy( d_sample.arrayPtr, h_sample.arrayPtr, h_sample.bitSize, cudaMemcpyHostToDevice );
cudaMemcpy( d_f.arrayPtr, h_f.arrayPtr, h_f.bitSize, cudaMemcpyHostToDevice );
cudaMemcpy( d_result.arrayPtr, h_result.arrayPtr, h_result.bitSize, cudaMemcpyHostToDevice );

//copy other values of the structures
d_sample = h_sample;
d_f = h_f;
d_result = h_result;

int numThreadsPerBlock = 4;
dim3 dimGrid(68/numThreadsPerBlock, 68);
dim3 dimBlock(numThreadsPerBlock);

//launch kernel on Device
dilateOnDevice<<< dimGrid, dimBlock >>>(d_sample, d_f, d_result);

// block until the device has completed
cudaThreadSynchronize();

// Check for any CUDA errors
checkCUDAError("kernel invocation");

.....
}[/code]
//my program always quits at this point! /verymad.gif' class='bbc_emoticon' alt=':verymad:' />

my function for checkCUDAerror and my kernel are:
[code]void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
system("pause");
exit(EXIT_FAILURE);
}
}

//my kernel:

__global__ void dilateOnDevice(matrix d_sample, matrix d_f, matrix d_result)
{
int i,j,down,across,max,value,resultIndex, cheight, bheight;

i = blockDim.x*blockIdx.x+threadIdx.x;
j = blockDim.y*blockIdx.y+threadIdx.y;

resultIndex = j+i*d_result.width;
d_result.arrayPtr[resultIndex]=0;
for(down=0;down<d_f.height;down++){ //multiplies each column
max = 0;
cheight = down*d_result.width;
bheight = down*d_f.width;
for(across=0;across<d_f.width;across++){ //multiplies each row
value = d_f.arrayPtr[across+bheight]*d_sample.arrayPtr[resultIndex + across+cheight];
if(value>max)
max = value;
}
}
d_result.arrayPtr[resultIndex]=max;
}[/code]

I'm trying to write a simple dilation function here...

since checkCUDAerror print out "unknown error" I'm pretty sure the error is in the kernel, but I don't know where it is?
or is my kernel just retarded?

my CPU version of the code is working, and I just tried to do the same for my kernel (is this the reason my kernel is being retarded??):
[code]void dilateOnHost(matrix a, matrix b, matrix c,int *result){
int i,j,down,across,max,value,resultIndex, cheight, bheight;

for(i=0;i<c.height;i++){ //moves filter down
for (j=0;j<c.width;j++){ //moves filter across
resultIndex = j+i*c.width;
result[resultIndex]=0;
for(down=0;down<b.height;down++){ //multiplies each column
max = 0;
cheight = down*c.width;
bheight = down*b.width;
for(across=0;across<b.width;across++){ //multiplies each row
value = b.arrayPtr[across+bheight]*a.arrayPtr[resultIndex + across+cheight];
if(value>max)
max = value;
}
}
result[resultIndex]=max;
}
}
}[/code]

one suspicion I have right now is that I can't pass structure into the kernel that way.
so, what is the right way to pass structure into the kernel??
Hello, I'm pretty new to programming, and I'm really new to CUDA



Is it possible to pass structures into CUDA kernels?



for example, I have:



struct matrix{int width; int height; int size; int bitSize; int wstart; int hstart; int *arrayPtr;};



int main(){



struct matrix h_sample, h_f, h_result;

struct matrix d_sample, d_f, d_result;



//then I assign each parameter of h_sample and h_f values, and an array for arrayPtr

//allocate host memory, etc, etc



//allocate device memory

cudaMalloc( (void **) &d_sample.arrayPtr, d_sample.bitSize);

cudaMalloc( (void **) &d_f.arrayPtr, d_f.bitSize);

cudaMalloc( (void **) &d_result.arrayPtr, d_result.bitSize);



//copy the arrays

cudaMemcpy( d_sample.arrayPtr, h_sample.arrayPtr, h_sample.bitSize, cudaMemcpyHostToDevice );

cudaMemcpy( d_f.arrayPtr, h_f.arrayPtr, h_f.bitSize, cudaMemcpyHostToDevice );

cudaMemcpy( d_result.arrayPtr, h_result.arrayPtr, h_result.bitSize, cudaMemcpyHostToDevice );



//copy other values of the structures

d_sample = h_sample;

d_f = h_f;

d_result = h_result;



int numThreadsPerBlock = 4;

dim3 dimGrid(68/numThreadsPerBlock, 68);

dim3 dimBlock(numThreadsPerBlock);



//launch kernel on Device

dilateOnDevice<<< dimGrid, dimBlock >>>(d_sample, d_f, d_result);



// block until the device has completed

cudaThreadSynchronize();



// Check for any CUDA errors

checkCUDAError("kernel invocation");



.....

}


//my program always quits at this point! /verymad.gif' class='bbc_emoticon' alt=':verymad:' />



my function for checkCUDAerror and my kernel are:

void checkCUDAError(const char *msg)

{

cudaError_t err = cudaGetLastError();

if( cudaSuccess != err)

{

fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );

system("pause");

exit(EXIT_FAILURE);

}

}



//my kernel:



__global__ void dilateOnDevice(matrix d_sample, matrix d_f, matrix d_result)

{

int i,j,down,across,max,value,resultIndex, cheight, bheight;



i = blockDim.x*blockIdx.x+threadIdx.x;

j = blockDim.y*blockIdx.y+threadIdx.y;



resultIndex = j+i*d_result.width;

d_result.arrayPtr[resultIndex]=0;

for(down=0;down<d_f.height;down++){ //multiplies each column

max = 0;

cheight = down*d_result.width;

bheight = down*d_f.width;

for(across=0;across<d_f.width;across++){ //multiplies each row

value = d_f.arrayPtr[across+bheight]*d_sample.arrayPtr[resultIndex + across+cheight];

if(value>max)

max = value;

}

}

d_result.arrayPtr[resultIndex]=max;

}




I'm trying to write a simple dilation function here...



since checkCUDAerror print out "unknown error" I'm pretty sure the error is in the kernel, but I don't know where it is?

or is my kernel just retarded?



my CPU version of the code is working, and I just tried to do the same for my kernel (is this the reason my kernel is being retarded??):

void dilateOnHost(matrix a, matrix b, matrix c,int *result){

int i,j,down,across,max,value,resultIndex, cheight, bheight;



for(i=0;i<c.height;i++){ //moves filter down

for (j=0;j<c.width;j++){ //moves filter across

resultIndex = j+i*c.width;

result[resultIndex]=0;

for(down=0;down<b.height;down++){ //multiplies each column

max = 0;

cheight = down*c.width;

bheight = down*b.width;

for(across=0;across<b.width;across++){ //multiplies each row

value = b.arrayPtr[across+bheight]*a.arrayPtr[resultIndex + across+cheight];

if(value>max)

max = value;

}

}

result[resultIndex]=max;

}

}

}




one suspicion I have right now is that I can't pass structure into the kernel that way.

so, what is the right way to pass structure into the kernel??

#2
Posted 09/17/2010 11:41 PM   
This bit:
[code] //copy other values of the structures
d_sample = h_sample;
d_f = h_f;
d_result = h_result;[/code]

looks pretty suspicious. Won't it overwrite your just allocated and assigned device memory pointers with the host values?
This bit:

//copy other values of the structures

d_sample = h_sample;

d_f = h_f;

d_result = h_result;




looks pretty suspicious. Won't it overwrite your just allocated and assigned device memory pointers with the host values?

#3
Posted 09/18/2010 07:17 AM   
This bit:
[code] //copy other values of the structures
d_sample = h_sample;
d_f = h_f;
d_result = h_result;[/code]

looks pretty suspicious. Won't it overwrite your just allocated and assigned device memory pointers with the host values?
This bit:

//copy other values of the structures

d_sample = h_sample;

d_f = h_f;

d_result = h_result;




looks pretty suspicious. Won't it overwrite your just allocated and assigned device memory pointers with the host values?

#4
Posted 09/18/2010 07:17 AM   
[quote name='avidday' post='1119120' date='Sep 18 2010, 12:17 AM']This bit:
[code] //copy other values of the structures
d_sample = h_sample;
d_f = h_f;
d_result = h_result;[/code]

looks pretty suspicious. Won't it overwrite your just allocated and assigned device memory pointers with the host values?[/quote]

aren't they supposed to be the same to begin with? hence the cudaMemCpy...
[quote name='avidday' post='1119120' date='Sep 18 2010, 12:17 AM']This bit:

//copy other values of the structures

d_sample = h_sample;

d_f = h_f;

d_result = h_result;




looks pretty suspicious. Won't it overwrite your just allocated and assigned device memory pointers with the host values?



aren't they supposed to be the same to begin with? hence the cudaMemCpy...

#5
Posted 09/20/2010 09:30 PM   
[quote name='avidday' post='1119120' date='Sep 18 2010, 12:17 AM']This bit:
[code] //copy other values of the structures
d_sample = h_sample;
d_f = h_f;
d_result = h_result;[/code]

looks pretty suspicious. Won't it overwrite your just allocated and assigned device memory pointers with the host values?[/quote]

aren't they supposed to be the same to begin with? hence the cudaMemCpy...
[quote name='avidday' post='1119120' date='Sep 18 2010, 12:17 AM']This bit:

//copy other values of the structures

d_sample = h_sample;

d_f = h_f;

d_result = h_result;




looks pretty suspicious. Won't it overwrite your just allocated and assigned device memory pointers with the host values?



aren't they supposed to be the same to begin with? hence the cudaMemCpy...

#6
Posted 09/20/2010 09:30 PM   
I have used structures many times in CUDA. I generally used structure pointers to do this. Here I show you an example. Hope it helps you.

struct ve_s{
int idx;
float real;
}

__global__ void Kernel(...., struct ve_s* sv, .....)
{
....
sv->real = 34.5;
....
}

main()
{
struct ve_s vs;
cutilSafeCall( cudaMalloc((void**)&d_vs, sizeof(ve_s)) );
cutilSafeCall( cudaMemcpy(d_vs, &vs, sizeof(ve_s), cudaMemcpyHostToDevice) );

Kernel<<< blocksPerGrid, threadsPerBlock>>>(..., d_vs,....);
}
I have used structures many times in CUDA. I generally used structure pointers to do this. Here I show you an example. Hope it helps you.



struct ve_s{

int idx;

float real;

}



__global__ void Kernel(...., struct ve_s* sv, .....)

{

....

sv->real = 34.5;

....

}



main()

{

struct ve_s vs;

cutilSafeCall( cudaMalloc((void**)&d_vs, sizeof(ve_s)) );

cutilSafeCall( cudaMemcpy(d_vs, &vs, sizeof(ve_s), cudaMemcpyHostToDevice) );



Kernel<<< blocksPerGrid, threadsPerBlock>>>(..., d_vs,....);

}

#7
Posted 09/24/2010 01:19 PM   
I have used structures many times in CUDA. I generally used structure pointers to do this. Here I show you an example. Hope it helps you.

struct ve_s{
int idx;
float real;
}

__global__ void Kernel(...., struct ve_s* sv, .....)
{
....
sv->real = 34.5;
....
}

main()
{
struct ve_s vs;
cutilSafeCall( cudaMalloc((void**)&d_vs, sizeof(ve_s)) );
cutilSafeCall( cudaMemcpy(d_vs, &vs, sizeof(ve_s), cudaMemcpyHostToDevice) );

Kernel<<< blocksPerGrid, threadsPerBlock>>>(..., d_vs,....);
}
I have used structures many times in CUDA. I generally used structure pointers to do this. Here I show you an example. Hope it helps you.



struct ve_s{

int idx;

float real;

}



__global__ void Kernel(...., struct ve_s* sv, .....)

{

....

sv->real = 34.5;

....

}



main()

{

struct ve_s vs;

cutilSafeCall( cudaMalloc((void**)&d_vs, sizeof(ve_s)) );

cutilSafeCall( cudaMemcpy(d_vs, &vs, sizeof(ve_s), cudaMemcpyHostToDevice) );



Kernel<<< blocksPerGrid, threadsPerBlock>>>(..., d_vs,....);

}

#8
Posted 09/24/2010 01:19 PM   
[quote name='KChou' post='1120274' date='Sep 21 2010, 12:30 AM']aren't they supposed to be the same to begin with? hence the cudaMemCpy...[/quote]Yes the same data should eventually reside at both the host and device memory areas but there are at least two big problems there:
- avidday is right, the implicit copy constructor for the matrix structure will just set the value of the device arrayPtr to the value of the host arrayPtr _without copying over any of the data that it points to_
- even if you had an explicit matrix copy constructor which copies the elements of the matrix one by one it would still not work (my guess is it would crash even) as by default the host address space is not mapped into the device address space so unless you use cudaHostAlloc, copying from host to device involves first copying to a newly created pinned memory area on the host and then, unless you also pass the cudaHostAllocMapped to cudaHostAlloc, setting up a DMA transfer to the GPU
[quote name='KChou' post='1120274' date='Sep 21 2010, 12:30 AM']aren't they supposed to be the same to begin with? hence the cudaMemCpy...Yes the same data should eventually reside at both the host and device memory areas but there are at least two big problems there:

- avidday is right, the implicit copy constructor for the matrix structure will just set the value of the device arrayPtr to the value of the host arrayPtr _without copying over any of the data that it points to_

- even if you had an explicit matrix copy constructor which copies the elements of the matrix one by one it would still not work (my guess is it would crash even) as by default the host address space is not mapped into the device address space so unless you use cudaHostAlloc, copying from host to device involves first copying to a newly created pinned memory area on the host and then, unless you also pass the cudaHostAllocMapped to cudaHostAlloc, setting up a DMA transfer to the GPU

#9
Posted 09/25/2010 05:27 PM   
[quote name='KChou' post='1120274' date='Sep 21 2010, 12:30 AM']aren't they supposed to be the same to begin with? hence the cudaMemCpy...[/quote]Yes the same data should eventually reside at both the host and device memory areas but there are at least two big problems there:
- avidday is right, the implicit copy constructor for the matrix structure will just set the value of the device arrayPtr to the value of the host arrayPtr _without copying over any of the data that it points to_
- even if you had an explicit matrix copy constructor which copies the elements of the matrix one by one it would still not work (my guess is it would crash even) as by default the host address space is not mapped into the device address space so unless you use cudaHostAlloc, copying from host to device involves first copying to a newly created pinned memory area on the host and then, unless you also pass the cudaHostAllocMapped to cudaHostAlloc, setting up a DMA transfer to the GPU
[quote name='KChou' post='1120274' date='Sep 21 2010, 12:30 AM']aren't they supposed to be the same to begin with? hence the cudaMemCpy...Yes the same data should eventually reside at both the host and device memory areas but there are at least two big problems there:

- avidday is right, the implicit copy constructor for the matrix structure will just set the value of the device arrayPtr to the value of the host arrayPtr _without copying over any of the data that it points to_

- even if you had an explicit matrix copy constructor which copies the elements of the matrix one by one it would still not work (my guess is it would crash even) as by default the host address space is not mapped into the device address space so unless you use cudaHostAlloc, copying from host to device involves first copying to a newly created pinned memory area on the host and then, unless you also pass the cudaHostAllocMapped to cudaHostAlloc, setting up a DMA transfer to the GPU

#10
Posted 09/25/2010 05:27 PM   
Scroll To Top