Hello, I’m pretty new to programming, and I’m really new to CUDA
Is it possible to pass structures into CUDA kernels?
for example, I have:
struct matrix{int width; int height; int size; int bitSize; int wstart; int hstart; int *arrayPtr;};
int main(){
struct matrix h_sample, h_f, h_result;
struct matrix d_sample, d_f, d_result;
//then I assign each parameter of h_sample and h_f values, and an array for arrayPtr
//allocate host memory, etc, etc
//allocate device memory
cudaMalloc( (void **) &d_sample.arrayPtr, d_sample.bitSize);
cudaMalloc( (void **) &d_f.arrayPtr, d_f.bitSize);
cudaMalloc( (void **) &d_result.arrayPtr, d_result.bitSize);
//copy the arrays
cudaMemcpy( d_sample.arrayPtr, h_sample.arrayPtr, h_sample.bitSize, cudaMemcpyHostToDevice );
cudaMemcpy( d_f.arrayPtr, h_f.arrayPtr, h_f.bitSize, cudaMemcpyHostToDevice );
cudaMemcpy( d_result.arrayPtr, h_result.arrayPtr, h_result.bitSize, cudaMemcpyHostToDevice );
//copy other values of the structures
d_sample = h_sample;
d_f = h_f;
d_result = h_result;
int numThreadsPerBlock = 4;
dim3 dimGrid(68/numThreadsPerBlock, 68);
dim3 dimBlock(numThreadsPerBlock);
//launch kernel on Device
dilateOnDevice<<< dimGrid, dimBlock >>>(d_sample, d_f, d_result);
// block until the device has completed
cudaThreadSynchronize();
// Check for any CUDA errors
checkCUDAError("kernel invocation");
.....
}
//my program always quits at this point! External Image
my function for checkCUDAerror and my kernel are:
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
system("pause");
exit(EXIT_FAILURE);
}
}
//my kernel:
__global__ void dilateOnDevice(matrix d_sample, matrix d_f, matrix d_result)
{
int i,j,down,across,max,value,resultIndex, cheight, bheight;
i = blockDim.x*blockIdx.x+threadIdx.x;
j = blockDim.y*blockIdx.y+threadIdx.y;
resultIndex = j+i*d_result.width;
d_result.arrayPtr[resultIndex]=0;
for(down=0;down<d_f.height;down++){ //multiplies each column
max = 0;
cheight = down*d_result.width;
bheight = down*d_f.width;
for(across=0;across<d_f.width;across++){ //multiplies each row
value = d_f.arrayPtr[across+bheight]*d_sample.arrayPtr[resultIndex + across+cheight];
if(value>max)
max = value;
}
}
d_result.arrayPtr[resultIndex]=max;
}
I’m trying to write a simple dilation function here…
since checkCUDAerror print out “unknown error” I’m pretty sure the error is in the kernel, but I don’t know where it is?
or is my kernel just retarded?
my CPU version of the code is working, and I just tried to do the same for my kernel (is this the reason my kernel is being retarded??):
void dilateOnHost(matrix a, matrix b, matrix c,int *result){
int i,j,down,across,max,value,resultIndex, cheight, bheight;
for(i=0;i<c.height;i++){ //moves filter down
for (j=0;j<c.width;j++){ //moves filter across
resultIndex = j+i*c.width;
result[resultIndex]=0;
for(down=0;down<b.height;down++){ //multiplies each column
max = 0;
cheight = down*c.width;
bheight = down*b.width;
for(across=0;across<b.width;across++){ //multiplies each row
value = b.arrayPtr[across+bheight]*a.arrayPtr[resultIndex + across+cheight];
if(value>max)
max = value;
}
}
result[resultIndex]=max;
}
}
}
one suspicion I have right now is that I can’t pass structure into the kernel that way.
so, what is the right way to pass structure into the kernel??