First CUDA trial but unexpected behavior
I wanted to write a small function which looks into an array for the greatest element and returns the ID of it.
Unfortunately it doesn't work, so i created an minimum example which shows me, that my kernel didn't change the value of the parameters given.
I dont know why this happens?!

main.cpp
[code]
int DATASIZE = 1024;

float *dValues; CUDA_SAFE_CALL( cudaMalloc( (void **)&dValues, sizeof(float) * DATASIZE) );
int *dIndices; CUDA_SAFE_CALL( cudaMalloc( (void **)&dIndices, sizeof(int) * DATASIZE) );

float *hValues = (float *)malloc(sizeof(float) * DATASIZE);
int *hIndices = (int *)malloc(sizeof(int) * DATASIZE);

printf("Initializing data...\n");

for(int i = 0; i < DATASIZE; i++) {
hValues[i] = i*2.f;
hIndices[i] = i;
}

CUDA_SAFE_CALL( cudaMemcpy(dValues, hValues, sizeof(float) * DATASIZE, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(dIndices, hIndices, sizeof(int) * DATASIZE, cudaMemcpyHostToDevice) );

float *Val = (float *)malloc(sizeof(float)); *Val = 1337;
int *ID = (int *)malloc(sizeof(int)); *ID = 1337;

ANN::FindMax(hValues, hIndices,
dValues, dIndices,
Val, ID);

CUDA_SAFE_CALL( cudaFree(dValues) );
CUDA_SAFE_CALL( cudaFree(dIndices) );[/code]

host function:
[code]void FindMax(float *hValues, int *hIndices,
float *dValues, int *dIndices,
float *pMax, int *pID)
{
printf("Running kernel\n");

// Do calculations and find max on the GPU
int numBlocks = 1;
dim3 threadsPerBlock(32, 32);

for(int i = 0; i < 8; i++) {
std::cout<<"host: Result is: "<<hValues[i]<<std::endl;
}

float *dMax; CUDA_SAFE_CALL( cudaMalloc( (void **)&dMax, sizeof(float)) );
int *dID; CUDA_SAFE_CALL( cudaMalloc( (void **)&dID, sizeof(int)) );

CUDA_SAFE_CALL( cudaMemcpy(dMax, pMax, sizeof(float), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(dID, pID, sizeof(int), cudaMemcpyHostToDevice) );

NaiveKernel<<<numBlocks, threadsPerBlock>>>(dValues, dIndices, dMax, dID);
cudaThreadSynchronize();

CUDA_SAFE_CALL( cudaMemcpy(pMax, dMax, sizeof(float), cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL( cudaMemcpy(pID, dID, sizeof(int), cudaMemcpyDeviceToHost) );

CUDA_SAFE_CALL( cudaMemcpy(hValues, dValues, sizeof(float), cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL( cudaMemcpy(hIndices, dIndices, sizeof(int), cudaMemcpyDeviceToHost) );

// Copy the data back to the host. It's just 1 int
for(int i = 0; i < 8; i++) {
std::cout<<"device: Result is: "<<hValues[i]<<std::endl;
}
std::cout<<"Result is "<<*pMax<<" at index: "<<*pID<<std::endl;[/code]
device function:
[code]__global__ static
void NaiveKernel(float *pValues, int *pIndices,
float *pMaxResult, int *pIndex) {

int ID = blockIdx.x * blockDim.x + threadIdx.x;
float fResult = pValues[ID];

*pMaxResult = 32;
*pIndex = 32;
pValues[ID] = -1;
//atomicMax(pMaxResult, fResult);
}[/code]

Output:
[code]Initializing data...
Initializing data...
Running kernel
host: Result is: 0
host: Result is: 2
host: Result is: 4
host: Result is: 6
host: Result is: 8
host: Result is: 10
host: Result is: 12
host: Result is: 14
device: Result is: 0
device: Result is: 2
device: Result is: 4
device: Result is: 6
device: Result is: 8
device: Result is: 10
device: Result is: 12
device: Result is: 14
Result is 1337 at index: 1337
[/code]
I wanted to write a small function which looks into an array for the greatest element and returns the ID of it.

Unfortunately it doesn't work, so i created an minimum example which shows me, that my kernel didn't change the value of the parameters given.

I dont know why this happens?!



main.cpp



int DATASIZE = 1024;



float *dValues; CUDA_SAFE_CALL( cudaMalloc( (void **)&dValues, sizeof(float) * DATASIZE) );

int *dIndices; CUDA_SAFE_CALL( cudaMalloc( (void **)&dIndices, sizeof(int) * DATASIZE) );



float *hValues = (float *)malloc(sizeof(float) * DATASIZE);

int *hIndices = (int *)malloc(sizeof(int) * DATASIZE);



printf("Initializing data...\n");



for(int i = 0; i < DATASIZE; i++) {

hValues[i] = i*2.f;

hIndices[i] = i;

}



CUDA_SAFE_CALL( cudaMemcpy(dValues, hValues, sizeof(float) * DATASIZE, cudaMemcpyHostToDevice) );

CUDA_SAFE_CALL( cudaMemcpy(dIndices, hIndices, sizeof(int) * DATASIZE, cudaMemcpyHostToDevice) );



float *Val = (float *)malloc(sizeof(float)); *Val = 1337;

int *ID = (int *)malloc(sizeof(int)); *ID = 1337;



ANN::FindMax(hValues, hIndices,

dValues, dIndices,

Val, ID);



CUDA_SAFE_CALL( cudaFree(dValues) );

CUDA_SAFE_CALL( cudaFree(dIndices) );




host function:

void FindMax(float *hValues, int *hIndices, 

float *dValues, int *dIndices,

float *pMax, int *pID)

{

printf("Running kernel\n");



// Do calculations and find max on the GPU

int numBlocks = 1;

dim3 threadsPerBlock(32, 32);



for(int i = 0; i < 8; i++) {

std::cout<<"host: Result is: "<<hValues[i]<<std::endl;

}



float *dMax; CUDA_SAFE_CALL( cudaMalloc( (void **)&dMax, sizeof(float)) );

int *dID; CUDA_SAFE_CALL( cudaMalloc( (void **)&dID, sizeof(int)) );



CUDA_SAFE_CALL( cudaMemcpy(dMax, pMax, sizeof(float), cudaMemcpyHostToDevice) );

CUDA_SAFE_CALL( cudaMemcpy(dID, pID, sizeof(int), cudaMemcpyHostToDevice) );



NaiveKernel<<<numBlocks, threadsPerBlock>>>(dValues, dIndices, dMax, dID);

cudaThreadSynchronize();



CUDA_SAFE_CALL( cudaMemcpy(pMax, dMax, sizeof(float), cudaMemcpyDeviceToHost) );

CUDA_SAFE_CALL( cudaMemcpy(pID, dID, sizeof(int), cudaMemcpyDeviceToHost) );



CUDA_SAFE_CALL( cudaMemcpy(hValues, dValues, sizeof(float), cudaMemcpyDeviceToHost) );

CUDA_SAFE_CALL( cudaMemcpy(hIndices, dIndices, sizeof(int), cudaMemcpyDeviceToHost) );



// Copy the data back to the host. It's just 1 int

for(int i = 0; i < 8; i++) {

std::cout<<"device: Result is: "<<hValues[i]<<std::endl;

}

std::cout<<"Result is "<<*pMax<<" at index: "<<*pID<<std::endl;


device function:

__global__ static

void NaiveKernel(float *pValues, int *pIndices,

float *pMaxResult, int *pIndex) {



int ID = blockIdx.x * blockDim.x + threadIdx.x;

float fResult = pValues[ID];



*pMaxResult = 32;

*pIndex = 32;

pValues[ID] = -1;

//atomicMax(pMaxResult, fResult);

}




Output:

Initializing data...

Initializing data...

Running kernel

host: Result is: 0

host: Result is: 2

host: Result is: 4

host: Result is: 6

host: Result is: 8

host: Result is: 10

host: Result is: 12

host: Result is: 14

device: Result is: 0

device: Result is: 2

device: Result is: 4

device: Result is: 6

device: Result is: 8

device: Result is: 10

device: Result is: 12

device: Result is: 14

Result is 1337 at index: 1337

#1
Posted 03/28/2012 08:05 PM   
Hi,
your code looks OK from the distance, so my guess is that the kernel is actually never called. One thing I find highly suspicious is the "static" you used while defining the kernel itself. If you had to use such a declaration to keep your compiler happy (preventing from a "multiple definition error" for example) then it means that your kernel is actually defined elsewhere as a noop (a .h possibly).
This is just a guess but I encourage you to double-check this, ans ultimately to put all your reproducer code into one single source file to see if it solves your issue.
HTH
Hi,

your code looks OK from the distance, so my guess is that the kernel is actually never called. One thing I find highly suspicious is the "static" you used while defining the kernel itself. If you had to use such a declaration to keep your compiler happy (preventing from a "multiple definition error" for example) then it means that your kernel is actually defined elsewhere as a noop (a .h possibly).

This is just a guess but I encourage you to double-check this, ans ultimately to put all your reproducer code into one single source file to see if it solves your issue.

HTH

#2
Posted 03/29/2012 05:03 AM   
[quote name='Gilles_C' date='29 March 2012 - 05:03 AM' timestamp='1332997423' post='1389382']
Hi,
your code looks OK from the distance, so my guess is that the kernel is actually never called. One thing I find highly suspicious is the "static" you used while defining the kernel itself. If you had to use such a declaration to keep your compiler happy (preventing from a "multiple definition error" for example) then it means that your kernel is actually defined elsewhere as a noop (a .h possibly).
This is just a guess but I encourage you to double-check this, ans ultimately to put all your reproducer code into one single source file to see if it solves your issue.
HTH
[/quote]
Thanks for the reply. I will check this.
ATM i wrote a CPU-processed neuronal net library, to create highly assysmetric nets.
My aim now was to create a wrapper to calculate on GPUs.
So I try to hide the ugly GPU-stuff hidden in a CUDA-class.
Thats why i defined a *.cu file with host functions calling __global__ device functions implemented in another *.cu file, which i included in the *.cu file with the host functions.
The host function have a prototype defined in a *.h file. The call of the host function obviously seems to work, cause i got an output.
I actually wonder, why i have to make the __global__ functions static or inlined to avoid compiler errors.
Maybe NVCC doesn't support the inclusion of *.cu files in *.cu files.
But I didnt declared prototypes for my kernels somewhere else. Dont know what happens there.
[quote name='Gilles_C' date='29 March 2012 - 05:03 AM' timestamp='1332997423' post='1389382']

Hi,

your code looks OK from the distance, so my guess is that the kernel is actually never called. One thing I find highly suspicious is the "static" you used while defining the kernel itself. If you had to use such a declaration to keep your compiler happy (preventing from a "multiple definition error" for example) then it means that your kernel is actually defined elsewhere as a noop (a .h possibly).

This is just a guess but I encourage you to double-check this, ans ultimately to put all your reproducer code into one single source file to see if it solves your issue.

HTH



Thanks for the reply. I will check this.

ATM i wrote a CPU-processed neuronal net library, to create highly assysmetric nets.

My aim now was to create a wrapper to calculate on GPUs.

So I try to hide the ugly GPU-stuff hidden in a CUDA-class.

Thats why i defined a *.cu file with host functions calling __global__ device functions implemented in another *.cu file, which i included in the *.cu file with the host functions.

The host function have a prototype defined in a *.h file. The call of the host function obviously seems to work, cause i got an output.

I actually wonder, why i have to make the __global__ functions static or inlined to avoid compiler errors.

Maybe NVCC doesn't support the inclusion of *.cu files in *.cu files.

But I didnt declared prototypes for my kernels somewhere else. Dont know what happens there.

#3
Posted 03/29/2012 12:35 PM   
Avoiding static and implementing the kernel in same *.cu file doesn't help.
CUDA seems to fail. Is there any chance to get CUDA working?

[code]
__global__
void NaiveKernel(float *pValues, int *pIndices,
float *pMaxResult, int *pIndex) {
// We are assuming 256 threads per block
// This function is very simple!
// Too many atomic operations on the same address will cause massive slowdown

int ID = blockIdx.x * blockDim.x + threadIdx.x;
float fResult = pValues[ID];

*pMaxResult = 32;
*pIndex = 32;
pValues[ID] = -1;
//atomicMax(pMaxResult, fResult);
}

void FindMax(float *hValues, int *hIndices,
float *dValues, int *dIndices,
float *pMax, int *pID)
{
printf("Running kernel\n");

// Do calculations and find max on the GPU
int numBlocks = 1;
dim3 threadsPerBlock(32, 32);

for(int i = 0; i < 8; i++) {
std::cout<<"host: Result is: "<<hValues[i]<<std::endl;
}

float *dMax; CUDA_SAFE_CALL( cudaMalloc( (void **)&dMax, sizeof(float)) );
int *dID; CUDA_SAFE_CALL( cudaMalloc( (void **)&dID, sizeof(int)) );

CUDA_SAFE_CALL( cudaMemcpy(dMax, pMax, sizeof(float), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(dID, pID, sizeof(int), cudaMemcpyHostToDevice) );

NaiveKernel<<<numBlocks, threadsPerBlock>>>(dValues, dIndices, dMax, dID);
cudaThreadSynchronize();
//..
[/code]
Avoiding static and implementing the kernel in same *.cu file doesn't help.

CUDA seems to fail. Is there any chance to get CUDA working?





__global__

void NaiveKernel(float *pValues, int *pIndices,

float *pMaxResult, int *pIndex) {

// We are assuming 256 threads per block

// This function is very simple!

// Too many atomic operations on the same address will cause massive slowdown



int ID = blockIdx.x * blockDim.x + threadIdx.x;

float fResult = pValues[ID];



*pMaxResult = 32;

*pIndex = 32;

pValues[ID] = -1;

//atomicMax(pMaxResult, fResult);

}



void FindMax(float *hValues, int *hIndices,

float *dValues, int *dIndices,

float *pMax, int *pID)

{

printf("Running kernel\n");



// Do calculations and find max on the GPU

int numBlocks = 1;

dim3 threadsPerBlock(32, 32);



for(int i = 0; i < 8; i++) {

std::cout<<"host: Result is: "<<hValues[i]<<std::endl;

}



float *dMax; CUDA_SAFE_CALL( cudaMalloc( (void **)&dMax, sizeof(float)) );

int *dID; CUDA_SAFE_CALL( cudaMalloc( (void **)&dID, sizeof(int)) );



CUDA_SAFE_CALL( cudaMemcpy(dMax, pMax, sizeof(float), cudaMemcpyHostToDevice) );

CUDA_SAFE_CALL( cudaMemcpy(dID, pID, sizeof(int), cudaMemcpyHostToDevice) );



NaiveKernel<<<numBlocks, threadsPerBlock>>>(dValues, dIndices, dMax, dID);

cudaThreadSynchronize();

//..

#4
Posted 03/29/2012 03:15 PM   
[quote name='dgrat' date='29 March 2012 - 04:15 PM' timestamp='1333034107' post='1389545']
CUDA seems to fail. Is there any chance to get CUDA working?
[/quote]

Hi,
you should try a good exorcism or a voodoo ritual ...

Seriously i noticed different issues:
1. in cudaMemcpy the size is in bytes, so in FindMax you copy back only one element of dValues and dIndices
2. you launch one 2D block of 32x32 thread so in your kernel ID will be in {0, ..., 31}
3. check for error after the kernel invocation
[quote name='dgrat' date='29 March 2012 - 04:15 PM' timestamp='1333034107' post='1389545']

CUDA seems to fail. Is there any chance to get CUDA working?





Hi,

you should try a good exorcism or a voodoo ritual ...



Seriously i noticed different issues:

1. in cudaMemcpy the size is in bytes, so in FindMax you copy back only one element of dValues and dIndices

2. you launch one 2D block of 32x32 thread so in your kernel ID will be in {0, ..., 31}

3. check for error after the kernel invocation

#5
Posted 03/29/2012 10:47 PM   
Scroll To Top