First CUDA trial but unexpected behavior

I wanted to write a small function which looks into an array for the greatest element and returns the ID of it.

Unfortunately it doesn’t work, so i created an minimum example which shows me, that my kernel didn’t change the value of the parameters given.

I dont know why this happens?!

main.cpp

int DATASIZE = 1024;

float *dValues; CUDA_SAFE_CALL( cudaMalloc( (void **)&dValues, sizeof(float) * DATASIZE) );

    int *dIndices; 	CUDA_SAFE_CALL( cudaMalloc( (void **)&dIndices, sizeof(int) * DATASIZE) );

	float *hValues = (float *)malloc(sizeof(float) * DATASIZE);

	int *hIndices = (int *)malloc(sizeof(int) * DATASIZE);

printf("Initializing data...\n");

	for(int i = 0; i < DATASIZE; i++)	{

		hValues[i] = i*2.f;

		hIndices[i] = i;

	}

	CUDA_SAFE_CALL( cudaMemcpy(dValues, hValues, sizeof(float) * DATASIZE, cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(dIndices, hIndices, sizeof(int) * DATASIZE, cudaMemcpyHostToDevice) );

	float *Val = (float *)malloc(sizeof(float)); *Val = 1337;

	int *ID = (int *)malloc(sizeof(int)); *ID = 1337;

	ANN::FindMax(hValues, hIndices,

				 dValues, dIndices,

				 Val, ID);

	CUDA_SAFE_CALL( cudaFree(dValues) );

	CUDA_SAFE_CALL( cudaFree(dIndices) );

host function:

void FindMax(float *hValues, int *hIndices, 

			 float *dValues, int *dIndices, 

			 float *pMax, int *pID) 

{

	printf("Running kernel\n");

	

	// Do calculations and find max on the GPU

	int numBlocks = 1;

	dim3 threadsPerBlock(32, 32);

	

	for(int i = 0; i < 8; i++) {

		std::cout<<"host: Result is: "<<hValues[i]<<std::endl;

	}

	

	float *dMax; 	CUDA_SAFE_CALL( cudaMalloc( (void **)&dMax, sizeof(float)) );

	int *dID; 		CUDA_SAFE_CALL( cudaMalloc( (void **)&dID, sizeof(int)) );

	

	CUDA_SAFE_CALL( cudaMemcpy(dMax, pMax, sizeof(float), cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(dID, pID, sizeof(int), cudaMemcpyHostToDevice) );

	

	NaiveKernel<<<numBlocks, threadsPerBlock>>>(dValues, dIndices, dMax, dID);

	cudaThreadSynchronize();

	

	CUDA_SAFE_CALL( cudaMemcpy(pMax, dMax, sizeof(float), cudaMemcpyDeviceToHost) );

	CUDA_SAFE_CALL( cudaMemcpy(pID, dID, sizeof(int), cudaMemcpyDeviceToHost) );

	

	CUDA_SAFE_CALL( cudaMemcpy(hValues, dValues, sizeof(float), cudaMemcpyDeviceToHost) );

	CUDA_SAFE_CALL( cudaMemcpy(hIndices, dIndices, sizeof(int), cudaMemcpyDeviceToHost) );

	

	// Copy the data back to the host. It's just 1 int

	for(int i = 0; i < 8; i++) {

		std::cout<<"device: Result is: "<<hValues[i]<<std::endl;

	}

	std::cout<<"Result is "<<*pMax<<" at index: "<<*pID<<std::endl;

device function:

__global__ static

void NaiveKernel(float *pValues, int *pIndices, 

				 float *pMaxResult, int *pIndex) {

	

	int ID = blockIdx.x * blockDim.x + threadIdx.x;

	float 	fResult = pValues[ID];

	

	*pMaxResult = 32;

	*pIndex = 32;

	pValues[ID] = -1;

	//atomicMax(pMaxResult, fResult);

}

Output:

Initializing data...

Initializing data...

Running kernel

host: Result is: 0

host: Result is: 2

host: Result is: 4

host: Result is: 6

host: Result is: 8

host: Result is: 10

host: Result is: 12

host: Result is: 14

device: Result is: 0

device: Result is: 2

device: Result is: 4

device: Result is: 6

device: Result is: 8

device: Result is: 10

device: Result is: 12

device: Result is: 14

Result is 1337 at index: 1337

Hi,
your code looks OK from the distance, so my guess is that the kernel is actually never called. One thing I find highly suspicious is the “static” you used while defining the kernel itself. If you had to use such a declaration to keep your compiler happy (preventing from a “multiple definition error” for example) then it means that your kernel is actually defined elsewhere as a noop (a .h possibly).
This is just a guess but I encourage you to double-check this, ans ultimately to put all your reproducer code into one single source file to see if it solves your issue.
HTH

Thanks for the reply. I will check this.

ATM i wrote a CPU-processed neuronal net library, to create highly assysmetric nets.

My aim now was to create a wrapper to calculate on GPUs.

So I try to hide the ugly GPU-stuff hidden in a CUDA-class.

Thats why i defined a *.cu file with host functions calling global device functions implemented in another *.cu file, which i included in the *.cu file with the host functions.

The host function have a prototype defined in a *.h file. The call of the host function obviously seems to work, cause i got an output.

I actually wonder, why i have to make the global functions static or inlined to avoid compiler errors.

Maybe NVCC doesn’t support the inclusion of *.cu files in *.cu files.

But I didnt declared prototypes for my kernels somewhere else. Dont know what happens there.

Avoiding static and implementing the kernel in same *.cu file doesn’t help.

CUDA seems to fail. Is there any chance to get CUDA working?

__global__ 

void NaiveKernel(float *pValues, int *pIndices, 

				 float *pMaxResult, int *pIndex) {

	// We are assuming 256 threads per block

	// This function is very simple!

	// Too many atomic operations on the same address will cause  massive slowdown

	

	int ID = blockIdx.x * blockDim.x + threadIdx.x;

	float 	fResult = pValues[ID];

	

	*pMaxResult = 32;

	*pIndex = 32;

	pValues[ID] = -1;

	//atomicMax(pMaxResult, fResult);

}

void FindMax(float *hValues, int *hIndices, 

			 float *dValues, int *dIndices, 

			 float *pMax, int *pID) 

{

	printf("Running kernel\n");

	

	// Do calculations and find max on the GPU

	int numBlocks = 1;

	dim3 threadsPerBlock(32, 32);

	

	for(int i = 0; i < 8; i++) {

		std::cout<<"host: Result is: "<<hValues[i]<<std::endl;

	}

	

	float *dMax; 	CUDA_SAFE_CALL( cudaMalloc( (void **)&dMax, sizeof(float)) );

	int *dID; 		CUDA_SAFE_CALL( cudaMalloc( (void **)&dID, sizeof(int)) );

	

	CUDA_SAFE_CALL( cudaMemcpy(dMax, pMax, sizeof(float), cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMemcpy(dID, pID, sizeof(int), cudaMemcpyHostToDevice) );

	

	NaiveKernel<<<numBlocks, threadsPerBlock>>>(dValues, dIndices, dMax, dID);

	cudaThreadSynchronize();

//..

Hi,

you should try a good exorcism or a voodoo ritual …

Seriously i noticed different issues:

  1. in cudaMemcpy the size is in bytes, so in FindMax you copy back only one element of dValues and dIndices

  2. you launch one 2D block of 32x32 thread so in your kernel ID will be in {0, …, 31}

  3. check for error after the kernel invocation