I wanted to write a small function which looks into an array for the greatest element and returns the ID of it.
Unfortunately it doesn’t work, so i created an minimum example which shows me, that my kernel didn’t change the value of the parameters given.
I dont know why this happens?!
main.cpp
int DATASIZE = 1024;
float *dValues; CUDA_SAFE_CALL( cudaMalloc( (void **)&dValues, sizeof(float) * DATASIZE) );
int *dIndices; CUDA_SAFE_CALL( cudaMalloc( (void **)&dIndices, sizeof(int) * DATASIZE) );
float *hValues = (float *)malloc(sizeof(float) * DATASIZE);
int *hIndices = (int *)malloc(sizeof(int) * DATASIZE);
printf("Initializing data...\n");
for(int i = 0; i < DATASIZE; i++) {
hValues[i] = i*2.f;
hIndices[i] = i;
}
CUDA_SAFE_CALL( cudaMemcpy(dValues, hValues, sizeof(float) * DATASIZE, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(dIndices, hIndices, sizeof(int) * DATASIZE, cudaMemcpyHostToDevice) );
float *Val = (float *)malloc(sizeof(float)); *Val = 1337;
int *ID = (int *)malloc(sizeof(int)); *ID = 1337;
ANN::FindMax(hValues, hIndices,
dValues, dIndices,
Val, ID);
CUDA_SAFE_CALL( cudaFree(dValues) );
CUDA_SAFE_CALL( cudaFree(dIndices) );
host function:
void FindMax(float *hValues, int *hIndices,
float *dValues, int *dIndices,
float *pMax, int *pID)
{
printf("Running kernel\n");
// Do calculations and find max on the GPU
int numBlocks = 1;
dim3 threadsPerBlock(32, 32);
for(int i = 0; i < 8; i++) {
std::cout<<"host: Result is: "<<hValues[i]<<std::endl;
}
float *dMax; CUDA_SAFE_CALL( cudaMalloc( (void **)&dMax, sizeof(float)) );
int *dID; CUDA_SAFE_CALL( cudaMalloc( (void **)&dID, sizeof(int)) );
CUDA_SAFE_CALL( cudaMemcpy(dMax, pMax, sizeof(float), cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(dID, pID, sizeof(int), cudaMemcpyHostToDevice) );
NaiveKernel<<<numBlocks, threadsPerBlock>>>(dValues, dIndices, dMax, dID);
cudaThreadSynchronize();
CUDA_SAFE_CALL( cudaMemcpy(pMax, dMax, sizeof(float), cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL( cudaMemcpy(pID, dID, sizeof(int), cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL( cudaMemcpy(hValues, dValues, sizeof(float), cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL( cudaMemcpy(hIndices, dIndices, sizeof(int), cudaMemcpyDeviceToHost) );
// Copy the data back to the host. It's just 1 int
for(int i = 0; i < 8; i++) {
std::cout<<"device: Result is: "<<hValues[i]<<std::endl;
}
std::cout<<"Result is "<<*pMax<<" at index: "<<*pID<<std::endl;
device function:
__global__ static
void NaiveKernel(float *pValues, int *pIndices,
float *pMaxResult, int *pIndex) {
int ID = blockIdx.x * blockDim.x + threadIdx.x;
float fResult = pValues[ID];
*pMaxResult = 32;
*pIndex = 32;
pValues[ID] = -1;
//atomicMax(pMaxResult, fResult);
}
Output:
Initializing data...
Initializing data...
Running kernel
host: Result is: 0
host: Result is: 2
host: Result is: 4
host: Result is: 6
host: Result is: 8
host: Result is: 10
host: Result is: 12
host: Result is: 14
device: Result is: 0
device: Result is: 2
device: Result is: 4
device: Result is: 6
device: Result is: 8
device: Result is: 10
device: Result is: 12
device: Result is: 14
Result is 1337 at index: 1337