Hi, all:
I am trying to code a program that is used for finding a max number and its associated index in a 2D array.
I set the values to a 2D array, then, copy to a 1D array in device. I coded a global function named find_max to find the max number in the 1D array and its associated index in that 1D array. Outputs are 1D array with initialized "0" in all elements. The "0" in one element of the array is replaced by max number located at the associated index.
After I executed program, outputs of 1D array and index of that max number in the input 1D array is incorrect.
I tested the same implementation by for loop in host (CPU), outputs are correct. So, I think there should be some special considerations in this kind of implementation in GPU.
So, I will greatly appreciate you if you tell me the specials.
Dawn
My complete program is below:
#include “cuda_runtime.h”
#include “cuda.h”
#include “helper_functions.h”
#include “helper_cuda.h”
#include <stdio.h>
#include <stdlib.h>
//3 - linear
#define xsize 101
#define ysize 50
#define size xsize*ysize
// define blocks and threads
#define threds 1024 // # of threads per block
#if size%threds == 0
#define blocks (size/threds) // total # of blocks = total # of threads/threads (per block) you want to send to
#else
#define blocks (size/threds +1)
#endif
global void find_max(float *d_data, float *d_peak, int *d_max) {
int i_max = 0;
float tmp;
tmp = d_data[0];
int idx = blockIdx.x*blockDim.x + threadIdx.x;
// find peak value idx by idx
while (idx < size) {
if (tmp < d_data[idx]) {
i_max = idx;
tmp = d_data[i_max];
}
idx += blockDim.x*gridDim.x;
}
// write peak value and associated index for output
d_peak[i_max] = tmp;
*d_max = i_max;
}
int main()
{
int *d_max;
float *d_x, *d_peak;
float x[xsize][ysize], h_data[xsize][ysize];
// alloc memory for device variables and copy variables from host to device
checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(float)*size));
checkCudaErrors(cudaMalloc((void **)&d_peak, sizeof(float)*size));
checkCudaErrors(cudaMalloc((void **)&d_max, sizeof(int)));
checkCudaErrors(cudaMemset(d_max, 0, sizeof(int)));
checkCudaErrors(cudaMemset(d_peak, 0.0f, sizeof(float)*size));
int num = 0;
// generate float data in x[][]
for (int i = 0; i < xsize; i++) {
for (int j = 0; j < ysize; j++) {
num = rand() % 100;
x[i][j] = (float)num;
if (i == 50 && j == 25) {
x[i][j] = 1000.00f;
printf("peak number is set at index: \n");
printf("\n i=%d j=%d \n", i, j);
}
printf("%f ", x[i][j]);
}
printf("\n");
}
// copy x from host to d_x in device
checkCudaErrors(cudaMemcpy(d_x, x, sizeof(float)*size, cudaMemcpyHostToDevice));
int h_max;
find_max <<< blocks, threds >>> (d_x, d_peak, d_max);
checkCudaErrors(cudaMemcpy(&h_max, d_max, sizeof(int), cudaMemcpyDeviceToHost));
printf("\n\n");
// write index of the max number in 1D array
printf("max number index=%d \n", h_max);
printf("\n\n");
checkCudaErrors(cudaMemcpy(h_data, d_peak, sizeof(float)*size, cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
// write output of 1D array having the max number
for (int i = 0; i < xsize; i++) {
for (int j = 0; j < ysize; j++) {
if (i == 50 && j == 25) {
printf("\n i=%d j=%d \n", i, j);
}
printf("%f ", h_data[i][j]);
}
printf("\n");
}
// clean buffer
// free
//free(x);
//free(h_data);
cudaFree(d_x);
cudaFree(d_peak);
cudaFree(d_max);
return 0;
}