I want to do simple program of summation of prime no from 1 to 50,000 in CUDA, so I create one kernel to find prime no. and sum of it using reduction. But due to some reasons ,it is giving garbage value of sum as a output. what is wrong in following code?? or should I create two different kernel??
#include<stdio.h>
#include<cuda.h>
#include<time.h>
#define SIZE 10
__global__ void prime(int *ga, int *gb, int *gc) //gb=output, gc=sum
{
int k = threadIdx.x + (blockDim.x * blockIdx.x);
int flag, j;
if (k < SIZE)
{
flag = 0;
for (j = 2; j <= k / 2; j++)
{
if (ga[k] % j == 0)
{
flag = 1;
break;
}
}
if (flag == 0)
{
gb[k] = ga[k];
}
}
int id = threadIdx.x;
extern __shared__ int s[];
s[id] = 0;
__syncthreads();
if (id<SIZE)
{
s[id] = gb[k]; //ga??
}__syncthreads();
if (blockDim.x >= 2048){
if (id < 1024){
s[id] = s[id] + s[id + 512]; //1024??
}
__syncthreads();
}
if (blockDim.x >= 1024){
if (id < 512){
s[id] = s[id] + s[id + 512];
}
__syncthreads();
}
if (blockDim.x >= 512){
if (id < 256)
{
s[id] = s[id] + s[id + 256];
}
__syncthreads();
}
if (blockDim.x >= 256){
if (id < 128)
{
s[id] = s[id] + s[id + 128];
}
__syncthreads();
}
if (blockDim.x >= 128){
if (id < 64)
{
s[id] = s[id] + s[id + 64];
}
__syncthreads();
}
//if this is the last warp
if (id < 32){
if (blockDim.x >= 64)
{
s[id] = s[id] + s[id + 32];
}
if (blockDim.x >= 32)
{
s[id] = s[id] + s[id + 16];
}
if (blockDim.x >= 16)
{
s[id] = s[id] + s[id + 8];
}
if (blockDim.x >= 8)
{
s[id] = s[id] + s[id + 4];
}
if (blockDim.x >= 4)
{
s[id] = s[id] + s[id + 2];
}
if (blockDim.x >= 2)
{
s[id] = s[id] + s[id + 1];
}
}
if (id == 0)
{
atomicAdd(gc, s[0]);
}
}
int main()
{
int i, count = 0, sum = 0;
int *ga, *gb, *gc, *ha, *add=0;
cudaError_t err;
cudaEvent_t start, stop;
float elapseTime;
int blocks, blocksize = 512;
int sh_size = (blocksize * sizeof(int));
ha = (int*)malloc(SIZE*sizeof(int));
add = (int*)malloc(SIZE*sizeof(int));
for (i = 2; i < SIZE; i++)
{
ha[i] = i;
}
if (cudaSuccess != (err = (cudaMalloc((void**)&ga, SIZE*sizeof(int)))))
{
printf("error in cuaMalloc %s", (char *)cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
if (cudaSuccess != (err = (cudaMalloc((void**)&gb, SIZE*sizeof(int)))))
{
printf("error in cuaMalloc %s", (char *)cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
if (cudaSuccess != (err = (cudaMalloc((void**)&gc, SIZE*sizeof(int)))))
{
printf("error in cuaMalloc %s", (char *)cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
blocks = SIZE / blocksize;
if (SIZE% blocksize != 0)
blocks++;
if (cudaSuccess != (err = (cudaMemcpy(ga, ha, SIZE*sizeof(int), cudaMemcpyHostToDevice))))
{
printf("Error in cudaMemcpy H_T_D %s ", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// time of execution
cudaEventCreate(&start);
cudaEventCreate(&stop);
//start recording
cudaEventRecord(start, 0);
prime << < blocks, blocksize, sh_size >> >(ga, gb, gc);
//stop recording
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapseTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaThreadSynchronize();
if (cudaSuccess != (err = cudaMemcpy(ha, gb, SIZE*sizeof(int), cudaMemcpyDeviceToHost)))
{
printf("Error in cudaMemcpy D_T_H %s ", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
if (cudaSuccess != (err = cudaMemcpy(add, gc, SIZE*sizeof(int), cudaMemcpyDeviceToHost)))
{
printf("Error in cudaMemcpy D_T_H %s ", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//printing output
for (int k = 2; k < SIZE; k++)
{
if (ha[k] != 0)
{
printf("\n prime no::%d\t", ha[k]);
// sum = sum + ha[k];
count++;
}
}
printf("\n \nsum of all prime no. is:%d", add);
printf("\n\nTotle prime numbers upto %d is : %d", SIZE, count);
printf("\n\n Total elapsed time on GPU=%lf millisec", elapseTime);
cudaFree(ga);
cudaFree(gb);
cudaFree(gc);
free(ha);
free(add);
getchar();
return 0;
}
specifications:1.graphics card- Gforce 210(1.2 compute capability) 2.CUDA toolkit 6.5version 3.visual studio 2013