summation of prime no in CUDA

I want to do simple program of summation of prime no from 1 to 50,000 in CUDA, so I create one kernel to find prime no. and sum of it using reduction. But due to some reasons ,it is giving garbage value of sum as a output. what is wrong in following code?? or should I create two different kernel??

#include<stdio.h>
#include<cuda.h>
#include<time.h>

#define SIZE 10

__global__ void prime(int *ga, int *gb, int *gc)					//gb=output, gc=sum
{
	int k = threadIdx.x + (blockDim.x * blockIdx.x);
	
	int  flag, j;

	if (k < SIZE)
	{
		flag = 0;
		for (j = 2; j <= k / 2; j++)
		{
			if (ga[k] % j == 0)
			{
				flag = 1;
				break;
			}
		}
		if (flag == 0)
		{
			gb[k] = ga[k];
		}
	}
	int id = threadIdx.x;
	extern __shared__ int s[];
	s[id] = 0;
	__syncthreads();
	if (id<SIZE)
	{
		s[id] = gb[k];			//ga??
	}__syncthreads();
	if (blockDim.x >= 2048){
		if (id < 1024){
			s[id] = s[id] + s[id + 512];               //1024??
		}
		__syncthreads();
	}
	if (blockDim.x >= 1024){
		if (id < 512){
			s[id] = s[id] + s[id + 512];
		}
		__syncthreads();
	}

	if (blockDim.x >= 512){
		if (id < 256)
		{
			s[id] = s[id] + s[id + 256];
		}
		__syncthreads();
	}
	if (blockDim.x >= 256){
		if (id < 128)
		{
			s[id] = s[id] + s[id + 128];
		}
		__syncthreads();
	}
	if (blockDim.x >= 128){
		if (id < 64)
		{
			s[id] = s[id] + s[id + 64];
		}
		__syncthreads();
	}
	//if this is the last warp
	if (id < 32){
		if (blockDim.x >= 64)
		{

			s[id] = s[id] + s[id + 32];
		}
		if (blockDim.x >= 32)
		{

			s[id] = s[id] + s[id + 16];
		}
		if (blockDim.x >= 16)
		{

			s[id] = s[id] + s[id + 8];
		}
		if (blockDim.x >= 8)
		{

			s[id] = s[id] + s[id + 4];
		}
		if (blockDim.x >= 4)
		{

			s[id] = s[id] + s[id + 2];
		}
		if (blockDim.x >= 2)
		{

			s[id] = s[id] + s[id + 1];
		}
	}
	
	if (id == 0)
	{
	
		atomicAdd(gc, s[0]);
	}
}

int main()
{
	int i, count = 0, sum = 0;

	int *ga, *gb, *gc, *ha, *add=0;

	cudaError_t err;
	cudaEvent_t start, stop;
	float elapseTime;

	int blocks, blocksize = 512;
	int sh_size = (blocksize * sizeof(int));

	ha = (int*)malloc(SIZE*sizeof(int));
	add = (int*)malloc(SIZE*sizeof(int));

	for (i = 2; i < SIZE; i++)
	{
		ha[i] = i;
	}

	if (cudaSuccess != (err = (cudaMalloc((void**)&ga, SIZE*sizeof(int)))))
	{
		printf("error in cuaMalloc  %s", (char *)cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
	if (cudaSuccess != (err = (cudaMalloc((void**)&gb, SIZE*sizeof(int)))))
	{
		printf("error in cuaMalloc  %s", (char *)cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
	if (cudaSuccess != (err = (cudaMalloc((void**)&gc, SIZE*sizeof(int)))))
	{
		printf("error in cuaMalloc  %s", (char *)cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	blocks = SIZE / blocksize;
	if (SIZE% blocksize != 0)
		blocks++;

	if (cudaSuccess != (err = (cudaMemcpy(ga, ha, SIZE*sizeof(int), cudaMemcpyHostToDevice))))
	{
		printf("Error in cudaMemcpy H_T_D  %s ", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	// time of execution
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	//start recording 
	cudaEventRecord(start, 0);

	prime << < blocks, blocksize, sh_size >> >(ga, gb, gc);

	//stop recording
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	cudaEventElapsedTime(&elapseTime, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	cudaThreadSynchronize();

	if (cudaSuccess != (err = cudaMemcpy(ha, gb, SIZE*sizeof(int), cudaMemcpyDeviceToHost)))
	{
		printf("Error in cudaMemcpy D_T_H  %s ", cudaGetErrorString(err));
		exit(EXIT_FAILURE);

	}
	if (cudaSuccess != (err = cudaMemcpy(add, gc, SIZE*sizeof(int), cudaMemcpyDeviceToHost)))
	{
		printf("Error in cudaMemcpy D_T_H  %s ", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	//printing output
	for (int k = 2; k < SIZE; k++)
	{
		if (ha[k] != 0)
		{
			printf("\n prime no::%d\t", ha[k]);
		//	sum = sum + ha[k];
			count++;
		}
	}

	printf("\n \nsum of all prime no. is:%d", add);

	printf("\n\nTotle prime numbers upto %d is : %d", SIZE, count);

	printf("\n\n Total elapsed time on GPU=%lf millisec", elapseTime);

	cudaFree(ga);
cudaFree(gb);
	cudaFree(gc);
	free(ha);
	free(add);
	getchar();
	return 0;

}

specifications:1.graphics card- Gforce 210(1.2 compute capability) 2.CUDA toolkit 6.5version 3.visual studio 2013

I resolved the error, and here is runable code of summation of prime no. from 1 to 20,000

#include<stdio.h>
#include<cuda.h>
#define SIZE 20000

__global__ void prime(int *ga, int *gb)				//ga=sum, gb=prime 
{
	int i = threadIdx.x + (blockDim.x * blockIdx.x);
	int k = threadIdx.x;

	int flag, j;
	if (i < SIZE)
	{
		flag = 0;
		for (j = 2; j <= i / 2; j++)
		{
			if (ga[i] % j == 0)
			{
				flag = 1;
				break;
			}
		}
		if (flag == 0)
		{
			gb[i] = ga[i];
		}
	}

	extern __shared__ int arr[];
	arr[k] = 0;
	__syncthreads();
	if (k<SIZE)
	{
		arr[k] = gb[i];
	}
	__syncthreads();

	if (blockDim.x >= 1024){
		if (k < 512){
			arr[k] = arr[k] + arr[k + 512];}
		__syncthreads();
	}

	if (blockDim.x >= 512)
	{
		if (k < 256)
		{arr[k] = arr[k] + arr[k + 256];}
		__syncthreads();
	}
	if (blockDim.x >= 256)
	{
		if (k < 128)
		{arr[k] = arr[k] + arr[k+ 128];}
		__syncthreads();
	}
	if (blockDim.x >= 128)
	{
		if (k < 64)
		{arr[k] = arr[k] + arr[k + 64];}
		__syncthreads();
	}
	
	if (k < 32){
		if (blockDim.x >= 64)
		{arr[k] = arr[k] + arr[k + 32];}
		
		if (blockDim.x >= 32)
		{arr[k] = arr[k] + arr[k + 16];}
		
		if (blockDim.x >= 16)
		{arr[k] = arr[k] + arr[k + 8];}
		
		if (blockDim.x >= 8)
		{arr[k] = arr[k] + arr[k + 4];}
		
		if (blockDim.x >= 4)
		{arr[k] = arr[k] + arr[k + 2];}
		
		if (blockDim.x >= 2)
		{	arr[k] = arr[k] + arr[k + 1];}
	}

	if (k == 0)
	{
		atomicAdd(ga, arr[0]);
	}
}

int  main()
{
	int ha, cpu1, cpu2, count = 0, i, flag, sum = 0;
	int *ga, *gb, hb;
	int size = SIZE*sizeof(int);

	const int blocksize = 512;
	int sh_size = (blocksize * sizeof(int));

	int blocks = SIZE / blocksize;
	if (SIZE % blocksize != 0) blocks++;

	for (int k = 0; k < SIZE; k++)			//start from 2
	{
		cpu1[k] = k;
		ha[k] = k;
		cpu2[k] = 0;
	}

	for (i = 2; i < SIZE; i++)
	{
		flag = 0;
		for (int j = 2; j <= i / 2; j++)
		{
			if (cpu1[i] % j == 0)
			{
				flag = 1; break;
			}
		}
		if (flag == 0)
		{
			if (cpu1[i] != 0)
				cpu2[i] = cpu1[i];

			sum = sum + cpu2[i];
			//printf("%d\t", cpu2[i]);
			count++;
		}
	}
	printf("\n\nTotal prime no. count on CPU = %d\n", count);

	printf("\nAddition on CPU  : %d\n\n", sum);

	cudaError_t err;
	cudaEvent_t start, stop;
	float elapseTime;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	if (cudaSuccess != (err = (cudaMalloc((void**)&ga, size))))
	{
		printf("error in cuaMalloc  %s", (char *)cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
	if (cudaSuccess != (err = (cudaMalloc((void**)&gb, size))))
	{
		printf("error in cuaMalloc  %s", (char *)cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
	
	if (cudaSuccess != (err = (cudaMemcpy(ga, ha, size, cudaMemcpyHostToDevice))))
	{
		printf("Error in cudaMemcpy H_T_D  %s ", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
	cudaEventRecord(start, 0);

	prime << <blocks, blocksize, sh_size >> >(ga, gb);
	
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	cudaEventElapsedTime(&elapseTime, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	if (cudaSuccess != (err = cudaMemcpy(hb, gb, size, cudaMemcpyDeviceToHost)))
	{
		printf("Error in cudaMemcpy D_T_H  %s ", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
	if (cudaSuccess != (err = cudaMemcpy(ha, ga, size, cudaMemcpyDeviceToHost)))
	{
		printf("Error in cudaMemcpy D_T_H  %s ", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
	
	for (int y = 2; y < SIZE; y++)
	{
		if (hb[y] != 0)
		printf("\t %d\t", hb[y]);				//prime=hb
	}
	printf("\n\nTotal prime no. count on CPU = %d", count);
	printf("\n\nSummation on GPU  : %d\t\n", ha[0]);		//ha=sum						
	printf("\nTotal elapsed time on GPU=%lfmillisecond\n ", elapseTime);
	cudaFree(ga);
	getchar();
	return 0;
}

I’d think a primality sieve might be a more efficient way of determining which numbers are prime and which aren’t

for (j = 2; j <= i / 2; j++)

Above for loop is going to introduce some thread divergence and it’s also going to take much longer on the final launched blocks, compared to the first blocks launched. This will lead to very uneven workload distribution across the device’s multiprocessors.

yeah…this technique will make program more optimize…:)