Cuda Program for finding mulptiple of 3 or 5 and sum of these multiples [SOLVED]

I have used reduction kernel to compute the sum.

Please,Can anyone tell me whats wrong with this : I am getting wrong output

__global__ void compute(int *ga){
        int id = threadIdx.x + (blockDim.x * blockIdx.x);
        extern __shared__ int s[];
        s[id] = MAX;

        if (id<MAX)
        {
            s[id] = ga[id];
        }
        __syncthreads();
            //if (s[id] % 3 == 0 || s[id] % 5 == 0)
            //  s[id] = id;

//applying the reeduction
            if(blockDim.x >= 1024){
                if (id < 512){
                    s[id] = s[id] + s[id + 512];
                }
                __syncthreads();
            }
            __syncthreads();
            if (blockDim.x >= 512){
                if (id < 256)
                {
                    s[id] = s[id] + s[id + 256];
                }
                __syncthreads();
            }
            if (blockDim.x >= 256){
                if (id < 128)
                {
                    s[id] = s[id] + s[id + 128];
                }
                __syncthreads();
            }
            if (blockDim.x >= 128){
                if (id < 64)
                {
                    s[id] = s[id] + s[id + 64];
                }
                __syncthreads();
            }
            //if this is the last warp
            if (id < 32)
            {
                if (blockDim.x >= 64)
                    s[id] = s[id] + s[id + 32];
                if (blockDim.x >= 32)
                    s[id] = s[id] + s[id + 16];
                if (blockDim.x >= 16)
                    s[id] = s[id] + s[id + 8];
                if (blockDim.x >= 8)
                    s[id] = s[id] + s[id + 4];
                if (blockDim.x >= 4)
                    s[id] = s[id] + s[id + 2];
                if (blockDim.x >= 2)
                    s[id] = s[id] + s[id + 1];
            }
            //thread zero will store min of this block i.e. s[0];
            if (id == 0)
            {
                ga[blockIdx.x] = s[0];
            }
    }