Reduction an array in to 10 elements by addition of elements based on remaining the indexes to 10

I have an float array with size of 1 million
which I need to add the elements with same remaining in division to 10, I found reduction function but I should have 10 output instead of 1. To explain better, I need to parallelism the below function: (it’s not completely true… just I write that to explain)

for(int i = 0; i < size; i++) {
  if (i%10 == 0)
    output[0] += input[i];

  if (i%10 == 1)
    output[1] += input[i];

  if (i%10 == 2)
    output[2] += input[i];

  if (i%10 == 3)
    output[3] += input[i];

  if (i%10 == 4)
    output[4] += input[i];

  if (i%10 == 5)
    output[5] += input[i];

  if (i%10 == 6)
    output[6] += input[i];

  if (i%10 == 7)
    output[7] += input[i];

  if (i%10 == 8)
    output[8] += input[i];

  if (i%10 == 9)
    output[9] += input[i];   
}
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

__global__ void reduce_mod10(float* output, const float* input, unsigned int size) {
  for ( unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x ) {
    atomicAdd(output + (i%10), input[i]);
  }
}

#include <iostream>
#include <numeric>

int main() {
  using namespace std;

  unsigned int N = 1000 * 1000;
  float* input;
  cudaMallocManaged(&input, N*sizeof(float));
  iota(input, input+N, 0.0f); // 0.0, 1.0, 2.0, ...

  float* output;
  cudaMallocManaged(&output, 10*sizeof(float));
  fill_n(output, 10, 0.0f); // fill 0.0 

  reduce_mod10<<<(N+255)/256,256>>>(output,input,N);
  cudaDeviceSynchronize();

  for ( int i = 0; i < 10; ++i ) { 
    cout << i << " : " << output[i] << endl;
  }

  cudaFree(input);
  cudaFree(output);
  cudaDeviceReset();
}