'cudaAddTensor' got a high deviation result.

I execute the following test codes on K40c, the output results are

ao: 132749.421875 3587.000000 98219272.000000 881821.000000

those are different from using cpu method. what could I do or should I ignore this deviation.

float a[]= {132451.2, -0.0001, 98218975.2338, 878234.001};
  float b[]= {298.222, 3587.000004};
  float *ao, *bo;
  cudaMalloc(&ao, sizeof(a));
  cudaMalloc(&bo, sizeof(b));
  cudaMemcpy(ao, a, sizeof(a), cudaMemcpyHostToDevice);
  cudaMemcpy(bo, b, sizeof(b), cudaMemcpyHostToDevice);

 checkCudaErrors(cudaDeviceSynchronize());
  cudnnTensorDescriptor_t at, bt;
float alpha=1., beta=1.;
  cudnnCreate(&cudnnHandle);

        checkCUDNN(cudnnCreateTensorDescriptor(&at));
        checkCUDNN(cudnnCreateTensorDescriptor(&bt));

  cudnnSetTensor4dDescriptor(at, tensorFormat, dataType, 2, 2, 1, 1);
  cudnnSetTensor4dDescriptor(bt, tensorFormat, dataType,  1, 2, 1, 1);
  cudnnAddTensor(cudnnHandle, &alpha, bt, bo, &beta, at, ao);
  outputGPU("ao", ao, 4);