I execute the following test codes on K40c, the output results are
ao: 132749.421875 3587.000000 98219272.000000 881821.000000
those are different from using cpu method. what could I do or should I ignore this deviation.
float a[]= {132451.2, -0.0001, 98218975.2338, 878234.001};
float b[]= {298.222, 3587.000004};
float *ao, *bo;
cudaMalloc(&ao, sizeof(a));
cudaMalloc(&bo, sizeof(b));
cudaMemcpy(ao, a, sizeof(a), cudaMemcpyHostToDevice);
cudaMemcpy(bo, b, sizeof(b), cudaMemcpyHostToDevice);
checkCudaErrors(cudaDeviceSynchronize());
cudnnTensorDescriptor_t at, bt;
float alpha=1., beta=1.;
cudnnCreate(&cudnnHandle);
checkCUDNN(cudnnCreateTensorDescriptor(&at));
checkCUDNN(cudnnCreateTensorDescriptor(&bt));
cudnnSetTensor4dDescriptor(at, tensorFormat, dataType, 2, 2, 1, 1);
cudnnSetTensor4dDescriptor(bt, tensorFormat, dataType, 1, 2, 1, 1);
cudnnAddTensor(cudnnHandle, &alpha, bt, bo, &beta, at, ao);
outputGPU("ao", ao, 4);