When I use a non-blocking stream with npp, I seem to get incorrect values using nppiAbsDiff_8u_C3R
.
I attached a minimal example to reproduce the case below. Here’s what I observed:
- When I use nppSetStream(0), I get the correct result
- When I use CUDA_LAUNCH_BLOCKING=1 environment variable, I also get the correct result
- When I use an async stream, I mostly get an incorrect result (sometimes correct).
This smells like a race condition. Additionally, when I investigate with nvvp, I can see my async stream and one other async stream (created by npp I guess), which runs a single kernel ForEachPixelNaive (Screenshot https://ibb.co/pZ1Gx4r).
My system:
// nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130
// nppversion.h
#define NPP_VERSION_MAJOR 10
#define NPP_VERSION_MINOR 0
#define NPP_VERSION_BUILD 130
Here is the output of the sample program:
Async:
Correct: 1
Incorrect: 99
Sync:
Correct: 100
And this is the source code of my program:
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <npp.h>
#include <nppi.h>
#include <stdlib.h>
#include <vector>
inline void check(int status) {
if (status != 0) {
std::cerr << "Return code: " << status << std::endl;
std::abort();
}
}
void test(bool async_stream) {
NppiSize size{5 * 1024, 1024};
const size_t step = size.width * 3;
const size_t bytes = step * size.height;
uint8_t *src1, *src2, *diff;
cudaStream_t stream;
if (async_stream) {
check(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
} else {
stream = 0;
}
check(nppSetStream(stream));
std::vector<uint8_t> ones(bytes, 1);
std::vector<uint8_t> twos(bytes, 2);
std::vector<uint8_t> result(bytes);
check(cudaMalloc(&src1, bytes));
check(cudaMalloc(&src2, bytes));
check(cudaMalloc(&diff, bytes));
std::vector<int> counters{0, 0};
for (int i = 0; i < 100; ++i) {
check(cudaMemcpyAsync(src1, ones.data(), bytes, cudaMemcpyHostToDevice, stream));
check(cudaMemcpyAsync(src2, twos.data(), bytes, cudaMemcpyHostToDevice, stream));
check(cudaMemcpyAsync(diff, twos.data(), bytes, cudaMemcpyHostToDevice, stream));
check(nppiAbsDiff_8u_C3R(src1, step, src2, step, diff, step, size));
check(cudaMemcpyAsync(result.data(), diff, bytes, cudaMemcpyDeviceToHost, stream));
check(cudaStreamSynchronize(stream));
auto is_correct = std::equal(ones.begin(), ones.end(), result.begin());
++counters[static_cast<size_t>(is_correct)];
}
std::cout << "Correct: " << counters[1] << "\nIncorrect: " << counters[0]
<< std::endl;
check(nppSetStream(0));
check(cudaFree(src1));
check(cudaFree(src2));
check(cudaFree(diff));
if (async_stream) {
check(cudaStreamDestroy(stream));
}
}
int main() {
std::cout << "Async:\n";
test(true);
std::cout << "\nSync:\n";
test(false);
}