How to reset CUDA error in driver API

We are using the CUDA 5.5 driver API for online digital signal processing for ASTRON’s LOFAR radio astronomy telescope.
On rare occasions, a CUDA call fails aborting the observation. This may be our fault (bug, or resources acquired too early / released too late), or it could be an ‘ECC uncorrectable’ (seen it once during a test run). Unless seriously buggy, our application could actually deal with this, as we record and aggregate the amount of ‘flagged’ data anyway (or we can delay the observation start time a bit). Subsequent offline processing takes flagged data into account.

The problem is that once disaster strikes, we don’t know how to reset this error using the driver API.
In the runtime API, there is cudaGetLastError(), but I cannot find a driver API equivalent. In the driver API, I’ve seen that after an error, subsequent calls do work, but continue to return the original error. That way, we cannot deal with it.

It would be a pain if we had to recreate the complete context, or even restart the whole process (MPI, yikes). But if that’s the only way…

So how can we reset the CUDA driver API error state?

What about

cudaError_t err=cudaDeviceReset();
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

Sounds like this could be a bug in cuda, or perhaps indeed something screwed up and propagates the error.

Post some code perhaps ? See if it’s reproduceable with minimal code or so ?

You need to destroy the correspondent context, create a new context and reallocate all device memory. That’s the only way to recover from an error.

[ reply split over 2 posts, as forum cannot handle it in 1 ]

@CudaaduC: cudaDeviceReset() is a runtime API call. I was looking for a driver API call. Also, with cudaDeviceReset() I lose all my contexts / allocations.

@Skybuck: I tested a bit with both APIs and there is more involved. It seems it depends on the operation whether errors stick.

Driver API
Allocations (cuMalloc()) and copying H2D/D2H errors do not stick. Kernel run (and launch) errors (ECC, far out-of-bounds access) do stick. The next cuStreamSynchronize() returns an error that I cannot reset (without dropping the context).

// cu-test.cu
// nvcc -ptx cu-test.cu && nvcc -o cu-test cu-test.cu -lcuda
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <cuda.h>

extern "C" {
__global__ void kfunc(float* data) {
  //float v = data[0];
  float v = data[1024*1024*1024]; // out of bounds
  data[0] = v + 1.0f;
}
}

using std::exit;
using std::memset;
using std::cout;
using std::cerr;
using std::endl;

int main() {
  CUresult r;
  r = cuInit(0);
  if (r != CUDA_SUCCESS) { cerr << "cuInit failed: " << r << endl; exit(1); }

  CUdevice dev;
  int ordinal = 0;
  r = cuDeviceGet(&dev, ordinal);
  if (r != CUDA_SUCCESS) { cerr << "cuDeviceGet failed: " << r << endl; exit(1); }

  CUcontext ctx;
  unsigned int flags = CU_CTX_SCHED_AUTO;
  r = cuCtxCreate(&ctx, flags, dev);
  if (r != CUDA_SUCCESS) { cerr << "cuCtxCreate failed: " << r << endl; exit(1); }

  CUdeviceptr dptr;
  size_t len = 1024*1024;
  r = cuMemAlloc(&dptr, len * sizeof(float));
  if (r != CUDA_SUCCESS) { cerr << "cuMemAlloc failed:" << r << endl; }

  CUdeviceptr dptr2;
  len = 1024ULL*1024*1024*1024; // too large
  r = cuMemAlloc(&dptr2, len * sizeof(float));
  if (r != CUDA_SUCCESS) { cerr << "cuMemAlloc failed (2) (expected): " << r << endl; }

  CUdeviceptr dptr3;
  len = 1024*1024; // works again after previous erroneous alloc
  r = cuMemAlloc(&dptr3, len * sizeof(float));
  if (r != CUDA_SUCCESS) { cerr << "cuMemAlloc failed (3): " << r << endl; }

  float *hptr = new float[32*len];
  memset(hptr, 0, len * sizeof(float));

  CUstream stream;
  r = cuStreamCreate(&stream, 0);
  if (r != CUDA_SUCCESS) { cerr << "cuStreamCreate failed: " << r << endl; exit(1); }

  r = cuMemcpyHtoDAsync(dptr, hptr, len * sizeof(float), stream);
  //r = cuMemcpyHtoDAsync(dptr, hptr, 32*len * sizeof(float), stream); // GPU buffer overflow
  if (r != CUDA_SUCCESS) { cerr << "cuMemcpyHtoDAsync failed: " << r << endl; }

  r = cuStreamSynchronize(stream);
  if (r != CUDA_SUCCESS) { cerr << "cuStreamSynchronize failed (HtoD): " << r << endl; }

  CUmodule kmodule;
  r = cuModuleLoad(&kmodule, "cu-test.ptx");
  if (r != CUDA_SUCCESS) { cerr << "cuModuleLoad failed: " << r << endl; exit(1); }

  CUfunction kfunc;
  r = cuModuleGetFunction(&kfunc, kmodule, "kfunc");
  if (r != CUDA_SUCCESS) { cerr << "cuModuleGetFunction failed: " << r << endl; exit(1); }

  // async launch bad kernel
  void *args = &dptr;
  r = cuLaunchKernel(kfunc, /*gridDim: */1, 1, 1, /*blockDim: */1, 1, 1,
                     /*dynShmemBytes: */0, stream, &args, NULL);
  if (r != CUDA_SUCCESS) { cerr << "cuLaunchKernel failed: " << r << endl; exit(1); }

  r = cuStreamSynchronize(stream);
  if (r != CUDA_SUCCESS) { cerr << "cuStreamSynchronize failed (launch) (expected): " << r << endl; }

  r = cuMemcpyDtoHAsync(hptr, dptr, len * sizeof(float), stream);
  if (r != CUDA_SUCCESS) { cerr << "cuMemcpyDtoHAsync failed: " << r << endl; }

  r = cuStreamSynchronize(stream);
  if (r != CUDA_SUCCESS) { cerr << "cuStreamSynchronize failed (DtoH): " << r << endl; }

  for (unsigned i = 0; i < 16; i++) {
    cout << hptr[i] << " ";
  }
  cout << endl;


  delete[] hptr;
  r = cuMemFree(dptr);
  if (r != CUDA_SUCCESS) { cerr << "cuMemFree failed: " << r << endl; exit(1); }
  // delete stream, context, ...

  return 0;
}

prints

cuMemAlloc failed (2) (expected): 2
cuStreamSynchronize failed (launch) (expected): 719
cuMemcpyDtoHAsync failed: 719
cuStreamSynchronize failed (DtoH): 719
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
cuMemFree failed: 719

So after running an intentionally broken kernel, subsequent D2H transfer and cuMemFree() calls continue to return the (unknown) error. (I believe those operations do in fact succeed, but they return the earlier error as documented.) (If I patch the kernel to be sane), the transfer does work.)

[ part 2 ]

Runtime API
I tried a similar test for the runtime API (now with synchronous calls), but it seems that even there I cannot reset kernel run errors (using cudaGetLastError()):

// cuda-test.cu
// nvcc -o cuda-test cuda-test.cu
#include <cstdlib>
#include <cstring>
#include <iostream>

using std::exit;
using std::memset;
using std::cout;
using std::cerr;
using std::endl;

__global__ void kfunc(float* data) {
  //float v = data[0];
  float v = data[1024*1024*1024]; // out of bounds
  data[0] = v + 1.0f;
}

int main() {
  cudaError_t err;
  float *dptr;
  size_t len = 1024*1024;
  err = cudaMalloc((void **)&dptr, len * sizeof(float));
  if (err != cudaSuccess) { cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << " (" << err << ")" << endl; exit(1); }

  float *hptr = new float[32*len];
  memset(hptr, 0, len * sizeof(float));
  err = cudaMemcpy(dptr, hptr, len * sizeof(float), cudaMemcpyHostToDevice);
  if (err != cudaSuccess) { cerr << "cudaMemcpy (H2D) failed: " << cudaGetErrorString(err) << " (" << err << ")" << endl; exit(1); }

  // launch bad kernel
  int block_dim = 1;
  int grid_dim = 1;
  kfunc<<<grid_dim, block_dim>>>(dptr);
  err = cudaGetLastError();
  if (err != cudaSuccess) { cerr << "kernel launch failed: " << cudaGetErrorString(err) << " (" << err << ")" << endl; exit(1); }
  err = cudaDeviceSynchronize();
  if (err != cudaSuccess) { cerr << "cudaDeviceSynchronize failed (launch) (expected): " << cudaGetErrorString(err) << " (" << err << ")" << endl; }
  if (err != cudaSuccess) {
    err = cudaGetLastError();
    if (err != cudaSuccess) { cerr << "resetting last error, which was (expected): " << cudaGetErrorString(err) << " (" << err << ")" << endl; }
    err = cudaGetLastError();
    if (err != cudaSuccess) { cerr << "reset failed: " << cudaGetErrorString(err) << " (" << err << ")" << endl; }
  } 

  err = cudaMemcpy(hptr, dptr, len * sizeof(float), cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) { cerr << "cudaMemcpy (D2H) failed: " << cudaGetErrorString(err) << " (" << err << ")" << endl; }

  for (unsigned i = 0; i < 16; i++) {
    cout << hptr[i] << " ";
  }
  cout << endl;

  delete[] hptr;
  err = cudaFree(dptr);
  if (err != cudaSuccess) { cerr << "cudaFree failed: " << cudaGetErrorString(err) << " (" << err << ")" << endl; exit(1); }

  return 0;
}

prints

cudaDeviceSynchronize failed (launch) (expected): unknown error (30)
resetting last error, which was (expected): unknown error (30)
cudaMemcpy (D2H) failed: unknown error (30)
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
cudaFree failed: unknown error (30)

After a broken kernel run (different unknown error code than with driver API), I reset the error this time and that appears to work, but the subsequent transfer and deallocation still return failure, even though I believe they are programmed fine.

I’m not sure this is a bug or not (the documentation does not really specify all semantics and my tests could be broken), but error recovery looks inconvenient.

@miku50052: Yes, I’m afraid kernel run errors can only be cleared with a new context. But if I implement recovery, I hope this is enough (as opposed to complete device reset which resets all contexts).