Thrust : can't reset my device after bad_alloc on thrust::Sort_by_key()

The following program does 3 successive calls to the “test” function where some thrust operations are executed. Each one of these 3 calls provides a different size to the problem:

  • 3,000 for the first call;
  • 300,000,000 for the second call;
  • 3,000 again for the third call.

The second call is expected to fail with a bad_alloc, but the third one should be successful (as is the first one) if I properly cleaned up the status of my GPU. Unfortunately, it also fails. Moreover, successive calls would also result in a failure until I quit my process and start again.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cuda.h>
#include <thrust/system_error.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>

#define CUDA_CALL(x)do { if((x) != cudaSuccess) { return -11;}} while(0)

typedef typename thrust::device_vector<size_t>  tDevVecInt;
typedef typename thrust::device_vector<float>   tDevVecFlt;

struct modSim : public thrust::unary_function<int, int>  
{
    int szMat;
    int p;

    modSim(int in1, int in2)
    {
        this->p = in1;
        this->szMat = in2;
    }
    __host__ __device__ int operator()(const int &x) 
    {
        return (x/szMat)*p+(x%p);
    }
};

 int test(size_t szData)
{

    modSim moduloCol(3, 33);

    CUDA_CALL(cudaSetDevice(0));

    try
    {

        tDevVecFlt devRand(szData);
        tDevVecInt devIndices(szData);
        tDevVecFlt devData(szData);

        thrust::sequence(devRand.begin(), devRand.end());
        thrust::tabulate(devIndices.begin(), devIndices.end(), moduloCol); 
        thrust::sort_by_key(devIndices.begin(), devIndices.end(), devRand.begin()); 

    }
    catch(std::bad_alloc &e)
    {
        std::cout << e.what() << std::endl;
        CUDA_CALL(cudaDeviceReset());
        CUDA_CALL(cudaSetDevice(0));
        return -3;
    }
    catch(thrust::system_error &e)
    {
        std::cout << e.what() << std::endl;
        CUDA_CALL(cudaDeviceReset());
        CUDA_CALL(cudaSetDevice(0));
        return -2;
    }

    CUDA_CALL(cudaDeviceReset());
    return 0;   
}

int main(void)
{

    size_t n;
    int retVal;

    n = 3000;
    retVal = test(n);
    std::cout << retVal << std::endl;

    n = 300000000;
    retVal = test(n);
    std::cout << retVal << std::endl;

    n = 3000;
    retVal = test(n);
    std::cout << retVal << std::endl;

return(0);
}

On my setup (Windows 8, NVIDIA GeForce 820m with 2GB dedicated VRAM, CUDA 7.0 compiled using Visual Studio 2010), I get this:

  • first call with N = 3,000 succeeds;
  • second call with N = 300,000,000 fails with the exception bad allocation: out of memory;
  • third call with N = 3,000 fails with a thrust::system error : after cub_::DeviceRadixSort::SortPairs(1): out of memory.

So the output looks like this:

0
bad allocation: out of memory
-3
after cub_::DeviceRadixSort::SortPairs(1): out of memory
-2

As mentioned above, the third call shouldn’t have failed as it is identical to the successful first call.

This failure seems to be a consequence of the previous call (the one that issued a bad alloc) but I cleaned everything up after the bad alloc with a cudaDeviceReset() and a cudaSetDevice().

Despite the cleaning instructions, the device is not back into a functional state and I don’t understand why.

If I did something wrong, what would by the proper way to restore the GPU to a functional state after the first failure without ending my process?

Does anyone reproduce this behavior? Is there something obvious I’m missing? Thanks for your help.

An update was provided in the cross-posting:

http://stackoverflow.com/questions/32121827/cant-restore-my-gpu-after-bad-alloc-with-cudadevicereset-from-the-cuda-libr