The following program does 3 successive calls to the “test” function where some thrust operations are executed. Each one of these 3 calls provides a different size to the problem:
- 3,000 for the first call;
- 300,000,000 for the second call;
- 3,000 again for the third call.
The second call is expected to fail with a bad_alloc, but the third one should be successful (as is the first one) if I properly cleaned up the status of my GPU. Unfortunately, it also fails. Moreover, successive calls would also result in a failure until I quit my process and start again.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cuda.h>
#include <thrust/system_error.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#define CUDA_CALL(x)do { if((x) != cudaSuccess) { return -11;}} while(0)
typedef typename thrust::device_vector<size_t> tDevVecInt;
typedef typename thrust::device_vector<float> tDevVecFlt;
struct modSim : public thrust::unary_function<int, int>
{
int szMat;
int p;
modSim(int in1, int in2)
{
this->p = in1;
this->szMat = in2;
}
__host__ __device__ int operator()(const int &x)
{
return (x/szMat)*p+(x%p);
}
};
int test(size_t szData)
{
modSim moduloCol(3, 33);
CUDA_CALL(cudaSetDevice(0));
try
{
tDevVecFlt devRand(szData);
tDevVecInt devIndices(szData);
tDevVecFlt devData(szData);
thrust::sequence(devRand.begin(), devRand.end());
thrust::tabulate(devIndices.begin(), devIndices.end(), moduloCol);
thrust::sort_by_key(devIndices.begin(), devIndices.end(), devRand.begin());
}
catch(std::bad_alloc &e)
{
std::cout << e.what() << std::endl;
CUDA_CALL(cudaDeviceReset());
CUDA_CALL(cudaSetDevice(0));
return -3;
}
catch(thrust::system_error &e)
{
std::cout << e.what() << std::endl;
CUDA_CALL(cudaDeviceReset());
CUDA_CALL(cudaSetDevice(0));
return -2;
}
CUDA_CALL(cudaDeviceReset());
return 0;
}
int main(void)
{
size_t n;
int retVal;
n = 3000;
retVal = test(n);
std::cout << retVal << std::endl;
n = 300000000;
retVal = test(n);
std::cout << retVal << std::endl;
n = 3000;
retVal = test(n);
std::cout << retVal << std::endl;
return(0);
}
On my setup (Windows 8, NVIDIA GeForce 820m with 2GB dedicated VRAM, CUDA 7.0 compiled using Visual Studio 2010), I get this:
- first call with N = 3,000 succeeds;
- second call with N = 300,000,000 fails with the exception bad allocation: out of memory;
- third call with N = 3,000 fails with a thrust::system error : after cub_::DeviceRadixSort::SortPairs(1): out of memory.
So the output looks like this:
0
bad allocation: out of memory
-3
after cub_::DeviceRadixSort::SortPairs(1): out of memory
-2
As mentioned above, the third call shouldn’t have failed as it is identical to the successful first call.
This failure seems to be a consequence of the previous call (the one that issued a bad alloc) but I cleaned everything up after the bad alloc with a cudaDeviceReset() and a cudaSetDevice().
Despite the cleaning instructions, the device is not back into a functional state and I don’t understand why.
If I did something wrong, what would by the proper way to restore the GPU to a functional state after the first failure without ending my process?
Does anyone reproduce this behavior? Is there something obvious I’m missing? Thanks for your help.