I am running a simple test code, which loops on thrust::copy_if(). While running this code, the CPU hangs. It happens after some many millions of loop passes, or sometimes after just thousands of loop passes. I tested the code with 4 types of GTX GPUs. Problem shows up only with some models:
GTX780Ti - tested on 5 GTX780ti. had no problem.
Titan Kepler - No problem.
GTX980 - tested on 6 GTX980. All hanged.
TITAN Maxwell 12GB Ram -tested on 2 cards. All hanged.
Here below is the source code, which consists of 2 source file: main.cpp and copyIfWrapper.cu.
Here is main.c which inits the pointer and loops on copyIfWrapper:
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
void copyIfWrapper(float *d_in, float *d_out, unsigned int length, float thresh);
int main(int argc, char **argv)
{
if (argc <2 ){
std::cout << "Format: " << argv[0] << " <devid>\n";
exit(0);
}
unsigned int devId = atoi(argv[1]);
std::cout << "devId: " << devId<< "\n";
if(cudaSuccess != cudaSetDevice((int)devId)){
std::cout << "cudaSetDevice err\n";
}
if(cudaSuccess !=cudaDeviceReset()){
std::cout << "cudaDeviceReset err\n";
}
unsigned int arraySize = 400000;
float* d_outVal;
float* d_inVal;
if(cudaSuccess != cudaMalloc((void **) &d_outVal, arraySize * sizeof(float))){
std::cout << "cudaMalloc err\n";
}
if(cudaSuccess != cudaMalloc((void **) &d_inVal, arraySize * sizeof(float))){
std::cout << "cudaMalloc err\n";
}
// prepare values for copy - program hangs regardless the value here
float* hostBuf = new float[arraySize];
for(unsigned int idx =0; idx < arraySize;idx++){
hostBuf[idx] = 0;// program hangs regardless the value here
}
cudaMemcpy(d_inVal, hostBuf, arraySize * sizeof(float), cudaMemcpyHostToDevice);
float m_thresh = 0.34;//m_param->value("PP::MIP:InputImageThresh");
unsigned int debugLoopCounter = 0;
while(true){
// debug prints:
std::cout << "\r" <<debugLoopCounter++;
std::cout << " m_maxVoxCount.length " << arraySize <<std::flush;
copyIfWrapper( d_inVal, d_outVal, arraySize, m_thresh );
}
return 0;
}
Next is copyIfWrapper.cu:
struct aboveThresh_predicate
{
__host__ __device__
bool operator()(const float val) const
{
return (bool)( val > 0.24 );
}
};
void copyIfWrapper(float *d_in, float *d_out, unsigned int length, float thresh) {
thrust::device_ptr<float> d_inVPtr(d_in);
thrust::device_ptr<float> d_outPtr(d_out);
// apply threshold using copy_if with "stencil[i] = (sv.values[i] > thresh)"
thrust::device_ptr<float> valIter = thrust::copy_if(d_inVPtr, d_inVPtr + length, d_inVPtr, d_outPtr,
aboveThresh_predicate() );
}
Here’s the Traceback taken while cpu hangs:
#14 in main () at main.cpp: 49
#13 in copyIfWrapper()
#12 in thrust::device_ptr(float) thrust::system::cuda::detail::copy_if<threust::system....etc..
#11 in thrust::system::cuda::detail::trivial_copy_n<thrust::system::cuda::detail::tag, ....etc
#10 in cudaMemcpy () from libcudart.so.6.5
#9 in ?? () from libcudart.so.6.5
#8 in cuMemcpyDtoH_v2 () from libcuda.so.1
#7 in ?? () from libcuda.s0.1
#6 in ?? () from libcuda.s0.1
#5 in ?? () from libcuda.s0.1
#3 in ?? () from libcuda.s0.1
#2 in ?? () from libcuda.s0.1
#1 in clock_gettime () from librt.so.1
#0 in clock_gettime ()
Note: At main.cpp line 40, copy value is set to 0 which is < threshold (0.24). Still, any other value would make it hang too.
I would be grateful to get any idea about this behavior.
Thanks,
Ronen Halevy.