I have some issues porting my CUDA software that works perfectly using the CUDA toolkit 5.0 and computing for architecture sm_20.
I tried to port it for Maxwell (Geforce GTX 960 and 970). I tried CUDA toolkit 6.5, where cubin works fine and ptx makes the driver to crash. I tried CUDA toolkit 7.0, where both cubin and ptx version crash the drive.
Then I simplified my code in order to find the problem. The following code has exactly the same behaviour as my software. I think it should work (and it works in some configurations).
Could anyone tell me if he/she can reproduce the problem or if there is anything wrong in my code?
I would be very helpful.
My configuration is :
Gainward Phantom Geforce GTX 960
Windows 7 Home Premium (SP1) (64bits)
NVIDIA Graphic driver version : 350.12
CUDA toolkits 6.5 and 7.0
I compile in 64 bits.
The code below simply adds one for each task to a global result.
maxwell.cu :
// Using CUDA toolkit 6.5 for GTX 970
// This code compiled with nvcc as cubin works in sm_52
// This code compiled with nvcc compiled as ptx then in cubin from this ptx works in sm_52
// This code compiled with nvcc as ptx and loaded directly by the driver causes a driver reboot.
#define BLOCK_HEIGHT 2
// Scheduler
__device__ unsigned int persistentTaskIndex;
__device__ int getTaskID()
{
__shared__ volatile int persistentBlockStart[BLOCK_HEIGHT];
if(threadIdx.x == 0)
persistentBlockStart[threadIdx.y] = atomicAdd(&persistentTaskIndex, warpSize);
int taskID = persistentBlockStart[threadIdx.y] + threadIdx.x;
return taskID;
}
// Atomic add for double. Implementation provided by NVIDIA
__forceinline__ __device__ double atomicAdd(double * address, double val)
{
unsigned long long int * address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do
{
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
__device__ int g_taskCount;
__device__ double * o_result;
extern "C" __global__ void accumulationKernel()
{
int taskCount = g_taskCount;
int taskID;
#define ACCUMULATE_WARP_SIZE 32 // Must be power of two
#define ACCUMULATE_BLOCK_HEIGHT 32
__shared__ double reduction[ACCUMULATE_WARP_SIZE][ACCUMULATE_BLOCK_HEIGHT];
while (1)
{
taskID = getTaskID();
reduction[threadIdx.y][threadIdx.x] = 0;
if (taskID >= taskCount)
return;
int outputPosition = 0;
#define DOUBLE_REDUCTION(OUTPUT, INPUT) \
reduction[threadIdx.y][threadIdx.x] = INPUT; \
if(threadIdx.x < 16) reduction[threadIdx.y][threadIdx.x] += reduction[threadIdx.y][threadIdx.x+16]; \
if(threadIdx.x < 8) reduction[threadIdx.y][threadIdx.x] += reduction[threadIdx.y][threadIdx.x+8]; \
if(threadIdx.x < 4) reduction[threadIdx.y][threadIdx.x] += reduction[threadIdx.y][threadIdx.x+4]; \
if(threadIdx.x < 2) reduction[threadIdx.y][threadIdx.x] += reduction[threadIdx.y][threadIdx.x+2]; \
if(threadIdx.x < 1) \
{\
reduction[threadIdx.y][0] += reduction[threadIdx.y][1]; \
atomicAdd(&OUTPUT[outputPosition], reduction[threadIdx.y][0]);\
}
DOUBLE_REDUCTION(o_result, 1.0)
}
}
main.cpp :
// CUDA header
#include <cuda.h>
#include <stdio.h>
static CUdevice cuDevice;
static CUcontext cuContext;
static CUmodule cuModule;
static CUfunction kernelFunction;
static CUresult error;
static CUdeviceptr buffer_dPtr;
CUdeviceptr o_result_dPtr;
// CUDA tools
#define CUDA_CHECK_ERROR(err) safeCudaCall(err, __FILE__, __LINE__)
CUresult safeCudaCall( CUresult err, const char *file, const int line )
{
#define CU_SAFE_CALL_OUTPUT(x) case x: printf("Driver API error in %s, line %i : "#x,file, line); printf("\n"); break;
if( CUDA_SUCCESS != err)
{
switch(err)
{
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_VALUE);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_OUT_OF_MEMORY);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_INITIALIZED);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_DEINITIALIZED);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NO_DEVICE);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_DEVICE);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_IMAGE );
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_CONTEXT);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_MAP_FAILED);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_UNMAP_FAILED);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_ARRAY_IS_MAPPED);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_ALREADY_MAPPED);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NO_BINARY_FOR_GPU);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_ALREADY_ACQUIRED );
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_MAPPED);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_MAPPED_AS_ARRAY);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_MAPPED_AS_POINTER);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_ECC_UNCORRECTABLE);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_UNSUPPORTED_LIMIT);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_SOURCE);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_FILE_NOT_FOUND);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED);
// CU_SAFE_CALL_OUTPUT(CUDA_ERROR_OPERATING_SYSTEM);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_HANDLE);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_FOUND);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_READY);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_LAUNCH_FAILED);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_LAUNCH_TIMEOUT);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING);
CU_SAFE_CALL_OUTPUT(CUDA_ERROR_UNKNOWN);
}
}
return err;
}
template <typename T> void setGlobal(CUmodule cuModule, const char * name, T value)
{
CUdeviceptr dPtr;
size_t size;
CUDA_CHECK_ERROR(cuModuleGetGlobal(&dPtr, &size, cuModule, name));
if (size != sizeof(T))
printf("Warning : Global variable %s size does not match type size", name);
CUDA_CHECK_ERROR(cuMemcpyHtoD(dPtr, &value, sizeof(T)));
}
static int blockSize[3];
static int gridSize[2];
void initFunction(CUfunction function)
{
int multiProcessorCount;
cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuDevice);
int blockCount = 1;
int warpSize = 32;
int maxThreadsPerBlock;
cuFuncGetAttribute(&maxThreadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
int blockHeight = maxThreadsPerBlock / warpSize;
gridSize[0] = multiProcessorCount;
gridSize[1] = blockCount;
blockSize[0] = warpSize;
blockSize[1] = blockHeight;
blockSize[2] = 1;
}
int main(void)
{
CUDA_CHECK_ERROR(cuInit(0));
int deviceCount;
CUDA_CHECK_ERROR(cuDeviceGetCount(&deviceCount));
CUDA_CHECK_ERROR(cuDeviceGet(&cuDevice, 0));
unsigned int flags = 0;
CUDA_CHECK_ERROR(cuCtxCreate(&cuContext, flags, cuDevice));
char kernelFileName[2048];
////////////////////////////////////////////////////////
// CHANGE THE CUDA MODULE HERE
// cubin works and ptx cause the driver to crash
////////////////////////////////////////////////////////
sprintf(kernelFileName, "maxwell.ptx");
// sprintf(kernelFileName, "maxwell.cubin");
// sprintf(kernelFileName, "maxwell_from_ptx.cubin");
CUDA_CHECK_ERROR(cuModuleLoad(&cuModule, kernelFileName));
CUDA_CHECK_ERROR(cuModuleGetFunction(&kernelFunction, cuModule, "accumulationKernel"));
initFunction(kernelFunction);
int fragmentCount = 10000;
setGlobal<int>(cuModule, "g_taskCount", fragmentCount);
int resultSize = 1;
cuMemAlloc(&o_result_dPtr, resultSize * sizeof(double));
setGlobal<CUdeviceptr>(cuModule, "o_result", o_result_dPtr);
CUDA_CHECK_ERROR(cuMemsetD32(o_result_dPtr, 0, resultSize * 2));
//-- Launch accumulation kernel
#define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
setGlobal<unsigned int>(cuModule, "persistentTaskIndex", 0);
CUDA_CHECK_ERROR(cuParamSetSize(kernelFunction, 0));
CUDA_CHECK_ERROR(cuFuncSetBlockShape(kernelFunction, blockSize[0], blockSize[1], 1));
CUDA_CHECK_ERROR(cuLaunchGrid(kernelFunction, gridSize[0], gridSize[1]));
CUDA_CHECK_ERROR(cuCtxSynchronize());
double result;
CUDA_CHECK_ERROR(cuMemcpyDtoH(&result, o_result_dPtr, sizeof(double)));
printf("The result is %f and should be %f \n", result, (float)fragmentCount);
}