[Solved] CUDA code works on Kepler but not on Maxwell

I have some issues porting my CUDA software that works perfectly using the CUDA toolkit 5.0 and computing for architecture sm_20.

I tried to port it for Maxwell (Geforce GTX 960 and 970). I tried CUDA toolkit 6.5, where cubin works fine and ptx makes the driver to crash. I tried CUDA toolkit 7.0, where both cubin and ptx version crash the drive.

Then I simplified my code in order to find the problem. The following code has exactly the same behaviour as my software. I think it should work (and it works in some configurations).

Could anyone tell me if he/she can reproduce the problem or if there is anything wrong in my code?
I would be very helpful.

My configuration is :
Gainward Phantom Geforce GTX 960
Windows 7 Home Premium (SP1) (64bits)
NVIDIA Graphic driver version : 350.12
CUDA toolkits 6.5 and 7.0

I compile in 64 bits.

The code below simply adds one for each task to a global result.

maxwell.cu :

// Using CUDA toolkit 6.5 for GTX 970
// This code compiled with nvcc as cubin works in sm_52
// This code compiled with nvcc compiled as ptx then in cubin from this ptx works in sm_52
// This code compiled with nvcc as ptx and loaded directly by the driver causes a driver reboot.

#define BLOCK_HEIGHT 2

// Scheduler
__device__ unsigned int persistentTaskIndex;

__device__ int getTaskID()
{
	__shared__  volatile int persistentBlockStart[BLOCK_HEIGHT];

	if(threadIdx.x == 0)
		persistentBlockStart[threadIdx.y] = atomicAdd(&persistentTaskIndex, warpSize);

	int taskID = persistentBlockStart[threadIdx.y] + threadIdx.x;
		
	return taskID;
}

// Atomic add for double. Implementation provided by NVIDIA
__forceinline__ __device__ double atomicAdd(double * address, double val)
{
	unsigned long long int * address_as_ull = (unsigned long long int*)address;
	unsigned long long int old = *address_as_ull, assumed;
	do 
	{
		assumed = old;
		old = atomicCAS(address_as_ull, assumed, 
		__double_as_longlong(val + 
		__longlong_as_double(assumed)));
	} while (assumed != old);

	return __longlong_as_double(old);
}

__device__ int g_taskCount;
__device__ double * o_result;

extern "C" __global__ void accumulationKernel()
{
	int taskCount = g_taskCount;
	int taskID;

#define ACCUMULATE_WARP_SIZE 32 // Must be power of two
#define ACCUMULATE_BLOCK_HEIGHT 32

	__shared__ double reduction[ACCUMULATE_WARP_SIZE][ACCUMULATE_BLOCK_HEIGHT]; 
	
	while (1)
	{
		taskID = getTaskID();

		reduction[threadIdx.y][threadIdx.x] = 0;
	
		if (taskID >= taskCount)
			return;

		int outputPosition = 0;

#define DOUBLE_REDUCTION(OUTPUT, INPUT) \
	reduction[threadIdx.y][threadIdx.x] = INPUT; \
	if(threadIdx.x < 16) reduction[threadIdx.y][threadIdx.x] += reduction[threadIdx.y][threadIdx.x+16]; \
	if(threadIdx.x < 8) reduction[threadIdx.y][threadIdx.x] += reduction[threadIdx.y][threadIdx.x+8]; \
	if(threadIdx.x < 4) reduction[threadIdx.y][threadIdx.x] += reduction[threadIdx.y][threadIdx.x+4]; \
	if(threadIdx.x < 2) reduction[threadIdx.y][threadIdx.x] += reduction[threadIdx.y][threadIdx.x+2]; \
	if(threadIdx.x < 1) \
	{\
		reduction[threadIdx.y][0] += reduction[threadIdx.y][1]; \
		atomicAdd(&OUTPUT[outputPosition], reduction[threadIdx.y][0]);\
	}
		
		DOUBLE_REDUCTION(o_result, 1.0)
	}
}

main.cpp :

// CUDA header 
#include <cuda.h>

#include <stdio.h>

static CUdevice cuDevice;
static CUcontext cuContext;
static CUmodule cuModule;
static CUfunction kernelFunction;
static CUresult error;

static CUdeviceptr buffer_dPtr;
CUdeviceptr o_result_dPtr;

// CUDA tools
#define CUDA_CHECK_ERROR(err) safeCudaCall(err, __FILE__, __LINE__)

CUresult safeCudaCall( CUresult err, const char *file, const int line )
{	
#define CU_SAFE_CALL_OUTPUT(x) case x: printf("Driver API error in %s, line %i : "#x,file, line); printf("\n"); break;

	if( CUDA_SUCCESS != err)
	{
		switch(err)
		{
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_VALUE);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_OUT_OF_MEMORY);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_INITIALIZED);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_DEINITIALIZED);						
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NO_DEVICE);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_DEVICE);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_IMAGE );
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_CONTEXT);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_MAP_FAILED);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_UNMAP_FAILED);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_ARRAY_IS_MAPPED);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_ALREADY_MAPPED);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NO_BINARY_FOR_GPU);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_ALREADY_ACQUIRED );
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_MAPPED);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_MAPPED_AS_ARRAY);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_MAPPED_AS_POINTER);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_ECC_UNCORRECTABLE);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_UNSUPPORTED_LIMIT);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_SOURCE);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_FILE_NOT_FOUND);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED);
//				CU_SAFE_CALL_OUTPUT(CUDA_ERROR_OPERATING_SYSTEM);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_HANDLE);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_FOUND);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_READY);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_LAUNCH_FAILED);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_LAUNCH_TIMEOUT);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_UNKNOWN);
		}
	}
	
	return err;
}

template <typename T> void setGlobal(CUmodule cuModule, const char * name, T value)
{
	CUdeviceptr dPtr;
	size_t size;
	
	CUDA_CHECK_ERROR(cuModuleGetGlobal(&dPtr, &size, cuModule, name));
	
	if (size != sizeof(T))
		printf("Warning : Global variable %s size does not match type size", name);
	
	CUDA_CHECK_ERROR(cuMemcpyHtoD(dPtr, &value, sizeof(T)));
}

static int blockSize[3];
static int gridSize[2];

void initFunction(CUfunction function)
{
	int multiProcessorCount;
	cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuDevice);

	int blockCount = 1;
	int warpSize = 32;

	int maxThreadsPerBlock;
	cuFuncGetAttribute(&maxThreadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);

	int blockHeight = maxThreadsPerBlock / warpSize;
	gridSize[0] = multiProcessorCount;
	gridSize[1] = blockCount;
	blockSize[0] = warpSize;
	blockSize[1] = blockHeight;
	blockSize[2] = 1;	
}

int main(void)
{
	CUDA_CHECK_ERROR(cuInit(0));

	int deviceCount;
	CUDA_CHECK_ERROR(cuDeviceGetCount(&deviceCount));
	
	CUDA_CHECK_ERROR(cuDeviceGet(&cuDevice, 0));

	unsigned int flags = 0;
	CUDA_CHECK_ERROR(cuCtxCreate(&cuContext, flags, cuDevice));
	
	char kernelFileName[2048];
	////////////////////////////////////////////////////////
	//	CHANGE THE CUDA MODULE HERE
	// cubin works and ptx cause the driver to crash
	////////////////////////////////////////////////////////
	sprintf(kernelFileName, "maxwell.ptx");
//	sprintf(kernelFileName, "maxwell.cubin");
//	sprintf(kernelFileName, "maxwell_from_ptx.cubin");

	CUDA_CHECK_ERROR(cuModuleLoad(&cuModule, kernelFileName));
	
	CUDA_CHECK_ERROR(cuModuleGetFunction(&kernelFunction, cuModule, "accumulationKernel"));
	
	initFunction(kernelFunction);
	
	int fragmentCount = 10000;
	setGlobal<int>(cuModule, "g_taskCount", fragmentCount);
		
	int resultSize = 1;
	cuMemAlloc(&o_result_dPtr, resultSize * sizeof(double));

	setGlobal<CUdeviceptr>(cuModule, "o_result", o_result_dPtr);

	CUDA_CHECK_ERROR(cuMemsetD32(o_result_dPtr, 0, resultSize * 2));

	//-- Launch accumulation kernel 
#define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
	setGlobal<unsigned int>(cuModule, "persistentTaskIndex", 0);
		
	CUDA_CHECK_ERROR(cuParamSetSize(kernelFunction, 0));
	
	CUDA_CHECK_ERROR(cuFuncSetBlockShape(kernelFunction, blockSize[0], blockSize[1], 1));
	
	CUDA_CHECK_ERROR(cuLaunchGrid(kernelFunction, gridSize[0], gridSize[1]));

	CUDA_CHECK_ERROR(cuCtxSynchronize());

	double result;
	CUDA_CHECK_ERROR(cuMemcpyDtoH(&result, o_result_dPtr, sizeof(double)));

	printf("The result is %f and should be %f \n", result, (float)fragmentCount);
}

what is a “shared volatile int” …?

volatile forces the storage of the data in the shared memory. Ortherwise, the compiler can decide to keep data into registers causing data not to be actually shared between threads. This distinction has appeared with Kepler.

i would have thought that a proper barrier, like __syncthreads(), would be sufficient to ensure the required visibility of the data
if this is so, the implication would be that, if a volatile declaration is necessary, it is likely due to poor synchronization otherwise
volatile then takes on the synchronization function of the missing barrier/ synchronization call

When I run your code with

cuda-memcheck --tool racecheck ./main

It reports 7 hazards. You might want to investigate these. Compile your .cu module with -lineinfo

It will probably also help if you manually inline your DOUBLE_REDUCTION macro.

First of all thanks for your answers. It has been usefull, even if I am not fully satisfied.

@txbob : I did not think about using cuda-memcheck. It helped me to find a mistake in my example concerning the block height used for my function and in my reduction. In main.cpp I changed it to be 2 as it is defined in the .cu

However, the problem remains after these modifications.

racecheck also give me several hazards during my reduction when reading and writing from my shared buffer, however I do not understand why, given that normally all threads of a warp should be at the same instruction at the same time (or waiting if different branches). In my code, there should not be any conflicts of read/write in a same warp. Unless some unknown optimisations are performed.

Anyway I tried to use __syncthreads() during my reduction, as advised by little_jimmy. It seems to solve the problem. BUT, it is a block synchronisation and what I think my algorithm should only need warp synchronisation. Then I expect a loss of performance.

Is it a mistake to assume that all threads of a warp are on the instruction?

My new code is :

maxwell.cu :

#include <stdio.h> //for printf, needed also for CUDA

#define BLOCK_HEIGHT 2

// Scheduler
__device__ unsigned int persistentTaskIndex;

__device__ int getTaskID()
{
	__shared__  volatile int persistentBlockStart[BLOCK_HEIGHT];

	if(threadIdx.x == 0)
		persistentBlockStart[threadIdx.y] = atomicAdd(&persistentTaskIndex, warpSize);
		
	int taskID = persistentBlockStart[threadIdx.y] + threadIdx.x;
		
	return taskID;
}

// Atomic add for double. Implementation provided by NVIDIA
__forceinline__ __device__ double atomicAdd(double * address, double val)
{
	unsigned long long int * address_as_ull = (unsigned long long int*)address;
	unsigned long long int old = *address_as_ull, assumed;
	do 
	{
		assumed = old;
		old = atomicCAS(address_as_ull, assumed, 
		__double_as_longlong(val + 
		__longlong_as_double(assumed)));
	} while (assumed != old);

	return __longlong_as_double(old);
}

__device__ int g_taskCount;
__device__ double * o_result;

extern "C" __global__ void accumulationKernel()
{
	int taskCount = g_taskCount;
	int taskID;

#define ACCUMULATE_WARP_SIZE 32 // Must be power of two
#define ACCUMULATE_BLOCK_HEIGHT 2

	__shared__ double reduction[ACCUMULATE_BLOCK_HEIGHT][ACCUMULATE_WARP_SIZE]; 
	
	while (1)
	{
		taskID = getTaskID();

		reduction[threadIdx.y][threadIdx.x] = 0;
		__syncthreads();
	
		if (taskID >= taskCount)
			return;

		int outputPosition = 0;

		reduction[threadIdx.y][threadIdx.x] = 1.0;
		__syncthreads();
		if(threadIdx.x < 16) 
			reduction[threadIdx.y][threadIdx.x] += reduction[threadIdx.y][threadIdx.x+16];
		__syncthreads();	
		if(threadIdx.x < 8) 
			reduction[threadIdx.y][threadIdx.x] += reduction[threadIdx.y][threadIdx.x+8];
		__syncthreads();
		if(threadIdx.x < 4) 
			reduction[threadIdx.y][threadIdx.x] += reduction[threadIdx.y][threadIdx.x+4];
		__syncthreads();
		if(threadIdx.x < 2) 
			reduction[threadIdx.y][threadIdx.x] += reduction[threadIdx.y][threadIdx.x+2];
		__syncthreads();
		if(threadIdx.x < 1)
		{
			reduction[threadIdx.y][0] += reduction[threadIdx.y][1];
			atomicAdd(&o_result[outputPosition], reduction[threadIdx.y][0]);
		}
		__syncthreads();
	}
}

main.cpp :

// CUDA header 
#include <cuda.h>

#include <stdio.h>

static CUdevice cuDevice;
static CUcontext cuContext;
static CUmodule cuModule;
static CUfunction kernelFunction;
static CUresult error;

static CUdeviceptr buffer_dPtr;
CUdeviceptr o_result_dPtr;

// CUDA tools
#define CUDA_CHECK_ERROR(err) safeCudaCall(err, __FILE__, __LINE__)

CUresult safeCudaCall( CUresult err, const char *file, const int line )
{	
#define CU_SAFE_CALL_OUTPUT(x) case x: printf("Driver API error in %s, line %i : "#x,file, line); printf("\n"); break;

	if( CUDA_SUCCESS != err)
	{
		switch(err)
		{
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_VALUE);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_OUT_OF_MEMORY);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_INITIALIZED);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_DEINITIALIZED);						
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NO_DEVICE);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_DEVICE);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_IMAGE );
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_CONTEXT);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_MAP_FAILED);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_UNMAP_FAILED);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_ARRAY_IS_MAPPED);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_ALREADY_MAPPED);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NO_BINARY_FOR_GPU);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_ALREADY_ACQUIRED );
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_MAPPED);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_MAPPED_AS_ARRAY);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_MAPPED_AS_POINTER);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_ECC_UNCORRECTABLE);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_UNSUPPORTED_LIMIT);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_SOURCE);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_FILE_NOT_FOUND);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED);
//				CU_SAFE_CALL_OUTPUT(CUDA_ERROR_OPERATING_SYSTEM);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_INVALID_HANDLE);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_FOUND);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_NOT_READY);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_LAUNCH_FAILED);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_LAUNCH_TIMEOUT);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING);
			CU_SAFE_CALL_OUTPUT(CUDA_ERROR_UNKNOWN);
		}
	}
	
	return err;
}


template <typename T> void setGlobal(CUmodule cuModule, const char * name, T value)
{
	CUdeviceptr dPtr;
	size_t size;
	
	CUDA_CHECK_ERROR(cuModuleGetGlobal(&dPtr, &size, cuModule, name));
	
	if (size != sizeof(T))
		printf("Warning : Global variable %s size does not match type size", name);
	
	CUDA_CHECK_ERROR(cuMemcpyHtoD(dPtr, &value, sizeof(T)));
}

static int blockSize[3];
static int gridSize[2];

void initFunction(CUfunction function)
{
	int multiProcessorCount;
	cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuDevice);

	int warpSize = 32;

	int maxThreadsPerBlock;
	cuFuncGetAttribute(&maxThreadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);

	int blockHeight = 2;
	int blockCount = maxThreadsPerBlock / warpSize / blockHeight;
	gridSize[0] = multiProcessorCount;
	gridSize[1] = blockCount;
	blockSize[0] = warpSize;
	blockSize[1] = blockHeight;
	blockSize[2] = 1;	
}

int main(void)
{
	CUDA_CHECK_ERROR(cuInit(0));

	int deviceCount;
	CUDA_CHECK_ERROR(cuDeviceGetCount(&deviceCount));
	
	CUDA_CHECK_ERROR(cuDeviceGet(&cuDevice, 0));

	unsigned int flags = 0;
	CUDA_CHECK_ERROR(cuCtxCreate(&cuContext, flags, cuDevice));
	
	char kernelFileName[2048];
	////////////////////////////////////////////////////////
	//	CHANGE THE CUDA MODULE HERE
	// cubin works and ptx cause the driver to crash
	////////////////////////////////////////////////////////
//	sprintf(kernelFileName, "maxwell.ptx");
	sprintf(kernelFileName, "maxwell.cubin");
//	sprintf(kernelFileName, "maxwell_from_ptx.cubin");

	CUDA_CHECK_ERROR(cuModuleLoad(&cuModule, kernelFileName));
	
	CUDA_CHECK_ERROR(cuModuleGetFunction(&kernelFunction, cuModule, "accumulationKernel"));
	
	initFunction(kernelFunction);
	
	int fragmentCount = 10000;
	setGlobal<int>(cuModule, "g_taskCount", fragmentCount);
		
	int resultSize = 1;
	cuMemAlloc(&o_result_dPtr, resultSize * sizeof(double));

	setGlobal<CUdeviceptr>(cuModule, "o_result", o_result_dPtr);

	CUDA_CHECK_ERROR(cuMemsetD32(o_result_dPtr, 0, resultSize * 2));

	//-- Launch accumulation kernel 
#define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
	setGlobal<unsigned int>(cuModule, "persistentTaskIndex", 0);
		
	CUDA_CHECK_ERROR(cuParamSetSize(kernelFunction, 0));
	
	CUDA_CHECK_ERROR(cuFuncSetBlockShape(kernelFunction, blockSize[0], blockSize[1], 1));
	
	CUDA_CHECK_ERROR(cuLaunchGrid(kernelFunction, gridSize[0], gridSize[1]));

	CUDA_CHECK_ERROR(cuCtxSynchronize());

	double result;
	CUDA_CHECK_ERROR(cuMemcpyDtoH(&result, o_result_dPtr, sizeof(double)));

	printf("The result is %f and should be %f \n", result, (float)fragmentCount);
}

Your previous code (and current code) do not have the volatile keyword on the reduction variable. So with that proviso, the __syncthreads() are definitely necessary.

It should be possible to construct a warp level reduction that uses a volatile shared memory variable, and does not require the use of __syncthreads(). There are many examples of this:

https://docs.nvidia.com/cuda/samples/6_Advanced/reduction/doc/reduction.pdf

I’m quite sure it works.

Note that this construct:

if (taskID >= taskCount)
			return;

                ...
		__syncthreads();

is potentially illegal.

“It should be possible to construct a warp level reduction that uses a volatile shared memory variable, and does not require the use of __syncthreads(). There are many examples of this”

agreed. however, i am not entirely convinced that a volatile declaration would be any better than a __syncthreads()
for one, i would think that it implies that a thread writing to the volatile variable, must do so immediately, without the option of shifting the instruction around with respect to the next __syncthreads() barrier
i suppose it depends on the exact context of use

“BUT, it is a block synchronisation and what I think my algorithm should only need warp synchronisation”

a block synchronization would automatically equate to a warp synchronization, when the size of the block equals a warp
however, i am not sure that your block size equals a warp, even though you generally reduce for/ over a warp only
hence, i am not entirely convinced that you require warp synchronization only, across the board

a volatile declaration may save you here (and i do not think you actually need a __syncthreads() here):

if(threadIdx.x < z) ; z [16, …, 2]

but you certainly need a __syncthreads(), and surely not a volatile, before:

if(threadIdx.x < 1)
{
reduction[threadIdx.y][0] += reduction[threadIdx.y][1];
atomicAdd(&o_result[outputPosition], reduction[threadIdx.y][0]);
}

as this part requires block synchronization, not warp synchronization

not so?

Actually, I forgot the volatile keyword in my example. But if I declare the reduction table volatile, and remove the syncthreads, I expect my warp reduction to work. However, it crashes the driver.

Do you think there can be a bug?

I use a block height of two warps. Actually, in previous version of CUDA, I had issues when using a bloc of on warp only. I would have to test that.

As you advised, I made some tests with less syncthreads and it seems to work in the example when I only perform a syncthread after the reduction. However I do not really understand why, and if it will work in my real code, as I need to perform several successive reductions.

I do not really know what will be the impact of syncthreads on the performance in an real use case.

i thought you needed a __syncthreads before:

if (threadIdx.x < 1)

as your generally divergent warps converge at this point, but i misread that particular section

txbob rightly pointed out the issue around if (taskID >= taskCount)

consider:

__shared__  int persistentBlockStart[BLOCK_HEIGHT]; // no volatile

while (1)
{
if (threadIdx.x == 0)
{
reduction[threadIdx.y][threadIdx.x] = 0;
persistentBlockStart[threadIdx.y] = atomicAdd(&persistentTaskIndex, warpSize);
}

__syncthreads(); // may not be necessary

int taskID;

if (persistentBlockStart[threadIdx.y] < taskCount)
{
taskID = persistentBlockStart[threadIdx.y] + threadIdx.x;
}

else
{
break;
}

if (taskID < taskCount)
{
// main reduction block
}

if(threadIdx.x < 1)
{
reduction[threadIdx.y][0] += reduction[threadIdx.y][1];
atomicAdd(&o_result[outputPosition], reduction[threadIdx.y][0]);
}
}

i think the compiler would feel ‘less lost’, and you should register fewer races

and given that your block warps generally diverge, and remain divergent, i wonder whether you would not be better of by using a 1-dimensional block, with multi-dimensional grid - 2 blocks of a warp each, instead of 1 block of 2 warps
it would provide more synchronization leniency to warps that generally execute independently of each other
you simply need to replace all threadIdx.y with blockIdx.x, i would think

We have provided an example to NVIDIA. It was a bug. It has been solved in CUDA 7.5 release. The same code as before works as expected, in ptx and cubin.

Thanks everyone.