NVIDIA OpenCL driver 352.84 crashes when dereferencing a pointer pointing to unaligned address

I post this opencl problem on this section because I searched ‘opencl’ on the forums and only found it appears in the description of this board: Nsight, Visual Studio Edition.

For a __global ulong* pointer, if it points to an address which is NOT 8-bytes aligned, NVIDIA OpenCL driver 352.84 (newest version currently) will make the application crash. I attached the host and device side code below. When the value of p_is_aligned_8 displays F(false), the application will print “Error in clEnqueueMapBuffer (-36)”. If you press space key to run again, the applicaiton crashes. Sometimes OS pops up a window, which said, ‘NVIDIA Windows Kernel Mode Driver, Version 352.84 has stopped response and recovered successfully’.

OpenCL 2.0 C specification descibes pointer casting as below:
6.2.5 Pointer Casting
Pointers to old and new types may be cast back and forth to each other. Casting a pointer to a new type represents an unchecked assertion that the address is correctly aligned. The developer will also need to know the endianness of the OpenCL device and the endianness of the data to determine how the scalar and vector data elements are stored in memory.

That means the crash due to dereferencing a pointer pointing to unaligned address may not be seen as a bug. But AMD and Intel OpenCL platforms can run the code successfully for their GPUs. Considering the OpenCL 2.0 SVM and CUDA Unified Memory, the mixed data structure in ‘memory allocation block’ requires ulong* pointer can be used for any address. Thus the crash is really unacceptable. Currently I have to use two uint read to replace one ulong read, or change host side code to align pointer (adjust custom C++ allocator to be 8-bytes aligned, wasting near 20% memory). It deteriorates the program performance badly.

The host side code is as follows:

// OpenCLTest.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"
#include <fstream>
#include <iostream>
#include <cstring>
#include <conio.h>
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>

const char KERNEL_FILE[] = "../../OpenCLTest/OpenCL_kernel.cl";
const size_t NUM_PARALLEL = 8640;

const char* get_file_content(const char* filepath, size_t& size)
{
	std::ifstream ifs(filepath, std::ifstream::binary | std::ifstream::in);
	if (!ifs)
		return NULL;

	ifs.seekg(0, std::ifstream::end);
	size = ifs.tellg();
	ifs.seekg(0, std::ifstream::beg);

	char* contents = (char*) ::malloc(size + 1);
	ifs.read(contents, size);
	contents = '

// OpenCLTest.cpp : 定义控制台应用程序的入口点。
//

#include “stdafx.h”
#include
#include
#include
#include <conio.h>
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>

const char KERNEL_FILE = “…/…/OpenCLTest/OpenCL_kernel.cl”;
const size_t NUM_PARALLEL = 8640;

const char* get_file_content(const char* filepath, size_t& size)
{
std::ifstream ifs(filepath, std::ifstream::binary | std::ifstream::in);
if (!ifs)
return NULL;

ifs.seekg(0, std::ifstream::end);
size = ifs.tellg();
ifs.seekg(0, std::ifstream::beg);

char* contents = (char*) ::malloc(size + 1);
ifs.read(contents, size);
contents = '\0';
ifs.close();

return contents;

}

void runOpenCL()
{
size_t length;
const char* contents = get_file_content(KERNEL_FILE, length);
cl::Program::Sources source(1, std::make_pair(contents, length));

std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
for (auto platform : platforms) {
	std::vector<cl::Device> devices;
	try {
		platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
	}
	catch (cl::Error&) {
		continue;
	}
	for (auto device : devices) {
		auto sizesItem = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();

		cl::Context context(device);
		cl::Program program(context, source);
		try {
			program.build();
			typedef cl::make_kernel<cl::Buffer&, size_t> KernelType;
			KernelType kernel(program, "prepareNodes");

			cl::CommandQueue queue(context, device);
			size_t work_size = (NUM_PARALLEL + sizesItem[0] - 1) / sizesItem[0] * sizesItem[0];
			cl::NDRange globalNDR(work_size);
			cl::NDRange localNDR(sizesItem[0]);

			char work_buffer[200];
			cl::Buffer bufferWork(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(work_buffer), work_buffer);

			cl::EnqueueArgs arg(queue, cl::NullRange, globalNDR, localNDR);
			kernel(arg, bufferWork, NUM_PARALLEL);
			char* buffer = (char*)queue.enqueueMapBuffer(bufferWork, CL_TRUE, CL_MAP_READ, 0, sizeof(work_buffer));
			queue.enqueueUnmapMemObject(bufferWork, buffer);
		}
		catch (cl::Error& e) {
			std::cerr << "Error in " << e.what() << " (" << e.err() << ")" << std::endl;
			std::cerr << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << std::endl;
		}
	}
	std::cout << std::endl;
}

::free((void*)contents);

}

int _tmain(int argc, _TCHAR* argv)
{
runOpenCL();
std::cout << “Press space key to run repeatly, or Q(q) to exit …” << std::endl;
char c;
while (true) {
do {
c = _getch();
} while (c != ’ ’ && c != ‘q’ && c != ‘Q’);
if (c == ‘q’ || c == ‘Q’)
break;
runOpenCL();
}
return 0;
}

';
	ifs.close();

	return contents;
}

void runOpenCL()
{
	size_t length;
	const char* contents = get_file_content(KERNEL_FILE, length);
	cl::Program::Sources source(1, std::make_pair(contents, length));

	std::vector<cl::Platform> platforms;
	cl::Platform::get(&platforms);
	for (auto platform : platforms) {
		std::vector<cl::Device> devices;
		try {
			platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
		}
		catch (cl::Error&) {
			continue;
		}
		for (auto device : devices) {
			auto sizesItem = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();

			cl::Context context(device);
			cl::Program program(context, source);
			try {
				program.build();
				typedef cl::make_kernel<cl::Buffer&, size_t> KernelType;
				KernelType kernel(program, "prepareNodes");

				cl::CommandQueue queue(context, device);
				size_t work_size = (NUM_PARALLEL + sizesItem[0] - 1) / sizesItem[0] * sizesItem[0];
				cl::NDRange globalNDR(work_size);
				cl::NDRange localNDR(sizesItem[0]);

				char work_buffer[200];
				cl::Buffer bufferWork(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(work_buffer), work_buffer);

				cl::EnqueueArgs arg(queue, cl::NullRange, globalNDR, localNDR);
				kernel(arg, bufferWork, NUM_PARALLEL);
				char* buffer = (char*)queue.enqueueMapBuffer(bufferWork, CL_TRUE, CL_MAP_READ, 0, sizeof(work_buffer));
				queue.enqueueUnmapMemObject(bufferWork, buffer);
			}
			catch (cl::Error& e) {
				std::cerr << "Error in " << e.what() << " (" << e.err() << ")" << std::endl;
				std::cerr << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << std::endl;
			}
		}
		std::cout << std::endl;
	}

	::free((void*)contents);
}

int _tmain(int argc, _TCHAR* argv[])
{
	runOpenCL();
	std::cout << "Press space key to run repeatly, or Q(q) to exit ..." << std::endl;
	char c;
	while (true) {
		do {
			c = _getch();
		} while (c != ' ' && c != 'q' && c != 'Q');
		if (c == 'q' || c == 'Q')
			break;
		runOpenCL();
	}
	return 0;
}

The device side code (OpenCL_kernel.cl) is:

__kernel void prepareNodes(__global uchar* objects, ulong NUM_PARALLEL)
{
	size_t GID = get_global_id(0);
	if (GID == 0) {
#ifdef cl_nv_pragma_unroll //screen AMD and Intel OpenCL platform devices
		__global uchar* p = objects + 4;
		char isAligned = (((ulong) p) & 7) == 0? 'T' : 'F';
		printf("p_is_aligned_8:%c objects=%lld p=%lld\n", isAligned, objects, p);
		ulong p_ = *(__global ulong*) p;
//		if (p_ == 1) printf("ONE\n"); else printf("TWO\n");
		if (p_ == 1) objects[199] = 'X';
#endif
	}
}

GPU: GeForce GTX850M
OS: Windows 10 preview
Driver(from GPU-Z): nvlddmkm 9.18.13.5284 (ForceWave 352.84)/Win8 64

Note:
1,CL/cl.hpp is not included in NVIDIA CUDA 7 release package. It can be found in official site: https://www.khronos.org/registry/cl/api/1.2/cl.hpp
2,If both two ‘if (p_ == 1)’ lines are commented out, the error disappears. It seems the NVIDIA OpenCL compiler removes ‘ulong p_ = (__global ulong) p;’ for optimization
3,if p_is_aligned_8 is not F, consider change ‘__global uchar* p’ to be ‘objects + 8’

I wanna know whether NVIDIA is willing to fix it and when? Any comments are appreciated.