Copy data from ID3D11Resource to CUDA buffer gives grey output.

Hi, I am trying to take a screenshot from a Unity Computer game and hardware accelerated encode it to JPEG.

Therefore I want access to the GPU memory of Unity, which uses DirectX11 internally.

The entire program runs, but the output is a completely grey image of the requested size. I get no errors. What could this mean?

In the copy pass, I copy the RGB values byte by byte and if I override the colors, I do get that color as output, so apparently the texture I am reading is malfunctioning or for some reason really grey.

My code (stripped for simplification) is as follows:

int	EncodeJPEGState_CopyRGBDataFromGPUMemory(EncodeJPEGState *EJSTate, const int flipVertical, const int width, const int height, ID3D11Resource* pD3DResource, int &error)
{
	CUgraphicsResource	outCUDAResource = NULL;
	CUresult cuda_error = CUDA_SUCCESS;
	cudaError_t			CUDA_error = cudaSuccess;

	D3D11_RESOURCE_DIMENSION pResourceDimension;
	pD3DResource->GetType(&pResourceDimension);
	if (pResourceDimension != D3D11_RESOURCE_DIMENSION_TEXTURE2D) {
		error = -5;
		return error;
	}

	cuda_error = cuGraphicsD3D11RegisterResource(&outCUDAResource, pD3DResource, /*CU_GRAPHICS_REGISTER_FLAGS_NONE*/CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST | CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER); // 

	// from: http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPHICS.html#axzz4hziyhyPs
	cuda_error = cuGraphicsMapResources(1, &outCUDAResource, 0);	// last argument is stream, but 0 has no definition like CU_STREAM_LEGACY has.

	CUarray pArray = NULL;
	/*
	From: https://stackoverflow.com/questions/14927524/read-cudaarray-in-device-code
	CUDA arrays are opaque memory layouts optimized for texture fetching. They are one dimensional, two dimensional, or three-dimensional and composed of elements,
	each of which has 1, 2 or 4 components that may be signed or unsigned 8 , 16 or 32 bit integers, 16 bit floats, or 32 bit floats.
	CUDA arrays are only accessible by kernels through texture fetching as described in Texture Memory or surface reading and writing as described in Surface Memory

	So It is an array of pixels! What we want :-)!
	*/
	// CUresult cuGraphicsSubResourceGetMappedArray(CUarray* pArray, CUgraphicsResource resource, unsigned int  arrayIndex, unsigned int  mipLevel)
	cuda_error = cuGraphicsSubResourceGetMappedArray(&pArray, outCUDAResource, 0, 0);	// must be 0,0 at end. values with 1 don't work: five error 1: CUDA_ERROR_INVALID_VALUE

	CUDA_error = CUDA_texture_copy(pArray, EJSTate->RGBDataCUDA, width, height);

	cuda_error = cuGraphicsUnmapResources(1, &outCUDAResource, 0);	// last argument is stream, but 0 has no definition like CU_STREAM_LEGACY has.

	//This reference count will be decremented when this resource is unregistered through cuGraphicsUnregisterResource().
	cuda_error = cuGraphicsUnregisterResource(outCUDAResource);

	return 0;
}

// Simple copy kernel
__global__ void copyKernel(cudaSurfaceObject_t inputSurfObj, Npp8u *outputRGB, int width, int height)
{
	// Calculate surface coordinates
	unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
	unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
	if (x < width && y < height) {
		uchar4 data;
		// Read from input surface
		surf2Dread(&data, inputSurfObj, x * 4, y);
		// Write to output surface
		outputRGB[(y*width + x) * 3] = data.w;			// R
		outputRGB[(y*width + x) * 3 + 1] = data.x;		// G
		outputRGB[(y*width + x) * 3 + 2] = data.y;		// B
	}
}

cudaError_t	CUDA_texture_copy(CUarray Source_CUarray, Npp8u *Dest_RGBDataCUDA, int width, int height)
{
	// Specify surface
	struct cudaResourceDesc resDesc;
	memset(&resDesc, 0, sizeof(resDesc));

	resDesc.resType = cudaResourceTypeArray;
	resDesc.res.array.array = (cudaArray_t)Source_CUarray;	// WARNING Source_CUarray = CUarray!
	// from: https://www.cs.cmu.edu/afs/cs/academic/class/15668-s11/www/cuda-doc/html/group__CUDART__DRIVER.html
	// The types CUarray and struct cudaArray * represent the same data type and may be used interchangeably by casting the two types between each other.
	// In order to use a CUarray in a CUDA Runtime API function which takes a struct cudaArray *, it is necessary to explicitly cast the CUarray to a struct cudaArray *.
	
	cudaSurfaceObject_t inputSurfObj = 0;
	cudaError_t			CUDA_error = cudaCreateSurfaceObject(&inputSurfObj, &resDesc);

	if (CUDA_error != cudaSuccess) {
		return CUDA_error;
	}

	// Invoke kernel
	dim3 dimBlock(16, 16);
	dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x, (height + dimBlock.y - 1) / dimBlock.y);
	copyKernel <<<dimGrid, dimBlock>>> (inputSurfObj, Dest_RGBDataCUDA, width, height);

	// Wait for GPU to finish before accessing on host
	CUDA_error = cudaDeviceSynchronize();

	// Destroy surface object
	cudaDestroySurfaceObject(inputSurfObj);

	return CUDA_error;
}

OK, I solved the problem. The problem was in Unity. I had to call Texture2D.Apply() before using the ID3D11Resource as described here: Pass a RenderTexture to a plugin with zero copies - Unity Answers

As a sidenote, I also discovered that there is a standard function to copy texture data to CUDA-memory: cudaMemcpyFromArray, but this doesn’t help you if you also need to rearrange the RGBA components.