How to use CUDA in dll?

Hello,

my purpose is to use CUDA through a stub library, which encapsulates CUDA specific functions, methods and data. This library will reside inside a dll file and it will be used dynamically via LoadLibrary and GetProcAddress Windows API functions. I am using Visual Studio 2010 C++ compiler to create CUDA dll, but the rest is done by another compiler. That means, that I cannot use heap memory (ie. malloc, new or anything, which lives outside stack memory. Even global variables cause memory corruption) inside CUDA dll, so it is literally a stub library.

However, first I want to code a test program: CUDAhost.exe and CUDAdevice.dll, which both are compiled by Visual Studio 2010. This program draws a texture to the screen, but first the image data is copied from readGLTexture to viewGLTexture by CUDA. My problem is I cannot get the program working, the window is black. I cannot find the error. I’m not sure is this possible at all or should I choose completely different solution. I hope you can help me. Any suggestions are appreciated. Below is the source code:

CUDAhost.cpp:

#include "stdafx.h"

const unsigned int window_width  = 512;
const unsigned int window_height = 512;

GLuint viewGLTexture;
GLuint readGLTexture;
void* cViewCudaResource;
void* cReadCudaResource;
HINSTANCE dll;
typedef void (*KAYNNISTACUDA)(unsigned int& readGLTexture, void* &cReadCudaResource, unsigned int& viewGLTexture, void* &cViewCudaResource);
KAYNNISTACUDA kaynnistaCUDA;
typedef void (*PIIRRAKUVA)(void* &cReadCudaResource, void* &cViewCudaResource);
PIIRRAKUVA piirraKuva;

bool lataaTekstuuri(const wchar_t* tiedosto, GLuint& numero) {

  FILE* tiedostosta;
  BITMAPFILEHEADER bitmapFileHeader;
  BITMAPINFOHEADER bitmapInfoHeader;
  unsigned char *bittiKartta;
  unsigned char siirto;
  wchar_t polku[45]={0};
  int leveys;
  int korkeus;

//prepare file path
  wcsncat(polku, L"Tekstuurit\\", 45);
  wcsncat(polku, tiedosto, 45);
  wcsncat(polku, L".bmp", 45);

//open BMP file
  tiedostosta=_wfopen(polku, L"rb");
  if (tiedostosta==NULL) {
    return false;
  }

//read bmp file header and sequre it is bmp file
  fread(&bitmapFileHeader, sizeof(BITMAPFILEHEADER), 1, tiedostosta);
  if (bitmapFileHeader.bfType != 0x4D42) {
    fclose(tiedostosta);
    return false;
  }

//read bmp info header and move to the beginning of image data
  fread(&bitmapInfoHeader, sizeof(BITMAPINFOHEADER), 1, tiedostosta);
  fseek(tiedostosta, bitmapFileHeader.bfOffBits, SEEK_SET);

//allocate memory space
  bittiKartta=(unsigned char*)malloc(bitmapInfoHeader.biSizeImage);
  if (!bittiKartta) {
	free(bittiKartta);
	bittiKartta=NULL;
    fclose(tiedostosta);
    return false;
  }

//read image
  fread(bittiKartta, 1, bitmapInfoHeader.biSizeImage, tiedostosta);
  if (bittiKartta==NULL) {
	free(bittiKartta);
	bittiKartta=NULL;
    fclose(tiedostosta);
    return false;
  }

//rearrange bgr to rgb
  for (int i=0; i<bitmapInfoHeader.biSizeImage; i+=3) {
    siirto=bittiKartta[i];
    bittiKartta[i]=bittiKartta[i+2];
    bittiKartta[i+2]=siirto;
  }

//query image width and height
  leveys=bitmapInfoHeader.biWidth;
  korkeus=abs(bitmapInfoHeader.biHeight);

//close bmp file
  fclose(tiedostosta);
  glGetError();

//create OpenGL texture
  glGenTextures(1, &numero);
  glBindTexture(GL_TEXTURE_2D, numero);
  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, leveys, korkeus, 0, GL_RGB, GL_UNSIGNED_BYTE, bittiKartta);
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);

//free temporary buffer
  free(bittiKartta);
  bittiKartta=NULL;

//if success, return true
  if (0==glGetError()) {
    return true;
  } else {
    return false;
  }
}

void initGLandCUDA(int argc, char* argv[]) {

	glutInit(&argc, argv);
	glutInitDisplayMode(GLUT_RGBA);
	glutInitWindowSize(window_width, window_height);
	glutCreateWindow("CUDA GL Interop");

	glewInit();

	glEnable(GL_TEXTURE_2D);
	bool onnistuiko=lataaTekstuuri(L"Tex", readGLTexture);

	glGenTextures(1, &viewGLTexture);
	glBindTexture(GL_TEXTURE_2D, viewGLTexture);
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, window_width, window_height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
	glBindTexture(GL_TEXTURE_2D, 0);

	dll=LoadLibraryW(L"CUDAdevice.dll");
	if (dll) {
		kaynnistaCUDA=(KAYNNISTACUDA)GetProcAddress(dll, "kaynnistaCUDA");
		kaynnistaCUDA(readGLTexture, cReadCudaResource, viewGLTexture, cViewCudaResource);
	}
}    

void renderFrame() {
	if (dll) {
		piirraKuva=(PIIRRAKUVA)GetProcAddress(dll, "piirraKuva");
		piirraKuva(cReadCudaResource, cViewCudaResource);
	}
	glBindTexture(GL_TEXTURE_2D, viewGLTexture);
	{
		glBegin(GL_QUADS);
		{
			glTexCoord2f(0.0f, 0.0f); glVertex2f(-1.0f, -1.0f);
			glTexCoord2f(1.0f, 0.0f); glVertex2f(+1.0f, -1.0f);
			glTexCoord2f(1.0f, 1.0f); glVertex2f(+1.0f, +1.0f);
			glTexCoord2f(0.0f, 1.0f); glVertex2f(-1.0f, +1.0f);
		}
		glEnd();
	}
	glBindTexture(GL_TEXTURE_2D, 0);
	glFinish();
}

int _tmain(int argc, _TCHAR* argv[])
{
	initGLandCUDA(argc, reinterpret_cast<char**>(argv));
	glutDisplayFunc(renderFrame);
	glutMainLoop();
	return 0;
}

dllmain.cpp:

#include "stdafx.h"

BOOL APIENTRY DllMain( HMODULE hModule,
                       DWORD  ul_reason_for_call,
                       LPVOID lpReserved
					 )
{
	switch (ul_reason_for_call)
	{
	case DLL_PROCESS_ATTACH:
	case DLL_THREAD_ATTACH:
	case DLL_THREAD_DETACH:
	case DLL_PROCESS_DETACH:
		break;
	}
	return TRUE;
}

//this function is used to setup CUDA
void kaynnistaCUDA(unsigned int& readGLTexture, void* &cReadCudaResource, unsigned int& viewGLTexture, void* &cViewCudaResource) {

	struct cudaGraphicsResource* viewCudaResource;
	struct cudaGraphicsResource* readCudaResource;
	cudaError cVirhe;

	cudaGLSetGLDevice(0);
	cVirhe=cudaGraphicsGLRegisterImage(&viewCudaResource, viewGLTexture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly);
	cVirhe=cudaGraphicsGLRegisterImage(&readCudaResource, readGLTexture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsSurfaceLoadStore);
	cReadCudaResource=reinterpret_cast<void*>(readCudaResource);
	cViewCudaResource=reinterpret_cast<void*>(viewCudaResource);
}    

//this function is used to draw texture image via CUDA
void piirraKuva(void* &cReadCudaResource, void* &cViewCudaResource) {

	cudaError cVirhe;

	struct cudaGraphicsResource* viewCudaResource=reinterpret_cast<cudaGraphicsResource*>(cReadCudaResource);
	struct cudaGraphicsResource* readCudaResource=reinterpret_cast<cudaGraphicsResource*>(cViewCudaResource);
	cudaArray *readCudaArray;
	cudaArray *viewCudaArray;

	cVirhe=cudaGraphicsMapResources(1, &readCudaResource);
	cVirhe=cudaGraphicsMapResources(1, &viewCudaResource);
	cVirhe=cudaGraphicsSubResourceGetMappedArray(&readCudaArray, readCudaResource, 0, 0);
	cVirhe=cudaGraphicsSubResourceGetMappedArray(&viewCudaArray, viewCudaResource, 0, 0);
	callCUDAKernel(readCudaArray, viewCudaArray);
	cudaGraphicsUnmapResources(1, &viewCudaResource);
	cudaStreamSynchronize(0);
}

ohjelmat.cu:

#include "stdafx.h"
#include "ohjelmat.h"

texture<uchar4, cudaTextureType2D, cudaReadModeElementType> readCudaTextureObject;
surface<void, cudaSurfaceType2D> viewCudaSurfaceObject;


__global__ void renderingKernel() {

	unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
	unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

	uchar4 dd=tex2D(readCudaTextureObject, x, y);

	surf2Dwrite(dd, viewCudaSurfaceObject, x*sizeof(dd), y, cudaBoundaryModeZero);
}


void callCUDAKernel(cudaArray *readCudaArray, cudaArray *viewCudaArray) {

	cudaError cVirhe;

	cVirhe=cudaBindTextureToArray(readCudaTextureObject, readCudaArray);
	cVirhe=cudaBindSurfaceToArray(viewCudaSurfaceObject, viewCudaArray);
	dim3 block(256, 1, 1);
	dim3 grid(2, 512, 1);
	renderingKernel<<<grid, block>>>();
	cudaPeekAtLastError();
	cudaDeviceSynchronize();
}

PS. Don’t worry about lataaTekstuuri(const wchar_t* tiedosto, GLuint& numero) function. I know it works. I also have set paths to cuda headers, libraries, sources and binaries, added cudart.lib to additional dependencies and set compute_20,sm_21.