CUDA -OpenGL Interop

Hi All!

I’ve got a problem with using renderbuffer in CUDA kernel. My task is:

  1. Render image in some position to renderbuffer.

  2. Get image and process it pixel-by-pixel.

So I create and bind all buffers:

rbo = new GLRBO(GL_RGBA, mesh->size.Max[0] - mesh->size.Min[0], mesh->size.Max[1] - mesh->size.Min[1]);

    fbo = new GLFBO(GL_COLOR_ATTACHMENT0, rbo->GetId());

    vao = new GLVAO(1, &BufferIds[0]);

    vao->Bind();

    vbo = new GLVBO(2, &BufferIds[1]);

    vbo->Bind(GL_ARRAY_BUFFER);

    vbo->SetData(GL_ARRAY_BUFFER, sizeof(GLVertex)*(mesh->nVertices+1), vertices, GL_DYNAMIC_DRAW);

    vbo->Bind(GL_ELEMENT_ARRAY_BUFFER);

    vbo->SetData(GL_ELEMENT_ARRAY_BUFFER, sizeof(GLuint)*mesh->nElements, indices, GL_DYNAMIC_DRAW);

    vao->Unbind();

    //register in CUDA

cudaSetDevice(0);

cudaGLSetGLDevice(0)

    cudaGraphicsResource* resource;

    cudaGraphicsGLRegisterImage(&resource, rbo->GetId(), GL_RENDERBUFFER, cudaGraphicsMapFlagsReadOnly);

cudaGraphicsMapResources(1, &resource, 0);

// ... Render image

void GLMain::_RenderScene()

{

    ++FrameCount;

    //render at maximum resolution to render buffer

    glViewport(0, 0, rboWidth, rboHeight);

if(!bDone)

    {

        fbo->Bind();

        glClearColor(0.0, 0.0, 0.0, 1.0);

        glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

        DrawMesh();

        fbo->BindReadDraw();

        cudaArray* array;

        cudaGraphicsSubResourceGetMappedArray(&array, resource, 0, 0);

        size_t size = sizeof(float)*rboWidth*rboHeight;

        float* initData = (float*)malloc(size*4);

        memset(initData, 0, size*4);

        ProcessImage(ar, initData, rboWidth, rboHeight);

    }

//Render to screen

}

//...... Process_Image.cu

texture <float4, cudaTextureType2D, cudaReadModeElementType> tex;

extern "C" void ProcessImage(cudaArray* in, float* _outAngle, float* _outIntence, float* _inData, int w, int h )

{   

    cudaBindTextureToArray(tex, in);

    size_t size = sizeof(float)*w*h;

    float* inData;

    cudaMalloc(&inData, size*4);

    dim3 threads(16, 16);

    dim3 blocks(w/threads.x, h/threads.y);

    cuProcessKernel<<<blocks, threads>>>(inData, w, h);

    cudaThreadSynchronize();

    cudaUnbindTexture(tex);

    cudaMemcpy(_inData, inData, size*4, cudaMemcpyDeviceToHost);

    cudaFree(outAngle);

    cudaFree(outIntence);

}

//...Kernel

__global__ void cuProcessKernel(float* initialData, int w, int h)

{

    int	x = blockIdx.x*blockDim.x + threadIdx.x;

    int	y = blockIdx.y*blockDim.y + threadIdx.y;

    int	idx = y * w + x;

    float4 pix = tex2D(tex, x, y);

    initialData[idx*4] = pix.x;

    initialData[idx*4+1] = pix.y;

    initialData[idx*4+2] = pix.z;

    initialData[idx*4+3] = pix.w;

}

So I’m simply trying to get per-pixel image in float4 format. But instead I get some garbage in the output array. When I make all white image it returns incorrect values. Could someone tell me where is my mistake ?

System: OS Win 7 x64 SP1, Core i7 2600/GTX570/16Gb, CUDA Toolkit 4.2, devdriver_4.2_winvista-win7_64_301.32_general.