Hi,
I’m coding a plugin (Freeframe) for a host application (VVVV → http://vvvv.org). I want to use CUDA in my plugin source code e.g. for performing a simple background subtraction (subtract reference image from current camera frame). Using the simple code, I can see in the host application’s render window that the reference background image is subtracted from the original camera image. But: there are heavy tearing artifacts I cannot explain. Any suggestions how to fix this? (switching VSync on in the NVidia control settings didn’t fix the problem - category “global 3D settings”).
My system:
Windows XP Home SP3
Intel Core 2 Duo (2,00GHz)
2GB RAM
Microsoft Visual Studio 2008 Standard Edition
GeForce 8600GT, 512MB
graphics driver: CUDA 180.60 beta
SDK: NVIDIA_SDK10_CUDA_2.10.1126.1520.exe
monitor resolution:1280x1024 32bit @85Hz
here is a codesnippet:
[codebox]////// File main.cpp ////////
…
//pFrame contains the current camera frame. The VVVV host application calls indirectly this method (which is part of the plugin dll). The modified data in pFrame will be rendered by the host application (using DirectX)
DWORD plugClass::processFrame24Bit(LPVOID pFrame)
{
…
EnterCriticalSection(&CriticalSection);
…
//pSrc, pDest, m_pBackgroundReference are OpenCV images of type IplImage*
processBackroundSubtraction((unsigned char*)pSrc->imageData, (unsigned char*)pDest->imageData,
(unsigned char*)m_pBackgroundReference->imageData, numImageDataElements, buffer);
…
LeaveCriticalSection(&CriticalSection);
…
}
//////////////////////////////
/////// File BackgroundSubtraction.cu ////////////
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
global void backgroundSubtractionOnDevice(unsigned char* current, unsigned const char* reference, int count)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < count)
{
int value = current[idx] - reference[idx];
current[idx] = (value < 0) ? 0 : (unsigned char) value;
}
}
//TODO:
//resolve tearing artifacts (sync issue?)
//optimise:
//1) upload reference image only if necessary.
//2) allocate/deallocate GPU memory only if input image format changes (image width, height, or depth)
//3) what are best values for blockSize and numBlocks?
//4) use shared memory for faster data access
extern “C”
void processBackroundSubtraction(const unsigned char* pSrc, unsigned char* pDest,
const unsigned char* pBackgroundReference, int count, char* error)
{
unsigned char* current_dev;
unsigned char* reference_dev;
size_t size = count * sizeof(unsigned char);
//upload to GPU
cudaMalloc((void **) ¤t_dev, size);
cudaMalloc((void **) &reference_dev, size);
cudaMemcpy(current_dev, pSrc, size, cudaMemcpyHostToDevice);
cudaMemcpy(reference_dev, pBackgroundReference, size, cudaMemcpyHostToDevice);
//background subtraction on GPU
int blockSize = 128;
int numBlocks = (int)ceil(count / (float)blockSize);
backgroundSubtractionOnDevice <<< numBlocks, blockSize >>> (current_dev, reference_dev, count);
//download from GPU
cudaMemcpy(pDest, current_dev, size, cudaMemcpyDeviceToHost);
cudaFree(current_dev);
cudaFree(reference_dev);
//get error message if GPU background subtraction failed
cudaError_t cudaErrorCode = cudaGetLastError();
if (cudaErrorCode != cudaSuccess)
{
strcpy(error, "CUDA processBackroundSubtraction() failed to launch error = ");
char cNumber[128];
itoa(cudaErrorCode, cNumber, 10);
strcat(error, cNumber);
}
}
///////////////////////////////////////////////////////[/codebox]