Hi AastaLLL,
Yes. There is some memory copy which is happening. Please find the code snippets below:
code snippet using zero copy memory :
int fsize1 = 2048 * 1080 ; // for 1920x1080 resolution
char *m_datamem;
char *m_hcurrframe;
char *m_dcurrframe;
char *m_hcudaout;
char *m_dcudaout;
int alloc1 = cudaHostAlloc((void **)&m_hcurrframe, fsize1, cudaHostAllocMapped);
int getPtr1 = cudaHostGetDevicePointer((void **)&m_dcurrframe, (void *) m_hcurrframe, 0);
int alloc2 = cudaHostAlloc((void **)&m_hcudaout, fsize1, cudaHostAllocMapped);
int getPtr2 = cudaHostGetDevicePointer((void **)&m_dcudaout, (void *) m_hcudaout, 0);
while (true)
{
// Acquire a Frame.
UniqueObj<Frame> frame(iFrameConsumer->acquireFrame());
IFrame *iFrame = interface_cast<IFrame>(frame);
if (!iFrame)
break;
// Get the Frame's Image.
Image *image = iFrame->getImage();
EGLStream::NV::IImageNativeBuffer *iImageNativeBuffer
= interface_cast<EGLStream::NV::IImageNativeBuffer>(image);
TEST_ERROR_RETURN(!iImageNativeBuffer, "Failed to create an IImageNativeBuffer");
int fd = iImageNativeBuffer->createNvBuffer(Argus::Size {m_framesize.width, m_framesize.height},
NvBufferColorFormat_YUV420, NvBufferLayout_Pitch, &status);
if (status != STATUS_OK)
TEST_ERROR_RETURN(status != STATUS_OK, "Failed to create a native buffer");
#if 1
cudaSetDeviceFlags(cudaDeviceMapHost);
NvBufferParams params;
NvBufferGetParams(fd, ¶ms);
cout<<"params.pitch[0] : "<< params.pitch[0] <<endl;
cout<<"params.offset[0] : "<< params.offset[0] <<endl;
int fsize = params.pitch[0] * m_framesize.height ;
m_datamem = (char *)mmap(NULL, fsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, params.offset[0]);
struct timeval tp1;
gettimeofday(&tp1, NULL);
long start1 = tp1.tv_sec * 1000 + tp1.tv_usec / 1000;
int copy1 = cudaMemcpy (m_dcurrframe,m_datamem,fsize,cudaMemcpyHostToDevice) ;
cout<<endl<<"copy1 : "<<copy1 <<endl;
float timediff = diff10(m_dcudaout,m_dcurrframe, m_framesize.width, m_framesize.height,params.pitch[0]);
printf("Finished diff operation after %f ms.\n", timediff);
cudaDeviceSynchronize();
gettimeofday(&tp1, NULL);
long end1 = tp1.tv_sec * 1000 + tp1.tv_usec / 1000;
long deltaT = end1 - start1;
cout<< "Time Taken : " << deltaT <<" ms"<<endl;
Code Snippet using Unified memory :
int fsize1 = 2048 * 1080 ; // for 1920x1080 resolution
char *m_datamem;
char *m_hcurrframe;
char *m_dcurrframe;
char *m_hcudaout;
char *m_dcudaout;
int alloc1 = cudaMallocManaged(&m_dcurrframe, fsize1);
int alloc2 = cudaHostAlloc((void **)&m_hcudaout, fsize1, cudaHostAllocMapped);
int getPtr2 = cudaHostGetDevicePointer((void **)&m_dcudaout, (void *) m_hcudaout, 0);
while (true)
{
// Acquire a Frame.
UniqueObj<Frame> frame(iFrameConsumer->acquireFrame());
IFrame *iFrame = interface_cast<IFrame>(frame);
if (!iFrame)
break;
// Get the Frame's Image.
Image *image = iFrame->getImage();
EGLStream::NV::IImageNativeBuffer *iImageNativeBuffer
= interface_cast<EGLStream::NV::IImageNativeBuffer>(image);
TEST_ERROR_RETURN(!iImageNativeBuffer, "Failed to create an IImageNativeBuffer");
int fd = iImageNativeBuffer->createNvBuffer(Argus::Size {m_framesize.width, m_framesize.height},
NvBufferColorFormat_YUV420, NvBufferLayout_Pitch, &status);
if (status != STATUS_OK)
TEST_ERROR_RETURN(status != STATUS_OK, "Failed to create a native buffer");
#if 1
cudaSetDeviceFlags(cudaDeviceMapHost);
NvBufferParams params;
NvBufferGetParams(fd, ¶ms);
cout<<"params.pitch[0] : "<< params.pitch[0] <<endl;
cout<<"params.offset[0] : "<< params.offset[0] <<endl;
int fsize = params.pitch[0] * m_framesize.height ;
m_datamem = (char *)mmap(NULL, fsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, params.offset[0]);
struct timeval tp1;
gettimeofday(&tp1, NULL);
long start1 = tp1.tv_sec * 1000 + tp1.tv_usec / 1000;
int copy1 = cudaMemcpy (m_dcurrframe,m_datamem,fsize,cudaMemcpyHostToDevice) ;
cout<<endl<<"copy1 : "<<copy1 <<endl;
float timediff = diff10(m_dcudaout,m_dcurrframe, m_framesize.width, m_framesize.height,params.pitch[0]);
printf("Finished diff operation after %f ms.\n", timediff);
cudaDeviceSynchronize();
gettimeofday(&tp1, NULL);
long end1 = tp1.tv_sec * 1000 + tp1.tv_usec / 1000;
long deltaT = end1 - start1;
cout<< "Time Taken : " << deltaT <<" ms"<<endl;
THe time taken by cuda kernel when memory is allocated using cudaMallocManaged (Unified Memory) is usually more than that taken by cudaHostAlloc (Zero Copy Memory)
Thanks.