convert ARGB (RGB32) to YUV

I try to encode camera ARGB raw video with NVENC SDK 8.0. my project build in visual sudio 2013 linked with cuda tool kit 8.0. my issue is that I’m having trouble to converting ARGB to YUV which needed for encoding. I’m trying to use NPP library as you can see in the following code but it does not work (the output compress video is empty green image), can someone please help me to understand what wrong with the code or It would even be better to get any suggestion to do this converting job with NVIDIA SDK or CUDA (actually I need to convert ARGB to NV12)

NVENCSTATUS CNvEncoderLowLatency::ConvertRGBToNV12NPP(unsigned char *pRGB, unsigned char *yuv[3],int width, int height)
{
NppStatus ret = NPP_SUCCESS;

Npp32u   m_RGBPitch;
Npp8u*   m_RGBDriver = NULL;

cudaMallocPitch((void**)&m_RGBDriver, (size_t*)&m_RGBPitch, width*4 , height);

int m_stepDestP1,m_stepDestP2,m_stepDestP3;

Npp8u* m_stYuvP1 = nppiMalloc_8u_C1(uInputWidth, uInputHeight, &m_stepDestP1);
Npp8u* m_stYuvP2 = nppiMalloc_8u_C1(uInputWidth/2 , uInputHeight/2 , &m_stepDestP2);
Npp8u* m_stYuvP3 = nppiMalloc_8u_C1(uInputWidth/2 , uInputHeight/2 , &m_stepDestP3);

cudaMemcpy(m_RGBDriver, pRGB, width*height*4, cudaMemcpyHostToDevice);

int d_steps[3] = { m_stepDestP1, m_stepDestP2, m_stepDestP3 };

Npp8u* d_ptrs[3] = { m_stYuvP1, m_stYuvP2, m_stYuvP3 };

NppiSize ROI = { width*4, height };


if ((ret = nppiRGBToYCrCb420_8u_AC4P3R(m_RGBDriver, m_RGBPitch, d_ptrs, d_steps, ROI)) != NPP_SUCCESS)
	return NV_ENC_ERR_GENERIC;


cudaMemcpy(yuv[0], d_ptrs[0], width*height , cudaMemcpyDeviceToHost);
cudaMemcpy(yuv[1], d_ptrs[1], width*height /4, cudaMemcpyDeviceToHost);
cudaMemcpy(yuv[2], d_ptrs[2], width*height /4, cudaMemcpyDeviceToHost);



return NV_ENC_SUCCESS;

}

Why are you not using the built-in API in NVENC which accepts RGB inputs?

do you mean to use NV_ENC_BUFFER_FORMAT_ARGB as EncodeInputBuffer.bufferfmt ?
thanks

Yes.

O.K , but Do I need to copy the frames I grabed from the camera to CUDA memory using cumemcpy , (the source is the RGBA frame buffer and the destination is CUdeviceptr) before encoding ?
should I need to allocate the RGBA frame buffer only by using cuMemAllocHost , do I need to worry about switching context by cuCtxPushCurrent and cuCtxPopCurrent

Should the code be something like this

cuMemAllocPitch( CUdeviceptr , pPitch …)

uCtxPushCurrent(cuCtx)
cuMemAllocHost frame grabber buffer
cuCtxPopCurrent(&cuCtx)

capture frame
cumemcpy frame (pitch w*4) to destination CUdeviceptr , pPitch

encode

thanks for your help

Take a look at CNvEncoderCudaInterop::ConvertYUVToNV12(…) in NvEncoderCudaInterop.cpp of the SDK. It does something very similar (YUV to NV12); the only difference in your case is that you need a simple copy instead of any processing.

In general, you would do something like:

uCtxPushCurrent(cuCtx)
cuMemAllocPitch( CUdeviceptr , pPitch …)
cuMemAllocHost frame grabber buffer
capture frame
cumemcpy frame (pitch w*4) to destination CUdeviceptr , pPitch
cuCtxPopCurrent(&cuCtx)

encode

Make sure that encoder is called after you pop the context, since encoder internally pushes it again.

Hope this helps.

I did exactly as you suggest and the input h264 encoding frames look very strething and greening

why ? what am i missing : (

working on GTX965M driver 384.76 nvenc SDK 8 , windows 7 64bit

Here is a detailed steps you should do:

#1 Init Stage
a)Create the cuda context

cuCtxCreate();

b)Allocate host memory for holding the RGB data from camera using

cuMemAllocHost();

c)Allocate device memory for sending the data to encoder using

cuMemAllocPitch();

d)Pop the current ctx from host thread

cuCtxPopCurrent();

#2 Encoder Init
a)Create the encoder and pass the cuda context created in 1a
b)Register the RGB device memory created in 1b

#3 Copy the RGB frames to GPU

cuCtxPushCurrent(); // push the cuda context created in step 1a
cuMemcpy2d(); // host to Device
cuCtxPopCurrent(); // pop the current context before calling the encoder

Pass the RGB frame to encoder using EncodeFrame api

#4 Destroy the encoder and cuda resources

cuCtxPushCurrent();
cuMemFree();
cuMemFreeHost();
cuCtxPopCurrent();
cuDestroyCtx();

Are you doing this and still seeing problems?

Thanks for your good advices
yes i am doing the same
It getting better still not perfect
now i get good quality image but divided to four 4801080 pictures instead of one picture of 19201080

This is the current implementation base on NvEncoderCudaInterop sample (SDK 8) :

InitCuda : cuCtxCreate(&m_cuContext, 0, cuDevice));

m_pNvHWEncoder->Initialize : NvEncOpenEncodeSessionEx(m_cuContext, deviceType);

–InitCaptureDriver

m_pNvHWEncoder->CreateEncoder: m_pEncodeAPI->nvEncInitializeEncoder(m_hEncoder, &m_stCreateEncodeParams)

AllocateIOBuffers:
CCudaAutoLock cuLock(m_cuContext);
__cu(cuMemAllocHost((void **)&m_argb, uInputWidth*uInputHeight * 4)); //ARGB capture frame
__cu(cuMemAllocPitch(&m_stEncodeBuffer[i].stInputBfr.pNV12devPtr, (size_t *)&m_stEncodeBuffer[i].stInputBfr.uNV12Stride, uInputWidth, uInputHeight * 4, 16)); //ARGB device input

m_stEncodeBuffer[i].stInputBfr.bufferFmt = NV_ENC_BUFFER_FORMAT_ARGB;

m_pNvHWEncoder->NvEncRegisterResource(NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR, (void*)m_stEncodeBuffer[i].stInputBfr.pNV12devPtr,
uInputWidth, uInputHeight, m_stEncodeBuffer[i].stInputBfr.uNV12Stride, &m_stEncodeBuffer[i].stInputBfr.nvRegisteredResource, NV_ENC_BUFFER_FORMAT_ARGB);

m_pNvHWEncoder->NvEncCreateBitstreamBuffer(BITSTREAM_BUFFER_SIZE, &m_stEncodeBuffer[i].stOutputBfr.hBitstreamBuffer);

for (int frm = m_encodeConfig.startFrameIdx; frm <= m_encodeConfig.endFrameIdx; frm++)
{

– capture frame (m_argb)

cuCtxPushCurrent(m_cuContext);

	CUDA_MEMCPY2D copyParam;
	memset(&copyParam, 0, sizeof(copyParam));
	copyParam.dstMemoryType = CU_MEMORYTYPE_DEVICE;
	copyParam.dstDevice = pEncodeBuffer->stInputBfr.pNV12devPtr;
	copyParam.dstPitch = pEncodeBuffer->stInputBfr.uNV12Stride;
	copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
	copyParam.srcHost = m_argb;
	copyParam.srcPitch = m_encodeConfig.width * 4;
	copyParam.WidthInBytes = m_encodeConfig.width;
	copyParam.Height = m_encodeConfig.height;
	__cu(cuMemcpy2D(&copyParam));

cuCtxPopCurrent(&m_cuContext);

m_pNvHWEncoder->NvEncMapInputResource(pEncodeBuffer->stInputBfr.nvRegisteredResource, &pEncodeBuffer->stInputBfr.hInputSurface);

m_pNvHWEncoder->NvEncEncodeFrame(pEncodeBuffer, NULL, m_encodeConfig.width, m_encodeConfig.height);

m_pNvHWEncoder->ProcessOutput(pEncodeBuffer);

m_pNvHWEncoder->NvEncUnmapInputResource(pEncodeBuffer->stInputBfr.hInputSurface);

1.__cu(cuMemAllocPitch(&m_stEncodeBuffer[i].stInputBfr.pNV12devPtr, (size_t )&m_stEncodeBuffer[i].stInputBfr.uNV12Stride, uInputWidth, uInputHeight * 4, 16)); //ARGB device input
----->
__cu(cuMemAllocPitch(&m_stEncodeBuffer[i].stInputBfr.pNV12devPtr, (size_t )&m_stEncodeBuffer[i].stInputBfr.uNV12Stride, uInputWidth4, uInputHeight, 4)); //ARGB device input
why uNV12Stride is 2
width when elembytes is 4;it should be 4*width.

2.copyParam.WidthInBytes = m_encodeConfig.width;
------>
copyParam.WidthInBytes = m_encodeConfig.width*4;

you will get the whole picture!

Also
copyParam.WidthInBytes = m_encodeConfig.width *4
Cheers!

Take a look at your code and I see you missed on ROI. The ROI you need to specify region of image you want to copy in range of 0 to width and 0 to height. There is no pitch here. But your code is:

NppiSize ROI = { width*4, height };

You should change to:

NppiSize ROI = { width, height };

Here is my code for convert from RGB or RGBA to YUV420p:

void convert_rgb_to_yuv420(const unsigned char* rgb, unsigned width, unsigned height, unsigned channels, unsigned char* yuv) {
	//
	Npp8u* srcMemory;
	Npp8u* dstMemory;

	if (!rgb || !yuv || (width <= 0) || (height <= 0) || (channels < 3) || (channels > 4))
		return;

	cudaMalloc((void **)&dstMemory, width*height*3/2);
	if (!dstMemory)
		return;

	cudaMalloc((void **)&srcMemory, width*height*channels);
	if (!srcMemory) {
		cudaFree(dstMemory);
		return;
	}

	cudaMemcpy(srcMemory, rgb, width*height*channels, cudaMemcpyHostToDevice);
	NppiSize roi = {width, height};
	Npp8u* dstBuff[3] = {dstMemory, dstMemory + width*height, dstMemory + width*height*5/4 };
	int dstSteps[3] = {width, width/2, width/2};

	if (channels ==3) {
		nppiRGBToYCbCr420_8u_C3P3R(srcMemory, width*channels, dstBuff, dstSteps, roi);
	} else {
		nppiRGBToYCrCb420_8u_AC4P3R(srcMemory, width*channels, dstBuff, dstSteps, roi);
	}

	cudaMemcpy(yuv, dstMemory, width*height*3/2, cudaMemcpyDeviceToHost);
	cudaFree(dstMemory);
	cudaFree(srcMemory);
}