Hello.
When I run the cuda histogram sample of argus lib at 4k analysis, the argus daemon service consumes too much memory - about 1 GB - at the time of submitting the first request. The problem is getting bigger for me because I need 3 parallel cuda consumers for three different sensors, that run the same code. So again, for every thread, the argus deamon service consumes about 1 GB memory at the time of the first reqeuest. And then I am getting segmentation fault due to lack of memory. I dont know if this is a normal situation of argus deamon, and if so, what I have to do in order to fix this. Here is the code that every thread executes:
Initialization:
bool CudaConsumerThread::threadInitialize()
{
CUresult cuResult;
threadId = pthread_self();
// Global variables
g_cudaContext = 0;
if(!initCUDA(&g_cudaContext)) {
ACQUISITION_ERROR("Failed to initialize CUDA");
return false;
}
ACQUISITION_PRINT("Creating stream settings");
this->streamSettings = UniqueObj<OutputStreamSettings>(m_iCaptureSession->createOutputStreamSettings());
this->iStreamSettings = interface_cast<IOutputStreamSettings>(streamSettings);
if (this->iStreamSettings)
{
this->iStreamSettings->setPixelFormat(PIXEL_FMT_YCbCr_420_888);
this->iStreamSettings->setResolution(m_streamSize);
}
outputStream = UniqueObj<OutputStream>
(m_iCaptureSession->createOutputStream(streamSettings.get()));
iStream = interface_cast<IStream>(outputStream);
ACQUISITION_PRINT("Connecting CUDA to OutputStream as an EGLStream consumer");
cuResult = cuEGLStreamConsumerConnect(&cudaConnection, iStream->getEGLStream());
if (cuResult != CUDA_SUCCESS)
{
ACQUISITION_ERROR("Unable to connect CUDA as a consumer from EGLStream (CUresult "
+ std::string(getCudaErrorString(cuResult)) + ").");
return false;
}
ACQUISITION_PRINT("Create capture request");
this->request = UniqueObj<Request>(m_iCaptureSession->createRequest());
this->iRequest = interface_cast<IRequest>(request);
if (!iRequest)
ACQUISITION_ERROR("Failed to create Request");
this->iRequest->enableOutputStream(outputStream.get());
return true;
}
Submite reqeust:
bool CudaConsumerThread::acquireFrame()
{
/* Submit request */
struct timespec tstart={0,0}, tend={0,0};
double time;
clock_gettime(CLOCK_REALTIME, &tstart);
Argus::Status status;
const uint64_t ONE_SECOND = 1000000000;
uint32_t result = this->m_iCaptureSession->capture(request.get(), ONE_SECOND, &status);
if (result == 0)
{
ORIGINATE_ERROR("Failed to submit capture request (status %x)", status);
return false;
}
clock_gettime(CLOCK_REALTIME, &tend);
time = ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) -
((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec);
ACQUISITION_TIME("Capture time", time);
/* Acquire frame */
CUresult cuResult;
CUgraphicsResource cudaResource = 0;
CUstream cudaStream = 0;
cuResult = cuEGLStreamConsumerAcquireFrame(&cudaConnection, &cudaResource, &cudaStream, -1);
clock_gettime(CLOCK_REALTIME, &tstart);
if (cuResult != CUDA_SUCCESS)
{
ACQUISITION_ERROR("Unable to acquire an image frame from the EGLStream with CUDA as a consumer (CUresult " + std::string(getCudaErrorString(cuResult)) + ").");
return false;
}
clock_gettime(CLOCK_REALTIME, &tend);
time = ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) -
((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec);
ACQUISITION_TIME("Acquisition time", time);
/* Move frame to GPU */
clock_gettime(CLOCK_REALTIME, &tstart);
CUeglFrame cudaEGLFrame;
cuResult = cuGraphicsResourceGetMappedEglFrame(&cudaEGLFrame, cudaResource, 0, 0);
if (cuResult != CUDA_SUCCESS)
{
ACQUISITION_ERROR("Unable to get the CUDA EGL frame (CUresult "
+ std::string(getCudaErrorString(cuResult)) + ").");
return false;
}
// Print the information contained in the CUDA EGL frame structure.
PROPAGATE_ERROR(printCUDAEGLFrame(cudaEGLFrame));
if ((cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV420_PLANAR) &&
(cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR) &&
(cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV422_PLANAR) &&
(cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR))
{
ORIGINATE_ERROR("Only YUV color formats are supported");
}
if (cudaEGLFrame.cuFormat != CU_AD_FORMAT_UNSIGNED_INT8)
ORIGINATE_ERROR("Only 8-bit unsigned int formats are supported");
CUDA_RESOURCE_DESC cudaResourceDesc;
memset(&cudaResourceDesc, 0, sizeof(cudaResourceDesc));
cudaResourceDesc.resType = CU_RESOURCE_TYPE_ARRAY;
cudaResourceDesc.res.array.hArray = cudaEGLFrame.frame.pArray[0];
CUsurfObject cudaSurfObj = 0;
cuResult = cuSurfObjectCreate(&cudaSurfObj, &cudaResourceDesc);
if (cuResult != CUDA_SUCCESS)
{
ACQUISITION_PRINT("Unable to create the surface object \
(CUresult " + std::string(getCudaErrorString(cuResult)) + ").");
return false;
}
cuResult = cuSurfObjectDestroy(cudaSurfObj);
if (cuResult != CUDA_SUCCESS)
{
ACQUISITION_PRINT("Unable to destroy the surface object \
(CUresult " + std::string(getCudaErrorString(cuResult)) + ").");
}
cuResult = cuEGLStreamConsumerReleaseFrame(&cudaConnection, cudaResource, &cudaStream);
if (cuResult != CUDA_SUCCESS)
{
ACQUISITION_PRINT("Unable to release the last frame acquired from the EGLStream \
(CUresult " + std::string(getCudaErrorString(cuResult)) + ").");
return false;
}
return true;
}
Any help would be appreciated. Thank you.
Hi,
We are trying to reproduce this issue on our environment.
May I know the difference between your implement and the cudaHistogram sample?
Could we reproduce this issue directly with the official sample?
Thanks.
Hi AastaLL,
The first part of my post concerns the official cudaHistogram sample itself:
When I run the cuda histogram sample of argus lib at 4k analysis, the argus daemon service consumes too much memory - about 1 GB - at the time of submitting the first request.
Do you observe the same behavior?
Hi,
We are checking this internally.
Will update information with you later.
Thanks.
Update:
I tried the sample on different camera (IMX214, the previews was the IMX377). The memory problem remains. So we can assume that this is not a driver issue.
Hi,
We try this with IMX274 4K input and the memory taken is 579Mb, far from 1G.
There is some difference between us. Could you also test the pure cudaHistorm on your environment?
More, could you check your application with nvprof and share the data with us?
[url]https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview[/url]
Thanks.
Hello and sorry for the delay.
I checked again the memory with the cudaHistogram sample and the memory taken is ~590MB. Still, the memory that the sample takes is too much.
I also checked the cudaHistogram with the nvprof tool with argument –print-gpu-trace and the results in the first frame are
Executing Argus Sample: argus_cudahistogram
Argus Version: 0.96.2 (multi-process)
Creating output stream
Initializing CUDA
==4332== NVPROF is profiling process 4332, command: ./argus_cudahistogram
==4332== Warning: Unified Memory Profiling is not supported on the underlying platform. System requirements for unified memory can be found at: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
Connecting CUDA to OutputStream as an EGLStream consumer
Submitting a capture request
Acquiring an image from the EGLStream
CUeglFrame:
width: 4096
height: 2304
depth: 0
pitch: 0
planeCount: 2
numChannels: 1
frameType: array
colorFormat: YUV420 semi-planar
cuFormat: uint8
Calculating histogram with 64 bins...
Finished after 2.443386 ms.
Result 0:
0: 197235 192853 199573 203184 206839 215397 225270 228707
8: 237797 256634 275134 291443 300190 305268 318036 334733
16: 321257 285956 262867 240898 202215 126716 68228 39064
24: 26959 23373 19118 17701 19318 16849 13873 13229
32: 14123 15619 15845 16957 19431 19697 19384 22744
40: 28023 34379 39993 42523 45603 52921 64098 83242
48: 110477 133518 152565 169731 176417 183695 198226 205360
56: 222309 238742 240651 244449 251456 247803 229733 211876
==4332== Profiling application: ./argus_cudahistogram
==4332== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput SrcMemType DstMemType Device Context Stream Name
37.2977s 2.3288ms (16 16 1) (32 4 1) 13 256B 0B - - - - NVIDIA Tegra X1 1 7 void histogram_smem_atomics<int=1024, int=64>(unsigned __int64, unsigned int, unsigned int, unsigned int*) [121]
37.3000s 13.073us (2 1 1) (128 1 1) 28 0B 0B - - - - NVIDIA Tegra X1 1 7 void histogram_smem_accum<int=1024, int=64>(unsigned int const *, int, unsigned int*) [126]
37.3001s 1.3030us - - - - - 256B 187.37MB/s Device Pageable NVIDIA Tegra X1 1 7 [CUDA memcpy DtoH]
Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy
And this is the output of the nvprof with no arguments
==4465== Profiling application: ./argus_cudahistogram
==4465== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 99.44% 2.5820ms 1 2.5820ms 2.5820ms 2.5820ms void histogram_smem_atomics<int=1024, int=64>(unsigned __int64, unsigned int, unsigned int, unsigned int*)
0.50% 12.917us 1 12.917us 12.917us 12.917us void histogram_smem_accum<int=1024, int=64>(unsigned int const *, int, unsigned int*)
0.06% 1.6140us 1 1.6140us 1.6140us 1.6140us [CUDA memcpy DtoH]
API calls: 59.70% 249.90ms 1 249.90ms 249.90ms 249.90ms cuEGLStreamConsumerAcquireFrame
39.09% 163.64ms 1 163.64ms 163.64ms 163.64ms cuCtxCreate
0.61% 2.5528ms 1 2.5528ms 2.5528ms 2.5528ms cudaEventSynchronize
0.36% 1.4899ms 2 744.93us 337.60us 1.1523ms cudaMalloc
0.06% 249.06us 2 124.53us 105.52us 143.54us cudaFree
0.04% 171.40us 2 85.701us 15.312us 156.09us cudaEventRecord
0.04% 170.31us 1 170.31us 170.31us 170.31us cuEGLStreamConsumerConnectWithFlags
0.03% 111.40us 2 55.701us 38.280us 73.123us cudaLaunch
0.02% 96.769us 1 96.769us 96.769us 96.769us cudaMemcpy
0.02% 89.061us 94 947ns 416ns 30.104us cuDeviceGetAttribute
0.01% 42.551us 1 42.551us 42.551us 42.551us cuSurfObjectCreate
0.00% 15.000us 1 15.000us 15.000us 15.000us cuSurfObjectDestroy
0.00% 12.760us 2 6.3800us 885ns 11.875us cudaConfigureCall
0.00% 11.563us 2 5.7810us 3.2300us 8.3330us cudaEventCreate
0.00% 10.313us 1 10.313us 10.313us 10.313us cuEGLStreamConsumerReleaseFrame
0.00% 10.000us 1 10.000us 10.000us 10.000us cuDeviceTotalMem
0.00% 9.9480us 7 1.4210us 364ns 6.4060us cudaSetupArgument
0.00% 9.7910us 2 4.8950us 2.1870us 7.6040us cudaEventDestroy
0.00% 5.8320us 4 1.4580us 885ns 2.9680us cuDeviceGetCount
0.00% 5.5200us 1 5.5200us 5.5200us 5.5200us cudaEventElapsedTime
0.00% 3.3850us 1 3.3850us 3.3850us 3.3850us cuGraphicsResourceGetMappedEglFrame
0.00% 2.8630us 3 954ns 729ns 1.0930us cuDeviceGet
0.00% 2.8120us 1 2.8120us 2.8120us 2.8120us cuInit
0.00% 2.1870us 1 2.1870us 2.1870us 2.1870us cuDriverGetVersion
0.00% 1.4580us 1 1.4580us 1.4580us 1.4580us cuDeviceGetName
FYI The cudaHistogram consumes
~= 1GB with imx377 in full resolution and JetPack 3.1
~= 590MB with imx214 in full resolution and JetPack 3.3
Hi,
We don’t have imx377 sensor.
Could you help us test it with JetPack3.3?
We would like to narrow down the issue is from sensor or package version.
Thanks.
Hi,
I am trying to understand the situation here. When I work without the cuda connection, everything is ok in 4k resolution, no memory consumption. The problem occurs when you connect the argus producer with the cuda consumer like the cudaHistogram sample does, whatever the sensor is. In my mind this can’t be a sensor issue.
However if you believe that any test in imx377 can help I am willing to help.
Hi,
After checking, this is a known problem.
Root cause is there are lots of buffers behind this sample.
We are discussing the possibility to lower the buffer amount.
Will update information with you if any progress.
Thanks.
Hi,
So currently, there is no way to use 3 cameras on 4k resolution and GPU processing on Jetson TX1. Ηow do you propose to continue?
My plan was not only use 3 cameras, but to go to 6 in the near feature. Ιf there is no solution soon I will have to redesign my system. Can you give me a timeline for fixing this bug?
Is there any chance to use Jetson TX2 for the project that I described ?
Hi,
It may help since TX2 have twice memory size.
We will check this internally and update information with you.
Thanks.
Hi,
TX1 doesn’t support (3x) 4K camera.
For your use-case, it’s recommended to use Jetson AGX Xavier, which supports (2x) 4Kp60.
[url]https://developer.nvidia.com/embedded/buy/jetson-xavier-devkit[/url]
Thanks.