Libargus crashing with cuda-openGL interop

Hi i have built a 6 csi-camera capturing application using libargus samples which works well. I have also removed all dependencies and built separately using my own makefile. Now i would like to display the panorama generated with cuda using a render application i built which works fine independently.It takes care of cuda-opengl interop using freeglut. However i get a segmentation fault during the capturing stage when i integrate with libargus. I have checked the opengl box sample program. However , that uses openGLES and EGL .Any help would be appreciated. This is my makefile

# Copyright (c) 2014-2015, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# OS info
OSLOWER := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")

OS_ARCH := $(shell uname -m | sed -e "s/i386/i686/")

# Take command line flags that override any of these settings
ifeq ($(i386),1)
	OS_ARCH := i686
endif

ifeq ($(x86_64),1)
	OS_ARCH := x86_64
endif

ifeq ($(ARMv7),1)
	OS_ARCH := armv7l
endif

ifeq ($(ARMv8),1)
	OS_ARCH := aarch64
endif

# Specify the logical root directory for headers and libraries.

# From JPEG Makefile
ifeq ($(shell uname -m), aarch64)
	TARGET_ROOTFS :=
else
	ifeq ($(TARGET_ROOTFS),)
		$(error Please specify the target rootfs path if you are cross-compiling)
	endif
endif

ifneq ($(TARGET_ROOTFS),)
CPPFLAGS += --sysroot=$(TARGET_ROOTFS)
LDFLAGS += \
	-Wl,-rpath-link=$(TARGET_ROOTFS)/lib/$(TEGRA_ARMABI) \
	-Wl,-rpath-link=$(TARGET_ROOTFS)/usr/lib/$(TEGRA_ARMABI) \
	-Wl,-rpath-link=$(TARGET_ROOTFS)/usr/lib/$(TEGRA_ARMABI)/tegra
endif

CPPFLAGS += \
	-I"$(TARGET_ROOTFS)/usr/include/$(TEGRA_ARMABI)" \
	-I"../../include"

LDFLAGS += \
	-L"$(TARGET_ROOTFS)/usr/lib/$(TEGRA_ARMABI)" \
	-L"$(TARGET_ROOTFS)/usr/lib/$(TEGRA_ARMABI)/tegra"
	
	

CXXFLAGS += -std=c++0x

ifneq ($(VIBRANTE_TOOLCHAIN_SYSROOT),)
	CCFLAGS += --sysroot="$(VIBRANTE_TOOLCHAIN_SYSROOT)"
endif

# Configuration-specific build flags
ifeq ($(dbg),1)
	CCFLAGS += -g
	TARGET := debug
else
	CCFLAGS += -O3 -DNDEBUG
	TARGET := release
endif


EXTERNAL_CFLAGS :=
EXTERNAL_LIBS :=

EXTERNAL_CFLAGS += $(shell pkg-config --cflags cudart-8.0)
EXTERNAL_LIBS += $(shell pkg-config --libs cudart-8.0)






INCLUDES :=
INCLUDES += $(EXTERNAL_CFLAGS)
INCLUDES += -I./include
INCLUDES += -I./utils
INCLUDES += -I./argus/include


LIBRARIES := -L"$(PKG_CONFIG_SYSROOT_DIR)/usr/lib"
LIBRARIES += -lpthread -lrt -lm -lv4l2 -lGL -lGLEW -lglut
LIBRARIES += /usr/lib/aarch64-linux-gnu/tegra/libargus.so


ifneq ($(VIBRANTE_TOOLCHAIN_SYSROOT),)
	LIBRARIES += -L"$(VIBRANTE_TOOLCHAIN_SYSROOT)/usr/lib"
endif

ifneq ($(PKG_CONFIG_SYSROOT_DIR),)
	ifeq ($(ARMv7),1)
		LIBRARIES += -Wl,-rpath-link="$(PKG_CONFIG_SYSROOT_DIR)/lib/arm-linux-gnueabihf"
		LIBRARIES += -Wl,-rpath-link="$(PKG_CONFIG_SYSROOT_DIR)/usr/lib"
		LIBRARIES += -Wl,-rpath-link="$(PKG_CONFIG_SYSROOT_DIR)/usr/lib/arm-linux-gnueabihf"
	endif
endif


LIBRARIES +=  /usr/lib/aarch64-linux-gnu/tegra/libcuda.so 




LIBRARIES += $(EXTERNAL_LIBS)

# add CUDA to runtime path
CUDA_LIB_PATH := $(subst -L,,$(shell pkg-config --libs-only-L cudart-8.0))
LDFLAGS += -Wl,--allow-shlib-undefined -pthread
LDFLAGS += -Wl,-rpath=$(CUDA_LIB_PATH)

# show libraries used by linker in debug mode
ifeq ($(dbg),1)
	LDFLAGS += -Wl,--trace
endif

CPP_FILES := $(wildcard *.cpp)
C_FILES := $(wildcard *.c)
CU_FILES := $(wildcard *.cu)
OBJ_DIR := obj/$(TARGET)
OBJ_FILES_UTILS := ./utils/obj/release/*.o

OBJ_FILES_CPP := $(addprefix $(OBJ_DIR)/,$(notdir $(CPP_FILES:.cpp=.o)))
OBJ_FILES_C := $(addprefix $(OBJ_DIR)/,$(notdir $(C_FILES:.c=.o)))
OBJ_FILES_CU := $(addprefix $(OBJ_DIR)/,$(notdir $(CU_FILES:.cu=.o)))

OUTPUT_DIR := ./
#OUTPUT_DIR := ../../bin/$(OS_ARCH)/$(OSLOWER)/$(TARGET)$(if $(abi),/$(abi))


################################################################################

# Target rules
all: build 

build: $(OUTPUT_DIR)/PanoStitch

$(OBJ_DIR):
	mkdir -p $(OBJ_DIR)

$(OBJ_DIR)/%.o: %.cpp | $(OBJ_DIR)
	$(CXX) $(INCLUDES) $(CCFLAGS) $(CXXFLAGS)  -o $@ -c $< 

$(OBJ_DIR)/%.o: %.c | $(OBJ_DIR)
	$(CC) $(INCLUDES) $(CCFLAGS) -std=c99 -o $@ -c $<

$(OBJ_DIR)/%.o: %.cu | $(OBJ_DIR)
	nvcc -c $(CUDA_INCLUDE) $< -o $@

$(OUTPUT_DIR)/PanoStitch: $(OBJ_FILES_CPP) $(OBJ_FILES_C) $(OBJ_FILES_CU) $(OBJ_FILES_UTILS)  | $(OUTPUT_DIR)
	$(CXX) $(LDFLAGS) $(CCFLAGS) $(CXXFLAGS)  -o $@ $^ $(LIBRARIES) 
$(OUTPUT_DIR):
	mkdir -p $(OUTPUT_DIR)
run: build
	./$(OUTPUT_DIR)/PanoStitch

clean:
	rm -f $(OBJ_FILES_CPP) $(OBJ_FILES_C)
	rm -f $(OUTPUT_DIR)/PanoStitch
	
cuda_clean:
	rm -f $(OBJ_FILES_CU)

Hi tejas95,

Please send us your sample that can reproduce this issue.

Also, we had topics that hit some error if using openGL on tegra.

I would suggest to move to GLES/EGL (mmapi sample) first as we have more resources on it.

Hi WayneWWW,
I have currently switched to GLES for rendering. I am able to build it with argus. However, I am facing peculiar issues. The capturing pipeline(Argus) abruptly hangs without any errors. Everything works fine when i comment out renderobj.RenderCudaBuffer(). The rendering part of the cuda buffer was written using simpleGLES cuda sample.

/*
 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <cuda_render.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <Argus/Argus.h>
#include <unistd.h>
#include "Error.h"
#include "UniquePointer.h"


#include <cudaEGL.h>

#include "CUDAHelper.h"
#include <pthread.h>
#include <sys/time.h>
#include "CudaUtilities.h"



#define PRODUCER_PRINT(...) printf("PRODUCER: " __VA_ARGS__)
#define CONSUMER_PRINT(...) printf("CONSUMER: " __VA_ARGS__)
using namespace Argus;
#define NUM_CAMERAS 1
#define SENSOR_MODE 2
#define PREVIEW_WIDTH 1920
#define PREVIEW_HEIGHT 1080

#define gpuErrchk(ans)                        \
    {                                         \
        gpuAssert((ans), __FILE__, __LINE__); \
    }
    
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort)
            exit(code);
    }
}



// Constants
static const Size     STREAM_SIZE (PREVIEW_WIDTH, PREVIEW_HEIGHT);
static const uint32_t FRAME_COUNT = 2000;

// Global variables

#define CAPTURE_TIME 100

namespace ArgusSamples
{
	
typedef struct CudaConsumerHandle
{
	int m_numCameras;
	
	CUcontext g_cudaContext = 0;
	
	IStream **outputStreams;
	CUresult *cuResult;
    CUeglStreamConnection *cudaConnection;
    CUgraphicsResource *cudaResource;
    CUstream *cudaStream;
    CUeglFrame *cudaEGLFrame;
	
	
	uint8_t **cuda_buffers;
	cudaArray **cuda_arrays;
	
	int m_width;
	int m_height;
	
	pthread_mutex_t *lock;
	
		
		
}CudaConsumerHandle;
	
CudaConsumerHandle cudaConsumer;

	
bool InitCudaConsumer(CudaConsumerHandle *p, IStream ** OutputStreams, int num_cameras, int width, int height)
{
	p->m_numCameras = num_cameras;
	p->m_width = width;
	p->m_height = height;
	
	p->outputStreams = (IStream **)malloc(num_cameras * sizeof(IStream *));
	
	for(int i = 0 ; i < num_cameras; i++)
	{
		p->outputStreams[i] = OutputStreams[i];
		
	}
	p->lock = (pthread_mutex_t *)malloc(num_cameras * sizeof(pthread_mutex_t));
	p->cudaConnection = (CUeglStreamConnection *)malloc(num_cameras * sizeof(CUeglStreamConnection));
	p->cudaResource = (CUgraphicsResource *)malloc(num_cameras * sizeof(CUgraphicsResource));
	p->cudaStream = (CUstream *)malloc(num_cameras * sizeof(CUstream));
	p->cuResult = (CUresult *)malloc(num_cameras * sizeof(CUresult));
	p->cudaEGLFrame = (CUeglFrame *)malloc(num_cameras * sizeof(CUeglFrame));
	
	
	for(int i = 0; i < num_cameras ; i++)
		pthread_mutex_init(&p->lock[i], NULL);
		
	PROPAGATE_ERROR(initCUDA(&p->g_cudaContext));
	
	printf("Connecting CUDA to OutputStream as an EGLStream consumer\n");
    
    for(int i = 0 ; i < p->m_numCameras ; i++)
    {
		p->cuResult[i] = cuEGLStreamConsumerConnect(&p->cudaConnection[i], p->outputStreams[i]->getEGLStream());
		if (p->cuResult[i] != CUDA_SUCCESS)
		{
			ORIGINATE_ERROR("Unable to connect CUDA to EGLStream as a consumer (CUresult %s)",
				getCudaErrorString(p->cuResult[i]));
		}
	}
	
	p->cuda_buffers = (uint8_t **)malloc(num_cameras * sizeof(uint8_t *));
	p->cuda_arrays = (cudaArray **)malloc(num_cameras * sizeof(cudaArray *));
	
	for(int i = 0 ; i < p->m_numCameras ; i++)
	{
		cudaMalloc((void **)&p->cuda_buffers[i], p->m_width * p->m_height * 1.5 * sizeof(uint8_t));
	}
	
	
}

bool CudaConsumerDisconnect(CudaConsumerHandle *p)
{
	
    printf("Cleaning up cuda consumer\n");

    // Disconnect the Argus producer from the stream.
    /// @todo: This is a WAR for a bug in cuEGLStreamConsumerDisconnect (see bug 200239336).
    
    for(int i = 0 ; i < p->m_numCameras ; i++)
    {
		//outputStream[i].reset();

		p->cuResult[i] = cuEGLStreamConsumerDisconnect(&p->cudaConnection[i]);
		if (p->cuResult[i] != CUDA_SUCCESS)
		{
			ORIGINATE_ERROR("Unable to disconnect CUDA as a consumer from EGLStream (CUresult %s)",
				getCudaErrorString(p->cuResult[i]));
		}
	}

    PROPAGATE_ERROR(cleanupCUDA(&p->g_cudaContext));
	
	
	
	
}

void *CudaConsumerThreadRun(void *index)
{
	int camera_index = *((int *)index);
	CudaConsumerHandle *p = (CudaConsumerHandle *)(&cudaConsumer);
	
	CONSUMER_PRINT("Waiting for Argus producer to connect to output stream.\n");
		printf("Camera Index = %d ", camera_index);
fflush(stdout);
		
	p->outputStreams[camera_index]->waitUntilConnected();

	fflush(stdout);
	
	struct timeval tv1, tv2;
	double time = 0.0;
	 
	gettimeofday(&tv1, NULL);
	for(unsigned int frame = 0; frame < FRAME_COUNT; ++frame)
    {
        /*
         * For simplicity this example submits a capture then waits for an output.
         * This pattern will not provide the best possible performance as the camera
         * stack runs in a pipeline, it is best to keep submitting as many captures as
         * possible prior to waiting for the result.
         */
      
		
     
		//printf("Acquiring an image from the EGLStream\n");
		fflush(stdout);

     
        printf("Frame %d\n", frame);
        fflush(stdout);
		pthread_mutex_lock(&p->lock[camera_index]);
		p->cuResult[camera_index] = cuEGLStreamConsumerAcquireFrame(&p->cudaConnection[camera_index], &p->cudaResource[camera_index], &p->cudaStream[camera_index], -1);
		 pthread_mutex_unlock(&p->lock[camera_index]);
		 
		if (p->cuResult[camera_index] != CUDA_SUCCESS)
		{
				ORIGINATE_ERROR("Unable to acquire an image frame from the EGLStream with CUDA as a "
					"consumer (CUresult %s).", getCudaErrorString(p->cuResult[camera_index]));
		}
		
		
        // Get the CUDA EGL frame.
      
        
			
			pthread_mutex_lock(&p->lock[camera_index]);
		
			p->cuResult[camera_index] = cuGraphicsResourceGetMappedEglFrame(&p->cudaEGLFrame[camera_index], p->cudaResource[camera_index], 0, 0);
			
			pthread_mutex_unlock(&p->lock[camera_index]);
			
			if (p->cuResult[camera_index] != CUDA_SUCCESS)
			{
				ORIGINATE_ERROR("Unable to get the CUDA EGL frame (CUresult %s).",
					getCudaErrorString(p->cuResult[camera_index]));
			}

			// Print the information contained in the CUDA EGL frame structure.
			//PROPAGATE_ERROR(printCUDAEGLFrame(p->cudaEGLFrame[i]));

			if ((p->cudaEGLFrame[camera_index].eglColorFormat != CU_EGL_COLOR_FORMAT_YUV420_PLANAR) &&
				(p->cudaEGLFrame[camera_index].eglColorFormat != CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR) &&
				(p->cudaEGLFrame[camera_index].eglColorFormat != CU_EGL_COLOR_FORMAT_YUV422_PLANAR) &&
				(p->cudaEGLFrame[camera_index].eglColorFormat != CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR))
			{
				ORIGINATE_ERROR("Only YUV color formats are supported");
			}
			if (p->cudaEGLFrame[camera_index].cuFormat != CU_AD_FORMAT_UNSIGNED_INT8)
				ORIGINATE_ERROR("Only 8-bit unsigned int formats are supported");
		
		
				
				cudaArray *arr_y =  (struct cudaArray *)p->cudaEGLFrame[camera_index].frame.pArray[0];
				cudaArray *arr_u =  (struct cudaArray *)p->cudaEGLFrame[camera_index].frame.pArray[1];
				cudaArray *arr_v =  (struct cudaArray *)p->cudaEGLFrame[camera_index].frame.pArray[2];
			
			
			
			
				int u_offset = p->m_width * p->m_height;
				int v_offset = p->m_width * p->m_height * 1.25;
				
				
				  pthread_mutex_lock(&p->lock[camera_index]);
				gpuErrchk(cudaMemcpyFromArray(p->cuda_buffers[camera_index], arr_y, 0, 0, p->m_width * p->m_height , cudaMemcpyDeviceToDevice));
				gpuErrchk(cudaMemcpyFromArray(p->cuda_buffers[camera_index] + u_offset, arr_u, 0, 0, p->m_width * p->m_height * 0.25 , cudaMemcpyDeviceToDevice));
				gpuErrchk(cudaMemcpyFromArray(p->cuda_buffers[camera_index] + v_offset, arr_v, 0, 0, p->m_width * p->m_height * 0.25 , cudaMemcpyDeviceToDevice));
		
				pthread_mutex_unlock(&p->lock[camera_index]);
			
			
			
			
		
			
		
		
		
		
		pthread_mutex_lock(&p->lock[camera_index]);
		
		p->cuResult[camera_index] = cuEGLStreamConsumerReleaseFrame(&p->cudaConnection[camera_index], p->cudaResource[camera_index], &p->cudaStream[camera_index]);
		

		 pthread_mutex_unlock(&p->lock[camera_index]);
		if (p->cuResult[camera_index] != CUDA_SUCCESS)
		{
			ORIGINATE_ERROR("Unable to release the last frame acquired from the EGLStream "
					"(CUresult %s).", getCudaErrorString(p->cuResult[camera_index]));
		}
		
    }
    gettimeofday(&tv2, NULL);
    time = (tv2.tv_sec - tv1.tv_sec)*1000 + (tv2.tv_usec - tv1.tv_usec)/(1000.0);
    time/=1000.0;
    
    printf("FPS = %f Cam Index = %d \n", FRAME_COUNT/time, camera_index);
    fflush(stdout);
	
	

}

 bool execute()
{
	  /****** Panorama *******/
	


	// Create the CameraProvider object
    UniqueObj<CameraProvider> cameraProvider(CameraProvider::create());
    ICameraProvider *iCameraProvider = interface_cast<ICameraProvider>(cameraProvider);
    if (!iCameraProvider)
        ORIGINATE_ERROR("Failed to create CameraProvider");

	
    // Get the camera devices.
    std::vector<CameraDevice*> cameraDevices;
    iCameraProvider->getCameraDevices(&cameraDevices);
    if (cameraDevices.size() < NUM_CAMERAS)
        ORIGINATE_ERROR("Not enough cameras available");

	UniqueObj<CaptureSession>captureSession[NUM_CAMERAS];
	ICaptureSession *iCaptureSession[NUM_CAMERAS];
	
	for(int i = 0 ; i < NUM_CAMERAS ; i++)
	{
   
       captureSession[i] = UniqueObj<CaptureSession>(iCameraProvider->createCaptureSession(cameraDevices[i]));
       
	   iCaptureSession[i] = interface_cast<ICaptureSession>(captureSession[i]);
		if (!iCaptureSession[i])
        ORIGINATE_ERROR("Failed to create CaptureSession");
	}

	CameraDevice *cameraDevice = cameraDevices[0];
	ICameraProperties *iCameraProperties = interface_cast<ICameraProperties>(cameraDevice);
	std::vector<SensorMode*> sensorModes;
	iCameraProperties->getSensorModes(&sensorModes);
	if (sensorModes.size() == 0)
		ORIGINATE_ERROR("Failed to get sensor modes");
	
	SensorMode *sensorMode = sensorModes[1];
	
	
    printf("Creating output streams for cuda\n");
    
		
    UniqueObj<OutputStreamSettings> streamSettings[NUM_CAMERAS];
    IOutputStreamSettings *iStreamSettings[NUM_CAMERAS];
    
    for(int i = 0 ; i < NUM_CAMERAS ; i++)
    {
		streamSettings[i] = UniqueObj<OutputStreamSettings>(iCaptureSession[i]->createOutputStreamSettings());
		iStreamSettings[i]	= interface_cast<IOutputStreamSettings>(streamSettings[i]);
		if (iStreamSettings[i])
		{
			iStreamSettings[i]->setPixelFormat(PIXEL_FMT_YCbCr_420_888);
			iStreamSettings[i]->setResolution(STREAM_SIZE);
		}
	}
    UniqueObj<OutputStream> outputStream[NUM_CAMERAS];
    IStream *iStream[NUM_CAMERAS];
 
    for(int i = 0 ; i < NUM_CAMERAS ; i++)
    {
		outputStream[i] = UniqueObj<OutputStream>(iCaptureSession[i]->createOutputStream(streamSettings[i].get()));
		iStream[i] = interface_cast<IStream>(outputStream[i]);
        if (!iStream[i])
         ORIGINATE_ERROR("Failed to create OutputStream");
    }
	
	
	  
    pthread_t ConsumerThread[NUM_CAMERAS];
	InitCudaConsumer(&cudaConsumer, iStream, NUM_CAMERAS, PREVIEW_WIDTH, PREVIEW_HEIGHT);
	pthread_attr_t attr_;
    cpu_set_t cpus_;
    pthread_attr_init(&attr_);




 
    
    

 


    // Initialize and connect CUDA as the EGLStream consumer.
   


 
    // Create capture request and enable output stream.
    
    UniqueObj<Request> request[NUM_CAMERAS];
    IRequest *iRequest[NUM_CAMERAS];
    
    for(int i = 0 ; i < NUM_CAMERAS ; i++)
    {
    
		request[i] = UniqueObj<Request>(iCaptureSession[i]->createRequest());
		iRequest[i] = interface_cast<IRequest>(request[i]);
		if (!iRequest[i])
			ORIGINATE_ERROR("Failed to create Request");
		ISourceSettings *sourceSettings  = interface_cast<ISourceSettings>(iRequest[i]->getSourceSettings());

		sourceSettings->setSensorMode(sensorMode);
		
		iRequest[i]->enableOutputStream(outputStream[i].get());
	}
    
  
     // Submit some captures and calculate the histogram with CUDA
     
 

		uint8_t *cuda_I420_buffer[NUM_CAMERAS];
		uint8_t *cuda_I420_buffer_y[NUM_CAMERAS];
		uint8_t *cuda_I420_buffer_uv[NUM_CAMERAS];
	
    
		int num[] = {0, 1, 2, 3, 4, 5};
		for(int i = 0 ; i < NUM_CAMERAS ; i++)
		{
			CPU_ZERO(&cpus_);
			CPU_SET( (i%3)+1, &cpus_);
			pthread_attr_setaffinity_np(&attr_, sizeof(cpu_set_t), &cpus_);
			pthread_create(&ConsumerThread[i], &attr_, CudaConsumerThreadRun, (void *)(&num[i]));
		}
		
		CudaConsumerHandle *p = &cudaConsumer;
		int cnt = 1;
		struct timeval tv1, tv2;
		
		uint8_t *cuda_render_buffer;
		cudaMalloc((void **)&cuda_render_buffer, 4 * PREVIEW_WIDTH * PREVIEW_HEIGHT);
		CudaGLES renderobj;
	
	
		
		
			
		renderobj.InitCudaGLES(PREVIEW_WIDTH, PREVIEW_HEIGHT, "Pano Render"); 
			
			for(int i = 0 ; i < NUM_CAMERAS ; i++)
			{
				Argus::Status status;
				const uint64_t ONE_SECOND = 10000;
				uint32_t result = iCaptureSession[i]->repeat(request[i].get());
 
			}
			usleep(1000000);
			while(1)
			{
			  
     
			
			
			cnt++;
			printf("Frame Rendered = %d\n", cnt);
			fflush(stdout);
			
		
		
		
			for(int i = 0; i < NUM_CAMERAS ; i++)
				pthread_mutex_lock(&p->lock[i]);
			
			gettimeofday(&tv1, NULL);
			for(int i = 0 ; i < NUM_CAMERAS ; i++)
			{
				cuda_I420_buffer[i] = p->cuda_buffers[i];
				cuda_I420_buffer_y[i] = cuda_I420_buffer[i];
				cuda_I420_buffer_uv[i] = cuda_I420_buffer[i] + PREVIEW_HEIGHT*PREVIEW_WIDTH;
			}
			
			
			
			CudaI420_To_RGBA_Wrapper(cuda_render_buffer, cuda_I420_buffer_y[0], cuda_I420_buffer_uv[0], PREVIEW_WIDTH, PREVIEW_HEIGHT, 32, 8);
			
			
			renderobj.RenderCudaBuffer(cuda_render_buffer);
			
			for(int i = 0; i < NUM_CAMERAS ; i++)
				pthread_mutex_unlock(&p->lock[i]);
			
			 
			gpuErrchk(cudaDeviceSynchronize());
			
			gettimeofday(&tv2, NULL);	
			double pipeline = (tv2.tv_sec - tv1.tv_sec)*1000 + (tv2.tv_usec - tv1.tv_usec)/(1000.0);
			printf("Pipeline execution time = %f\n", pipeline);
					
		
			
			
			
			
			
			
			
			
		}
 
	
		
		


        usleep(CAPTURE_TIME*1000000);
     
		for(int i = 0 ; i  < NUM_CAMERAS ; i++)
		{
			iCaptureSession[i]->stopRepeat();
			iCaptureSession[i]->waitForIdle();
		}
		
		for(int i = 0 ; i < NUM_CAMERAS ; i++)
		{
			iStream[i]->disconnect();	
			pthread_join(ConsumerThread[i], NULL);
		}
		 
     
	

  
        
		for(int i = 0 ; i < NUM_CAMERAS ; i++)
			outputStream[i].reset();
	
		CudaConsumerDisconnect(&cudaConsumer);
  
       
    


		printf("Done\n");

    return true;
}

}; // namespace ArgusSamples

int main(int argc, const char *argv[])
{
    if (!ArgusSamples::execute())
        return EXIT_FAILURE;

    return EXIT_SUCCESS;
}

This is the GLES class written using the sample code.
I have checked the working of the code seperately without argus and it works fine.Any suggestions would be greatly helpful.The purpose of this renderer is to render a cuda buffer directly without copying it to the cpu.

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include <stdarg.h>
#include <unistd.h>
#include <X11/Xlib.h>
#include <X11/Xutil.h>


#include "cuda_render.h"
void error_exit(const char* format, ... )
{
  va_list args;
  va_start( args, format );
  vfprintf( stderr, format, args );
  va_end( args );
  exit(1); 
}

#include "graphics_interface.c"

#define checkCUDAError() \
{ \
  cudaError_t res = cudaGetLastError();\
  if (res != cudaSuccess)\
    {\
      fprintf(stderr, "Line %d: CUDA Error: %s\n", \
              __LINE__, cudaGetErrorString(res));\
      cudaThreadExit();\
      exit(1);\
    }\
}



void readAndCompileShaderFromGLSLFile(GLuint new_shaderprogram, const char *filename, GLenum shaderType)
{
  FILE *file = fopen(filename,"rb"); // open shader text file
  if (!file) 
    error_exit("Filename %s does not exist\n", filename);

  /* get the size of the file and read it */
  fseek(file,0,SEEK_END);
  GLint size = ftell(file);
  char *data = (char*)malloc(sizeof(char)*(size + 1));
  memset(data, 0, sizeof(char)*(size + 1));
  fseek(file,0,SEEK_SET);
  size_t res = fread(data,1,size,file);
  fclose(file);
  
  GLuint shader = glCreateShader(shaderType);
  glShaderSource(shader, 1, (const GLchar**)&data, &size);
  glCompileShader(shader);

  GET_GLERROR(0);
  GLint compile_success = 0;
  glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_success);
  GET_GLERROR(0);
  
  if (compile_success == GL_FALSE)
    {
      printf("Compilation of %s failed!\n Reason:\n", filename);

      GLint maxLength = 0;
      glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &maxLength);
      
      char errorLog[maxLength];
      glGetShaderInfoLog(shader, maxLength, &maxLength, &errorLog[0]);
      
      printf("%s", errorLog);
      
      glDeleteShader(shader); 
      exit(1);
    }

  glAttachShader(new_shaderprogram, shader);
  glDeleteShader(shader); // good to do? 

  free(data);  
}

GLuint ShaderCreate(const char *vshader_filename, const char *fshader_filename)
{
  printf("Loading GLSL shaders %s %s\n", vshader_filename, fshader_filename);

  GLuint new_shaderprogram = glCreateProgram();

  GET_GLERROR(0);
  if (vshader_filename)
    readAndCompileShaderFromGLSLFile(new_shaderprogram, vshader_filename, GL_VERTEX_SHADER);

  GET_GLERROR(0);
  if (fshader_filename)
    readAndCompileShaderFromGLSLFile(new_shaderprogram, fshader_filename, GL_FRAGMENT_SHADER);
  
  GET_GLERROR(0);
  
  glLinkProgram(new_shaderprogram);
  
  GET_GLERROR(0);
  GLint link_success;
  glGetProgramiv(new_shaderprogram, GL_LINK_STATUS, &link_success);

  if (link_success == GL_FALSE)
    {
      printf("Linking of %s with %s failed!\n Reason:\n", vshader_filename, fshader_filename);

      GLint maxLength = 0;
      glGetShaderiv(new_shaderprogram, GL_INFO_LOG_LENGTH, &maxLength);
      
      char errorLog[maxLength];
      glGetShaderInfoLog(new_shaderprogram, maxLength, &maxLength, &errorLog[0]);
      
      printf("%s", errorLog);
      
      exit(EXIT_FAILURE);
    }

  return new_shaderprogram;
}



void CudaGLES::InitCudaGLES(int width, int height, char *window_name)
{
  
  graphics_setup_window(0,0, width, height, (const char *)window_name);	
	
  char *GL_version=(char *)glGetString(GL_VERSION);
  char *GL_vendor=(char *)glGetString(GL_VENDOR);
  char *GL_renderer=(char *)glGetString(GL_RENDERER);
  
  printf("Version: %s\n", GL_version);
  printf("Vendor: %s\n", GL_vendor);
  printf("Renderer: %s\n", GL_renderer);

  m_width = width;
  m_height = height;

  
    // initialize buffer object
  glGenBuffers(1, &m_vbo);
  glBindBuffer(GL_ARRAY_BUFFER, m_vbo);
	

  unsigned int size = width * height * 4 * sizeof(float);
  glBufferData(GL_ARRAY_BUFFER, size, NULL, GL_DYNAMIC_DRAW);
  glVertexAttribPointer((GLuint)0, 4, GL_FLOAT, GL_FALSE, 0, 0);
  glEnableVertexAttribArray(0); 
  
  cudaGraphicsGLRegisterBuffer(&m_position, m_vbo, cudaGraphicsMapFlagsNone);
  
  glGenBuffers(1, &m_vbo);
  glBindBuffer(GL_ARRAY_BUFFER, m_vbo);


  size = width * height * 4 * sizeof(float);
  glBufferData(GL_ARRAY_BUFFER, size, NULL, GL_DYNAMIC_DRAW);
  glVertexAttribPointer((GLuint)1, 4, GL_FLOAT, GL_FALSE, 0, 0);
  glEnableVertexAttribArray(1); 
  
  cudaGraphicsGLRegisterBuffer(&m_color, m_vbo, cudaGraphicsMapFlagsNone);

  // GLSL stuff
  char *vertex_shader_path = sdkFindFilePath("/home/ubuntu/TX1-Pano/gles_render/mesh.vert.glsl", NULL);
  char *fragment_shader_path = sdkFindFilePath("/home/ubuntu/TX1-Pano/gles_render/mesh.frag.glsl", NULL);

  if (vertex_shader_path == NULL || fragment_shader_path == NULL)
  {
    printf("Error finding shader file\n");
    exit(EXIT_FAILURE);
  }

  m_shader = ShaderCreate(vertex_shader_path, fragment_shader_path);
//  GET_GLERROR(0);

  free(vertex_shader_path);
  free(fragment_shader_path);
  
  glUseProgram(m_shader);
  
  glClear( GL_COLOR_BUFFER_BIT );
  graphics_swap_buffers();
  
}
	
	
	
	
	






__global__ void simple_vbo_kernel(float4 *pos, uchar4 *cuda_input, float4 *color, unsigned int width, unsigned int height)
{
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

    // calculate uv coordinates
    float u = x / (float) width;
    float v = y / (float) height;
    u = u*2.0f - 1.0f;
    v = v*2.0f - 1.0f;

	cuda_input += (y*width + x);
	
	float r = (*cuda_input).x;
	float g = (*cuda_input).y;
	float b = (*cuda_input).z;
	
	r/=255.0;
	g/=255.0;
	b/=255.0;
	
    // write output vertex
    pos[y*width+x] = make_float4(u, v, 0, 1.0f);
    color[y*width+x] = make_float4(r, g, b, 1.0f);
}


void launch_kernel(float4 *pos, uchar4 *cuda_input, float4 * color, unsigned int mesh_width,
                   unsigned int mesh_height)
{
    // execute the kernel
    dim3 block(8, 8, 1);
    dim3 grid(mesh_width / block.x, mesh_height / block.y, 1);
    simple_vbo_kernel<<< grid, block>>>(pos, cuda_input, color, mesh_width, mesh_height);
}

void runCuda(struct cudaGraphicsResource **position_resource, struct cudaGraphicsResource **color_resource, uchar4 *rgba_buff, int width, int height)
{
    // map OpenGL buffer object for writing from CUDA
    float4 *dptr, *color;
    cudaGraphicsMapResources(1, position_resource, 0);
    cudaGraphicsMapResources(1, color_resource, 0);
    size_t num_bytes;
    cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, *position_resource);
    cudaGraphicsResourceGetMappedPointer((void **)&color, &num_bytes, *color_resource);
   
	
	launch_kernel(dptr, rgba_buff, color, width, height);

    cudaGraphicsUnmapResources(1, position_resource, 0);
    cudaGraphicsUnmapResources(1, color_resource, 0);
}

GLuint mesh_shader = 0;



void CudaGLES::RenderCudaBuffer(unsigned char *buff)
{
	uchar4 *rgba_buff = reinterpret_cast<uchar4 *>(buff);
    

    // run CUDA kernel to generate vertex  positions and color
    runCuda(&m_position, &m_color, rgba_buff, m_width, m_height);

    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
  
    glDrawArrays(GL_POINTS, 0, m_width * m_height);

    glFinish();
    
    usleep(1000);
			
	graphics_swap_buffers();
}
void CudaGLES::stop()
{
	cudaGraphicsUnregisterResource(m_position); 
	cudaGraphicsUnregisterResource(m_color);      
	
	graphics_close_window(); // close window and destroy OpenGL ES context
}
/*
int main()
{
	int width = 1920;
	int height = 1080;
	
	unsigned char *h, *d;
	h = (unsigned char *)malloc(width * height * 4);
	cudaMalloc((void **)&d, width*height*4);
	
	FILE *f;
	f = fopen("input.yuv", "r");
	fread(h, width*height*4, 1, f);
	fclose(f);
	cudaMemcpy(d, h, width*height*4, cudaMemcpyHostToDevice);
	
	CudaGLES obj;
	obj.InitCudaGLES(width, height, "Render Pano");
	for(int i = 0 ;  i < 200 ; i++)
	{
		obj.RenderCudaBuffer(d);
		//usleep(1000);
	}
	return 0;
}*/

Hi tejas95,

Could you send me your sample code with makefile? If you don’t want to reveal it, please use private message.

I just checked your code. Currently work is only on one camera and no cuda kernel. Directly render the result from argus to GLES, right?

Yes.

The are two kernels actually. One for converting I420 to RGBA . And other one for generating position coordinates for shaders while rendering.Please check the rendering code as well.Both Argus and GLES code work completely fine independently.

Can it be something related to lack of graphics resources?

Could you share the error log?

As i said, i was not getting any errors. After a few frames of capturing, Argus just hangs.

tejas95,

Could you send me a package of your project? I would like to try it internally for debug.

Thanks.

Hi WayneWWW, how do u want me to send u the files?

I have attached the tar file which includes a makefile to build the application.
Thanks
TX1-app.tar.gz (576 KB)

Thanks! I’ll update after finding root cause or other solution.

Hi tejas95,

I think you can refer to following code in mmapi sample to know how to get argus->dma_buf fd-> cuda.

Then, use cudaGraphicsEGLRegisterImage to share your cuda buffer with EGLImage and send it to EGLrenderer.

Samples:
09_camera_jpeg_capture (uses createNvBuffer for dma_buf)
10_camera_recording
v4l2_cuda
and common/classes/NvEglRenderer.cpp

Hi WayneWWW,

Thanks for the suggestion. I will definitely try it out. However it would be really helpful to know the cause of the previous issue .

Thanks

I have tried to use argus + cuda +EGLrender as a sample. The following is the code snippet of some cuda API.

for (unsigned int frame = 0; frame < options.frameCount; ++frame)
    {
        /*
         * For simplicity this example submits a capture then waits for an output.
         * This pattern will not provide the best possible performance as the camera
         * stack runs in a pipeline, it is best to keep submitting as many captures as
         * possible prior to waiting for the result.
         */
        printf("Submitting a capture request\n");
        {
            const uint64_t ONE_SECOND = 1000000000;
            uint32_t result = iCaptureSession->capture(request.get(), ONE_SECOND, NULL);
            if (result == 0)
                ORIGINATE_ERROR("Failed to submit capture request (status %x)", 0);
        }

        printf("Acquiring an image from the EGLStream\n");
        CUgraphicsResource cudaResource = 0;
        CUstream cudaStream = 0;
        cuResult = cuEGLStreamConsumerAcquireFrame(&cudaConnection, &cudaResource, &cudaStream, -1);
        if (cuResult != CUDA_SUCCESS)
        {
            ORIGINATE_ERROR("Unable to acquire an image frame from the EGLStream with CUDA as a "
                "consumer (CUresult %s).", getCudaErrorString(cuResult));
        }

        // Get the CUDA EGL frame.
        CUeglFrame cudaEGLFrame;
        cuResult = cuGraphicsResourceGetMappedEglFrame(&cudaEGLFrame, cudaResource, 0, 0);
        if (cuResult != CUDA_SUCCESS)
        {
            ORIGINATE_ERROR("Unable to get the CUDA EGL frame (CUresult %s).",
                getCudaErrorString(cuResult));
        }

        // Print the information contained in the CUDA EGL frame structure.
        //PROPAGATE_ERROR(printCUDAEGLFrame(cudaEGLFrame));

        if ((cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV420_PLANAR) &&
            (cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR) &&
            (cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV422_PLANAR) &&
            (cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR))
        {
            ORIGINATE_ERROR("Only YUV color formats are supported");
        }
        if (cudaEGLFrame.cuFormat != CU_AD_FORMAT_UNSIGNED_INT8)
            ORIGINATE_ERROR("Only 8-bit unsigned int formats are supported");

	//printf(" cudaEGLFrame.frameType = %d\n ", cudaEGLFrame.frameType);
  
        struct cudaResourceDesc cudaResourceDesc[2];
        for(unsigned int i = 0 ; i < cudaEGLFrame.planeCount ; i++)
	{
            memset(&cudaResourceDesc[i], 0, sizeof(struct cudaResourceDesc));
            cudaResourceDesc[i].resType = cudaResourceTypeArray;
            cudaResourceDesc[i].res.array.array = (cudaArray_t)cudaEGLFrame.frame.pArray[i];
	}

	struct cudaTextureDesc cudaTextureDesc[2];
        for(unsigned int i = 0 ; i < cudaEGLFrame.planeCount ; i++)
	{
            memset(&cudaTextureDesc[i], 0, sizeof(struct cudaTextureDesc));
	    cudaTextureDesc[i].filterMode = cudaFilterModeLinear;
	    cudaTextureDesc[i].readMode = cudaReadModeNormalizedFloat;
	    cudaTextureDesc[i].normalizedCoords = 1;
	}

        cudaTextureObject_t cudaTexObj[2] = {0};

	for(int i = 0; i < 2; i++)
            CUDA_CHECK(cudaCreateTextureObject(&cudaTexObj[i], &cudaResourceDesc[i], &cudaTextureDesc[i], NULL));


	int fd = -1;

	if(NvBufferCreate(&fd, 1920, 1080, NvBufferLayout_BlockLinear,  NvBufferColorFormat_NV12))
	{
            ORIGINATE_ERROR("NvBufferCreate failed\n");
	}


	egl_display = renderer->getEGLDisplay();

        if (!egl_display || EGL_NO_DISPLAY == egl_display)
	{
            ORIGINATE_ERROR("Unable to get egl display\n");
	}

        EGLImageKHR dstEglImage = NvEGLImageFromFd(egl_display, fd);

	if(!dstEglImage)
            ORIGINATE_ERROR("Unable to get EGLImage from fd\n");

        CUgraphicsResource dstcuResource = 0;

        cuResult = cuGraphicsEGLRegisterImage(&dstcuResource, dstEglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);

	if(cuResult != CUDA_SUCCESS)
	{
            ORIGINATE_ERROR("Unable to register CUDA resource (CUresult %s)",
            getCudaErrorString(cuResult));

	}

        CUeglFrame dst_cudaEGLFrame;
	memset(&dst_cudaEGLFrame, 0, sizeof(dst_cudaEGLFrame));
	cuResult = cuGraphicsResourceGetMappedEglFrame(&dst_cudaEGLFrame, dstcuResource, 0, 0);

	//PROPAGATE_ERROR(printCUDAEGLFrame(dst_cudaEGLFrame));

        if (cuResult != CUDA_SUCCESS)
        {
            ORIGINATE_ERROR("Unable to get the DST CUDA EGL frame (CUresult %s).",
                getCudaErrorString(cuResult));
        }

	//create SurfaceObject as Dst
        struct cudaResourceDesc  dst_cudaResourceDesc[2];
	for(int i = 0; i < 2; i++)
	{
            memset(&dst_cudaResourceDesc[i], 0, sizeof(struct cudaResourceDesc));
            dst_cudaResourceDesc[i].resType = cudaResourceTypeArray;
            dst_cudaResourceDesc[i].res.array.array = (cudaArray_t)dst_cudaEGLFrame.frame.pArray[i];
	}
	cudaSurfaceObject_t dst_cudaSurfObj[2] = {0};

	for(int i = 0; i < 2; i++)
            CUDA_CHECK(cudaCreateSurfaceObject(&dst_cudaSurfObj[i], &dst_cudaResourceDesc[i]));


        for (unsigned int i = 0; i < 2; ++i) {
            gpuImageScaling(cudaTexObj[i], dst_cudaSurfObj[i], dst_cudaEGLFrame.width, dst_cudaEGLFrame.height, dst_cudaEGLFrame.numChannels);  //modify this as your cuda kerenl.
        }

        renderer->render(fd);

tejas95,

I doubt there maybe some problem in your format. Is it a yuv420 one? Could you print out the detail of your EGLframe?