Cuda kernel sometimes fails to launch

Platform: Jetson TK1

I have written as simple CUDA kernel to add two (3-channel) matrices index by index. But I am experiencing strange behavior and I can’t figure out why it is so. The behavior is that sometimes this program runs fine but sometimes the program executing gets halt at the kernel call. This is random (or i think it is).

I have CUDA_LAUNCH_BLOCKING=1 so that I can get benchmark for the kernel performance.

Secondly, when the kernel is executed properly, the time taken is ~8ms. Isn’t it a bit much to just add two 1920x1080x3 images ? With this performance, I would hardly be able to optimize my other image processing algorithms to GPU. Or am I missing something ?

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/calib3d/calib3d.hpp"
#include <opencv2/gpu/gpumat.hpp>
#include <opencv2/gpu/devmem2d.hpp>

#include <iostream>
#include <limits>
#include <numeric>
#include <stdio.h>
using namespace cv;
using namespace std;
using namespace cv::gpu;
#define HOM_MAT_TYPE double

__host__
void display_mat(Mat imageout)
{
	int i , j;
	unsigned char *ptr;
	ptr = imageout.ptr();
	for (i = 0 ; i < imageout.rows; i++)
 	{
 		for (j = 0 ; j < imageout.cols ; j++)
 		{
 			//Vec3b tmp = imageout.at<Vec3b>(i,j);
 			//cout << (int)tmp(0)<< " " <<(int)tmp(1)<< " " <<(int)tmp(2) <<endl;
 			cout <<(int)ptr[i*imageout.rows + j]<< " ";
 		}
 	}
}

__host__
void add_matrix(Mat &m1, Mat &m2, Mat &out)
{
	int rows = m1.rows;
	int cols = m1.cols;
	int i,j;
	Vec3b tmp;
	unsigned char* ptr1 = m1.ptr() ;
	unsigned char* ptr2 = m2.ptr();
	unsigned char* ptr3 = out.ptr();
	int gray_tid = 0x00;
	int grayWidthStep = m1.step;

	for( j = 0 ; j < rows ; j ++)
	{
		for( i = 0 ; i < cols ; i+=3)
		{
			
			gray_tid  = j * grayWidthStep + (i*3);
			
			ptr3[gray_tid] = (1.00* ptr1[gray_tid]) + (1.00* ptr2[gray_tid]);
			ptr3[gray_tid + 1] = (1.00* ptr1[gray_tid + 1]) + (1.00* ptr2[gray_tid + 1]);
			ptr3[gray_tid + 2] = (1.00* ptr1[gray_tid + 2]) + (1.00* ptr2[gray_tid + 2]);

		}
	}
}

__global__ void add_matrix(char *m1, char *m2, char * outm, int grayWidthStep)
{

	int i = blockIdx.x * blockDim.x + threadIdx.x;
	int j = blockIdx.y * blockDim.y + threadIdx.y;
	
	const int gray_tid  = j * grayWidthStep + (i*3);
	if(i > 1920)
		return;
	if(j > 1080)
		return;

	//printf (" SOMEHTING ");
	outm[gray_tid] = (1.00* m1[gray_tid]) + (1.00* m2[gray_tid]);
	outm[gray_tid + 1] = (1.00* m1[gray_tid + 1]) + (1.00* m2[gray_tid + 1]);
	outm[gray_tid + 2] = (1.00* m1[gray_tid + 2]) + (1.00* m2[gray_tid + 2]);

	return;
}

__host__
void fill_matrix(Mat &m)
{
	
	m = Scalar(127,1,1);
}

int main(int argc, char ** argv)
{

	Mat imageMain = Mat(1080,1920,CV_8UC3);
    Mat imageLogo = Mat(1080,1920,CV_8UC3);
    Mat imageout = Mat(1080,1920,CV_8UC3);

	char * imageMain_d, *imageLogo_d, *imageout_d;
 	int i , j;	 
 	int64 e;
 	
 	float ee;
 	
 	int iterations = 100;
 	float sum = 0;
 	
 	
 	int input_size = imageMain.step * imageMain.rows;

 	fill_matrix(imageMain);
 	fill_matrix(imageLogo);
 	
 	cudaMalloc((char **)&imageMain_d,input_size);
 	cudaMalloc((char **)&imageLogo_d,input_size);
 	cudaMalloc((char **)&imageout_d,input_size);

	cudaMemcpy(imageMain_d, imageMain.ptr(), input_size, cudaMemcpyHostToDevice);
	cudaMemcpy(imageLogo_d, imageLogo.ptr(), input_size, cudaMemcpyHostToDevice);
	
	dim3 block(32, 16, 1);
	dim3 grid(60, 68, 1);
	//dim3 grid((imageMain.cols + block.x - 1)/block.x, (imageMain.rows + block.y - 1)/block.y);

	cout <<"STARTING with "<<imageMain.step<<" Step . and Cols : "<< imageMain.cols <<"\n"<<endl;
	e = getTickCount();
	//add_matrix<<<grid,block>>>(imageMain_d,imageLogo_d,imageout_d,imageMain.step);
	ee = ( getTickCount() - e) / getTickFrequency() ;
	cout << ee<<endl;
	cout <<"DONE\n"<<endl;
	
	cout <<"Copy done \n"<<endl;
	
	
 	for (i  = 0 ; i < iterations; i ++)
 	{
 		cout <<"Iteration "<<i<<endl;	
	 	e = getTickCount();
	 	add_matrix<<<grid,block>>>(imageMain_d,imageLogo_d,imageout_d,imageMain.step);
	 
	 	ee = ( getTickCount() - e) / getTickFrequency() ;
	 	sum += ( ee);
	 	
	 	 
 	}
 	cout <<endl;
 	cudaDeviceSynchronize();
	cudaMemcpy(imageout.ptr(),imageout_d,input_size,cudaMemcpyDeviceToHost);
	cudaFree(imageout_d);
	cudaFree(imageMain_d);
	cudaFree(imageLogo_d);
 	cout <<"GPU : " <<sum/iterations<<endl;
 	imwrite("imageout_gpu.png",imageout);

 	sum = 0; e = 0 ; ee = 0;
	for (i  = 0 ; i < iterations ; i ++)
 	{
	 	e = getTickCount();
	 	//add_matrix<<<grid,block>>>(imageMain_d,imageLogo_d,imageout_d,imageMain.step);
	 	add_matrix(imageMain,imageLogo,imageout);
	 	ee = ( getTickCount() - e) / getTickFrequency() ;
	 	sum += ( ee);
	 	//cout <<ee<<endl;
	 	//cout << (getTickCount() - e ) / getTickFrequency()<<endl;
	 	 
 	}

 	cout <<"CPU : " <<sum/iterations<<endl;
 	imwrite("imageout_cpu.png",imageout);
    //imshow("stitched_warp",imageMain);
    //imshow("stitched_custom",imageout);
    //waitKey(0);	
	return 0;
}

There was an out of bound indexing problem. and I was passing a host memory integer to Cuda kernel without first moving it to dev memory.