Platform: Jetson TK1
I have written as simple CUDA kernel to add two (3-channel) matrices index by index. But I am experiencing strange behavior and I can’t figure out why it is so. The behavior is that sometimes this program runs fine but sometimes the program executing gets halt at the kernel call. This is random (or i think it is).
I have CUDA_LAUNCH_BLOCKING=1 so that I can get benchmark for the kernel performance.
Secondly, when the kernel is executed properly, the time taken is ~8ms. Isn’t it a bit much to just add two 1920x1080x3 images ? With this performance, I would hardly be able to optimize my other image processing algorithms to GPU. Or am I missing something ?
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/calib3d/calib3d.hpp"
#include <opencv2/gpu/gpumat.hpp>
#include <opencv2/gpu/devmem2d.hpp>
#include <iostream>
#include <limits>
#include <numeric>
#include <stdio.h>
using namespace cv;
using namespace std;
using namespace cv::gpu;
#define HOM_MAT_TYPE double
__host__
void display_mat(Mat imageout)
{
int i , j;
unsigned char *ptr;
ptr = imageout.ptr();
for (i = 0 ; i < imageout.rows; i++)
{
for (j = 0 ; j < imageout.cols ; j++)
{
//Vec3b tmp = imageout.at<Vec3b>(i,j);
//cout << (int)tmp(0)<< " " <<(int)tmp(1)<< " " <<(int)tmp(2) <<endl;
cout <<(int)ptr[i*imageout.rows + j]<< " ";
}
}
}
__host__
void add_matrix(Mat &m1, Mat &m2, Mat &out)
{
int rows = m1.rows;
int cols = m1.cols;
int i,j;
Vec3b tmp;
unsigned char* ptr1 = m1.ptr() ;
unsigned char* ptr2 = m2.ptr();
unsigned char* ptr3 = out.ptr();
int gray_tid = 0x00;
int grayWidthStep = m1.step;
for( j = 0 ; j < rows ; j ++)
{
for( i = 0 ; i < cols ; i+=3)
{
gray_tid = j * grayWidthStep + (i*3);
ptr3[gray_tid] = (1.00* ptr1[gray_tid]) + (1.00* ptr2[gray_tid]);
ptr3[gray_tid + 1] = (1.00* ptr1[gray_tid + 1]) + (1.00* ptr2[gray_tid + 1]);
ptr3[gray_tid + 2] = (1.00* ptr1[gray_tid + 2]) + (1.00* ptr2[gray_tid + 2]);
}
}
}
__global__ void add_matrix(char *m1, char *m2, char * outm, int grayWidthStep)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
const int gray_tid = j * grayWidthStep + (i*3);
if(i > 1920)
return;
if(j > 1080)
return;
//printf (" SOMEHTING ");
outm[gray_tid] = (1.00* m1[gray_tid]) + (1.00* m2[gray_tid]);
outm[gray_tid + 1] = (1.00* m1[gray_tid + 1]) + (1.00* m2[gray_tid + 1]);
outm[gray_tid + 2] = (1.00* m1[gray_tid + 2]) + (1.00* m2[gray_tid + 2]);
return;
}
__host__
void fill_matrix(Mat &m)
{
m = Scalar(127,1,1);
}
int main(int argc, char ** argv)
{
Mat imageMain = Mat(1080,1920,CV_8UC3);
Mat imageLogo = Mat(1080,1920,CV_8UC3);
Mat imageout = Mat(1080,1920,CV_8UC3);
char * imageMain_d, *imageLogo_d, *imageout_d;
int i , j;
int64 e;
float ee;
int iterations = 100;
float sum = 0;
int input_size = imageMain.step * imageMain.rows;
fill_matrix(imageMain);
fill_matrix(imageLogo);
cudaMalloc((char **)&imageMain_d,input_size);
cudaMalloc((char **)&imageLogo_d,input_size);
cudaMalloc((char **)&imageout_d,input_size);
cudaMemcpy(imageMain_d, imageMain.ptr(), input_size, cudaMemcpyHostToDevice);
cudaMemcpy(imageLogo_d, imageLogo.ptr(), input_size, cudaMemcpyHostToDevice);
dim3 block(32, 16, 1);
dim3 grid(60, 68, 1);
//dim3 grid((imageMain.cols + block.x - 1)/block.x, (imageMain.rows + block.y - 1)/block.y);
cout <<"STARTING with "<<imageMain.step<<" Step . and Cols : "<< imageMain.cols <<"\n"<<endl;
e = getTickCount();
//add_matrix<<<grid,block>>>(imageMain_d,imageLogo_d,imageout_d,imageMain.step);
ee = ( getTickCount() - e) / getTickFrequency() ;
cout << ee<<endl;
cout <<"DONE\n"<<endl;
cout <<"Copy done \n"<<endl;
for (i = 0 ; i < iterations; i ++)
{
cout <<"Iteration "<<i<<endl;
e = getTickCount();
add_matrix<<<grid,block>>>(imageMain_d,imageLogo_d,imageout_d,imageMain.step);
ee = ( getTickCount() - e) / getTickFrequency() ;
sum += ( ee);
}
cout <<endl;
cudaDeviceSynchronize();
cudaMemcpy(imageout.ptr(),imageout_d,input_size,cudaMemcpyDeviceToHost);
cudaFree(imageout_d);
cudaFree(imageMain_d);
cudaFree(imageLogo_d);
cout <<"GPU : " <<sum/iterations<<endl;
imwrite("imageout_gpu.png",imageout);
sum = 0; e = 0 ; ee = 0;
for (i = 0 ; i < iterations ; i ++)
{
e = getTickCount();
//add_matrix<<<grid,block>>>(imageMain_d,imageLogo_d,imageout_d,imageMain.step);
add_matrix(imageMain,imageLogo,imageout);
ee = ( getTickCount() - e) / getTickFrequency() ;
sum += ( ee);
//cout <<ee<<endl;
//cout << (getTickCount() - e ) / getTickFrequency()<<endl;
}
cout <<"CPU : " <<sum/iterations<<endl;
imwrite("imageout_cpu.png",imageout);
//imshow("stitched_warp",imageMain);
//imshow("stitched_custom",imageout);
//waitKey(0);
return 0;
}