Hi all,
I am executing Gray to BGRA conversion code on GPU using Zero Copy Pipeline and Standard CUDA pipeline (where we use cudaMemcpy for copying data from CPU to GPU.) . I read online on http://arrayfire.com/zero-copy-on-tegra-k1/ that Zero copy takes less time as compared to Standard CUDA pipeline. But in this case , the time has increased by 10x . I am not able to find where its going wrong . It says that memcpy is not required but without using it i’m not able to generate output. What might be the problem ?
Any help on this topic would be appreciated.
Attached below is the code using Zero Copy .
Code :
#include
#include<stdio.h>
#include<string.h>
#include<opencv2/core/core.hpp>
#include<opencv2/highgui/highgui.hpp>
#include<cuda_runtime.h>
#include “device_launch_parameters.h”
#include “Profile.h”
//#define PROFILE
#define CUDA 1
global void kernel(unsigned char *d_output , unsigned char *d_input, int width, int height)
{
int rows = (blockIdx.x * blockDim.x + threadIdx.x);
int cols = (blockIdx.y * blockDim.y + threadIdx.y) ;
int index = rows * width + cols ;
d_output[3 * index ] = d_input[3 * index];
d_output[3 * index +1] = d_input[3 * index + 1];
d_output[3 * index +2] = d_input[3 * index + 2];
d_output[4 * index +3] = 255; //Alpha value
}
using namespace profile;
Profile* m_pTimer = Profile::getInstance();
void GrayToBGRA(const cv::Mat& input, cv::Mat& output, unsigned char *dev_output, unsigned char *dev_input, int sizeIp, int sizeOp)
{
dim3 numThreadsPerBlock(8,8);
dim3 numBlocks(input.rows/numThreadsPerBlock.x, input.cols/numThreadsPerBlock.y);
#ifdef PROFILE
m_pTimer->StartTimer(QUERYAVG);
#endif
//cudaMemcpy( dev_input, input.ptr(), sizeIp, cudaMemcpyHostToDevice );
#ifdef PROFILE
m_pTimer->StopTimer(QUERYAVG);
#endif
#ifdef PROFILE
m_pTimer->StartTimer(QUERYKERNEL);
#endif
//cudaSetDevice(1);
#ifdef CUDA
//GPU timer code
float time;
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
#endif
kernel<<< numBlocks, numThreadsPerBlock >>>( dev_output, dev_input, input.cols, input.rows);
#ifdef CUDA
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop); //time taken in kernel call calculated
cudaEventDestroy(start);
#endif
cudaDeviceSynchronize();
// cudaThreadSynchronize();
#ifdef PROFILE
m_pTimer->StopTimer(QUERYKERNEL);
#endif
#ifdef PROFILE
m_pTimer->StartTimer(QUERYMEMCPY2);
#endif
m_pTimer->StopTimer(QUERYMEMCPY2);
printf("\n\nTime taken is %f (ms)\n",time);
}
int main()
{
cudaDeviceProp prop;
int whichDevice;
cudaGetDevice(&whichDevice);
cudaGetDeviceProperties(&prop,whichDevice);
if (prop.canMapHostMemory !=1)
{
printf("Device Cannot Map Memory ");
return 0;
}
unsigned char *d_output, *d_input;
cv::Mat input = cv::imread( "/home/ubuntu/Neha/GrayToBGRA/latest1.jpeg" );
if(input.empty())
{
std::cout<<"Image Not Found!"<<std::endl;
std::cin.get();
return -1;
}
//Create output image
cv::Mat output(input.rows,input.cols,CV_8UC4);
cudaSetDeviceFlags(cudaDeviceMapHost);
unsigned char *h_in = input.data;
unsigned char *h_out= output.data;
const int size_input = input.cols * 3 * input.rows;
const int size_output = output.cols * 4 * output.rows;
printf("size_input=%d\n",size_input);
printf("size_output=%d\n",size_output);
//Allocate device memory
#ifdef PROFILE
m_pTimer->StartTimer(QUERYIPM);
#endif
cudaHostAlloc((void **)&h_in, size_input, cudaHostAllocMapped);
cudaHostAlloc((void **)&h_out, size_output, cudaHostAllocMapped);
cudaMemcpy(h_in,input.data,size_input,cudaMemcpyHostToDevice);
cudaHostGetDevicePointer((void **)&d_input, (void *) h_in , 0);
cudaHostGetDevicePointer((void **)&d_output, (void *) h_out, 0);
//cudaMalloc(&d_input,size_input);
#ifdef PROFILE
m_pTimer->StopTimer(QUERYIPM);
#endif
//cudaMalloc(&d_output,size_output);
#ifdef PROFILE
m_pTimer->StartTimer(QUERYTOTAL);
#endif
for(int i=0;i<1;i++)
{
GrayToBGRA(input,output,d_output,d_input,size_input,size_output);
}
#ifdef PROFILE
m_pTimer->StopTimer(QUERYTOTAL);
#endif
cudaMemcpy(output.data,h_out,size_input,cudaMemcpyDeviceToHost);
//printf("Horizontal Flip After\n");
//Show the input and output
cv::imshow("Input",input);
cv::waitKey(0);
cv::imshow("Output",output);
//Wait for key press
cv::waitKey(0);
cout <<"Total Time: "<< (m_pTimer->getTotalTime(QUERYTOTAL)/500)<<endl;
cout <<"Total Time for Cudamalloc:"<< (m_pTimer->getTotalTime(QUERYIPM))<<endl;
cout <<"Total Time for cudamemcpy"<< (m_pTimer->getTotalTime(QUERYAVG)/500)<<endl;
cout <<"Total Time for kernel:"<< (m_pTimer->getTotalTime(QUERYKERNEL)/500)<<endl;
cout <<"Total Time for cudamemcpyafter:"<< (m_pTimer->getTotalTime(QUERYMEMCPY2)/500)<<endl;
cudaFree(h_out);
cudaFree(h_in);
return 0;
}
Also attaching nvprof stats for reference.
==10116== Profiling result:
Time(%) Time Calls Avg Min Max Name
87.26% 63.516ms 1 63.516ms 63.516ms 63.516ms kernel(unsigned char*, unsigned char*, int, int)
12.74% 9.2751ms 2 4.6375ms 3.4132ms 5.8619ms [CUDA memcpy HtoH]
==10116== API calls:
Time(%) Time Calls Avg Min Max Name
75.47% 255.06ms 2 127.53ms 4.5387ms 250.52ms cudaHostAlloc
18.79% 63.507ms 1 63.507ms 63.507ms 63.507ms cudaEventSynchronize
5.42% 18.330ms 2 9.1648ms 4.8008ms 13.529ms cudaMemcpy
0.10% 335.33us 83 4.0400us 1.0000us 127.83us cuDeviceGetAttribute
0.07% 220.42us 1 220.42us 220.42us 220.42us cudaGetDeviceProperties
0.05% 180.17us 1 180.17us 180.17us 180.17us cudaLaunch
0.04% 130.17us 2 65.083us 61.667us 68.500us cudaEventRecord
0.01% 48.081us 2 24.040us 8.4160us 39.665us cudaEventCreate
0.01% 25.750us 1 25.750us 25.750us 25.750us cudaDeviceSynchronize
0.01% 24.917us 1 24.917us 24.917us 24.917us cudaGetDevice
0.01% 23.167us 2 11.583us 6.3330us 16.834us cudaHostGetDevicePointer
0.01% 20.583us 1 20.583us 20.583us 20.583us cudaEventElapsedTime
0.00% 13.833us 1 13.833us 13.833us 13.833us cudaSetDeviceFlags
0.00% 10.500us 4 2.6250us 1.5830us 3.1660us cudaSetupArgument
0.00% 7.9170us 1 7.9170us 7.9170us 7.9170us cudaConfigureCall
0.00% 7.7500us 2 3.8750us 1.7500us 6.0000us cuDeviceGetCount
0.00% 7.5000us 1 7.5000us 7.5000us 7.5000us cudaEventDestroy
0.00% 3.9170us 1 3.9170us 3.9170us 3.9170us cuDeviceTotalMem
0.00% 3.3330us 2 1.6660us 1.4160us 1.9170us cuDeviceGet
0.00% 2.5000us 1 2.5000us 2.5000us 2.5000us cuDeviceGetName
Original Code without Zero Copy Pipeline. (Using memcpy / CudaMalloc)
Code:
#include
#include<stdio.h>
#include<opencv2/core/core.hpp>
#include<opencv2/highgui/highgui.hpp>
#include<cuda_runtime.h>
#include “device_launch_parameters.h”
#include “Profile.h”
//#define PROFILE 0
#define CUDA 1
global void GrayToBGRA(unsigned char *d_output , unsigned char *d_input, int width, int height)
{
int rows = (blockIdx.x * blockDim.x + threadIdx.x);
int cols = (blockIdx.y * blockDim.y + threadIdx.y) ;
int index = rows * width + cols ;
d_output[4 * index ] = d_input[3 * index];
d_output[4 * index +1] = d_input[3 * index + 1];
d_output[4 * index +2] = d_input[3 * index + 2];
d_output[4 * index +3] = 255; //Alpha value
}
using namespace profile;
Profile* m_pTimer = Profile::getInstance();
void GrayToBGRA(const cv::Mat& input, cv::Mat& output, unsigned char *dev_output, unsigned char *dev_input, int sizeIp, int sizeOp)
{
dim3 numThreadsPerBlock(8,8);
dim3 numBlocks(input.rows/numThreadsPerBlock.x, input.cols/numThreadsPerBlock.y);
#ifdef PROFILE
m_pTimer->StartTimer(QUERYAVG);
#endif
cudaMemcpy( dev_input, input.ptr(), sizeIp, cudaMemcpyHostToDevice );
#ifdef PROFILE
m_pTimer->StopTimer(QUERYAVG);
#endif
#ifdef PROFILE
m_pTimer->StartTimer(QUERYKERNEL);
#endif
//cudaSetDevice(1);
#ifdef CUDA
//GPU timer code
float time;
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
#endif
GrayToBGRA<<< numBlocks, numThreadsPerBlock >>>( dev_output, dev_input, input.cols, input.rows);
#ifdef CUDA
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop); //time taken in kernel call calculated
cudaEventDestroy(start);
#endif
//cudaThreadSynchronize();
#ifdef PROFILE
m_pTimer->StopTimer(QUERYKERNEL);
#endif
#ifdef PROFILE
m_pTimer->StartTimer(QUERYMEMCPY2);
#endif
cudaMemcpy( output.ptr(), dev_output, sizeOp, cudaMemcpyDeviceToHost );
#ifdef PROFILE
m_pTimer->StopTimer(QUERYMEMCPY2);
#endif
printf("\n\nTime taken is %f (ms)\n",time);
}
int main()
{
unsigned char *d_output, *d_input;
//cudaFree(0);
cv::Mat input = cv::imread( "/home/ubuntu/Neha/GrayToBGRA/latest1.jpeg" );
if(input.empty())
{
std::cout<<"Image Not Found!"<<std::endl;
std::cin.get();
return -1;
}
//Create output image
cv::Mat output(input.rows,input.cols,CV_8UC4);
const int size_input = input.cols * 3 * input.rows;
const int size_output = output.cols * 4 * output.rows;
printf("size_input=%d\n",size_input);
printf("size_output=%d\n",size_output);
//Allocate device memory
#ifdef PROFILE
m_pTimer->StartTimer(QUERYIPM);
#endif
cudaMalloc(&d_input,size_input);
#ifdef PROFILE
m_pTimer->StopTimer(QUERYIPM);
#endif
cudaMalloc(&d_output,size_output);
#ifdef PROFILE
m_pTimer->StartTimer(QUERYTOTAL);
#endif
for(int i=0;i<1;i++)
{
GrayToBGRA(input,output,d_output,d_input,size_input,size_output);
}
#ifdef PROFILE
m_pTimer->StopTimer(QUERYTOTAL);
#endif
//printf("Horizontal Flip After\n");
//Show the input and output
cv::imshow("Input",input);
cv::waitKey(0);
cv::imshow("Output",output);
//Wait for key press
cv::waitKey(0);
cout <<"Total Time: "<< (m_pTimer->getTotalTime(QUERYTOTAL)/500)<<endl;
cout <<"Total Time for Cudamalloc:"<< (m_pTimer->getTotalTime(QUERYIPM))<<endl;
cout <<"Total Time for cudamemcpy"<< (m_pTimer->getTotalTime(QUERYAVG)/500)<<endl;
cout <<"Total Time for kernel:"<< (m_pTimer->getTotalTime(QUERYKERNEL)/500)<<endl;
cout <<"Total Time for cudamemcpyafter:"<< (m_pTimer->getTotalTime(QUERYMEMCPY2)/500)<<endl;
cudaFree(d_output);
cudaFree(d_input);
return 0;
}
Nvprof result :
==10133== Profiling result:
Time(%) Time Calls Avg Min Max Name
41.17% 6.4118ms 1 6.4118ms 6.4118ms 6.4118ms [CUDA memcpy DtoH]
40.73% 6.3435ms 1 6.3435ms 6.3435ms 6.3435ms GrayToBGRA(unsigned char*, unsigned char*, int, int)
18.11% 2.8205ms 1 2.8205ms 2.8205ms 2.8205ms [CUDA memcpy HtoD]
==10133== API calls:
Time(%) Time Calls Avg Min Max Name
92.87% 259.62ms 2 129.81ms 13.468ms 246.15ms cudaMalloc
4.63% 12.947ms 2 6.4735ms 3.7474ms 9.1996ms cudaMemcpy
2.27% 6.3467ms 1 6.3467ms 6.3467ms 6.3467ms cudaEventSynchronize
0.11% 316.75us 83 3.8160us 1.0830us 116.17us cuDeviceGetAttribute
0.05% 142.83us 1 142.83us 142.83us 142.83us cudaLaunch
0.03% 79.251us 2 39.625us 38.167us 41.084us cudaEventRecord
0.02% 51.084us 2 25.542us 7.3340us 43.750us cudaEventCreate
0.01% 17.583us 1 17.583us 17.583us 17.583us cudaEventElapsedTime
0.00% 13.417us 4 3.3540us 1.2500us 8.1670us cudaSetupArgument
0.00% 8.1660us 2 4.0830us 2.3330us 5.8330us cuDeviceGetCount
0.00% 7.4170us 1 7.4170us 7.4170us 7.4170us cudaEventDestroy
0.00% 4.9160us 1 4.9160us 4.9160us 4.9160us cudaConfigureCall
0.00% 4.3330us 1 4.3330us 4.3330us 4.3330us cuDeviceTotalMem
0.00% 3.0000us 2 1.5000us 1.2500us 1.7500us cuDeviceGet
0.00% 2.6660us 1 2.6660us 2.6660us 2.6660us cuDeviceGetName
There is a huge difference b/w the time of the kernel launching functions for both the Codes, though the code inside the kernel is same.