Regarding Usage of Zero Copy on TX1 to improve performance

Aditya_K · March 9, 2016, 1:36pm

Hi all,
I am executing Gray to BGRA conversion code on GPU using Zero Copy Pipeline and Standard CUDA pipeline (where we use cudaMemcpy for copying data from CPU to GPU.) . I read online on http://arrayfire.com/zero-copy-on-tegra-k1/ that Zero copy takes less time as compared to Standard CUDA pipeline. But in this case , the time has increased by 10x . I am not able to find where its going wrong . It says that memcpy is not required but without using it i’m not able to generate output. What might be the problem ?

Any help on this topic would be appreciated.

Attached below is the code using Zero Copy .

Code :

#include
#include<stdio.h>
#include<string.h>
#include<opencv2/core/core.hpp>
#include<opencv2/highgui/highgui.hpp>
#include<cuda_runtime.h>
#include “device_launch_parameters.h”
#include “Profile.h”

//#define PROFILE
#define CUDA 1

global void kernel(unsigned char *d_output , unsigned char *d_input, int width, int height)
{

int rows = (blockIdx.x * blockDim.x  + threadIdx.x);
int cols = (blockIdx.y * blockDim.y + threadIdx.y) ;
int index = rows  * width + cols ;
d_output[3 * index ]   = d_input[3 * index];
d_output[3 * index +1] = d_input[3 * index + 1];
d_output[3 * index +2] = d_input[3 * index + 2];
d_output[4 * index +3] = 255;	//Alpha value

}

using namespace profile;

Profile* m_pTimer = Profile::getInstance();

void GrayToBGRA(const cv::Mat& input, cv::Mat& output, unsigned char *dev_output, unsigned char *dev_input, int sizeIp, int sizeOp)
{

dim3 numThreadsPerBlock(8,8);
dim3 numBlocks(input.rows/numThreadsPerBlock.x, input.cols/numThreadsPerBlock.y); 
#ifdef PROFILE
m_pTimer->StartTimer(QUERYAVG);
#endif

	//cudaMemcpy( dev_input, input.ptr(), sizeIp, cudaMemcpyHostToDevice );
#ifdef PROFILE
m_pTimer->StopTimer(QUERYAVG);
#endif

#ifdef PROFILE
	m_pTimer->StartTimer(QUERYKERNEL);
#endif
//cudaSetDevice(1);
#ifdef CUDA
//GPU timer code
float time;
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0); 
#endif
	kernel<<< numBlocks, numThreadsPerBlock >>>( dev_output, dev_input, input.cols, input.rows);
#ifdef CUDA
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop); //time taken in kernel call calculated
cudaEventDestroy(start);
#endif
cudaDeviceSynchronize();

// cudaThreadSynchronize();
#ifdef PROFILE
m_pTimer->StopTimer(QUERYKERNEL);
#endif

#ifdef PROFILE
	m_pTimer->StartTimer(QUERYMEMCPY2);
#endif

m_pTimer->StopTimer(QUERYMEMCPY2);

printf("\n\nTime taken is %f (ms)\n",time);

}

int main()
{
cudaDeviceProp prop;
int whichDevice;
cudaGetDevice(&whichDevice);
cudaGetDeviceProperties(&prop,whichDevice);
if (prop.canMapHostMemory !=1)
{
printf("Device Cannot Map Memory ");

	return 0;
}	
unsigned char *d_output, *d_input;
cv::Mat input = cv::imread( "/home/ubuntu/Neha/GrayToBGRA/latest1.jpeg" );

if(input.empty())
{
	std::cout<<"Image Not Found!"<<std::endl;
	std::cin.get();
	return -1;
}

//Create output image
cv::Mat output(input.rows,input.cols,CV_8UC4);
cudaSetDeviceFlags(cudaDeviceMapHost);
unsigned char *h_in = input.data;
unsigned char *h_out= output.data;
const int size_input = input.cols * 3 * input.rows;
const int size_output = output.cols * 4 * output.rows;
printf("size_input=%d\n",size_input);
printf("size_output=%d\n",size_output);

//Allocate device memory
#ifdef PROFILE
m_pTimer->StartTimer(QUERYIPM);
#endif
cudaHostAlloc((void **)&h_in,  size_input,  cudaHostAllocMapped);
cudaHostAlloc((void **)&h_out, size_output, cudaHostAllocMapped);
cudaMemcpy(h_in,input.data,size_input,cudaMemcpyHostToDevice);
cudaHostGetDevicePointer((void **)&d_input,  (void *) h_in , 0);
cudaHostGetDevicePointer((void **)&d_output, (void *) h_out, 0);
//cudaMalloc(&d_input,size_input);
#ifdef PROFILE
m_pTimer->StopTimer(QUERYIPM);
#endif
//cudaMalloc(&d_output,size_output);

#ifdef PROFILE
m_pTimer->StartTimer(QUERYTOTAL);
#endif
for(int i=0;i<1;i++)
{
GrayToBGRA(input,output,d_output,d_input,size_input,size_output);
}

#ifdef PROFILE
m_pTimer->StopTimer(QUERYTOTAL);
#endif
      
cudaMemcpy(output.data,h_out,size_input,cudaMemcpyDeviceToHost);

//printf("Horizontal Flip After\n");
//Show the input and output
cv::imshow("Input",input);
cv::waitKey(0);
cv::imshow("Output",output);

//Wait for key press
cv::waitKey(0);

cout <<"Total Time: "<< (m_pTimer->getTotalTime(QUERYTOTAL)/500)<<endl;
cout <<"Total Time for Cudamalloc:"<< (m_pTimer->getTotalTime(QUERYIPM))<<endl;
cout <<"Total Time for cudamemcpy"<< (m_pTimer->getTotalTime(QUERYAVG)/500)<<endl;
cout <<"Total Time for kernel:"<< (m_pTimer->getTotalTime(QUERYKERNEL)/500)<<endl;
cout <<"Total Time for cudamemcpyafter:"<< (m_pTimer->getTotalTime(QUERYMEMCPY2)/500)<<endl;
cudaFree(h_out);
cudaFree(h_in);
return 0;

}

Also attaching nvprof stats for reference.

==10116== Profiling result:
Time(%) Time Calls Avg Min Max Name
87.26% 63.516ms 1 63.516ms 63.516ms 63.516ms kernel(unsigned char*, unsigned char*, int, int)
12.74% 9.2751ms 2 4.6375ms 3.4132ms 5.8619ms [CUDA memcpy HtoH]

==10116== API calls:
Time(%) Time Calls Avg Min Max Name
75.47% 255.06ms 2 127.53ms 4.5387ms 250.52ms cudaHostAlloc
18.79% 63.507ms 1 63.507ms 63.507ms 63.507ms cudaEventSynchronize
5.42% 18.330ms 2 9.1648ms 4.8008ms 13.529ms cudaMemcpy
0.10% 335.33us 83 4.0400us 1.0000us 127.83us cuDeviceGetAttribute
0.07% 220.42us 1 220.42us 220.42us 220.42us cudaGetDeviceProperties
0.05% 180.17us 1 180.17us 180.17us 180.17us cudaLaunch
0.04% 130.17us 2 65.083us 61.667us 68.500us cudaEventRecord
0.01% 48.081us 2 24.040us 8.4160us 39.665us cudaEventCreate
0.01% 25.750us 1 25.750us 25.750us 25.750us cudaDeviceSynchronize
0.01% 24.917us 1 24.917us 24.917us 24.917us cudaGetDevice
0.01% 23.167us 2 11.583us 6.3330us 16.834us cudaHostGetDevicePointer
0.01% 20.583us 1 20.583us 20.583us 20.583us cudaEventElapsedTime
0.00% 13.833us 1 13.833us 13.833us 13.833us cudaSetDeviceFlags
0.00% 10.500us 4 2.6250us 1.5830us 3.1660us cudaSetupArgument
0.00% 7.9170us 1 7.9170us 7.9170us 7.9170us cudaConfigureCall
0.00% 7.7500us 2 3.8750us 1.7500us 6.0000us cuDeviceGetCount
0.00% 7.5000us 1 7.5000us 7.5000us 7.5000us cudaEventDestroy
0.00% 3.9170us 1 3.9170us 3.9170us 3.9170us cuDeviceTotalMem
0.00% 3.3330us 2 1.6660us 1.4160us 1.9170us cuDeviceGet
0.00% 2.5000us 1 2.5000us 2.5000us 2.5000us cuDeviceGetName

Original Code without Zero Copy Pipeline. (Using memcpy / CudaMalloc)

Code:

#include
#include<stdio.h>
#include<opencv2/core/core.hpp>
#include<opencv2/highgui/highgui.hpp>
#include<cuda_runtime.h>
#include “device_launch_parameters.h”
#include “Profile.h”

//#define PROFILE 0
#define CUDA 1

global void GrayToBGRA(unsigned char *d_output , unsigned char *d_input, int width, int height)
{

int rows = (blockIdx.x * blockDim.x  + threadIdx.x);
int cols = (blockIdx.y * blockDim.y + threadIdx.y) ;
int index = rows  * width + cols ;

d_output[4 * index ]   = d_input[3 * index];
d_output[4 * index +1] = d_input[3 * index + 1];
d_output[4 * index +2] = d_input[3 * index + 2];
d_output[4 * index +3] = 255;	//Alpha value

}

using namespace profile;

Profile* m_pTimer = Profile::getInstance();

void GrayToBGRA(const cv::Mat& input, cv::Mat& output, unsigned char *dev_output, unsigned char *dev_input, int sizeIp, int sizeOp)
{

dim3 numThreadsPerBlock(8,8);
dim3 numBlocks(input.rows/numThreadsPerBlock.x, input.cols/numThreadsPerBlock.y); 
#ifdef PROFILE
m_pTimer->StartTimer(QUERYAVG);
#endif

	cudaMemcpy( dev_input, input.ptr(), sizeIp, cudaMemcpyHostToDevice );
#ifdef PROFILE
m_pTimer->StopTimer(QUERYAVG);
#endif

#ifdef PROFILE
	m_pTimer->StartTimer(QUERYKERNEL);
#endif
//cudaSetDevice(1);
#ifdef CUDA
//GPU timer code
float time;
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0); 
#endif
	GrayToBGRA<<< numBlocks, numThreadsPerBlock >>>( dev_output, dev_input, input.cols, input.rows);
#ifdef CUDA
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop); //time taken in kernel call calculated
cudaEventDestroy(start);
#endif
	//cudaThreadSynchronize();
#ifdef PROFILE
m_pTimer->StopTimer(QUERYKERNEL);
#endif

#ifdef PROFILE
	m_pTimer->StartTimer(QUERYMEMCPY2);
#endif
	cudaMemcpy( output.ptr(), dev_output, sizeOp, cudaMemcpyDeviceToHost );
#ifdef PROFILE
m_pTimer->StopTimer(QUERYMEMCPY2);
#endif
printf("\n\nTime taken is %f (ms)\n",time);

}

int main()
{

unsigned char *d_output, *d_input;
//cudaFree(0);
cv::Mat input = cv::imread( "/home/ubuntu/Neha/GrayToBGRA/latest1.jpeg" );

if(input.empty())
{
	std::cout<<"Image Not Found!"<<std::endl;
	std::cin.get();
	return -1;
}

//Create output image
cv::Mat output(input.rows,input.cols,CV_8UC4);

const int size_input = input.cols * 3 * input.rows;
const int size_output = output.cols * 4 * output.rows;
printf("size_input=%d\n",size_input);
printf("size_output=%d\n",size_output);

//Allocate device memory
#ifdef PROFILE
m_pTimer->StartTimer(QUERYIPM);
#endif
cudaMalloc(&d_input,size_input);
#ifdef PROFILE
m_pTimer->StopTimer(QUERYIPM);
#endif
cudaMalloc(&d_output,size_output);

#ifdef PROFILE
m_pTimer->StartTimer(QUERYTOTAL);
#endif
for(int i=0;i<1;i++)
{
GrayToBGRA(input,output,d_output,d_input,size_input,size_output);
}
#ifdef PROFILE
m_pTimer->StopTimer(QUERYTOTAL);
#endif
      
//printf("Horizontal Flip After\n");
//Show the input and output
cv::imshow("Input",input);
cv::waitKey(0);
cv::imshow("Output",output);

//Wait for key press
cv::waitKey(0);

cout <<"Total Time: "<< (m_pTimer->getTotalTime(QUERYTOTAL)/500)<<endl;
cout <<"Total Time for Cudamalloc:"<< (m_pTimer->getTotalTime(QUERYIPM))<<endl;
cout <<"Total Time for cudamemcpy"<< (m_pTimer->getTotalTime(QUERYAVG)/500)<<endl;
cout <<"Total Time for kernel:"<< (m_pTimer->getTotalTime(QUERYKERNEL)/500)<<endl;
cout <<"Total Time for cudamemcpyafter:"<< (m_pTimer->getTotalTime(QUERYMEMCPY2)/500)<<endl;
cudaFree(d_output);
cudaFree(d_input);
return 0;

}

Nvprof result :

==10133== Profiling result:
Time(%) Time Calls Avg Min Max Name
41.17% 6.4118ms 1 6.4118ms 6.4118ms 6.4118ms [CUDA memcpy DtoH]
40.73% 6.3435ms 1 6.3435ms 6.3435ms 6.3435ms GrayToBGRA(unsigned char*, unsigned char*, int, int)
18.11% 2.8205ms 1 2.8205ms 2.8205ms 2.8205ms [CUDA memcpy HtoD]

==10133== API calls:
Time(%) Time Calls Avg Min Max Name
92.87% 259.62ms 2 129.81ms 13.468ms 246.15ms cudaMalloc
4.63% 12.947ms 2 6.4735ms 3.7474ms 9.1996ms cudaMemcpy
2.27% 6.3467ms 1 6.3467ms 6.3467ms 6.3467ms cudaEventSynchronize
0.11% 316.75us 83 3.8160us 1.0830us 116.17us cuDeviceGetAttribute
0.05% 142.83us 1 142.83us 142.83us 142.83us cudaLaunch
0.03% 79.251us 2 39.625us 38.167us 41.084us cudaEventRecord
0.02% 51.084us 2 25.542us 7.3340us 43.750us cudaEventCreate
0.01% 17.583us 1 17.583us 17.583us 17.583us cudaEventElapsedTime
0.00% 13.417us 4 3.3540us 1.2500us 8.1670us cudaSetupArgument
0.00% 8.1660us 2 4.0830us 2.3330us 5.8330us cuDeviceGetCount
0.00% 7.4170us 1 7.4170us 7.4170us 7.4170us cudaEventDestroy
0.00% 4.9160us 1 4.9160us 4.9160us 4.9160us cudaConfigureCall
0.00% 4.3330us 1 4.3330us 4.3330us 4.3330us cuDeviceTotalMem
0.00% 3.0000us 2 1.5000us 1.2500us 1.7500us cuDeviceGet
0.00% 2.6660us 1 2.6660us 2.6660us 2.6660us cuDeviceGetName

There is a huge difference b/w the time of the kernel launching functions for both the Codes, though the code inside the kernel is same.

kayccc · March 15, 2016, 5:35am

Hi Aditya_K,

Regarding the article [url]http://arrayfire.com/zero-copy-on-tegra-k1/[/url] from 2014 stating that zero-copy is faster than cudaMalloc, this article is mis-leading and generalizes the zero-copy case. This is not really accurate.
Zero copy is only faster in some cases where the access pattern does not benefit from caches.

Zero-Copy memory on Tegra is CPU and GPU uncached. So every access by the CUDA kernel goes to DRAM. So if the kernel repeatedly accesses the same memory location from then it is likely that the cudaMalloc memory is faster.

Thanks