I have one question about Nsight profile of cufft code.
I made very simple sample code for 1D-cuFFT and I checked the profile of my code by Nsight.
My 1D-cufft code is as below.
It is very simple 1D-cufft code by using Pageable memory and Unified Memory.
My code was operated with no problem.
But I have one question about Nsight profile.
We ca see “Cuda Event Create” and “Cuda Free” at the begging of the Nsight profile.
They needed very long time ,they needed more than 500msec .
Why did the need a long time?
I guess that Cuda API and Run time do various seeting og GPU at the first operation of cuFFT,
hence they need a long time ( more than 500msec ).
Is it right?
I’d really appreciated it if anyone will reply soon.
//****************************************************************************
#define _USE_MATH_DEFINES
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cuda_runtime.h>
#include <cufft.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#define NX 2048
#define BATCH 1
float elapsed_time0_ms=0.0f;
float elapsed_time1_ms=0.0f;
float elapsed_time2_ms=0.0f;
float elapsed_time3_ms=0.0f;
float elapsed_time4_ms=0.0f;
float elapsed_time5_ms=0.0f;
int i;
cudaEvent_t start0, stop0;
cudaEvent_t start1, stop1;
cudaEvent_t start2, stop2;
cudaEvent_t start3, stop3;
cudaEvent_t start4, stop4;
cudaEvent_t start5, stop5;
cudaEvent_t ustart0, ustop0;
cudaEvent_t ustart1, ustop1;
cudaEvent_t ustart2, ustop2;
cudaEvent_t ustart3, ustop3;
int main(int argc, char *argv)
{
//CUDA Time Measure Event Make
cudaEventCreate( &start0 );
cudaEventCreate( &stop0 );
cudaEventCreate( &start1 );
cudaEventCreate( &stop1 );
cudaEventCreate( &start2 );
cudaEventCreate( &stop2 );
cudaEventCreate( &start3 );
cudaEventCreate( &stop3 );
cudaEventCreate( &start4 );
cudaEventCreate( &stop4 );
cudaEventCreate( &start5 );
cudaEventCreate( &stop5 );
cudaEventCreate( &ustart0 );
cudaEventCreate( &ustop0 );
cudaEventCreate( &ustart1 );
cudaEventCreate( &ustop1 );
cudaEventCreate( &ustart2 );
cudaEventCreate( &ustop2 );
cudaEventCreate( &ustart3 );
cudaEventCreate( &ustop3 );
//Pageable Memory cuFFT Memory,Handle
cufftHandle plan;
cufftComplex *devPtr;
cufftComplex *data;
//Unified Memory cuFFT Memory,Handle
cufftHandle uplan;
cufftComplex *udata;
for(int LOOP=0;LOOP<2;++LOOP){
elapsed_time0_ms=0.0f;
elapsed_time1_ms=0.0f;
elapsed_time2_ms=0.0f;
elapsed_time3_ms=0.0f;
elapsed_time4_ms=0.0f;
elapsed_time5_ms=0.0f;
/////
// 1D FFT Pageable Memory Sample Code //
////
printf("LOOP:%2d\n",LOOP);
//CUDA Timer Start0
cudaEventRecord( start0, 0 );
//CPU Memory Allocate
data = (cufftComplex *)malloc(sizeof(cufftComplex)NXBATCH);
// GPU Memory allocte //
cudaMalloc((void**)&devPtr, sizeof(cufftComplex)NXBATCH);
//CUDA Timer End0
cudaEventRecord( stop0, 0 );
cudaEventSynchronize( stop0 );
cudaEventElapsedTime( &elapsed_time0_ms, start0, stop0 );
//Display time
printf(“Pageable Memory cuFFT Time\n\n”);
printf( “cudaMallloc Time: %8.2f ms\n”, elapsed_time0_ms );
/*Input Data */
for(i= 0 ; i < NX*BATCH ; i++){
data[i].x = 1000.0f*sin(2.0*M_PI/256.0f*i);
data[i].y = 0.0f;
}
//CUDA Timer Start1
cudaEventRecord( start1, 0 );
// Data HosttoDevice //
cudaMemcpy(devPtr, data, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice);
//CUDA Timer End1
cudaEventRecord( stop1, 0 );
cudaEventSynchronize( stop1 );
cudaEventElapsedTime( &elapsed_time1_ms, start1, stop1 );
//Display Time
printf( “cudaMemcpy HostToDevice Time: %8.2f ms\n”, elapsed_time1_ms );
//CUDA Timer Start2
cudaEventRecord( start2, 0 );
/* 1D FFT plan Make */
cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);
//CUDA Timer End2
cudaEventRecord( stop2, 0 );
cudaEventSynchronize( stop2 );
cudaEventElapsedTime( &elapsed_time2_ms, start2, stop2 );
//Display time
printf( "Pageable FFT Handle Make Time: %8.2f ms\n", elapsed_time2_ms );
//CUDA Timer Start3
cudaEventRecord( start3, 0 );
/* FFT Excec */
cufftExecC2C(plan, devPtr, devPtr, CUFFT_FORWARD);
//CUDA Timer End3
cudaEventRecord( stop3, 0 );
cudaEventSynchronize( stop3 );
cudaEventElapsedTime( &elapsed_time3_ms, start3, stop3 );
//Display time
printf( “Pageable cufftExec Time: %8.2f ms\n”, elapsed_time3_ms );
//CUDA Timer Start4
cudaEventRecord( start4, 0 );
/* Result Memcpy Device to Host */
cudaMemcpy(data, devPtr, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyDeviceToHost);
//CUDA Timer End4
cudaEventRecord( stop4, 0 );
cudaEventSynchronize( stop4 );
cudaEventElapsedTime( &elapsed_time4_ms, start4, stop4 );
//Display time
printf( “cudaMemcpy HostToDevice Time: %8.2f ms\n”, elapsed_time4_ms );
//CUDA Timer Start5
cudaEventRecord( start5, 0 );
//Pageable cuFFT Handle Destory//
cufftDestroy(plan);
//CPU Memory Release
free(data);
// GPU Memory Release //
cudaFree(devPtr);
//CUDA Timer End5
cudaEventRecord( stop5, 0 );
cudaEventSynchronize( stop5 );
cudaEventElapsedTime( &elapsed_time5_ms, start5, stop5 );
//Display Time
printf( “Pageable Memory FFT End Process: %8.2f ms\n\n”, elapsed_time5_ms );
//printf("Normal Memory FFT Result\n");
// for(i = 0 ; i < NX*BATCH ; i++){
// printf(“data[%d] %lf %lf\n”, i, data[i].x, data[i].y);
// }
////
// 1D FFT Unifyed Memory Sample Code //
////
//CUDA Timer Start0
cudaEventRecord( ustart0, 0 );
//unifyed Memory Allocete
cudaMallocManaged(&udata, sizeof(cufftComplex) * NX*BATCH);
//CUDA Timer End0
cudaEventRecord( ustop0, 0 );
cudaEventSynchronize( ustop0 );
cudaEventElapsedTime( &elapsed_time0_ms, ustart0, ustop0 );
//Display Time
printf(“Unified Memory cuFFT Time\n\n”);
printf( “cudaMalllocManaged Time: %8.2f ms\n”, elapsed_time0_ms );
// Input Data //
for(i= 0 ; i < NX*BATCH ; i++){
udata[i].x = 1000.0f*sin(2.0*M_PI/64.0*i);
udata[i].y = 0.0f;
}
//CUDA Timer Start1
cudaEventRecord( ustart1, 0 );
//Unified Memory用FFT Handle
cufftPlan1d(&uplan, NX, CUFFT_C2C, BATCH);
//CUDA Timer End1
cudaEventRecord( ustop1, 0 );
cudaEventSynchronize(ustop1 );
cudaEventElapsedTime( &elapsed_time1_ms, ustart1, ustop1 );
//Display time
printf( “Unified FFT Handle Make: %8.2f ms\n”, elapsed_time1_ms );
//CUDA Timer Start2
cudaEventRecord( ustart2, 0 );
/* FFT excec */
cufftExecC2C(uplan, udata, udata, CUFFT_FORWARD);
//CUDA Timer End2
cudaEventRecord( ustop2, 0 );
cudaEventSynchronize( ustop2 );
cudaEventElapsedTime( &elapsed_time2_ms, ustart2, ustop2 );
//display time
printf( “Unified cufft Excecute Time: %8.2f ms\n”, elapsed_time2_ms );
//printf(“Unifyed Memory FFT Result\n”);
// for(i = 0 ; i < NX*BATCH ; i++){
// printf(“udata[%d] %lf %lf\n”, i, data[i].x, data[i].y);
// }
//CUDA Timer Start3
cudaEventRecord( ustart3, 0 );
// GPU用Unified Memory Release //
cudaFree(udata);
//unified Memory cufft Handle destory//
cufftDestroy(uplan);
//CUDA Timer End3
cudaEventRecord( ustop3, 0 );
cudaEventSynchronize( ustop3 );
cudaEventElapsedTime( &elapsed_time3_ms, ustart3, ustop3 );
//Display time
printf( “Unified Memory FFT End Process: %8.2f ms\n\n”, elapsed_time3_ms );
} //LOOP END
//Pgeable Time MeasureEvent Release
cudaEventDestroy( start0 );
cudaEventDestroy( stop0 );
cudaEventDestroy( start1 );
cudaEventDestroy( stop1 );
cudaEventDestroy( start2 );
cudaEventDestroy( stop2 );
cudaEventDestroy( start3 );
cudaEventDestroy( stop3 );
cudaEventDestroy( start4 );
cudaEventDestroy( stop4 );
cudaEventDestroy( start5 );
cudaEventDestroy( stop5 );
//Unified Memory Time MeasureEvent Release
cudaEventDestroy( ustart0 );
cudaEventDestroy( ustop0 );
cudaEventDestroy( ustart1 );
cudaEventDestroy( ustop1 );
cudaEventDestroy( ustart2 );
cudaEventDestroy( ustop2 );
cudaEventDestroy( ustart3 );
cudaEventDestroy( ustop3 );
//GPU Reset
cudaDeviceReset();
return 0;
}