Optimizing cuFFT Program Help

Hello, I have a cuda program that performs a FFT on 2D images captured continously from a Point Grey camera and I am running into speed issues. The goal of this program is to be able to perform the FFT as fast or faster than the cameras frame rate. If you look at the time it takes to perform the actual FFT it is outstandingly fast (~3ms), however a lot of extra time is being eaten up either destroying the plan, or freeing the memory. The conversion from OpenCV Mat to C++ array is slow also, but I can fix that later. I have two cases:

  1. Destroying the plan takes a long time, but freeing the memory doesn’t.
  2. I try to bypass having to create and destroy the plan every single time my main.cpp calls this cuda function by creating separate functions that will only be called once to destroy or create the plan, but as a consequence freeing the memory takes longer.

Would anyone be able to explain this or offer possible solutions?

#include <cufft.h>
#include <cuda.h>

#include <opencv2/core/core.hpp>
#include <opencv2/opencv.hpp>
#include <iostream>
#include <stdlib.h>
#include <stdio.h>

#include "header.h"

int fastft(int width, int height, float data[0],double t,double capturetime)
{
//Create Variables
double t1 = (double)cv::getTickCount();
cufftHandle plan;
cufftComplex *transform;
t1 = ((double)cv::getTickCount() - t1)/cv::getTickFrequency();

//Allocate memory
double t2 = (double)cv::getTickCount();
cudaMallocManaged((void**)&data, sizeof(cufftReal)*width*height); 
cudaMallocManaged((void**)&transform, sizeof(cufftComplex)*width*height);
t2 = ((double)cv::getTickCount() - t2)/cv::getTickFrequency();

//Create Plan
double t3 = (double)cv::getTickCount();
cufftPlan2d(&plan, width, height, CUFFT_R2C);
t3 = ((double)cv::getTickCount() - t3)/cv::getTickFrequency();

//Execute fft
double t4 = (double)cv::getTickCount();
cufftExecR2C(plan, data, transform);
t4 = ((double)cv::getTickCount() - t4)/cv::getTickFrequency(); 

//Destroy plan
double t5 = (double)cv::getTickCount();
cufftDestroy(plan);
t5 = ((double)cv::getTickCount() - t5)/cv::getTickFrequency();

//Free memory
double t6 = (double)cv::getTickCount();
cudaFree(data);
t6 = ((double)cv::getTickCount() - t6)/cv::getTickFrequency();

double t7 = (double)cv::getTickCount();
cudaFree(transform);
t7 = ((double)cv::getTickCount() - t7)/cv::getTickFrequency();

std::cout<<" " <<std::endl;
std::cout<<"OpenCV Mat --> C++ 2D Array: "<< t<<std::endl;
std::cout<<"Create Variables: "<< t1<<std::endl;
std::cout<<"Allocate Memory: "<< t2<<std::endl;
std::cout<<"Create FFT Plan: "<< t3<<std::endl;
std::cout<<"Execute FFT: "<< t4<<std::endl;
std::cout<<"Destroy Plan: "<< t5<<std::endl;
std::cout<<"Free memory (image): "<< t6<<std::endl;
std::cout<<"Free memory (transform): "<< t7<<std::endl;
std::cout<<" "<<std::endl;
std::cout<<"Total FFT Time: "<< (t+t1+t2+t3+t4+t5+t6+t7)<<std::endl;
std::cout<<"Capture Time: "<< capturetime<<std::endl;
std::cout<<"Capture Time + Total FFT Time: "<< (t+t1+t2+t3+t4+t5+t6+t7+capturetime)<<std::endl;
std::cout<<" "<<std::endl;
return 0;
}

OpenCV Mat → C++ 2D Array: 0.0219364
Create Variables: 3.2e-08
Allocate Memory: 0.000723165
Create FFT Plan: 0.0058155
Execute FFT: 0.00345333
Destroy Plan: 0.0474761
Free memory (image): 0.00062883
Free memory (transform): 0.000860861

Total FFT Time: 0.0808942
Capture Time: 0.00185315
Capture Time + Total FFT Time: 0.0827473

#include <cufft.h>
#include <cuda.h>


#include <opencv2/core/core.hpp>
#include <opencv2/opencv.hpp>

#include <iostream>
#include <stdlib.h>
#include <stdio.h>

#include "header.h"

cufftHandle plan;

int destroyplan()
{
//Destroy plan
double t5 = (double)cv::getTickCount();
cufftDestroy(plan);
t5 = ((double)cv::getTickCount() - t5)/cv::getTickFrequency();
return 0;
}

int createplan(int width,int height)
{
//Create Plan
double t3 = (double)cv::getTickCount();
cufftPlan2d(&plan, width, height, CUFFT_R2C);
t3 = ((double)cv::getTickCount() - t3)/cv::getTickFrequency();
return 0;
}

int fastft(int width, int height, float data[0],double t,double capturetime)
{
//Create Variables
double t1 = (double)cv::getTickCount();
cufftComplex *transform;
t1 = ((double)cv::getTickCount() - t1)/cv::getTickFrequency();

//Allocate memory
double t2 = (double)cv::getTickCount();
cudaMallocManaged((void**)&data, sizeof(cufftReal)*width*height); 
cudaMallocManaged((void**)&transform, sizeof(cufftComplex)*width*height);
t2 = ((double)cv::getTickCount() - t2)/cv::getTickFrequency();
//Execute fft
double t4 = (double)cv::getTickCount();
cufftExecR2C(plan, data, transform);
t4 = ((double)cv::getTickCount() - t4)/cv::getTickFrequency(); 
//Free memory
double t6 = (double)cv::getTickCount();
cudaFree(data);
t6 = ((double)cv::getTickCount() - t6)/cv::getTickFrequency();
double t7 = (double)cv::getTickCount();
cudaFree(transform);
t7 = ((double)cv::getTickCount() - t7)/cv::getTickFrequency();

std::cout<<" " <<std::endl;
std::cout<<"OpenCV Mat --> C++ 2D Array: "<< t<<std::endl;
std::cout<<"Create Variables: "<< t1<<std::endl;
std::cout<<"Allocate Memory: "<< t2<<std::endl;
std::cout<<"Execute FFT: "<< t4<<std::endl;
std::cout<<"Free memory (image): "<< t6<<std::endl;
std::cout<<"Free memory (transform): "<< t7<<std::endl;
std::cout<<" "<<std::endl;
std::cout<<"Total FFT Time: "<< (t+t1+t2+t4+t6+t7)<<std::endl;
std::cout<<"Capture Time: "<< capturetime<<std::endl;
std::cout<<"Capture Time + Total FFT Time: "<< (t+t1+t2+t4+t6+t7+capturetime)<<std::endl;
std::cout<<" "<<std::endl;
return 0;
}

OpenCV Mat → C++ 2D Array: 0.0229877
Create Variables: 6.4e-08
Allocate Memory: 0.000779996
Execute FFT: 0.00222748
Free memory (image): 0.0346805
Free memory (transform): 0.000997083

Total FFT Time: 0.0616728
Capture Time: 0.00227941
Capture Time + Total FFT Time: 0.0639522

don’t destroy the plan, reuse it
don’t free the memory, reuse it