Recently I upgraded my Linux box (with a GTX 750 Ti) from CUDA 6.5 to 7.0 and noticed a performance decrease in cuFFT.
For example, the simple program below reports an average runtime of 0.550ms with 6.5 and 0.770ms with 7.0.
Downgrading to 6.5 brings back the 0.550ms. Upgrading to 7.0 yields 0.770ms again.
Here is the code:
#include <math.h>
#include <stdio.h>
#include <sys/time.h>
#include <cuda.h>
#include <cufft.h>
#define PI 3.141592653589793
#define NFFT 128
#define BATCH 1000
#define NRUNS 1000
inline double seconds()
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec * 1.e-6;
}
int main(int argc, char* argv[])
{
cudaSetDevice(0);
int n = BATCH * NFFT;
cufftComplex* array = (cufftComplex*)malloc(n*sizeof(cufftComplex));
for (int i=0; i<n; i++)
{
array[i].x = cos(2.*PI*i/NFFT);
array[i].y = sin(2.*PI*i/NFFT);
}
cufftComplex* d_array;
cudaMalloc(&d_array, n*sizeof(cufftComplex));
cudaMemcpy(d_array, array, n*sizeof(cufftComplex), cudaMemcpyHostToDevice);
double runtime = 0.0;
for (int i=0; i<NRUNS+1; i++)
{
double t0 = seconds();
cufftHandle plan;
int nfft[] = {NFFT};
cufftPlanMany(&plan, 1, nfft, NULL, 1, NFFT, NULL, 1, NFFT, CUFFT_C2C, BATCH);
cufftComplex* d_fft;
cudaMalloc(&d_fft, n*sizeof(cufftComplex));
cufftExecC2C(plan, d_array, d_fft, CUFFT_FORWARD);
cufftDestroy(plan);
cudaFree(d_fft);
if (i == 0) continue; // first run is just to warm up
double t1 = seconds();
runtime += t1-t0;
}
printf("Avg. runtime (s): %.6f\n", runtime/NRUNS);
cudaFree(d_array);
free(array);
cudaDeviceReset();
return 0;
}
Compiled with:
nvcc -arch=sm_50 -lcufft -o test test.cu
Supposedly, cuFFT should be faster in 7.0:
http://devblogs.nvidia.com/parallelforall/cuda-7-release-candidate-feature-overview/
(see “cuFFT Performance Improvements”)
Am I doing something wrong? Or am I forgetting something?