cuFFT cufftPlan1d and cufftExecR2C issues

Hi Guys,

I created the following code:

#include <cmath>
#include <stdio.h>
#include <cufft.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>

void cufft_1d_r2c(float* idata, int Size, float* odata) {
    // Input data in GPU memory
    float *gpu_idata;
    // Output data in GPU memory
    cufftComplex *gpu_odata;
    // Temp output in host memory
    cufftComplex host_signal;
    
    // Allocate space for the data in the GPU's memory
    cudaMalloc((void**)&gpu_idata, Size*sizeof(float));
    if (cudaGetLastError() != cudaSuccess){
        fprintf(stderr, "Cuda error: Failed to allocate memory for gpu_idata.\n");
        return;
    }
    
    // Allocate space for the calculated fourier coefficients in the GPU's memory
    cudaMalloc((void**)&gpu_odata, sizeof(cufftComplex)*Size);
    if (cudaGetLastError() != cudaSuccess){
        fprintf(stderr, "Cuda error: Failed to allocate memory for gpu_odata.\n");
        return;
    }
    
    // Copy from host memory to GPU's memory
    cudaMemcpy(gpu_idata, idata, Size*sizeof(float), cudaMemcpyHostToDevice);
    if (cudaGetLastError() != cudaSuccess){
        fprintf(stderr, "Cuda error: Could not copy idata into gpu memory.\n");
        return;
    }
    
    // Create the FFT calculation plan    
    cufftHandle Plan;
    if(cufftPlan1d(&Plan, Size, CUFFT_R2C, 1) != CUFFT_SUCCESS) {
        fprintf(stderr, "CUFFT error: Plan creation failed.\n");
        return;
    }
    
    // Execute the FFT calculation plan on the GPU
    if(cufftExecR2C(Plan, (cufftReal*)gpu_idata, (cufftComplex*)gpu_odata) != CUFFT_SUCCESS) {
        fprintf(stderr, "CUFFT error: ExecR2C failed.\n");
        return;
    }
    
    // Copy calculated fourier coefficients back to host memory
    cudaMemcpy(host_signal, gpu_odata,  Size*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
    if (cudaGetLastError() != cudaSuccess){
        fprintf(stderr, "Cuda error: Could not copy gpu_odata into host memory.\n");
        return;
    }
    
    // Print results
    float tmp;
    for (int i=0; i<Size; i++) {
        printf("[%d]: %f + %fi\n", i, host_signal[i].x, host_signal[i].y);
    }
    
    if (cudaDeviceSynchronize() != cudaSuccess){
        fprintf(stderr, "Cuda error: Failed to synchronize.\n");
        return;
    }
    
    cudaFree(gpu_idata);
    cudaFree(gpu_odata);
    cufftDestroy(Plan);
    
    return;
}

The idata parameter is just a float array, and Size is just it’s size.

If I pass in the following array:
[1,2,3,4,5,6]

I convert it to float before I pass:
cuFFT Input Data[0]: 1.000000.
cuFFT Input Data[1]: 2.000000.
cuFFT Input Data[2]: 3.000000.
cuFFT Input Data[3]: 4.000000.
cuFFT Input Data[4]: 5.000000.
cuFFT Input Data[5]: 6.000000.

I get something like this as the result of the FFT:
[0]: 21.000000 + 0.000000i
[1]: -3.000000 + 5.196153i
[2]: -3.000000 + 1.732051i
[3]: -3.000000 + 0.000000i
[4]: -4.000000 + 0.000000i
[5]: -335898466061976380462931376697054855168.000000 + -333239989787996945065453338629242880000.000000i

But it should be:
[0]: 21.000000 + 0.000000i
[1]: -3.000000 + 5.196153i
[2]: -3.000000 + 1.732051i
[3]: -3.000000 + 0.000000i
[4]: -3.0000 - 1.7321i
[5]: -3.0000 - 5.1962i

So it seems to me the first half of the result is OK, but not the second half.

My guess is that there might be something to do with the data sizes in the memory.

Any help would be really appreciated.

Zoltan

If you want results that are immediately understandable, use C2C not R2C transforms (even with real input data). Then you’ll likely get the output data you expect.

With an R2C transform, the output data only consists of the non-redundant data. Please read this section of the cufft documentation carefully:

[url]http://docs.nvidia.com/cuda/cufft/index.html#data-layout[/url]

“Finally, R2C demands an input array ( X 1 , X 2 , … , X N ) of real values and returns an array ( x 1 , x 2 , … , x [N/2] + 1 ) of non-redundant complex elements.”

and

“The real-to-complex transform is implicitly a forward transform. … For out-of-place transforms, input and output sizes match the logical transform size N and the non-redundant size [N/2]+ 1 , respectively.”

Also search the cufft document for “hermitian symmetry” and study online resources for hermitian symmetry associated with the output of a fourier transform where the input consists of only real data.

1 Like

Hi,

Thank you for the great suggestion!

I tried to modify this to use C2C plan, here is the progress:

#include <cmath>
#include <stdio.h>
#include <cufft.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>

void cufft_1d_c2c(float* idata, int Size, float* odata) {
    // Create cufftComplex from float
    cufftComplex *idata_c = new cufftComplex;
    for (int i = 0; i < Size; ++i)
    {
        idata_c[i].x = sin(idata[i] * (float)i / Size);
        idata_c[i].y = cos(idata[i] * (float)i / Size);
    }
    
    // Input data in GPU memory
    cufftComplex *gpu_idata;
    // Output data in GPU memory
    cufftComplex *gpu_odata;
    // Temp output in host memory
    cufftComplex host_signal;
    
    // Allocate space for the data in the GPU's memory
    cudaMalloc((void**)&gpu_idata, Size*sizeof(cufftComplex));
    if (cudaGetLastError() != cudaSuccess){
        fprintf(stderr, "Cuda error: Failed to allocate memory for gpu_idata.\n");
        return;
    }
    
    // Allocate space for the calculated fourier coefficients in the GPU's memory
    cudaMalloc((void**)&gpu_odata, sizeof(cufftComplex)*Size);
    if (cudaGetLastError() != cudaSuccess){
        fprintf(stderr, "Cuda error: Failed to allocate memory for gpu_odata.\n");
        return;
    }
    
    // Copy from host memory to GPU's memory
    cudaMemcpy(gpu_idata, idata_c, Size*sizeof(cufftComplex), cudaMemcpyHostToDevice);
    if (cudaGetLastError() != cudaSuccess){
        fprintf(stderr, "Cuda error: Could not copy idata into gpu memory.\n");
        return;
    }
    
    // Create the FFT calculation plan    
    cufftHandle Plan;
    if(cufftPlan1d(&Plan, Size, CUFFT_C2C, 1) != CUFFT_SUCCESS) {
        fprintf(stderr, "CUFFT error: Plan creation failed.\n");
        return;
    }
    
    // Execute the FFT calculation plan on the GPU
    if(cufftExecC2C(Plan, gpu_idata, (cufftComplex*)gpu_odata, CUFFT_FORWARD) != CUFFT_SUCCESS) {
        fprintf(stderr, "CUFFT error: Execr2C failed.\n");
        return;
    }
    
    // Copy calculated fourier coefficients back to host memory
    cudaMemcpy(host_signal, gpu_odata,  Size*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
    if (cudaGetLastError() != cudaSuccess){
        fprintf(stderr, "Cuda error: Could not copy gpu_odata into host memory.\n");
        return;
    }
    
    // Print data
    float tmp;
    for (int i=0; i<Size; i++) {
        printf("[%d]: %f + %fi\n", i, host_signal[i].x, host_signal[i].y);
    }
    
    // Synchronize the device
    if (cudaDeviceSynchronize() != cudaSuccess){
        fprintf(stderr, "Cuda error: Failed to synchronize.\n");
        return;
    }
    
    cudaFree(gpu_idata);
    cudaFree(gpu_odata);
    cufftDestroy(Plan);
}

The result looks a bit better but for some reasons the result does not look correct.

Input array:
cuFFT Input Data[0]: 1.000000.
cuFFT Input Data[1]: 2.000000.
cuFFT Input Data[2]: 3.000000.
cuFFT Input Data[3]: 4.000000.
cuFFT Input Data[4]: 5.000000.
cuFFT Input Data[5]: 6.000000.

cuFFT result:
[0]: 0.928471 + 1.371101i
[1]: 0.340154 + 0.243559i
[2]: 0.154339 + -0.029811i
[3]: 0.373335 + -0.253844i
[4]: 1.645083 + 0.410269i
[5]: -3.441382 + 4.258726i

So that one is wrong, here is what it should be:
[0]: 21.0000 + 0.0000i
[1]: -3.0000 + 5.1962i
[2]: -3.0000 + 1.7321i
[3]: -3.0000 + 0.0000i
[4]: -3.0000 - 1.7321i
[5]: -3.0000 - 5.1962i

What am I missing?
Thank you,
Zoltan

This code, unlike your previous code, is using the input data to build a sine/cosine sequence instead. I’m not sure why you think it should be reflective of the transform of

1 2 3 4 5 6

??

This construct:

for (int i = 0; i < Size; ++i)
    {
        idata_c[i].x = sin(idata[i] * (float)i / Size);
        idata_c[i].y = cos(idata[i] * (float)i / Size);
    }

does not exist in your first code. Why do you think they should produce the same output?

There are CUDA sample codes that cover the basic mechanics of doing FFTs:

http://docs.nvidia.com/cuda/cuda-samples/index.html#simple-cufft

You might want to study the sample codes.

Thanks, I removed sin and cos. It works now!