2D CUFFT wrong result

Hi

I’m tring to use CUFFT to compute fft of 2D data, but the result is wrong

Any problem on the following code?

const int row = 4;
	const int col = 4;
	float A[row][col] = 
	{{1,2,3,4},
	{5,6,7,8},
	{9,10,11,12},
	{13,14,15,16}};
	cufftHandle plan; 
	cufftReal *d_idata, *d_odata;
	cufftComplex *h_odata;
	cudaMalloc((void**)&d_idata, sizeof(cufftReal)*row*col); 
	cudaMalloc((void**)&d_odata, sizeof(cufftComplex)*row*col);
	cudaMallocHost((void**)&h_odata, sizeof(cufftComplex)*row*col); 
	cudaMemcpy(d_idata,A,sizeof(cufftReal)*row*col,cudaMemcpyHostToDevice);
	checkCudaErrors(cufftPlan2d(&plan, row,col, CUFFT_R2C));
	checkCudaErrors(cufftExecR2C(plan, (cufftReal*)d_idata, (cufftComplex*)d_odata));  
	cudaMemcpy(h_odata,d_odata,sizeof(cufftComplex)*row*col,cudaMemcpyDeviceToHost);
	PrintComplex(row,col,h_odata);

input data:
1.0 2.0 3.0 4.0
5.0 6.0 7.0 8.0
9.0 10.0 11.0 12.0
13.0 14.0 15.0 16.0

The correct result data should be:
Real part
136.0 -8.0 -8.0 -8.0
-32.0 0.0 0.0 0.0
-32.0 0.0 0.0 0.0
-32.0 0.0 0.0 0.0
i part
0.0 8.0 0.0 -8.0
32.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0
-32.0 0.0 0.0 0.0

My wrong result:
Real part
136.0 -8.0 -8.0 -32.0
0.0 0.0 -32.0 0.0
0.0 -32.0 0.0 0.0
0.0 0.0 0.0 0.0
i part
0.0 8.0 0.0 32.0
0.0 0.0 0.0 0.0
0.0 -32.0 0.0 0.0
0.0 0.0 0.0 0.0

Help me plz.

Thanks

You’re getting tripped up by CUFFT symmetry.

CUFFT R2C and C2R transforms exploit (complex conjugate, i.e. hermitian) symmetry (not the same as a hermitian matrix) in the complex data to reduce the amount of data required/produced. Although you don’t show your print function, it’s evident from your printout that you’re not taking this into account.

A simpler alternative is to use CUFFT C2C transforms instead.

If you wish to use R2C and/or C2R, you’ll need to take into account the symmetry patterns that CUFFT uses. For a 1D case, it is pretty straightforward and is described here:

http://docs.nvidia.com/cuda/cufft/index.html#data-layout

For a 2D case, the 2nd dimension (only) is reduced due to symmetry. The reduction is to a size of N/2 + 1, so 2nd dimension of 4 will only have 3 columns of output:

http://docs.nvidia.com/cuda/cufft/index.html#multi-dimensional

note that the symmetry in this case is in each row.

The following modification of your code demonstrates proper handling of data in either the C2C case or the R2C case, just add -DUSE_C2C to your compile command line to see the C2C behavior:

$ cat t734.cu
#include <cufft.h>
#include <stdio.h>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

void Print2DComplex(int rows, int cols, cufftComplex *data, bool cufft_symmetry = false){

        int sym_cols = cols;
        if (cufft_symmetry) sym_cols = cols/2 + 1;
        printf("Real Part: \n");
        for (int i = 0; i < rows; i++){
          for (int j = 0; j < cols; j++)
            if (j>=sym_cols)
              printf("%f ", data[i*sym_cols+(cols-j)].x);
            else
              printf("%f ", data[i*sym_cols+j].x);
          printf("\n");}
        printf("Imag Part: \n");
        for (int i = 0; i < rows; i++){
          for (int j = 0; j < cols; j++)
            if (j>=sym_cols)
              printf("%f ", -data[i*sym_cols+(cols-j)].y); // complex (hermitian) symmetry
            else
              printf("%f ", data[i*sym_cols+j].y);
          printf("\n");}
}

int main(){

        const int row = 4;
        const int col = 4;
        float A[row][col] =
        {{ 1, 2, 3, 4},
         { 5, 6, 7, 8},
         { 9,10,11,12},
         {13,14,15,16}};
        cufftHandle plan;
        cufftReal *d_idata;
        cufftComplex *h_odata, *d_odata;
        cudaMalloc((void**)&d_idata, sizeof(cufftComplex)*row*col);
        cudaMalloc((void**)&d_odata, sizeof(cufftComplex)*row*col);
        cudaMemset(d_idata, 0, sizeof(cufftComplex)*row*col);
        cudaMemset(d_odata, 0, sizeof(cufftComplex)*row*col);
        cudaMallocHost((void**)&h_odata, sizeof(cufftComplex)*row*col);
        bool symmetric_data = false;
#ifdef USE_C2C
        cudaMemcpy2D(d_idata, sizeof(cufftComplex), A, sizeof(float), sizeof(float), row*col, cudaMemcpyHostToDevice);
        if ((cufftPlan2d(&plan, row,col, CUFFT_C2C))!= CUFFT_SUCCESS) {printf("cufft plan error\n"); return -1;}
        if ((cufftExecC2C(plan, (cufftComplex*)d_idata, (cufftComplex*)d_odata, CUFFT_FORWARD))!=CUFFT_SUCCESS) {printf("cufft exec error\n"); return -1 ;}
#else
        cudaMemcpy(d_idata,A,sizeof(cufftReal)*row*col,cudaMemcpyHostToDevice);
        if ((cufftPlan2d(&plan, row,col, CUFFT_R2C))!= CUFFT_SUCCESS) {printf("cufft plan error\n"); return -1;}
        if ((cufftExecR2C(plan, (cufftReal*)d_idata, (cufftComplex*)d_odata))!=CUFFT_SUCCESS) {printf("cufft exec error\n"); return -1 ;}
        symmetric_data = true;
#endif
        cudaMemcpy(h_odata,d_odata,sizeof(cufftComplex)*row*col,cudaMemcpyDeviceToHost);
        cudaCheckErrors("some error");
        Print2DComplex(row,col,h_odata, symmetric_data);
        return 0;
}
$ nvcc -o t734 t734.cu -lcufft
$ ./t734
Real Part:
136.000000 -8.000005 -8.000000 -8.000005
-32.000000 0.000000 -0.000001 0.000000
-32.000000 -0.000000 0.000000 -0.000000
-32.000000 0.000000 -0.000001 0.000000
Imag Part:
-0.000001 8.000001 0.000001 -8.000001
32.000000 -0.000000 0.000001 0.000000
0.000000 0.000000 -0.000000 -0.000000
-32.000000 0.000000 -0.000001 -0.000000
$ nvcc -DUSE_C2C -o t734 t734.cu -lcufft
$ ./t734
Real Part:
136.000000 -8.000000 -8.000000 -8.000000
-32.000000 0.000000 0.000000 0.000000
-32.000000 0.000000 0.000000 0.000000
-32.000000 0.000000 0.000000 0.000000
Imag Part:
0.000000 8.000000 0.000000 -8.000000
32.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0.000000 0.000000
-32.000000 0.000000 0.000000 0.000000
$

If you don’t like the “complexity” associated with R2C, I suggest you just use C2C transforms.
However, in many cases, from a cufft execution standpoint, the R2C transform will be faster than the corresponding (real-to-complex) C2C transform.

Thanks a lot, txbob.

The result is correct.
:D

Hi there,

I was having a heck of a time getting a basic Image->R2C->C2R->Image test working and found my way here. The first (most frustrating) problem is that the second C2R destroys its source image, so it’s not valid to print the FFT after transforming it back to an image.

The second part was correctly interpreting the symmetric data like this post describes, but I found a bug. For this to work properly I had to use the following index in Print2DComplex for the second half of the image:

((height - y) % height) * sym_width + (width - x)

or using rows/cols:

((rows- i) % rows) * sym_cols + (cols - j)

I’ve attached an FFT of “Lena” image for illustration of the difference. Note that this is just the log(magnitude) so technically I haven’t verified both real/complex components here.


I should add, in practice I won’t be needing the second half of the FFT when using R2C, but since I found this bug I thought I would add it here for posterity.

Yep.

Hello,

I know this is an old post, anyway, I’ll give it a try.
I can run this test (t734.cu) as it is; however, when I try to modify it splitting it into functions or files I cannot make it run. I have 3 functions/files, including main, allocates and exec.

Firstly, I declare the : plan , *d_idata, *d_odata & *h_odata as globals. I changed A to be dynamic and in the main function allocate and initialize it.

Secondly, allocates is called, to allocate/initialize all the global variables and creating the plan.

Finally, exec is called to execute the R2C fft.

Unfortunately, I got illegal memory access error and out of bounds when using compute-sanitizer.

Here is the complete minimal example using functions, but same error I got using files.

#include <cufft.h>
#include <stdio.h>

cufftHandle plan;
cufftReal *d_idata=NULL;
cufftComplex *h_odata=NULL, *d_odata=NULL;

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line, bool  abort=true)
{
   if (code != cudaSuccess) 
   {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);            
        if (abort) exit(code);
   }
}

void Print2DComplex(int rows, int cols, cufftComplex *data, bool cufft_symmetry = false){

        int sym_cols = cols;
        if (cufft_symmetry) sym_cols = cols/2 + 1;
        printf("Real Part: \n");
        for (int i = 0; i < rows; i++){
          for (int j = 0; j < cols; j++)
            if (j>=sym_cols)
              printf("%f ", data[i*sym_cols+(cols-j)].x);
            else
              printf("%f ", data[i*sym_cols+j].x);
          printf("\n");}
        printf("Imag Part: \n");
        for (int i = 0; i < rows; i++){
          for (int j = 0; j < cols; j++)
            if (j>=sym_cols)
              printf("%f ", -data[i*sym_cols+(cols-j)].y); // complex (hermitian) symmetry
            else
              printf("%f ", data[i*sym_cols+j].y);
          printf("\n");}
}

void allocates(cufftHandle *plan, cufftReal *d_idata, cufftComplex *h_odata, cufftComplex *d_odata, const int row, const int col, float *A)
{
        gpuErrchk(cudaMalloc((void**)&d_idata, sizeof(cufftComplex)*row*col));
        gpuErrchk(cudaMalloc((void**)&d_odata, sizeof(cufftComplex)*row*col));
        gpuErrchk(cudaMemset(d_idata, 0, sizeof(cufftComplex)*row*col));
        gpuErrchk(cudaMemset(d_odata, 0, sizeof(cufftComplex)*row*col));
        gpuErrchk(cudaMallocHost((void**)&h_odata, sizeof(cufftComplex)*row*col));
        gpuErrchk(cudaMemcpy(d_idata,A,sizeof(cufftReal)*row*col,cudaMemcpyHostToDevice));

        if ((cufftPlan2d(plan, row,col, CUFFT_R2C))!= CUFFT_SUCCESS) {printf("cufft plan error\n"); exit(-1);}
}

void exec(cufftHandle plan, cufftReal *d_idata, cufftComplex *h_odata, cufftComplex *d_odata, const int row, const int col)
{
        if ((cufftExecR2C(plan, (cufftReal*)d_idata, (cufftComplex*)d_odata))!=CUFFT_SUCCESS) {printf("cufft exec error\n"); exit(-1);}
        gpuErrchk(cudaDeviceSynchronize());
        gpuErrchk(cudaMemcpy(h_odata,d_odata,sizeof(cufftComplex)*row*col,cudaMemcpyDeviceToHost));
        gpuErrchk(cudaDeviceSynchronize());
}

int main()
{

        const int row = 4;
        const int col = 4;
/*
        double A[row][col] =
        {{ 1, 2, 3, 4},
         { 5, 6, 7, 8},
         { 9,10,11,12},
         {13,14,15,16}};
*/
        float *A = (float*)calloc(row*col , sizeof(float));
        int j=0;
        for(int i=1; i <= (row*col); i++)
           A[j++] = i;

        allocates(&plan, d_idata, h_odata, d_odata, row, col, A);

        //bool symmetric_data = false;


        exec(plan, d_idata, h_odata, d_odata, row, col);
        //symmetric_data = true;

//        Print2DComplex(row,col,h_odata, symmetric_data);
        return 0;
}

I compiled it with: nvcc t734-cufft-R2C-functions-nvidia-forum.cu -o t734-cufft-R2C-functions-nvidia-forum -lcufft

But I got:
GPUassert: an illegal memory access was encountered t734-cufft-R2C-functions-nvidia-forum.cu 56

I tried the --device-c option compiling them when the functions were on files, without any luck.

I guess that the d_idata & d_odata aren’t visible/accessible on exec function/file, so how can I make them accessible to exec function/file?

Could you help me please.

Thanks

I just figured it out using separate functions, compiling using -dc & properly including the header files

That won’t work. A function uses pass-by-value. When you allocate d_idata in the function, it is operating on a local copy of d_idata due to pass by value. This is a common programming mistake and doesn’t really have anything to do with CUDA or CUFFT. There are numerous forum articles pertaining to the same programming error. With a bit of searching you can find them. Here is one example.