I have involved the cufft library, but I find some error with it!
when (nx, ny, nz)=(225, 329, 1499), the result is error!
when nz=1499, the result is error at most.
but when (nx, ny, nz)=(210, 324, 1500),(210, 324, 1500), (210, 324, 1515), (500, 500, 1500), the result is OK.
when nz<800, the result is OK.
the follow is my involving cufft methods:
1. in place, padding method
2. 3d
3. the first involving forward R2C, the second involving INVERSE C2R.
the code as follows:
I used the Tesla K40m, the memroy is 12GB
My compile option is
nvcc -arch=compute_35 -code=sm_35 -o cufft_in_place cufft_in_place.cu -lcudart -lcufft
#include <iostream>
#include <malloc.h>
#include <cuda.h>
#include <cufft.h>
#include <cuda_runtime_api.h>
const int Z=2700;
const int M=700;
const int N=560;
using namespace std;
int main(){
int Nh=(N/2+1);
//float *B=(float*)malloc(sizeof(float)*Z*M*N);
float *A=(float*)malloc(sizeof(float)*Z*M*2*Nh);
for(int k=0; k<Z; k++)
for(int i=0; i<M; i++)
for(int j=0; j<N; j++){
A[k*2*Nh*M+i*2*Nh+j]=(i*N+j+1);///(float)RAND_MAX;
}
cout<<"raw data"<<endl;
cout<<"A[0]= "<<A[0]<<endl;
cout<<"A[100]= "<<A[100]<<endl;
cout<<"A[1310]= "<<A[1310]<<endl;
cout<<"A[2320]= "<<A[2320]<<endl;
//for(int k=0; k<Z; k++)
// for(int i=0; i<M; i++){
// for(int j=0; j<2*Nh; j++)
// cout<<A[k*M*Nh*2+i*Nh*2+j]<<" ";
// cout<<endl;
// }
cufftComplex* A_dev;
cufftReal *a_dev;
//cudaMalloc((void**)&a_dev, sizeof(float)*Z*M*N);
cudaMalloc((void**)&a_dev, sizeof(float)*Z*M*2*Nh);
cudaMemcpy(a_dev, A, sizeof(float)*Z*M*2*Nh, cudaMemcpyHostToDevice);
A_dev=(cufftComplex*)a_dev;
cufftHandle plan_forward;
cufftPlan3d(&plan_forward, Z, M, N, CUFFT_R2C);
cufftExecR2C(plan_forward, a_dev, A_dev);
cudaMemcpy(A, A_dev, sizeof(float)*Z*M*2*Nh, cudaMemcpyDeviceToHost);
cout<<"inv forward"<<endl;
cout<<"A[0]= "<<A[0]<<endl;
cout<<"A[100]= "<<A[100]<<endl;
cout<<"A[1310]= "<<A[1310]<<endl;
cout<<"A[2320]= "<<A[2320]<<endl;
//for(int k=0; k<Z; k++)
//for(int i=0; i<M; i++){
// for(int j=0; j<2*Nh; j++)
// cout<<A[k*2*Nh*M+i*2*Nh+j]<<" ";
// cout<<endl;
//}
memset(A, 0, sizeof(float)*Z*M*2*Nh);
cufftHandle plan_backward;
cufftPlan3d(&plan_backward, Z, M, N, CUFFT_C2R);
cufftExecC2R(plan_backward, A_dev, a_dev);
cudaMemcpy(A, a_dev, sizeof(float)*Z*M*2*Nh, cudaMemcpyDeviceToHost);
cout<<"inv backward"<<endl;
cout<<"A[0]= "<<A[0]/((float)(Z*M*N))<<endl;
cout<<"A[100]= "<<A[100]/((float)(Z*M*N))<<endl;
cout<<"A[1310]= "<<A[1310]/((float)(Z*M*N))<<endl;
cout<<"A[2320]= "<<A[2320]/((float)(Z*M*N))<<endl;
//for(int k=0; k<Z; k++)
//for(int i=0; i<M; i++){
// for(int j=0; j<2*Nh; j++)
// cout<<A[k*2*Nh*M+i*2*Nh+j]/(float)(Z*M*N)<<" ";
// cout<<endl;
//}
free(A);
cudaFree(a_dev);
return 0;
}
why ? I really need help!
cufft_in_place.cu (2.37 KB)