I would like to have some clarifications on GPU computing accuracy and determinism.
Basically, I’m trying to implement a 1-D Fourier transform by a cublasCgemm routine call and to compile it as a Matlab mex function. So, I have two kernels: one to calculate the vector to be transformed and one to calculate the Fourier kernel matrix. Then, I have the mex function calling the two kernels and the cublasCgemm function.
Here is the excitation filling kernel.
[codebox]// *******************************
// * KERNEL - EXCITATION FILLING *
// *******************************
global void Excitation_Filling(float *d_X, float *d_Y, float *d_Z, float d_alfa, float d_z0, float *d_x_terzo_vett, float *d_y_terzo_vett, float *d_z_terzo_vett, float *d_r_vett, float d_theta_vett, float d_fase_vett, cuFloatComplex *d_campo_feed, int dimx, int dimy, float beta, float mfatt)
{
/* Evaluates the thread indices */
int idx = __umul24(blockIdx.x,BLOCK_SIZE_X) + threadIdx.x;
int idy = __umul24(blockIdx.y,BLOCK_SIZE_Y) + threadIdx.y;
/* Evaluates the matrix filling index */
int id_vett = idx*dimx+idy;
if (idx < dimx && idy < dimy) {
d_x_terzo_vett[id_vett]=-d_X[id_vett];
d_y_terzo_vett[id_vett]=__fadd_rn(__fmul_rn(d_Y[id_vett],cos
f(d_alfa)),__fmul_rn(__fadd_rn(d_Z[id_vett],-(d_z0)),sinf(d_alfa)));
d_z_terzo_vett[id_vett]=__fadd_rn(__fmul_rn(d_Y[id_vett],sin
f(d_alfa)),-__fmul_rn(__fadd_rn(d_Z[id_vett],-(d_z0)),cosf(d_alfa)));
d_r_vett[id_vett]=sqrtf(_fmul_rn(d_x_terzo_vett[id_vett],d
x_terzo_vett[id_vett])+_fmul_rn(d_y_terzo_vett[id_vett],d_y
terzo_vett[id_vett])+__fmul_rn(d_z_terzo_vett[id_vett],d_z_te
rzo_vett[id_vett]));
d_theta_vett[id_vett]=asinf(__fdividef(sqrtf(__fadd_rn(__fmu
l_rn(d_x_terzo_vett[id_vett],d_x_terzo_vett[id_vett]),
__fmul_rn(d_y_terzo_vett[id_vett],d_y_terzo_vett[id_vett])))
,d_r_vett[id_vett]));
d_campo_feed[id_vett]=make_cuFloatComplex(cosf(__fmul_rn(bet
a,d_r_vett[id_vett])),-sinf(__fmul_rn(beta,d_r_vett[id_vett])));
d_campo_feed[id_vett]=cuCdivf(d_campo_feed[id_vett],make_cuF
loatComplex(d_r_vett[id_vett],0.0f));
d_campo_feed[id_vett]=cuCmulf(d_campo_feed[id_vett],make_cuF
loatComplex(powf(cosf(d_theta_vett[id_vett]),mfatt),0.0f));
d_campo_feed[id_vett]=cuCmulf(d_campo_feed[id_vett],make_cuF
loatComplex(cosf(d_fase_vett[id_vett]),sinf(d_fase_vett[id_ve
tt])));
}
}
[/codebox]
Here is the Fourier kernel matrix filling kernel
[codebox]// **********************************
// * KERNEL - KERNEL MATRIX FILLING *
// **********************************
global void Kernel_Matrix_Filling(float *d_X, float *d_Y, float *d_Z, float *d_u_vett, float *d_v_vett, cuFloatComplex *d_Kernel_Matrix,
int dimx, int dimy, int dimu)
{
/* Evaluates the thread indices */
int idx = __umul24(blockIdx.x,BLOCK_SIZE_X) + threadIdx.x;
int idy = __umul24(blockIdx.y,BLOCK_SIZE_Y) + threadIdx.y;
/* Evaluates the matrix filling index */
int id_vett = idx*dimu+idy;
if (idx < dimx*dimy && idy < dimu) {
d_Kernel_Matrix[id_vett]=make_cuFloatComplex(cosf(__fadd_rn(
__fmul_rn(d_u_vett[idy],d_X[idx]),__fmul_rn(d_v_vett[idy],d_Y
[idx]))),
sinf(__fadd_rn(__fmul_rn(d_u_vett[idy],d_X[idx]),__fmul_rn(d
_v_vett[idy],d_Y[idx]))));
}
}
[/codebox]
Since the beginning, I had accuracy issues, which were mitigated by use of __fadd_rn and __fmul_rn. However, the final accuracy I get is 1e-3 or 1e-4, which is not enough for my application. I’m using single precision since I have a GT 230M card.
What could be the source of such large errors ? Use of cos/sin/sqrt functions ? Synchronization ?
Strangely, the result changes when I change blocksize and dimgrid, the faster the algorithm runs, the poorer the result is. I have separately tested the cublasCgemm routine call and checked it returns with a satisfactory accuracy.