OK.
I am processing a 3D array slice by slice.
Here’s the kernel:
__global__ void pKernel(double* p, double* pz, double* pzz, double* sourceFn, double* ir, int iDim, int jDim, int kDim, int si, int sj, int sk, int ri, int rj, int rk, int n, int k)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
double wR = 1.0;
double pCoef = 1.0/3.0;
// Solve for pressure
if ((i>0) && (i<iDim-1) && (j>0) && (j<jDim-1) && (k>0) && (k<kDim-1))
{
// We are inside the domain
p[ijk] = (pCoef*(pz[ip] + pz[im] + pz[jp] + pz[jm] + pz[kp] + pz[km])) - pzz[ijk];
}
if (k==0) { p[ijk] = (1.0+wR)*pz[kp] - wR*pzz[ijk]; } // Down Boundary
if (k==kDim-1) { p[ijk] = (1.0+wR)*pz[km] - wR*pzz[ijk]; } // Up Boundary
if (j==0) { p[ijk] = (1.0+wR)*pz[jp] - wR*pzz[ijk]; } // Front Boundary
if (j==jDim-1) { p[ijk] = (1.0+wR)*pz[jm] - wR*pzz[ijk]; } // Back Boundary
if (i==0) { p[ijk] = (1.0+wR)*pz[ip] - wR*pzz[ijk]; } // Left Boundary
if (i==iDim-1) { p[ijk] = (1.0+wR)*pz[im] - wR*pzz[ijk]; } // Right Boundary
// Inject Source
if ((i == si) && (j == sj) && (k == sk)) {
p[sourcePos] = p[sourcePos] + sourceFn[n] + sourceFn[n-2];
}
// Collect at Reciever
if ((i == ri) && (j == rj) && (k == rk)) {
ir[n] = p[recPos]; }
// Wait for all
__syncthreads();
}
with the striding macros defined as:
#define ijk (((i) * (jDim) * (kDim)) + ((j) * (kDim)) + (k))
#define im (((i-1) * (jDim) * (kDim)) + ((j) * (kDim)) + (k))
#define ip (((i+1) * (jDim) * (kDim)) + ((j) * (kDim)) + (k))
#define jm (((i) * (jDim) * (kDim)) + ((j-1) * (kDim)) + (k))
#define jp (((i) * (jDim) * (kDim)) + ((j+1) * (kDim)) + (k))
#define km (((i) * (jDim) * (kDim)) + ((j) * (kDim)) + (k-1))
#define kp (((i) * (jDim) * (kDim)) + ((j) * (kDim)) + (k+1))
#define sourcePos (((si) * (jDim) * (kDim)) + ((sj) * (kDim)) + (sk))
#define recPos (((ri) * (jDim) * (kDim)) + ((rj) * (kDim)) + (rk))
And on the host side:
// Loop through time
for (int n=2; n<ns; n++) {
// Execute Pressure Kernel
cudaPrintfInit();
// Loop through k-dimension tiles
for (int k=0; k<kDim; k++)
{
pKernel<<<Dg, Db>>>(pCuda, pzCuda, pzzCuda, sourceFnCuda, irCuda, iDim, jDim, kDim, si, sj, sk, ri, rj, rk, n, k);
cutilCheckMsg("Kernel execution failed\n");
cudaPrintfDisplay(stdout, true);
}
cudaThreadSynchronize();
cudaPrintfEnd();
// Update temporal memory
cudaMemcpy(pzzCuda, pzCuda, dataSize, cudaMemcpyDeviceToDevice);
cudaMemcpy(pzCuda, pCuda, dataSize, cudaMemcpyDeviceToDevice);
cudaMemset((void**)pCuda, 0, dataSize);
// Status
printf("GPU Iteration n = %i out of %i \r", n, ns);
} // end n
And this is the reference code for the CPU:
// Run the Simulation on CPU
for (int n=2; n<ns; n++)
{
for (int i=0; i<iDim; i++)
{
for (int j=0; j<jDim; j++)
{
for (int k=0; k<kDim; k++)
{
//// Where are we?
if ((i>0) && (i<iDim-1) && (j>0) && (j<jDim-1) && (k>0) && (k<kDim-1))
{
// We are inside the domain
p[i][j][k] = (pCoef*(pz[i+1][j][k] + pz[i-1][j][k] + pz[i][j+1][k] + pz[i][j-1][k] + pz[i][j][k+1] + pz[i][j][k-1])) - pzz[i][j][k];
}
if (k==0) { p[i][j][k] = ((1+wR)*pz[i][j][k+1]) - wR*pzz[i][j][k]; } // Down Boundary
if (k==kDim-1) { p[i][j][k] = ((1+wR)*pz[i][j][k-1]) -wR*pzz[i][j][k]; } // Up Boundary
if (j==0) { p[i][j][k] = ((1+wR)*pz[i][j+1][k]) -wR*pzz[i][j][k]; } // Front Boundary
if (j==jDim-1) { p[i][j][k] = ((1+wR)*pz[i][j-1][k]) -wR*pzz[i][j][k]; } // Back Boundary
if (i==0) { p[i][j][k] = ((1+wR)*pz[i+1][j][k]) -wR*pzz[i][j][k]; } // Left Boundary
if (i==iDim-1) { p[i][j][k] = ((1+wR)*pz[i-1][j][k]) + (-wR*pzz[i][j][k]); } // Right Boundary
} // end k
} // end j
} // end i
// Insert Source
p[si][sj][sk] = p[si][sj][sk] + sourceFn[n] - sourceFn[n-2];
// Collect at Receiver
ir[n] = p[ri][rj][rk];
// Update temporal memory
for (int i=0; i<iDim; i++)
{
for (int j=0; j<jDim; j++)
{
for (int k=0; k<kDim; k++)
{
pzz[i][j][k] = pz[i][j][k];
pz[i][j][k] = p[i][j][k];
}
}
}
// Status
printf("Iteration n = %i out of %i \r", n, ns);
} // end n