I’ve instrumented the code as below using the isnan/grabbing the tiger by the tail method, and I came to the same lines of code that tera pointed out (the gem: I really don’t understand it either) as being the “injection point” for nan into the computation stream.
Here is the first part of the output I get, on CUDA 8.0.61, CentOS 7, Tesla K20X:
$ ./t956 2>&1 |more
25.398417 25.398417 25.398417
24575.970703 0.499999
16384.000000 -3.186736
0 1.000987 inf inf 0.999671
t956.cu:102: void my_assert(__nv_bool): block: [8,0,0], thread: [221,0,0] Assert
ion `cond` failed.
t956.cu:102: void my_assert(__nv_bool): block: [8,0,0], thread: [93,0,0] Asserti
on `cond` failed.
t956.cu:102: void my_assert(__nv_bool): block: [53,0,0], thread: [95,0,0] Assert
ion `cond` failed.
t956.cu:102: void my_assert(__nv_bool): block: [8,0,0], thread: [29,0,0] Asserti
on `cond` failed.
t956.cu:102: void my_assert(__nv_bool): block: [8,0,0], thread: [31,0,0] Asserti
on `cond` failed.
nan58: inf, inf, inf, nan, nan
nan58: 0.000000, 0.000000, inf, nan, nan
nan58: 0.000000, 0.000000, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: 0.000000, inf, inf, nan, nan
nan58: 0.000000, 0.000000, inf, nan, nan
nan58: 0.000000, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: 0.000000, 0.000000, inf, nan, nan
nan58: 0.000000, 0.000000, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: 0.000000, 0.000000, 0.000000, 0.000000, 0.000000
nan58: 0.000000, 0.000000, 0.000000, nan, nan
nan58: 0.000000, 0.000000, inf, nan, nan
nan58: 0.000000, 0.000000, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: 0.000000, 0.000000, 0.000000, 0.000000, 0.000000
nan58: 0.000000, 0.000000, 0.000000, 0.000000, 0.000000
Here is the “instrumented” code:
$ cat t956.cu
// version 01.08.16
// Pair potential
// nvcc -arch=sm_20 -ptxas-options=-v
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include <fstream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <assert.h>
using namespace std;
#define threads 256
#define tpb 32
#define newtpb 864
const int nx=16;
const int ny=16;
const int nz=16;
const int n=4*nx*ny*nz;
const int blocks = (int)((n + threads - 1) / threads);
float rho=1.00;
float lc=pow(4.0/rho,1.0/3.0);
float lx=nx*lc;
float ly=ny*lc;
float lz=nz*lc;
float rv=1.0*2.50;
const float rc=2.50;
const float rc2=rc*rc;
const float csr=0.0010;
const float qp=0.010;
const float dt=0.010;
const float cv=dt/2.0;
const float cf=dt*dt/2.0;
const int mx=(int)(lx/rv);
const int my=(int)(ly/rv);
const int mz=(int)(lz/rv);
const int nmaxc=tpb;
const int ncel=mx*my*mz;
float cx=lx/(float)(mx);
float cy=ly/(float)(my);
float cz=lz/(float)(mz);
__constant__ float rcd;
__constant__ float rc2d;
__constant__ float csrd;
__constant__ float qpd;
__constant__ float dtd;
__constant__ float cvd;
__constant__ float cfd;
__constant__ int mxd;
__constant__ int myd;
__constant__ int mzd;
__constant__ int nmaxcd;
__constant__ int nceld;
//===========================================================================
// kernel function: particle positions in fcc
//===========================================================================
void c_fcc(float d,float3 l,float3 *r)
{
float d2=d/2.0;
float3 l2={l.x/2.0, l.y/2.0, l.z/2.0};
int s=0;
for (int k=0;k<nz;k++)
{ for (int i=0;i<nx;i++)
{ for (int j=0;j<ny;j++)
{ r[s].x=-l2.x+float(i)*d+d2;
r[s].y=-l2.y+float(j)*d+d2;
r[s].z=-l2.z+float(k)*d+d2;
s=s+1;
r[s].x=-l2.x+float(i)*d+d;
r[s].y=-l2.y+float(j)*d+d;
r[s].z=-l2.z+float(k)*d+d2;
s=s+1;
r[s].x=-l2.x+float(i)*d+d;
r[s].y=-l2.y+float(j)*d+d2;
r[s].z=-l2.z+float(k)*d+d;
s=s+1;
r[s].x=-l2.x+float(i)*d+d2;
r[s].y=-l2.y+float(j)*d+d;
r[s].z=-l2.z+float(k)*d+d;
s=s+1;
} } }
for (int i=0;i<s;i++)
{ r[i].x=(r[i].x>=l2.x?r[i].x-=l.x:r[i].x=r[i].x);
r[i].x=(r[i].x<-l2.x?r[i].x+=l.x:r[i].x=r[i].x);
r[i].y=(r[i].y>=l2.y?r[i].y-=l.y:r[i].y=r[i].y);
r[i].y=(r[i].y<-l2.y?r[i].y+=l.y:r[i].y=r[i].y);
r[i].z=(r[i].z>=l2.z?r[i].z-=l.z:r[i].z=r[i].z);
r[i].z=(r[i].z<-l2.z?r[i].z+=l.z:r[i].z=r[i].z);
}
}
//===========================================================================
// kernel function: initial velocities
//===========================================================================
__device__ void my_assert(bool cond){ assert(cond);}
void c_vel(int seed, float temp, float3 *v)
{
float3 sumv;
sumv.x=0.0;
sumv.y=0.0;
sumv.z=0.0;
srand(seed);
for (int i=0;i<n;i++)
{ v[i].x=(float)rand()/(float)RAND_MAX-0.5;
sumv.x+=v[i].x;
v[i].y=(float)rand()/(float)RAND_MAX-0.5;
sumv.y+=v[i].y;
v[i].z=(float)rand()/(float)RAND_MAX-0.5;
sumv.z+=v[i].z;
}
sumv.x=sumv.x/float(n);
sumv.y=sumv.y/float(n);
sumv.z=sumv.z/float(n);
float3 sumv2;
sumv2.x=0.0;
sumv2.y=0.0;
sumv2.z=0.0;
for (int i=0;i<n;i++)
{ v[i].x-=sumv.x;
v[i].y-=sumv.y;
v[i].z-=sumv.z;
sumv2.x+=v[i].x*v[i].x;
sumv2.y+=v[i].y*v[i].y;
sumv2.z+=v[i].z*v[i].z;
}
float fs=sqrt(3.0*float(n)*temp/(sumv2.x+sumv2.y+sumv2.z));
for (int i=0;i<n;i++)
{ v[i].x=v[i].x*fs;
v[i].y=v[i].y*fs;
v[i].z=v[i].z*fs;
}
}
//===========================================================================
// kernel function: total forces on CPU
//===========================================================================
void cpu_forces(float3 l2, float3 l, float3 *r, float3 *a)
{
for (int i=0;i<n;i++)
{ float3 bi=r[i];
float3 ai={0.0, 0.0, 0.0};
for (int j=0;j<n;j++)
{ float3 bj=r[j];
float3 rij;
rij.x = bi.x - bj.x;
rij.y = bi.y - bj.y;
rij.z = bi.z - bj.z;
rij.x=(rij.x>=l2.x?rij.x-=l.x:rij.x=rij.x);
rij.x=(rij.x<-l2.x?rij.x+=l.x:rij.x=rij.x);
rij.y=(rij.y>=l2.y?rij.y-=l.y:rij.y=rij.y);
rij.y=(rij.y<-l2.y?rij.y+=l.y:rij.y=rij.y);
rij.z=(rij.z>=l2.z?rij.z-=l.z:rij.z=rij.z);
rij.z=(rij.z<-l2.z?rij.z+=l.z:rij.z=rij.z);
float rij2 = rij.x * rij.x + rij.y * rij.y + rij.z * rij.z;
if (rij2<rc2&i!=j)
{ float absrij=sqrt(rij2);
float rmrc= absrij-rc;
float rmrc2=rmrc*rmrc;
float rmrc4=rmrc2*rmrc2;
float irij2 = 1.0/rij2;
float irij6 = irij2 * irij2 * irij2;
float ur = 4.0 * irij6 * (irij6-1.0);
float u1r = -48.0 * irij6 * (irij6 - 0.50) * irij2;;
float sr=rmrc4/(csr+rmrc4);
float s1r=4.0*csr*rmrc*rmrc2/((csr+rmrc4)*(csr+rmrc4)*absrij);
float fij=-(ur*s1r+u1r*sr);
ai.x+= rij.x * fij;
ai.y+= rij.y * fij;
ai.z+= rij.z * fij;
} }
a[i]=ai;
} }
//===========================================================================
// device function: pair interactions
//===========================================================================
__device__ float4
bodyBodyInteractionx(float3 l2, float3 l, float3 bi, float3 bj, float4 ai)
{
float3 rij;
rij.x = bi.x - bj.x;
rij.y = bi.y - bj.y;
rij.z = bi.z - bj.z;
rij.x=(rij.x>=l2.x?rij.x-=l.x:rij.x=rij.x);
rij.x=(rij.x<-l2.x?rij.x+=l.x:rij.x=rij.x);
rij.y=(rij.y>=l2.y?rij.y-=l.y:rij.y=rij.y);
rij.y=(rij.y<-l2.y?rij.y+=l.y:rij.y=rij.y);
rij.z=(rij.z>=l2.z?rij.z-=l.z:rij.z=rij.z);
rij.z=(rij.z<-l2.z?rij.z+=l.z:rij.z=rij.z);
float rij2 = rij.x * rij.x + rij.y * rij.y + rij.z * rij.z;
if (rij2<rc2d)
{ float irij2=1.0/rij2;
float irij6 = irij2 * irij2 * irij2;
float fij = 48.0 * irij6 * (irij6 - 0.50) * irij2;;
ai.x+= rij.x * fij;
ai.y+= rij.y * fij;
ai.z+= rij.z * fij;
ai.w+= rij2 * fij;
}
return ai;
}
//===========================================================================
// device function: pair interactions
//===========================================================================
__device__ float4
bodyBodyInteraction(float3 l2, float3 l, float3 bi, float3 bj, float4 ai)
{
float3 rij;
rij.x = bi.x - bj.x;
rij.y = bi.y - bj.y;
rij.z = bi.z - bj.z;
rij.x=(rij.x>=l2.x?rij.x-=l.x:rij.x=rij.x);
rij.x=(rij.x<-l2.x?rij.x+=l.x:rij.x=rij.x);
rij.y=(rij.y>=l2.y?rij.y-=l.y:rij.y=rij.y);
rij.y=(rij.y<-l2.y?rij.y+=l.y:rij.y=rij.y);
rij.z=(rij.z>=l2.z?rij.z-=l.z:rij.z=rij.z);
rij.z=(rij.z<-l2.z?rij.z+=l.z:rij.z=rij.z);
float rij2 = rij.x * rij.x + rij.y * rij.y + rij.z * rij.z;
if (rij2<rc2d)
{ float irij2=1.0/rij2;
float irij6 = irij2 * irij2 * irij2;
float absrij = sqrt(rij2);
float rmrc= absrij-rcd;
float rmrc2=rmrc*rmrc;
float rmrc4=rmrc2*rmrc2;
float ur = 4.0 * irij6 * (irij6-1.0);
float u1r = -48.0 * irij6 * (irij6 - 0.50) * irij2;;
float sr = rmrc4/(csrd+rmrc4);
float s1r = 4.0*csrd*rmrc*rmrc2/((csrd+rmrc4)*(csrd+rmrc4)*absrij);
float fij= -(ur*s1r+u1r*sr);
ai.x+= rij.x * fij;
ai.y+= rij.y * fij;
ai.z+= rij.z * fij;
ai.w+= rij2 * fij;
}
return ai;
}
__device__ float4
newbodyBodyInteraction(float3 l2, float3 l, float3 bi, float3 bj)
{
float3 rij;
float4 ai;
rij.x = bi.x - bj.x;
rij.y = bi.y - bj.y;
rij.z = bi.z - bj.z;
rij.x=(rij.x>=l2.x?rij.x-=l.x:rij.x=rij.x);
rij.x=(rij.x<-l2.x?rij.x+=l.x:rij.x=rij.x);
rij.y=(rij.y>=l2.y?rij.y-=l.y:rij.y=rij.y);
rij.y=(rij.y<-l2.y?rij.y+=l.y:rij.y=rij.y);
rij.z=(rij.z>=l2.z?rij.z-=l.z:rij.z=rij.z);
rij.z=(rij.z<-l2.z?rij.z+=l.z:rij.z=rij.z);
float rij2 = rij.x * rij.x + rij.y * rij.y + rij.z * rij.z;
if (rij2<rc2d)
{ float irij2=1.0/rij2;
float irij6 = irij2 * irij2 * irij2;
float absrij = sqrt(rij2);
float rmrc= absrij-rcd;
float rmrc2=rmrc*rmrc;
float rmrc4=rmrc2*rmrc2;
float ur = 4.0 * irij6 * (irij6-1.0);
float u1r = -48.0 * irij6 * (irij6 - 0.50) * irij2;;
float sr = rmrc4/(csrd+rmrc4);
float s1r = 4.0*csrd*rmrc*rmrc2/((csrd+rmrc4)*(csrd+rmrc4)*absrij);
float fij= -(ur*s1r+u1r*sr);
ai.x= rij.x * fij;
ai.y= rij.y * fij;
ai.z= rij.z * fij;
ai.w= rij2 * fij;
}
return ai;
}
//===========================================================================
// device function: forces in GPU
//===========================================================================
__global__ void
calculate_forces(float3 l2, float3 l, float3 *r_d, float3 *a_d, float *w_d)
{
__shared__ float3 shPosition[threads];
float3 myPosition;
float4 acc={0.0, 0.0, 0.0, 0.0};
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
myPosition = r_d[gtid];
for (int tile = 0; tile < blocks; tile++)
{
int idx = tile * threads + threadIdx.x;
shPosition[threadIdx.x] = r_d[idx];
__syncthreads();
for (int j = 0; j < threads; j++)
{ if(gtid!=j + tile * threads)
{
acc = bodyBodyInteraction(l2 ,l ,myPosition, shPosition[j], acc);
}
}
__syncthreads();
}
// Save the result in global memory for the integration step.
// float4 acc3 = {acc.x, acc.y, acc.z, acc.w};
a_d[gtid].x = acc.x;
a_d[gtid].y = acc.y;
a_d[gtid].z = acc.z;
w_d[gtid] = acc.w;
}
//===========================================================================
// cuda: first step velocity Verlet's integration
//===========================================================================
__global__ void intstep1(float3 l, float3 l2, float *cr_d,
float3 *r_d, float3 *v_d, float3 *a_d)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
float3 ri, vi, ai;
ri=r_d[i];
if (isnan(ri.x)) {printf("nan40\n"); my_assert(0);}
if (isnan(ri.y)) {printf("nan41\n"); my_assert(0);}
if (isnan(ri.z)) {printf("nan42\n"); my_assert(0);}
ai=a_d[i];
if (isnan(ai.x)) {printf("nan43\n"); my_assert(0);}
if (isnan(ai.y)) {printf("nan44\n"); my_assert(0);}
if (isnan(ai.z)) {printf("nan45\n"); my_assert(0);}
vi=v_d[i];
if (isnan(vi.x)) {printf("nan46\n"); my_assert(0);}
if (isnan(vi.y)) {printf("nan47\n"); my_assert(0);}
if (isnan(vi.z)) {printf("nan48\n"); my_assert(0);}
float cri=cr_d[i];
if (isnan(cri)) {printf("nan49\n"); my_assert(0);}
ri.x+=cri*(vi.x*dtd+ai.x*cfd);
if (isnan(ri.x)) {printf("nan52\n"); my_assert(0);}
ri.y+=cri*(vi.y*dtd+ai.y*cfd);
if (isnan(ri.y)) {printf("nan53\n"); my_assert(0);}
ri.z+=cri*(vi.z*dtd+ai.z*cfd);
if (isnan(ri.z)) {printf("nan54\n"); my_assert(0);}
vi.x+=cri*ai.x*cvd;
if (isnan(vi.x)) {printf("nan55\n"); my_assert(0);}
vi.y+=cri*ai.y*cvd;
if (isnan(vi.y)) {printf("nan56\n"); my_assert(0);}
vi.z+=cri*ai.z*cvd;
if (isnan(vi.y)) {printf("nan57\n"); my_assert(0);}
float my_rix = ri.x;
float my_l2x = l2.x;
float my_lx = l.x;
ri.x=(ri.x>=l2.x?ri.x-=l.x:ri.x=ri.x);
if (isnan(ri.x)) {printf("nan58: %f, %f, %f, %f, %f\n", my_rix, my_l2x, my_lx, float(my_rix-=my_lx), float(my_rix=my_rix)); my_assert(0);}
ri.x=(ri.x<-l2.x?ri.x+=l.x:ri.x=ri.x);
if (isnan(ri.x)) {printf("nan59\n"); my_assert(0);}
ri.y=(ri.y>=l2.y?ri.y-=l.y:ri.y=ri.y);
if (isnan(ri.y)) {printf("nan60\n"); my_assert(0);}
ri.y=(ri.y<-l2.y?ri.y+=l.y:ri.y=ri.y);
if (isnan(ri.y)) {printf("nan61\n"); my_assert(0);}
ri.z=(ri.z>=l2.z?ri.z-=l.z:ri.z=ri.z);
if (isnan(ri.z)) {printf("nan62\n"); my_assert(0);}
ri.z=(ri.z<-l2.z?ri.z+=l.z:ri.z=ri.z);
if (isnan(ri.z)) {printf("nan63\n"); my_assert(0);}
r_d[i]=ri;
v_d[i]=vi;
}
//===========================================================================
// cuda: second step of velocity Verlet's integration
//===========================================================================
__global__ void intstep2(float *cr_d,float3 *v_d, float3 *a_d)
{ int i=blockIdx.x*blockDim.x+threadIdx.x;
float3 vi=v_d[i];
float3 ai=a_d[i];
float cri=cr_d[i];
vi.x+=cri*ai.x*cvd;
vi.y+=cri*ai.y*cvd;
vi.z+=cri*ai.z*cvd;
v_d[i]=vi;
}
//===========================================================================
// cuda: first step velocity Verlet's integration
//===========================================================================
__global__ void intstep1z(float sfr, float3 l, float3 l2, float *cr_d,
float3 *r_d, float3 *v_d, float3 *a_d)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
float3 ri, vi, ai;
ri=r_d[i];
ai=a_d[i];
vi=v_d[i];
float cri=cr_d[i];
ri.z=((sfr-1.0)*cri+1.0)*ri.z;
ri.x+=cri*(vi.x*dtd+ai.x*cfd);
ri.y+=cri*(vi.y*dtd+ai.y*cfd);
ri.z+=cri*(vi.z*dtd+ai.z*cfd);
vi.x+=cri*ai.x*cvd;
vi.y+=cri*ai.y*cvd;
vi.z+=cri*ai.z*cvd;
ri.x=(ri.x>=l2.x?ri.x-=l.x:ri.x=ri.x);
ri.x=(ri.x<-l2.x?ri.x+=l.x:ri.x=ri.x);
ri.y=(ri.y>=l2.y?ri.y-=l.y:ri.y=ri.y);
ri.y=(ri.y<-l2.y?ri.y+=l.y:ri.y=ri.y);
ri.z=(ri.z>=l2.z?ri.z-=l.z:ri.z=ri.z);
ri.z=(ri.z<-l2.z?ri.z+=l.z:ri.z=ri.z);
r_d[i]=ri;
v_d[i]=vi;
}
//===========================================================================
// kernel function: write configuration
//===========================================================================
void c_confout
(int ist,int i,float3 l,float3 *r)
{
char filename[256];
ofstream file;
sprintf(filename,"conf%d.%d",ist,i);
file.open(filename);
{ file << l.x << '\t' << l.y << '\n';}
for (int k=0;k<n;k++)
{
{file << r[k].x << '\t' << r[k].y << '\t' << r[k].z << '\n';}
}
file.close();
}
//===========================================================================
// kernel function: write velocities
//===========================================================================
void c_velout
(int ist, int i, float3 *v)
{
char filename[256];
ofstream file;
sprintf(filename,"vel%d.%d",ist,i);
file.open(filename);
for (int k=0;k<n;k++)
{
{file << v[k].x << '\t' << v[k].y << '\t' << v[k].z << '\n';}
}
file.close();
}
//===========================================================================
// kernel function: read configuration
//===========================================================================
void c_confin
(int ist, int i, float3 l, float3 *r)
{
char filename[256];
ifstream file;
sprintf(filename,"conf%d.%d",ist,i);
file.open(filename);
file >> l.x >> l.y >> l.z ;
for (int k=0;k<n;k++)
{
file >> r[k].x >> r[k].y >> r[k].z ;
}
file.close();
}
//===========================================================================
// kernel function: read velocities
//===========================================================================
void c_velin
(int ist, int i, float3 *v)
{
char filename[256];
ifstream file;
sprintf(filename,"vel%d.%d",ist,i);
file.open(filename);
for (int k=0;k<n;k++)
{
file >> v[k].x >> v[k].y >> v[k].z;
}
file.close();
}
//===========================================================================
// cuda: (twice) the particles kinetic energy
//===========================================================================
__global__ void ekini(float3 *v_d,float *k_d)
{ int i=blockIdx.x*blockDim.x+threadIdx.x;
k_d[i]=v_d[i].x*v_d[i].x+v_d[i].y*v_d[i].y+v_d[i].z*v_d[i].z;
}
//===========================================================================
// device function: pair interactions
//===========================================================================
__device__ float
bodyBodyPotential(float3 l2, float3 l, float3 bi, float3 bj, float ui)
{
float3 rij;
ui=0.0;
rij.x = bi.x - bj.x;
rij.y = bi.y - bj.y;
rij.z = bi.z - bj.z;
rij.x=(rij.x>=l2.x?rij.x-=l.x:rij.x=rij.x);
rij.x=(rij.x<-l2.x?rij.x+=l.x:rij.x=rij.x);
rij.y=(rij.y>=l2.y?rij.y-=l.y:rij.y=rij.y);
rij.y=(rij.y<-l2.y?rij.y+=l.y:rij.y=rij.y);
rij.z=(rij.z>=l2.z?rij.z-=l.z:rij.z=rij.z);
rij.z=(rij.z<-l2.z?rij.z+=l.z:rij.z=rij.z);
float rij2 = rij.x * rij.x + rij.y * rij.y + rij.z * rij.z;
if (rij2<rc2d)
{ float irij2 = 1.0/rij2;
float irij6 = irij2 * irij2 * irij2;
float rmrc = sqrt(rij2)-rcd;
float rmrc2 = rmrc*rmrc;
float rmrc4 = rmrc2*rmrc2;
float sr = rmrc4/(csrd+rmrc4);
float uij = 4.0 * sr * irij6 * (irij6-1.0);
ui+= uij;
}
return ui;
}
//===========================================================================
// device function: forces in GPU
//===========================================================================
__global__ void
epoti(float3 l2, float3 l, float3 *r_d, float *u_d)
{
__shared__ float3 shPosition[threads];
float3 myPosition;
float ui = 0.0;
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
myPosition = r_d[gtid];
for (int tile = 0; tile < blocks; tile++)
{
int idx = tile * threads + threadIdx.x;
shPosition[threadIdx.x] = r_d[idx];
__syncthreads();
for (int j = 0; j < threads; j++)
{ if(gtid!=j + tile * threads)
{
ui = bodyBodyPotential(l2 ,l ,myPosition, shPosition[j], ui);
}
}
__syncthreads();
}
// Save the result in global memory
u_d[gtid] = ui;
}
//===========================================================================
// device function: sum elements of vector a (reduction)
//===========================================================================
__global__ void sumvec(float *a, float *c, int blocks)
{
__shared__ float cache[threads];
int tid=threadIdx.x+blockIdx.x*blockDim.x;
int cacheIndex=threadIdx.x;
float temp=0.0;
while (tid<blocks)
{
temp +=a[tid];
tid+=blockDim.x*gridDim.x;
}
cache[cacheIndex]=temp;
__syncthreads();
int i=blockDim.x/2;
while (i!=0)
{
if (cacheIndex<i)
cache[cacheIndex]+=cache[cacheIndex+i];
__syncthreads();
i/=2;
}
if (cacheIndex==0)
c[blockIdx.x]=cache[0];
}
//===========================================================================
// device: scale coordinates by a factor sf
//===========================================================================
__global__ void scalcoord(float sf,float3 *v_d)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
v_d[i].x=v_d[i].x*sf;
v_d[i].y=v_d[i].y*sf;
v_d[i].z=v_d[i].z*sf;
}
//===========================================================================
// device: define values 0 or 1
//===========================================================================
__global__ void setval(float a, float b, float3 *r_d, float *cr_d)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
cr_d[i]=1.0;
if ((r_d[i].z>a)&&(r_d[i].z<b))
cr_d[i]=0.0;
}
//===========================================================================
// cuda function: forces (version b)
//===========================================================================
__global__ void calculate_forcesb
(float3 l2, float3 l, int* mapd, int* celld,
int* npcd, int* lstcd, float3 *r_d, float3 *a_d, float *w_d)
{ int i=blockIdx.x*blockDim.x+threadIdx.x;
float4 acc={0.0, 0.0, 0.0, 0.0};
float3 myPosition = r_d[i];
int offseti=celld[i]*27;
for (int t=0;t<27;t++)
{ int celj=mapd[offseti+t];
int offsetj=celj*nmaxcd;
for (int k=0;k<npcd[celj];k++)
{ int j=lstcd[offsetj+k];
if (j!=i)
acc = bodyBodyInteraction(l2 ,l ,myPosition, r_d[j], acc);
}
}
a_d[i].x = acc.x;
a_d[i].y = acc.y;
a_d[i].z = acc.z;
w_d[i] = acc.w;
}
//===========================================================================
// cuda function: forces
//===========================================================================
__global__ void forces_shrange
(float3 l2, float3 l, int* mapd, int* celld,
int* npcd, int* lstcd, float3 *r_d, float3 *a_d, float *w_d)
{
int celi=blockIdx.x;
int offseti=celi*27;
int tid=threadIdx.x;
int np_celi=npcd[celi];
float3 myPosition;
int myip;
__shared__ float3 shposjp[tpb]; // nmaxcd must be defined at beginning
float4 acc={0.0, 0.0, 0.0, 0.0};
if(tid<np_celi)
{
myip=lstcd[celi*nmaxcd+tid];
myPosition = r_d[myip];
}
for (int t=0;t<27;t++)
{
int celj=mapd[offseti+t];
int offsetj=celj*nmaxcd;
if(tid<npcd[celj])
{
shposjp[tid]=r_d[lstcd[offsetj+tid]];
}
__syncthreads();
/*if(npcd[celi]>32 || npcd[celj]>32)
{printf("\n Shhhiiiit \n");}
*/
for(int j=0;j<npcd[celj];j++)
{
if((tid<np_celi) && (celi!=celj|| tid!=j)) //to avoid selfinteraction
{
acc = bodyBodyInteraction(l2 ,l ,myPosition, shposjp[j], acc);
}
}
__syncthreads();
}
if(tid<np_celi)
{
a_d[myip].x = acc.x;
a_d[myip].y = acc.y;
a_d[myip].z = acc.z;
w_d[myip] = acc.w;
}
}
//===========================================================================
// cuda function: forces, alternate
//===========================================================================
__global__
//__launch_bounds__(newtpb, 2 )
void alt_forces_shrange
(float3 l2, float3 l, int* mapd, int* celld,
int* npcd, int* lstcd, float3 *r_d, float3 *a_d, float *w_d)
{
// __shared__ float3 shposip[tpb]; // tpb=nmaxcd must be defined at beginning
// __shared__ int shin_celli[tpb];
__shared__ float4 shacc[newtpb];
int celi=blockIdx.x;
int tid=threadIdx.x;
int ni_celli=tid/32; //this number is between 0 and 26, the warp id
int np_celli=npcd[celi]; //number of particles in celli
int celj=mapd[celi*27+ni_celli]; //index of the local neighboring cell
int np_cellj=npcd[celj]; //number of particles in cellj
int myjp=-100; // some of the threads will do nothing becasue tthere are less than 32 particles
int s=tid%32; // each thread gets a number between 0 and 31
//if(s==0 && celi==33) {printf(" %d %d %d %d %d tid= %d\n", celi,celj,ni_celli,np_celli,np_cellj,tid);}
float3 myPosjp;
if(s<np_cellj)
{
myjp=lstcd[celj*nmaxc+s];
myPosjp=r_d[myjp];
if (isnan(myPosjp.x)) {printf("nan20: %d,%d,%d\n", celj,nmaxc,s); my_assert(0);}
if (isnan(myPosjp.y)) {printf("nan21: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
if (isnan(myPosjp.z)) {printf("nan22: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
}
__syncthreads();
// we load the particles of the celi on the shared memory in each cell at all time
/*
if(npcd[celi]>32 || npcd[celj]>32)
{printf("\n Shhhiiiit \n");}
*/
/*
if(tid<np_celli)
{
int ip=lstcd[celi*nmaxcd+tid];
shin_celli[tid]=ip;
shposip[tid]=r_d[ip];
}
*/
__syncthreads(); //at this point all data should be on in the block variables
for(int iii=0;iii<np_celli;iii++)
{
int myip=lstcd[celi*nmaxc+iii]; // int myip=shin_celli[iii];
float3 myPosip=r_d[myip]; //float3 myPosip=shposip[iii];
float4 acc={0.0,0.0,0.0,0.0};
shacc[tid]=acc;
__syncthreads();
if( myjp>=0 && myjp!=myip && s<np_cellj) //to avoid selfinteraction
{
if (isnan(myPosip.x)) {printf("nan8: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
if (isnan(myPosip.y)) {printf("nan9: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
if (isnan(myPosip.z)) {printf("nan10: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
if (isnan(myPosjp.x)) {printf("nan11: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
if (isnan(myPosjp.y)) {printf("nan12: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
if (isnan(myPosjp.z)) {printf("nan13: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
if (isnan(l2.x)) {printf("nan14: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
if (isnan(l2.y)) {printf("nan15: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
if (isnan(l2.z)) {printf("nan16: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
if (isnan(l.x)) {printf("nan17: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
if (isnan(l.y)) {printf("nan18: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
if (isnan(l.z)) {printf("nan19: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
acc= newbodyBodyInteraction(l2 ,l ,myPosip, myPosjp);
if (isnan(acc.x)) {printf("nan4: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
if (isnan(acc.y)) {printf("nan5: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
if (isnan(acc.z)) {printf("nan6: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
if (isnan(acc.w)) {printf("nan7: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
}
shacc[tid]=acc;
__syncthreads();
// reduction on the shared memory
if(tid>=512)
{
shacc[tid-512].x+=shacc[tid].x;
shacc[tid-512].y+=shacc[tid].y;
shacc[tid-512].z+=shacc[tid].z;
shacc[tid-512].w+=shacc[tid].w;
}
__syncthreads();
for(int l=256;l>=1;l=l/2)
{
if(tid<l)
{
shacc[tid].x+=shacc[tid+l].x;
shacc[tid].y+=shacc[tid+l].y;
shacc[tid].z+=shacc[tid+l].z;
shacc[tid].w+=shacc[tid+l].w;
}
__syncthreads();
}
__syncthreads();
if(tid==0)
{
if (isnan(shacc[0].x)) {printf("nan0: %d,%d\n", myip,blockIdx.x); my_assert(0);}
if (isnan(shacc[0].y)) {printf("nan1: %d,%d\n", myip,blockIdx.x); my_assert(0);}
if (isnan(shacc[0].z)) {printf("nan2: %d,%d\n", myip,blockIdx.x); my_assert(0);}
if (isnan(shacc[0].w)) {printf("nan3: %d,%d\n", myip,blockIdx.x); my_assert(0);}
a_d[myip].x = shacc[0].x;
a_d[myip].y = shacc[0].y;
a_d[myip].z = shacc[0].z;
w_d[myip] = shacc[0].w;
}
__syncthreads();
}
}
__global__
//__launch_bounds__(newtpb, 2 )
void newalt_forces_shrange
(float3 l2, float3 l, int* mapd, int* celld,
int* npcd, int* lstcd, float3 *r_d, float3 *a_d, float *w_d)
{
// __shared__ float3 shposip[tpb]; // tpb=nmaxcd must be defined at beginning
// __shared__ int shin_celli[tpb];
__shared__ float4 shacc[newtpb];
int celi=blockIdx.x;
int tid=threadIdx.x;
int ni_celli=tid/32; //this number is between 0 and 26, the warp id
int np_celli=npcd[celi]; //number of particles in celli
int celj=mapd[celi*27+ni_celli]; //index of the local neighboring cell
int np_cellj=npcd[celj]; //number of particles in cellj
int myjp=-100; // some of the threads will do nothing becasue tthere are less than 32 particles
int s=tid%32; // each thread gets a number between 0 and 31
//if(s==0 && celi==33) {printf(" %d %d %d %d %d tid= %d\n", celi,celj,ni_celli,np_celli,np_cellj,tid);}
float3 myPosjp;
if(s<np_cellj)
{
myjp=lstcd[celj*nmaxc+s];
myPosjp=r_d[myjp];
}
__syncthreads();
// we load the particles of the celi on the shared memory in each cell at all time
/*
if(npcd[celi]>32 || npcd[celj]>32)
{printf("\n Shhhiiiit \n");}
*/
/*
if(tid<np_celli)
{
int ip=lstcd[celi*nmaxcd+tid];
shin_celli[tid]=ip;
shposip[tid]=r_d[ip];
}
*/
__syncthreads(); //at this point all data should be on in the block variables
for(int iii=0;iii<np_celli;iii++)
{
int myip=lstcd[celi*nmaxc+iii]; // int myip=shin_celli[iii];
float3 myPosip=r_d[myip]; //float3 myPosip=shposip[iii];
float4 acc={0.0,0.0,0.0,0.0};
shacc[tid]=acc;
__syncthreads();
if( myjp>=0 && myjp!=myip && s<np_cellj) //to avoid selfinteraction
{
acc= newbodyBodyInteraction(l2 ,l ,myPosip, myPosjp);
}
shacc[tid]=acc;
__syncthreads();
// reduction on the shared memory
if(tid>=512)
{
shacc[tid-512].x+=shacc[tid].x;
shacc[tid-512].y+=shacc[tid].y;
shacc[tid-512].z+=shacc[tid].z;
shacc[tid-512].w+=shacc[tid].w;
}
__syncthreads();
for(int l=256;l>=1;l=l/2)
{
if(tid<l)
{
shacc[tid].x+=shacc[tid+l].x;
shacc[tid].y+=shacc[tid+l].y;
shacc[tid].z+=shacc[tid+l].z;
shacc[tid].w+=shacc[tid+l].w;
}
__syncthreads();
}
__syncthreads();
if(tid==0)
{
a_d[myip].x = shacc[0].x;
a_d[myip].y = shacc[0].y;
a_d[myip].z = shacc[0].z;
w_d[myip] = shacc[0].w;
}
__syncthreads();
}
}
//===========================================================================
// cuda: cell index of particles
//===========================================================================
__global__ void celli(float3 c,float3 l2,float3 *r_d,int *celld)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
int mxi,myi,mzi;
mxi=(int)((r_d[i].x+l2.x)/c.x);
myi=(int)((r_d[i].y+l2.y)/c.y);
mzi=(int)((r_d[i].z+l2.z)/c.z);
if (mxi>=mxd) mxi-=mxd;
if (myi>=myd) myi-=myd;
if (mzi>=mzd) mzi-=mzd;
mxi=(mxi+mxd)%mxd;
myi=(myi+myd)%myd;
mzi=(mzi+mzd)%mzd;
celld[i]=mxi+myi*mxd+mzi*mxd*myd;
}
//===========================================================================
// cuda: list of number and indexes of particles belonging to a given cell
//===========================================================================
__global__ void hist(int *npcd, int *lstcd,int *celld)
{ int j,temp;
int i=blockIdx.x*blockDim.x+threadIdx.x;
j=celld[i];
temp = atomicAdd(&npcd[j],1);
lstcd[j*nmaxcd+temp]=i;
}
//===========================================================================
// kernel function: map cell and neighbor cell indexes
//===========================================================================
void c_initlcl(int *map)
{ int i,j,k,u,v,w,s;
for (i=0;i<mx;i++)
{ for (j=0;j<my;j++)
{ for (k=0;k<mz;k++)
{ int imap=((i+mx)%mx+((j+my)%my)*mx+((k+mz)%mz)*mx*my)*27;
s=0;
for (u=-1;u<=1;u++)
{ for (v=-1;v<=1;v++)
{ for (w=-1;w<=1;w++)
{ map[imap+s]=((i+u+mx)%mx+((j+v+my)%my)*mx+((k+w+mz)%mz)*mx*my);
s=s+1;
} } } } } } }
__global__ void nan_kernel(float3 *d, int len, int i){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < len){
if (isnan(d[idx].x)) {printf("nan30: %d\n", i); my_assert(0);}
if (isnan(d[idx].y)) {printf("nan31: %d\n", i); my_assert(0);}
if (isnan(d[idx].z)) {printf("nan32: %d\n", i); my_assert(0);}
}
}
//===========================================================================
// MAIN PROGRAM
//===========================================================================
int main() {
cudaSetDevice(0);
int seed = 0;
float3 l = {lx, ly, lz};
float3 r[n], v[n];
int map[27*ncel];
float3 *r_d, *v_d, *a_d;
float *u_d, *k_d, *c1, *c2, *w_d, *cr_d;
int *mapd, *celld, *npcd, *lstcd;
float ekin, epot, temp, temp0, w, sfv, pres, vol, sfr, pres0;
temp0=0.50;
pres0=0.10;
float3 l2 = {l.x/2.0, l.y/2.0, l.z/2.0};
float3 c = {l.x/(float)(mx), l.y/(float)(my), l.z/(float)(mz)};
cudaMalloc((void**)&r_d, n*sizeof(float3));
cudaMalloc((void**)&v_d, n*sizeof(float3));
cudaMalloc((void**)&a_d, n*sizeof(float3));
cudaMalloc((void**)&u_d, n*sizeof(float));
cudaMalloc((void**)&k_d, n*sizeof(float));
cudaMalloc((void**)&w_d, n*sizeof(float));
cudaMalloc((void**)&c1, n*sizeof(float));
cudaMalloc((void**)&c2, n*sizeof(float));
cudaMalloc((void**)&cr_d, n*sizeof(float));
cudaMalloc((void**)&mapd,27*ncel*sizeof(int));
cudaMalloc((void**)&lstcd,ncel*nmaxc*sizeof(int));
cudaMalloc((void**)&npcd,ncel*sizeof(int));
cudaMalloc((void**)&celld,n*sizeof(int));
cudaMemcpyToSymbol(rcd,&rc,sizeof(float));
cudaMemcpyToSymbol(rc2d,&rc2,sizeof(float));
cudaMemcpyToSymbol(csrd,&csr,sizeof(float));
cudaMemcpyToSymbol(qpd,&qp,sizeof(float));
cudaMemcpyToSymbol(dtd,&dt,sizeof(float));
cudaMemcpyToSymbol(cvd,&cv,sizeof(float));
cudaMemcpyToSymbol(cfd,&cf,sizeof(float));
cudaMemcpyToSymbol(mxd,&mx,sizeof(int));
cudaMemcpyToSymbol(myd,&my,sizeof(int));
cudaMemcpyToSymbol(mzd,&mz,sizeof(int));
cudaMemcpyToSymbol(nmaxcd,&nmaxc,sizeof(int));
cudaMemcpyToSymbol(nceld,&ncel,sizeof(int));
c_initlcl(map);
c_fcc(lc,l,r);
c_vel(seed,temp0,v);
cudaMemcpy(r_d, r, n*sizeof(float3), cudaMemcpyHostToDevice);
nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 1);
cudaMemcpy(v_d, v, n*sizeof(float3), cudaMemcpyHostToDevice);
cudaMemcpy(mapd,map,27*ncel*sizeof(int),cudaMemcpyHostToDevice);
cudaMemsetAsync(npcd,0,ncel*sizeof(int));
celli<<<blocks,threads>>>(c,l2,r_d,celld);
nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 2);
hist<<<blocks,threads>>>(npcd,lstcd,celld);
calculate_forcesb<<<blocks,threads>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);
nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 3);
sumvec<<<blocks,threads>>>(w_d,c1,n);
sumvec<<<blocks,threads>>>(c1,c2,blocks);
cudaMemcpy(&w,c2,sizeof(float),cudaMemcpyDeviceToHost);
ekini<<<blocks,threads>>>(v_d, k_d);
sumvec<<<blocks,threads>>>(k_d,c1,n);
sumvec<<<blocks,threads>>>(c1,c2,blocks);
cudaMemcpy(&ekin,c2,sizeof(float),cudaMemcpyDeviceToHost);
temp=ekin/(3.0*float(n));
vol=l.x*l.y*l.z;
pres=(float(n)*temp+w/6.0)/vol;
printf("%f\t%f\t%f\n",l.x,l.y,l.z);
printf("%f\t%f\n",ekin,temp);
printf("%f\t%f\n",vol,pres);
setval<<<blocks,threads>>>(l.z+1.0,-l.z-1.0,r_d,cr_d);
nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 4);
//========================================================
// Main-Loop 1:
//========================================================
for (int i=0;i<10;i++)
{
// propagation of coordinates and velocities
sfr=1.0+qp*dt*(pres-pres0);
l.x=sfr*l.x;
l.y=sfr*l.y;
l.z=sfr*l.z;
l2.x = l.x/2.0;
l2.y = l.y/2.0;
l2.z = l.z/2.0;
c.x=l.x/(float)(mx);
c.y=l.y/(float)(my);
c.z=l.z/(float)(mz);
nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 5);
scalcoord<<<blocks,threads>>>(sfr,r_d);
nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 6);
intstep1<<<blocks,threads>>>(l,l2,cr_d,r_d,v_d,a_d);
nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 7);
cudaMemsetAsync(npcd,0,ncel*sizeof(int));
celli<<<blocks,threads>>>(c,l2,r_d,celld);
nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 8);
hist<<<blocks,threads>>>(npcd,lstcd,celld);
//calculate_forcesb<<<blocks,threads>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);
//forces_shrange<<<ncel,tpb>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);
alt_forces_shrange<<<ncel,newtpb>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);
nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 9);
intstep2<<<blocks,threads>>>(cr_d,v_d,a_d);
sumvec<<<blocks,threads>>>(w_d,c1,n);
sumvec<<<blocks,threads>>>(c1,c2,blocks);
cudaMemcpy(&w,c2,sizeof(float),cudaMemcpyDeviceToHost);
ekini<<<blocks,threads>>>(v_d, k_d);
sumvec<<<blocks,threads>>>(k_d,c1,n);
sumvec<<<blocks,threads>>>(c1,c2,blocks);
cudaMemcpy(&ekin,c2,sizeof(float),cudaMemcpyDeviceToHost);
temp=ekin/(3.0*float(n));
vol=l.x*l.y*l.z;
pres=(float(n)*temp+w/6.0)/vol;
printf("%i\t%f\t%f\t%f\t%f\n",i,n/vol,temp,pres,sfr);
if (i%300==0)
{
scalcoord<<<blocks,threads>>>(sfv,v_d);
}
if (i%1000==0)
{ cudaMemcpy(r,r_d,n*sizeof(float3),cudaMemcpyDeviceToHost);
cudaMemcpy(v,v_d,n*sizeof(float3),cudaMemcpyDeviceToHost);
c_confout(seed,i,l,r);
c_velout(seed,i,v);
}
}
exit(0);
cudaFree(r_d);
cudaFree(v_d);
cudaFree(a_d);
}
The nan58 printout occurs at line 361.