/* * Copyright 1993-2007 NVIDIA Corporation. All rights reserved. * * NOTICE TO USER: * * This source code is subject to NVIDIA ownership rights under U.S. and * international Copyright laws. Users and possessors of this source code * are hereby granted a nonexclusive, royalty-free license to use this code * in individual and commercial software. * * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE * OR PERFORMANCE OF THIS SOURCE CODE. * * U.S. Government End Users. This source code is a "commercial item" as * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of * "commercial computer software" and "commercial computer software * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) * and is provided to the U.S. Government only as a commercial end item. * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the * source code with only those rights set forth herein. * * Any use of this source code in individual and commercial software must * include, in the user documentation and internal comments to the code, * the above Disclaimer and U.S. Government End Users Notice. */ /* Template project which demonstrates the basics on how to setup a project * example application. * Device code. */ #ifndef _TEMPLATE_KERNEL_H_ #define _TEMPLATE_KERNEL_H_ #include #include #define SDATA( index) CUT_BANK_CHECKER(sdata, index) //////////////////////////////////////////////////////////////////////////////// //! My functions for device called by kernel //////////////////////////////////////////////////////////////////////////////// __constant__ const int Start = 1, StartSingleStep = -1, NormalStep = 2, SingleStep = -2, SmallRelErrorBound = 3, TooManyIterations = 4, SmallAbsErrorBound = 5, MinimumStepReached = 6, TooManyCalls = 7, InvalidParameters = 8, UnsolvableProblem = 9; __constant__ const float // Runge-Kutta-Fehlberg formula of 4-th order a[6] = { 0.0, 1.0/4.0, 3.0/8.0, 12.0/13.0, 1.0, 1.0/2.0 }, b[6][5] = { { 0.0, 0.0, 0.0, 0.0, 0.0 }, { 1.0/4.0, 0.0, 0.0, 0.0, 0.0 }, { 3.0/32.0, 9.0/32.0, 0.0, 0.0, 0.0 }, { 1932.0/2197.0, -7200.0/2197.0, 7296.0/2197.0, 0.0, 0.0 }, { 439.0/216.0, -8.0, 3680.0/513.0, -845.0/4104.0, 0.0 }, { -8.0/27.0, 2.0, -3544.0/2565.0, 1859.0/4104.0, -11.0/40.0 } }, // uncomment only one of c[6] below: // for local error of 5-th order use formula below - RKF4(5) - best choice! c[6] = { 16.0/135.0, 0.0, 6656.0/12825.0, 28561.0/56430.0, -9.0/50.0, 2.0/55.0 }, // for local error of 4-th order use formula below - RKF4(4) - second best only! // c[6] = { 25.0/216.0, 0.0, 1408.0/2565.0, 2197.0/4104.0, -1.0/5.0, 0.0 }, // local error term e[6] = { -1.0/360.0, 0.0, 128.0/4275.0, 2197.0/75240.0, -1.0/50.0, -2.0/55.0 }; // initialize constants which depend on the order __constant__ const int order = 5; __constant__ float iorder = 0.2, crit = 59049, criti = 0.0001889568; texture tex; __device__ float maxv = 0, minv = 10000000; //-----------------------------------------------------------------------------// __device__ float CopySign(float x, float y) { if (y >= 0) return fabs(x); else return fabs(x) * -1; } //-----------------------------------------------------------------------------// __device__ float ErrorTerm (int k, float3 *Work) { float sum = 0; for (int i = 0; i <= 5; i++) { if (k == 0) sum += e[i] * Work[i].x; else if (k == 1) sum += e[i] * Work[i].y; else sum += e[i] * Work[i].z; } return sum; } //-----------------------------------------------------------------------------// __device__ void Model(float t, float3 &y, float3 &yp) { // get vector yp at point y // scale yp to be in reference to t instead of 1 sec float xc = (y.x / 0.939334); float yc = (y.y / 0.940438); float zc = (y.z / 0.469494); if ((xc < 0) || (yc < 0) || (zc < 0) || (xc > 255) || (yc > 159) || (zc > 106)) { yp.x = 0.0; yp.y = 0.0; yp.z = 0.0; } else { float4 tmp = make_float4(0.0,0.0,0.0,0.0); //float4 tmp1 = make_float4(0.0,0.0,0.0,0.0); tmp = tex3D(tex,xc,yc,zc); yp.x = tmp.x; yp.y = tmp.y; yp.z = tmp.z; } } __device__ void RungeKuttaStep (float3 &y, float t, float h, float3 &s, float3 Work[7]) { // Integrates a system of N first order // ordinary differential equations of the form // // dy(i) / dt = f(t, y(1), ... ,y(N)) // // where the initial values y(i) and the initial derivatives // yp(i) are specified at the starting point t. It advances // the solution over the fixed step h and returns // solution approximation at t+h in array s(i). // (In the case of Shampine's original rkf45 the fourth order solution, // fifth order accurate locally). // It should be called with an h not smaller than 13 units of // roundoff in t so that the various independent arguments can be // distinguished. for (int j = 1; j <= 5; j++) { float x = 0.0; for (int m = 0; m < j; m++) x += b[j][m] * Work[m].x; s.x = x * h + y.x; x = 0.0; for (int m = 0; m < j; m++) x += b[j][m] * Work[m].y; s.y = x * h + y.y; x = 0.0; for (int m = 0; m < j; m++) x += b[j][m] * Work[m].z; s.z = x * h + y.z; Model(t + a[j] * h, s, Work[j]); } float x = 0.0; for (int j = 0; j <= 5; j++) x += c[j] * Work[j].x; s.x = h * x + y.x; x = 0.0; for (int j = 0; j <= 5; j++) x += c[j] * Work[j].y; s.y = h * x + y.y; x = 0.0; for (int j = 0; j <= 5; j++) x += c[j] * Work[j].z; s.z = h * x + y.z; } //-----------------------------------------------------------------------------// __device__ void Integrate (float3 &y, float t, float tout, float relerr, float abserr, int &iflag, int maxnfe) { //definitions int nfe = 0,kop = 0,init = 0,kflag = 0,jflag = 0; float h = 0.005,savre = 0,savae = 0; // Get machine epsilon const float eps = DBL_EPSILON, u26 = 26*eps; // remin is the minimum acceptable value of relerr. Attempts // to obtain higher accuracy with this subroutine are usually // very expensive and often unsuccessful. const float remin = 1e-12; // The expense is controlled by restricting the number // of function evaluations to be approximately maxnfe. // The default value of maxnfe = 3000 corresponds to about 500 steps. int mflag = abs(iflag); // input check parameters if (relerr < 0.0 || abserr < 0.0 || mflag == 0 || mflag > InvalidParameters || ((t == tout) && (kflag != SmallRelErrorBound))) { iflag = InvalidParameters; return; } float dt = 0,rer = 0,scale = 0,hmin = 0,eeoet = 0,ee = 0,et = 0,esttol = 0,s = 0,ae = 0,tol = 0; int output = 0, hfaild = 0; // references for convenience float3 Work[7]; for (int yy = 0; yy < 7; yy++) Work[yy] = make_float3(0.0,0.0,0.0); float3 &yp = Work[0], &f1 = Work[1], &ss = Work[5+1]; int gflag = 0; if (iflag == SmallRelErrorBound || (mflag == NormalStep && (init == 0 || kflag == NormalStep))) { gflag = Start; goto next; } if (iflag == TooManyIterations || (kflag == TooManyIterations && mflag == NormalStep)) { nfe = 0; if (mflag != NormalStep) gflag = Start; goto next; } if ((kflag == SmallAbsErrorBound && abserr == 0.0) || (kflag == MinimumStepReached && relerr < savre && abserr < savae)) { iflag = UnsolvableProblem; return; } next: if (gflag) { iflag = jflag; if (kflag == SmallRelErrorBound) mflag = abs(iflag); } jflag = iflag; kflag = 0; savre = relerr; savae = abserr; // Restrict relative error tolerance to be at least as large as // 2*eps+remin to avoid limiting precision difficulties arising // from impossible accuracy requests rer = 2 * eps + remin; // Relative error tolerance is too small if (relerr < rer) { relerr = rer; iflag = kflag = SmallRelErrorBound; return; } gflag = 0; dt = tout - t; if (mflag == Start) { // Initialization // Set initialization completion indicator, init // Set indicator for too many output points,kop // Evaluate initial derivatives // Set counter for function evaluations,nfe // Estimate starting stepsize init = 0; kop = 0; gflag = Start; Model(t,y,yp); // call function nfe = 1; if (t == tout) { iflag = NormalStep; return; } } if (init == 0 || gflag) { init = 1; h = fabs(dt); float ypk; tol = relerr * fabs(y.x) + abserr; if (tol > 0.0) { ypk = fabs(yp.x); if (ypk * pow(h,order) > tol) h = pow(tol/ypk,iorder); } tol = relerr * fabs(y.y) + abserr; if (tol > 0.0) { ypk = fabs(yp.y); if (ypk * pow(h,order) > tol) h = pow(tol/ypk,iorder); } tol = relerr * fabs(y.z) + abserr; if (tol > 0.0) { ypk = fabs(yp.z); if (ypk * pow(h,order) > tol) h = pow(tol/ypk,iorder); } if (tol <= 0.0) h = 0.0; ypk = max(fabs(dt),fabs(t)); h = max(h, u26 * ypk); jflag = CopySign((int)NormalStep,iflag); } // Set stepsize for integration in the direction from t to tout h = CopySign(h,dt); // Test to see if this routine is being severely impacted by too many // output points if (fabs(h) >= 2*fabs(dt)) kop++; if (kop == 100) { kop = 0; iflag = TooManyCalls; return; } if (fabs(dt) <= u26 * fabs(t)) { // If too close to output point,extrapolate and return y.x += dt * yp.x; y.y += dt * yp.y; y.z += dt * yp.z; Model(tout,y,yp); nfe++; t = tout; iflag = NormalStep; return; } // Initialize output point indicator output = false; // To avoid premature underflow in the error tolerance function, // scale the error tolerances scale = 2.0 / relerr; ae = scale * abserr; // Step by step integration - as an endless loop over steps for (int qa =0 ;qa < 40000; qa++) { hfaild = 0; // Set smallest allowable stepsize hmin = u26 * fabs(t); // Adjust stepsize if necessary to hit the output point. // Look ahead two steps to avoid drastic changes in the stepsize and // thus lessen the impact of output points on the code. dt = tout - t; if (fabs(dt) < 2 * fabs(h)) { if (fabs(dt) <= fabs(h)) { // The next successful step will complete the // integration to the output point output = true; h = dt; } else h = 0.5 * dt; } // Core integrator for taking a single step // // The tolerances have been scaled to avoid premature underflow in // computing the error tolerance function et. // To avoid problems with zero crossings, relative error is measured // using the average of the magnitudes of the solution at the // beginning and end of a step. // The error estimate formula has been grouped to control loss of // significance. // To distinguish the various arguments, h is not permitted // to become smaller than 26 units of roundoff in t. // Practical limits on the change in the stepsize are enforced to // smooth the stepsize selection process and to avoid excessive // chattering on problems having discontinuities. // To prevent unnecessary failures, the code uses 9/10 the stepsize // it estimates will succeed. // After a step failure, the stepsize is not allowed to increase for // the next attempted step. This makes the code more efficient on // problems having discontinuities and more effective in general // since local extrapolation is being used and extra caution seems // warranted. // // Test number of derivative function evaluations. // If okay,try to advance the integration from t to t+h if (nfe > maxnfe) { // Too much work iflag = kflag = TooManyIterations; return; } step: // Advance an approximate solution over one step of length h RungeKuttaStep(y,t,h,ss,Work); f1.x = ss.x; f1.y = ss.y; f1.z = ss.z; nfe += 5; // Compute and test allowable tolerances versus local error estimates // and remove scaling of tolerances. note that relative error is // measured with respect to the average of the magnitudes of the // solution at the beginning and end of the step. eeoet = 0.0; et = fabs(y.x) + fabs(f1.x) + ae; // Inappropriate error tolerance if (et <= 0.0) { iflag = SmallAbsErrorBound; return; } ee = fabs( ErrorTerm(0, Work) ); eeoet = max(eeoet,ee/et); et = fabs(y.y) + fabs(f1.y) + ae; // Inappropriate error tolerance if (et <= 0.0) { iflag = SmallAbsErrorBound; return; } ee = fabs( ErrorTerm(1, Work) ); eeoet = max(eeoet,ee/et); et = fabs(y.z) + fabs(f1.z) + ae; // Inappropriate error tolerance if (et <= 0.0) { iflag = SmallAbsErrorBound; return; } ee = fabs( ErrorTerm(2, Work) ); eeoet = max(eeoet,ee/et); esttol = fabs(h) * eeoet * scale; if (esttol > 1.0) { // Unsuccessful step // Reduce the stepsize , try again // The decrease is limited to a factor of 1/10 hfaild = true; output = false; s = 0.1; if (esttol < crit) s = 0.9 / pow(esttol,iorder); h *= s; if (fabs(h) > hmin) goto step; // loop // Requested error unattainable at smallest allowable stepsize iflag = kflag = MinimumStepReached; return; } // Successful step // Store solution at t+h and evaluate derivatives there t += h; y.x = f1.x; y.y = f1.y; y.z = f1.z; Model(t,y,yp); if (yp.x == 0 && yp.y == 0 && yp.z == 0) return; nfe++; // Choose next stepsize // The increase is limited to a factor of 5 // if step failure has just occurred, next // stepsize is not allowed to increase s = 5.0; if (esttol > criti) s = 0.9 / pow(esttol,iorder); if (hfaild) s = min(1.0,s); h = CopySign(max(hmin,s*fabs(h)),h); h = min(h,0.01); // End of core integrator if (output) { t = tout; iflag = NormalStep; return; } if (iflag <= 0) { // one-step mode iflag = SingleStep; return; } } // for (;;) } //-----------------------------------------------------------------------------// //////////////////////////////////////////////////////////////////////////////// //! Simple test kernel for device functionality //! @param g_idata input data in global memory //! @param g_odata output data in global memory //////////////////////////////////////////////////////////////////////////////// __device__ float3 points0(int x,int y,int z) { return make_float3(x * 0.939334,y * 0.940438,z * 0.469494); } __device__ float3 points1(int x,int y,int z,int width, int height, int depth, float3* g_integrationodata) { return g_integrationodata[(x*height+y)*depth+z]; } __device__ int sign(float x,float y) { return (y>=0) ? x : -x; } __device__ void tred3(float* X,float* D,float* E,float* A) { float tol = FLT_MIN/FLT_EPSILON; int n = 3; A = X; float* ei = E + n; for (int i = n-1; i >= 0; i--) { float h = 0.0; float f = - FLT_MAX; float* d = D; float* a = A + (i*(i+1))/2; int k = i; while (k--) { f = *a++; *d++ = f; h += f * f; } if (h <= tol) { *(--ei) = 0.0; h = 0.0; } else { float g = sign(-sqrt(h), f); *(--ei) = g; h -= f*g; f -= g; *(d-1) = f; *(a-1) = f; f = 0.0; float* dj = D; float* ej = E; int j = 0; for (j = 0; j < i; j++) { float* dk = D; float* ak = A+(j*(j+1))/2; float g = 0.0; k = j; while (k--) g += *ak++ * *dk++; k = i-j; int l = j; if (k) for (;;) { g += *ak * *dk++; if (!(--k)) break; ak += ++l; } g /= h; *ej++ = g; f += g * *dj++; } float hh = f / (2 * h); float* ak = A; dj = D; ej = E; for (j = 0; j < i; j++) { f = *dj++; g = *ej - hh * f; *ej++ = g; float* dk = D; float* ek = E; k = j+1; while (k--) { *ak++ -= (f * *ek++ + g * *dk++); } } } *d = *a; *a = h; } } __device__ void tql1(float* D, float* E) { float eps = FLT_EPSILON; int n = 3; int l; for (l=1; l=l; i--) { float ei = E[i*4]; float di = D[i*4]; float& ei1 = E[(i+1)*4]; g = c * ei; h = c * p; if ( fabs(p) >= fabs(ei)) { c = ei / p; r = sqrt(c*c + 1.0); ei1 = s*p*r; s = c/r; c = 1.0/r; } else { c = p / ei; r = sqrt(c*c + 1.0); ei1 = s * ei * r; s = 1.0/r; c /= r; } p = c * di - s*g; D[(i+1)*4] = h + s * (c*g + s*di); } el = s*p; dl = c*p; if (fabs(el) <= b) { test = true; break; } } //if (!test) Throw ( ConvergenceException(D) ); float p = dl + f; test = false; for (i=l; i>0; i--) { if (p < D[(i-1)*4]) { D[i*4] = D[(i-1)*4]; } else { test = true; break; } } if (!test) i=0; D[i*4] = p; } } __device__ void EigenValues(float* X, float* D) { float E[9]; float A[9]; for(int i=0;i<9;i++) { E[i] = 0.0; A[i] = 0.0; } tred3(X,D,E,A); tql1(D,E); } __global__ void integrationKernel(float3* g_integrationodata, float* g_ftleodata, int width, int height, int depth, float T) { int i = 0,j = 0,k = 0; // access block id i = (blockIdx.x + threadIdx.x); j = (threadIdx.y); k = (blockIdx.y + threadIdx.z); // access block id float3 y = make_float3(0.0,0.0,0.0); y.x = i * 0.939334; y.y = j * 0.940438; y.z = k * 0.469494; int iflag = Start; // compute the integration based on point x,y,z //if (i < 1 && j < 160 && k < 1) Integrate (y, 0.0, T, 0.001, 0.001, iflag, 3000000); int index = (i * height + j) * depth + k; g_integrationodata[index].x = y.x; g_integrationodata[index].y = y.y; g_integrationodata[index].z = y.z; __syncthreads(); } __global__ void ftleKernel(float3* g_integrationodata, float* g_ftleodata, int width, int height, int depth, float T) { int i,j,k,l,m,n; // access block id i = (blockIdx.x + threadIdx.x); j = (threadIdx.y); k = (blockIdx.y + threadIdx.z); // perform some computations //coherence metric float m1[9]; float m1t[9]; float S[9]; float val = 0; float D[9]; g_ftleodata[((i*height)+j)*depth+k] = 0.0; if (i > 0 && i < width-1) { if (j > 0 && j < width-1) { if (k > 0 && k < width-1) { m1[0] = (points1(i+1,j,k,width,height,depth,g_integrationodata).x-points1(i-1,j,k,width,height,depth,g_integrationodata).x)/(points0(i+1,j,k).x-points0(i-1,j,k).x); m1[1] = (points1(i,j+1,k,width,height,depth,g_integrationodata).x-points1(i,j-1,k,width,height,depth,g_integrationodata).x)/(points0(i,j+1,k).y-points0(i,j-1,k).y); m1[2] = (points1(i,j,k+1,width,height,depth,g_integrationodata).x-points1(i,j,k-1,width,height,depth,g_integrationodata).x)/(points0(i,j,k+1).z-points0(i,j,k-1).z); m1[3] = (points1(i+1,j,k,width,height,depth,g_integrationodata).y-points1(i-1,j,k,width,height,depth,g_integrationodata).y)/(points0(i+1,j,k).x-points0(i-1,j,k).x); m1[4] = (points1(i,j+1,k,width,height,depth,g_integrationodata).y-points1(i,j-1,k,width,height,depth,g_integrationodata).y)/(points0(i,j+1,k).y-points0(i,j-1,k).y); m1[5] = (points1(i,j,k+1,width,height,depth,g_integrationodata).y-points1(i,j,k-1,width,height,depth,g_integrationodata).y)/(points0(i,j,k+1).z-points0(i,j,k-1).z); m1[6] = (points1(i+1,j,k,width,height,depth,g_integrationodata).z-points1(i-1,j,k,width,height,depth,g_integrationodata).z)/(points0(i+1,j,k).x-points0(i-1,j,k).x); m1[7] = (points1(i,j+1,k,width,height,depth,g_integrationodata).z-points1(i,j-1,k,width,height,depth,g_integrationodata).z)/(points0(i,j+1,k).y-points0(i,j-1,k).y); m1[8] = (points1(i,j,k+1,width,height,depth,g_integrationodata).z-points1(i,j,k-1,width,height,depth,g_integrationodata).z)/(points0(i,j,k+1).z-points0(i,j,k-1).z); for (l=0;l<9;l++) m1[l] = m1[l]*2550000.0; // S << (m1.t() * m1); for (l=0;l<3;l++) for (m=0;m<3;m++) m1t[m*3+l] = m1[l*3+m]; for (l=0;l<9;l++) { S[l] = 0.0; D[l] = 0.0; } for (l=0;l<3;l++) for (m=0;m<3;m++) for (n=0;n<3;n++) S[l*3+m] += m1t[l*3+n] * m1[n*3+m]; EigenValues(S,D); val = D[8]; val = (1.0/fabs(T))*log(sqrt(val)); minv = max (0.0, min (minv, val)); maxv = max (maxv, val); g_ftleodata[((i*height)+j)*depth+k] = val; } } } __syncthreads(); } __global__ void normalizedftleKernel(float* g_ftleodata, int width, int height, int depth) { int i,j,k; // access block id i = (blockIdx.x + threadIdx.x); j = (threadIdx.y); k = (blockIdx.y + threadIdx.z); int index = ((i*height)+j)*depth+k; g_ftleodata[index] = (255 - 255*(maxv-g_ftleodata[index])/(maxv-minv)); if (g_ftleodata[index] > 255) g_ftleodata[index] = 255; } #endif // #ifndef _TEMPLATE_KERNEL_H_