Hi AastaLLL,
You can reproduce something similar this way:
- Get arrayfire 3.6.0 master from github: https://github.com/arrayfire/arrayfire.
- Check/install required dependancies (https://github.com/arrayfire/arrayfire/wiki/Build-Instructions-for-Linux). I’ve also downloaded cub-1.7.4 and copied its cub directory into /usr/include.
- Configure with cmake-gui (all default should be ok, but you may have to click twice on configure to get everything configured, then generate).
- Build. No need to install for this test.
- In build directory, launch:
./test/jit_cuda --gtest_filter=JIT.ISSUE_1646
Setting AF_CUDA_MAX_JIT_LEN to 19 or more leads to failure, but it succeeds with 18 or less.
Here is the failing kernel with value 19:
typedef unsigned int uint;
typedef long long dim_t;
/*******************************************************
* Copyright (c) 2014, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
* The complete license agreement can be obtained at:
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
typedef float2 cuFloatComplex;
typedef cuFloatComplex cfloat;
typedef double2 cuDoubleComplex;
typedef cuDoubleComplex cdouble;
// ----------------------------------------------
// REAL NUMBER OPERATIONS
// ----------------------------------------------
#define sign(in) signbit((in))
#define __noop(a) (a)
#define __add(lhs, rhs) (lhs) + (rhs)
#define __sub(lhs, rhs) (lhs) - (rhs)
#define __mul(lhs, rhs) (lhs) * (rhs)
#define __div(lhs, rhs) (lhs) / (rhs)
#define __and(lhs, rhs) (lhs) && (rhs)
#define __or(lhs, rhs) (lhs) || (rhs)
#define __lt(lhs, rhs) (lhs) < (rhs)
#define __gt(lhs, rhs) (lhs) > (rhs)
#define __le(lhs, rhs) (lhs) <= (rhs)
#define __ge(lhs, rhs) (lhs) >= (rhs)
#define __eq(lhs, rhs) (lhs) == (rhs)
#define __neq(lhs, rhs) (lhs) != (rhs)
#define __conj(in) (in)
#define __real(in) (in)
#define __imag(in) (0)
#define __abs(in) abs(in)
#define __sigmoid(in) (1.0/(1 + exp(-(in))))
#define __bitor(lhs, rhs) ((lhs) | (rhs))
#define __bitand(lhs, rhs) ((lhs) & (rhs))
#define __bitxor(lhs, rhs) ((lhs) ^ (rhs))
#define __bitshiftl(lhs, rhs) ((lhs) << (rhs))
#define __bitshiftr(lhs, rhs) ((lhs) >> (rhs))
#define __min(lhs, rhs) ((lhs) < (rhs)) ? (lhs) : (rhs)
#define __max(lhs, rhs) ((lhs) > (rhs)) ? (lhs) : (rhs)
#define __rem(lhs, rhs) ((lhs) % (rhs))
#define __mod(lhs, rhs) ((lhs) % (rhs))
#define __pow(lhs, rhs) fpow((float)lhs, (float)rhs)
#define __convert_char(val) (char)((val) != 0)
#define fpow(lhs, rhs) pow((lhs), (rhs))
#define frem(lhs, rhs) remainder((lhs), (rhs))
#define iszero(a) ((a) == 0)
// ----------------------------------------------
// COMPLEX FLOAT OPERATIONS
// ----------------------------------------------
#define __crealf(in) ((in).x)
#define __cimagf(in) ((in).y)
#define __cabsf(in) hypotf(in.x, in.y)
__device__ cfloat __cplx2f(float x, float y)
{
cfloat res = {x, y};
return res;
}
__device__ cfloat __cconjf(cfloat in)
{
cfloat res = {in.x, -in.y};
return res;
}
__device__ cfloat __caddf(cfloat lhs, cfloat rhs)
{
cfloat res = {lhs.x + rhs.x, lhs.y + rhs.y};
return res;
}
__device__ cfloat __csubf(cfloat lhs, cfloat rhs)
{
cfloat res = {lhs.x - rhs.x, lhs.y - rhs.y};
return res;
}
__device__ cfloat __cmulf(cfloat lhs, cfloat rhs)
{
cfloat out;
out.x = lhs.x * rhs.x - lhs.y * rhs.y;
out.y = lhs.x * rhs.y + lhs.y * rhs.x;
return out;
}
__device__ cfloat __cdivf(cfloat lhs, cfloat rhs)
{
// Normalize by absolute value and multiply
float rhs_abs = __cabsf(rhs);
float inv_rhs_abs = 1.0f / rhs_abs;
float rhs_x = inv_rhs_abs * rhs.x;
float rhs_y = inv_rhs_abs * rhs.y;
cfloat out = {lhs.x * rhs_x + lhs.y * rhs_y,
lhs.y * rhs_x - lhs.x * rhs_y};
out.x *= inv_rhs_abs;
out.y *= inv_rhs_abs;
return out;
}
__device__ cfloat __cminf(cfloat lhs, cfloat rhs)
{
return __cabsf(lhs) < __cabsf(rhs) ? lhs : rhs;
}
__device__ cfloat __cmaxf(cfloat lhs, cfloat rhs)
{
return __cabsf(lhs) > __cabsf(rhs) ? lhs : rhs;
}
#define __candf(lhs, rhs) __cabsf(lhs) && __cabsf(rhs)
#define __corf(lhs, rhs) __cabsf(lhs) || __cabsf(rhs)
#define __ceqf(lhs, rhs) (((lhs).x == (rhs).x) && ((lhs).y == (rhs).y))
#define __cneqf(lhs, rhs) !__ceqf((lhs), (rhs))
#define __cltf(lhs, rhs) (__cabsf(lhs) < __cabsf(rhs))
#define __clef(lhs, rhs) (__cabsf(lhs) <= __cabsf(rhs))
#define __cgtf(lhs, rhs) (__cabsf(lhs) > __cabsf(rhs))
#define __cgef(lhs, rhs) (__cabsf(lhs) >= __cabsf(rhs))
#define __convert_cfloat(real) __cplx2f(real, 0)
#define __convert_c2c(in) (in)
#define __convert_z2c(in) __cplx2f((float)in.x, (float)in.y)
// ----------------------------------------------
// COMPLEX DOUBLE OPERATIONS
// ----------------------------------------------
#define __creal(in) ((in).x)
#define __cimag(in) ((in).y)
#define __cabs(in) hypot(in.x, in.y)
__device__ cdouble __cplx2(double x, double y)
{
cdouble res = {x, y};
return res;
}
__device__ cdouble __cconj(cdouble in)
{
cdouble res = {in.x, -in.y};
return res;
}
__device__ cdouble __cadd(cdouble lhs, cdouble rhs)
{
cdouble res = {lhs.x + rhs.x, lhs.y + rhs.y};
return res;
}
__device__ cdouble __csub(cdouble lhs, cdouble rhs)
{
cdouble res = {lhs.x - rhs.x, lhs.y - rhs.y};
return res;
}
__device__ cdouble __cmul(cdouble lhs, cdouble rhs)
{
cdouble out;
out.x = lhs.x * rhs.x - lhs.y * rhs.y;
out.y = lhs.x * rhs.y + lhs.y * rhs.x;
return out;
}
__device__ cdouble __cdiv(cdouble lhs, cdouble rhs)
{
// Normalize by absolute value and multiply
double rhs_abs = __cabs(rhs);
double inv_rhs_abs = 1.0 / rhs_abs;
double rhs_x = inv_rhs_abs * rhs.x;
double rhs_y = inv_rhs_abs * rhs.y;
cdouble out = {lhs.x * rhs_x + lhs.y * rhs_y,
lhs.y * rhs_x - lhs.x * rhs_y};
out.x *= inv_rhs_abs;
out.y *= inv_rhs_abs;
return out;
}
__device__ cdouble __cmin(cdouble lhs, cdouble rhs)
{
return __cabs(lhs) < __cabs(rhs) ? lhs : rhs;
}
__device__ cdouble __cmax(cdouble lhs, cdouble rhs)
{
return __cabs(lhs) > __cabs(rhs) ? lhs : rhs;
}
#define __cand(lhs, rhs) __cabs(lhs) && __cabs(rhs)
#define __cor(lhs, rhs) __cabs(lhs) || __cabs(rhs)
#define __ceq(lhs, rhs) (((lhs).x == (rhs).x) && ((lhs).y == (rhs).y))
#define __cneq(lhs, rhs) !__ceq((lhs), (rhs))
#define __clt(lhs, rhs) (__cabs(lhs) < __cabs(rhs))
#define __cle(lhs, rhs) (__cabs(lhs) <= __cabs(rhs))
#define __cgt(lhs, rhs) (__cabs(lhs) > __cabs(rhs))
#define __cge(lhs, rhs) (__cabs(lhs) >= __cabs(rhs))
#define __convert_cdouble(real) __cplx2(real, 0)
#define __convert_z2z(in) (in)
#define __convert_c2z(in) __cplx2((double)in.x, (double)in.y)
template<typename T>
struct Param
{
T *ptr;
dim_t dims[4];
dim_t strides[4];
};
extern "C" __global__ void
KER3024106651647577914(
float *in0_ptr,
float *in1_ptr,
float *in2_ptr,
float *in5_ptr,
float *in8_ptr,
float *in11_ptr,
float *in14_ptr,
float *in17_ptr,
float *in20_ptr,
float *in23_ptr,
float *in26_ptr,
float *in29_ptr,
float *in32_ptr,
float *in35_ptr,
float *in38_ptr,
float *in41_ptr,
float *in44_ptr,
float *in47_ptr,
float *in50_ptr,
float *in53_ptr,
Param<float> out55,
uint blocks_x, uint blocks_y, uint blocks_x_total, uint num_odims)
{
Param<float> outref = out55;
for (int blockIdx_x = blockIdx.x; blockIdx_x < blocks_x_total; blockIdx_x += gridDim.x) {
uint threadId = threadIdx.x;
int idx = blockIdx_x * blockDim.x * blockDim.y + threadId;
if (idx >= outref.dims[3] * outref.strides[3]) return;
int idx0 = idx;
int idx1 = idx;
int idx2 = idx;
int idx5 = idx;
int idx8 = idx;
int idx11 = idx;
int idx14 = idx;
int idx17 = idx;
int idx20 = idx;
int idx23 = idx;
int idx26 = idx;
int idx29 = idx;
int idx32 = idx;
int idx35 = idx;
int idx38 = idx;
int idx41 = idx;
int idx44 = idx;
int idx47 = idx;
int idx50 = idx;
int idx53 = idx;
float val0 = in0_ptr[idx0];
float val1 = in1_ptr[idx1];
float val2 = in2_ptr[idx2];
float val3 = __add(val1, val2);
float val4 = __add(val0, val3);
float val5 = in5_ptr[idx5];
float val6 = __add(val3, val5);
float val7 = __add(val4, val6);
float val8 = in8_ptr[idx8];
float val9 = __add(val6, val8);
float val10 = __add(val7, val9);
float val11 = in11_ptr[idx11];
float val12 = __add(val9, val11);
float val13 = __add(val10, val12);
float val14 = in14_ptr[idx14];
float val15 = __add(val12, val14);
float val16 = __add(val13, val15);
float val17 = in17_ptr[idx17];
float val18 = __add(val15, val17);
float val19 = __add(val16, val18);
float val20 = in20_ptr[idx20];
float val21 = __add(val18, val20);
float val22 = __add(val19, val21);
float val23 = in23_ptr[idx23];
float val24 = __add(val21, val23);
float val25 = __add(val22, val24);
float val26 = in26_ptr[idx26];
float val27 = __add(val24, val26);
float val28 = __add(val25, val27);
float val29 = in29_ptr[idx29];
float val30 = __add(val27, val29);
float val31 = __add(val28, val30);
float val32 = in32_ptr[idx32];
float val33 = __add(val30, val32);
float val34 = __add(val31, val33);
float val35 = in35_ptr[idx35];
float val36 = __add(val33, val35);
float val37 = __add(val34, val36);
float val38 = in38_ptr[idx38];
float val39 = __add(val36, val38);
float val40 = __add(val37, val39);
float val41 = in41_ptr[idx41];
float val42 = __add(val39, val41);
float val43 = __add(val40, val42);
float val44 = in44_ptr[idx44];
float val45 = __add(val42, val44);
float val46 = __add(val43, val45);
float val47 = in47_ptr[idx47];
float val48 = __add(val45, val47);
float val49 = __add(val46, val48);
float val50 = in50_ptr[idx50];
float val51 = __add(val48, val50);
float val52 = __add(val49, val51);
float val53 = in53_ptr[idx53];
float val54 = __add(val51, val53);
float val55 = __add(val52, val54);
out55.ptr[idx] = val55;
}
}
Unrelated, but you might also be interested by what happens with canny_cuda otsu threshold test:
./test/canny_cuda --gtest_filter=CannyEdgeDetector.OtsuThreshold