This load issue has existed for a while now, please provide link where to report this bug.
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19122697
// Cuda compilation tools, release 7.0, V7.0.17
// Based on LLVM 3.4svn
//
.version 4.2
.target sm_20
.address_size 32
// .globl KernelClear
.visible .entry KernelClear(
.param .u32 KernelClear_param_0,
.param .u32 KernelClear_param_1,
.param .u32 KernelClear_param_2,
.param .u32 KernelClear_param_3,
.param .u32 KernelClear_param_4,
.param .u32 KernelClear_param_5
)
{
.reg .s32 %r<28>;
ld.param.u32 %r1, [KernelClear_param_2];
ld.param.u32 %r2, [KernelClear_param_5];
cvta.to.global.u32 %r3, %r2;
cvta.to.global.u32 %r4, %r1;
mov.u32 %r5, %tid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.y;
mov.u32 %r8, %ntid.y;
mul.lo.s32 %r9, %r8, %r6;
mov.u32 %r10, %tid.z;
mov.u32 %r11, %ntid.z;
mul.lo.s32 %r12, %r11, %r9;
mov.u32 %r13, %ctaid.x;
mov.u32 %r14, %nctaid.x;
mul.lo.s32 %r15, %r14, %r12;
mov.u32 %r16, %ctaid.y;
mov.u32 %r17, %nctaid.y;
mov.u32 %r18, %ctaid.z;
mad.lo.s32 %r19, %r18, %r17, %r16;
mad.lo.s32 %r20, %r7, %r6, %r5;
mad.lo.s32 %r21, %r10, %r9, %r20;
mad.lo.s32 %r22, %r13, %r12, %r21;
mad.lo.s32 %r23, %r15, %r19, %r22;
shl.b32 %r24, %r23, 2;
add.s32 %r25, %r4, %r24;
mov.u32 %r26, 0;
st.volatile.global.u32 [%r25], %r26;
add.s32 %r27, %r3, %r24;
st.volatile.global.u32 [%r27], %r26;
ret;
}
// .globl KernelUpdate
.visible .entry KernelUpdate(
.param .u32 KernelUpdate_param_0,
.param .u32 KernelUpdate_param_1,
.param .u32 KernelUpdate_param_2,
.param .u32 KernelUpdate_param_3,
.param .u32 KernelUpdate_param_4,
.param .u32 KernelUpdate_param_5
)
{
.reg .f32 %f<10>;
.reg .s32 %r<32>;
ld.param.u32 %r1, [KernelUpdate_param_0];
ld.param.u32 %r2, [KernelUpdate_param_4];
ld.param.u32 %r3, [KernelUpdate_param_5];
cvta.to.global.u32 %r4, %r3;
cvta.to.global.u32 %r5, %r2;
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.y;
mov.u32 %r9, %ntid.y;
mul.lo.s32 %r10, %r9, %r7;
mov.u32 %r11, %tid.z;
mov.u32 %r12, %ntid.z;
mul.lo.s32 %r13, %r12, %r10;
mov.u32 %r14, %ctaid.x;
mov.u32 %r15, %nctaid.x;
mul.lo.s32 %r16, %r15, %r13;
mov.u32 %r17, %ctaid.y;
mov.u32 %r18, %nctaid.y;
mov.u32 %r19, %ctaid.z;
mad.lo.s32 %r20, %r19, %r18, %r17;
mad.lo.s32 %r21, %r8, %r7, %r6;
mad.lo.s32 %r22, %r11, %r10, %r21;
mad.lo.s32 %r23, %r14, %r13, %r22;
mad.lo.s32 %r24, %r16, %r20, %r23;
mad.lo.s32 %r25, %r24, 40, %r5;
ld.volatile.global.f32 %f1, [%r25+8];
ld.volatile.global.f32 %f2, [%r25+12];
ld.volatile.global.f32 %f3, [%r25+16];
ld.volatile.global.f32 %f4, [%r25+20];
ld.volatile.global.f32 %f5, [%r25+24];
ld.volatile.global.f32 %f6, [%r25+28];
ld.volatile.global.f32 %f7, [%r25+32];
fma.rn.f32 %f8, %f3, %f5, %f6;
fma.rn.f32 %f9, %f4, %f5, %f7;
st.volatile.global.f32 [%r25+28], %f8;
st.volatile.global.f32 [%r25+32], %f9;
cvt.rzi.s32.f32 %r26, %f8;
cvt.rzi.s32.f32 %r27, %f9;
mad.lo.s32 %r28, %r27, %r1, %r26;
shl.b32 %r29, %r28, 2;
add.s32 %r30, %r4, %r29;
atom.global.inc.u32 %r31, [%r30], 0;
ret;
}
// .globl KernelDraw
.visible .entry KernelDraw(
.param .u32 KernelDraw_param_0,
.param .u32 KernelDraw_param_1,
.param .u32 KernelDraw_param_2,
.param .u32 KernelDraw_param_3,
.param .u32 KernelDraw_param_4,
.param .u32 KernelDraw_param_5,
.param .u32 KernelDraw_param_6
)
{
.reg .pred %p<9>;
.reg .f32 %f<18>;
.reg .s32 %r<61>;
ld.param.u32 %r17, [KernelDraw_param_0];
ld.param.u32 %r18, [KernelDraw_param_1];
ld.param.u32 %r15, [KernelDraw_param_2];
ld.param.u32 %r19, [KernelDraw_param_5];
ld.param.u32 %r16, [KernelDraw_param_6];
mov.u32 %r20, %tid.y;
mov.u32 %r21, %ntid.x;
mov.u32 %r22, %ntid.y;
mul.lo.s32 %r23, %r22, %r21;
mov.u32 %r24, %tid.z;
mov.u32 %r25, %ntid.z;
mul.lo.s32 %r26, %r25, %r23;
mov.u32 %r27, %ctaid.x;
mov.u32 %r28, %nctaid.x;
mul.lo.s32 %r29, %r28, %r26;
mov.u32 %r30, %ctaid.z;
mov.u32 %r31, %nctaid.y;
mov.u32 %r32, %ctaid.y;
mad.lo.s32 %r33, %r30, %r31, %r32;
mov.u32 %r34, %tid.x;
mad.lo.s32 %r35, %r20, %r21, %r34;
mad.lo.s32 %r36, %r24, %r23, %r35;
mad.lo.s32 %r37, %r27, %r26, %r36;
mad.lo.s32 %r38, %r29, %r33, %r37;
cvta.to.global.u32 %r39, %r19;
mad.lo.s32 %r40, %r38, 40, %r39;
ld.volatile.global.u32 %r1, [%r40+36];
ld.volatile.global.f32 %f2, [%r40+28];
cvt.rzi.s32.f32 %r41, %f2;
ld.volatile.global.f32 %f3, [%r40+32];
cvt.rzi.s32.f32 %r42, %f3;
mad.lo.s32 %r2, %r42, %r17, %r41;
setp.ge.f32 %p1, %f2, 0f00000000;
cvt.rn.f32.s32 %f4, %r17;
setp.lt.f32 %p2, %f2, %f4;
and.pred %p3, %p1, %p2;
setp.ge.f32 %p4, %f3, 0f00000000;
and.pred %p5, %p3, %p4;
cvt.rn.f32.s32 %f5, %r18;
setp.lt.f32 %p6, %f3, %f5;
and.pred %p7, %p5, %p6;
@!%p7 bra BB2_4;
bra.uni BB2_1;
BB2_1:
cvta.to.global.u32 %r43, %r16;
and.b32 %r59, %r1, 255;
bfe.u32 %r58, %r1, 8, 8;
bfe.u32 %r57, %r1, 16, 8;
shr.u32 %r60, %r1, 24;
shl.b32 %r44, %r2, 2;
add.s32 %r45, %r43, %r44;
ld.volatile.global.u32 %r46, [%r45];
cvt.rn.f32.s32 %f1, %r46;
setp.leu.f32 %p8, %f1, 0f3F800000;
@%p8 bra BB2_3;
cvt.rn.f32.u32 %f6, %r59;
div.rn.f32 %f7, %f6, %f1;
cvt.rmi.f32.f32 %f8, %f7;
cvt.rzi.u32.f32 %r59, %f8;
cvt.rn.f32.u32 %f9, %r58;
div.rn.f32 %f10, %f9, %f1;
cvt.rmi.f32.f32 %f11, %f10;
cvt.rzi.u32.f32 %r58, %f11;
cvt.rn.f32.u32 %f12, %r57;
div.rn.f32 %f13, %f12, %f1;
cvt.rmi.f32.f32 %f14, %f13;
cvt.rzi.u32.f32 %r57, %f14;
cvt.rn.f32.u32 %f15, %r60;
div.rn.f32 %f16, %f15, %f1;
cvt.rmi.f32.f32 %f17, %f16;
cvt.rzi.u32.f32 %r60, %f17;
BB2_3:
cvta.to.global.u32 %r47, %r15;
shl.b32 %r48, %r57, 16;
shl.b32 %r49, %r58, 8;
or.b32 %r50, %r49, %r48;
or.b32 %r51, %r50, %r59;
shl.b32 %r52, %r60, 24;
or.b32 %r53, %r51, %r52;
add.s32 %r55, %r47, %r44;
atom.global.add.u32 %r56, [%r55], %r53;
BB2_4:
ret;
}