Kernel fails to load "invalid floating point operation"

This load issue has existed for a while now, please provide link where to report this bug.

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19122697
// Cuda compilation tools, release 7.0, V7.0.17
// Based on LLVM 3.4svn
//

.version 4.2
.target sm_20
.address_size 32

	// .globl	KernelClear

.visible .entry KernelClear(
	.param .u32 KernelClear_param_0,
	.param .u32 KernelClear_param_1,
	.param .u32 KernelClear_param_2,
	.param .u32 KernelClear_param_3,
	.param .u32 KernelClear_param_4,
	.param .u32 KernelClear_param_5
)
{
	.reg .s32 	%r<28>;


	ld.param.u32 	%r1, [KernelClear_param_2];
	ld.param.u32 	%r2, [KernelClear_param_5];
	cvta.to.global.u32 	%r3, %r2;
	cvta.to.global.u32 	%r4, %r1;
	mov.u32 	%r5, %tid.x;
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %tid.y;
	mov.u32 	%r8, %ntid.y;
	mul.lo.s32 	%r9, %r8, %r6;
	mov.u32 	%r10, %tid.z;
	mov.u32 	%r11, %ntid.z;
	mul.lo.s32 	%r12, %r11, %r9;
	mov.u32 	%r13, %ctaid.x;
	mov.u32 	%r14, %nctaid.x;
	mul.lo.s32 	%r15, %r14, %r12;
	mov.u32 	%r16, %ctaid.y;
	mov.u32 	%r17, %nctaid.y;
	mov.u32 	%r18, %ctaid.z;
	mad.lo.s32 	%r19, %r18, %r17, %r16;
	mad.lo.s32 	%r20, %r7, %r6, %r5;
	mad.lo.s32 	%r21, %r10, %r9, %r20;
	mad.lo.s32 	%r22, %r13, %r12, %r21;
	mad.lo.s32 	%r23, %r15, %r19, %r22;
	shl.b32 	%r24, %r23, 2;
	add.s32 	%r25, %r4, %r24;
	mov.u32 	%r26, 0;
	st.volatile.global.u32 	[%r25], %r26;
	add.s32 	%r27, %r3, %r24;
	st.volatile.global.u32 	[%r27], %r26;
	ret;
}

	// .globl	KernelUpdate
.visible .entry KernelUpdate(
	.param .u32 KernelUpdate_param_0,
	.param .u32 KernelUpdate_param_1,
	.param .u32 KernelUpdate_param_2,
	.param .u32 KernelUpdate_param_3,
	.param .u32 KernelUpdate_param_4,
	.param .u32 KernelUpdate_param_5
)
{
	.reg .f32 	%f<10>;
	.reg .s32 	%r<32>;


	ld.param.u32 	%r1, [KernelUpdate_param_0];
	ld.param.u32 	%r2, [KernelUpdate_param_4];
	ld.param.u32 	%r3, [KernelUpdate_param_5];
	cvta.to.global.u32 	%r4, %r3;
	cvta.to.global.u32 	%r5, %r2;
	mov.u32 	%r6, %tid.x;
	mov.u32 	%r7, %ntid.x;
	mov.u32 	%r8, %tid.y;
	mov.u32 	%r9, %ntid.y;
	mul.lo.s32 	%r10, %r9, %r7;
	mov.u32 	%r11, %tid.z;
	mov.u32 	%r12, %ntid.z;
	mul.lo.s32 	%r13, %r12, %r10;
	mov.u32 	%r14, %ctaid.x;
	mov.u32 	%r15, %nctaid.x;
	mul.lo.s32 	%r16, %r15, %r13;
	mov.u32 	%r17, %ctaid.y;
	mov.u32 	%r18, %nctaid.y;
	mov.u32 	%r19, %ctaid.z;
	mad.lo.s32 	%r20, %r19, %r18, %r17;
	mad.lo.s32 	%r21, %r8, %r7, %r6;
	mad.lo.s32 	%r22, %r11, %r10, %r21;
	mad.lo.s32 	%r23, %r14, %r13, %r22;
	mad.lo.s32 	%r24, %r16, %r20, %r23;
	mad.lo.s32 	%r25, %r24, 40, %r5;
	ld.volatile.global.f32 	%f1, [%r25+8];
	ld.volatile.global.f32 	%f2, [%r25+12];
	ld.volatile.global.f32 	%f3, [%r25+16];
	ld.volatile.global.f32 	%f4, [%r25+20];
	ld.volatile.global.f32 	%f5, [%r25+24];
	ld.volatile.global.f32 	%f6, [%r25+28];
	ld.volatile.global.f32 	%f7, [%r25+32];
	fma.rn.f32 	%f8, %f3, %f5, %f6;
	fma.rn.f32 	%f9, %f4, %f5, %f7;
	st.volatile.global.f32 	[%r25+28], %f8;
	st.volatile.global.f32 	[%r25+32], %f9;
	cvt.rzi.s32.f32	%r26, %f8;
	cvt.rzi.s32.f32	%r27, %f9;
	mad.lo.s32 	%r28, %r27, %r1, %r26;
	shl.b32 	%r29, %r28, 2;
	add.s32 	%r30, %r4, %r29;
	atom.global.inc.u32 	%r31, [%r30], 0;
	ret;
}

	// .globl	KernelDraw
.visible .entry KernelDraw(
	.param .u32 KernelDraw_param_0,
	.param .u32 KernelDraw_param_1,
	.param .u32 KernelDraw_param_2,
	.param .u32 KernelDraw_param_3,
	.param .u32 KernelDraw_param_4,
	.param .u32 KernelDraw_param_5,
	.param .u32 KernelDraw_param_6
)
{
	.reg .pred 	%p<9>;
	.reg .f32 	%f<18>;
	.reg .s32 	%r<61>;


	ld.param.u32 	%r17, [KernelDraw_param_0];
	ld.param.u32 	%r18, [KernelDraw_param_1];
	ld.param.u32 	%r15, [KernelDraw_param_2];
	ld.param.u32 	%r19, [KernelDraw_param_5];
	ld.param.u32 	%r16, [KernelDraw_param_6];
	mov.u32 	%r20, %tid.y;
	mov.u32 	%r21, %ntid.x;
	mov.u32 	%r22, %ntid.y;
	mul.lo.s32 	%r23, %r22, %r21;
	mov.u32 	%r24, %tid.z;
	mov.u32 	%r25, %ntid.z;
	mul.lo.s32 	%r26, %r25, %r23;
	mov.u32 	%r27, %ctaid.x;
	mov.u32 	%r28, %nctaid.x;
	mul.lo.s32 	%r29, %r28, %r26;
	mov.u32 	%r30, %ctaid.z;
	mov.u32 	%r31, %nctaid.y;
	mov.u32 	%r32, %ctaid.y;
	mad.lo.s32 	%r33, %r30, %r31, %r32;
	mov.u32 	%r34, %tid.x;
	mad.lo.s32 	%r35, %r20, %r21, %r34;
	mad.lo.s32 	%r36, %r24, %r23, %r35;
	mad.lo.s32 	%r37, %r27, %r26, %r36;
	mad.lo.s32 	%r38, %r29, %r33, %r37;
	cvta.to.global.u32 	%r39, %r19;
	mad.lo.s32 	%r40, %r38, 40, %r39;
	ld.volatile.global.u32 	%r1, [%r40+36];
	ld.volatile.global.f32 	%f2, [%r40+28];
	cvt.rzi.s32.f32	%r41, %f2;
	ld.volatile.global.f32 	%f3, [%r40+32];
	cvt.rzi.s32.f32	%r42, %f3;
	mad.lo.s32 	%r2, %r42, %r17, %r41;
	setp.ge.f32	%p1, %f2, 0f00000000;
	cvt.rn.f32.s32	%f4, %r17;
	setp.lt.f32	%p2, %f2, %f4;
	and.pred  	%p3, %p1, %p2;
	setp.ge.f32	%p4, %f3, 0f00000000;
	and.pred  	%p5, %p3, %p4;
	cvt.rn.f32.s32	%f5, %r18;
	setp.lt.f32	%p6, %f3, %f5;
	and.pred  	%p7, %p5, %p6;
	@!%p7 bra 	BB2_4;
	bra.uni 	BB2_1;

BB2_1:
	cvta.to.global.u32 	%r43, %r16;
	and.b32  	%r59, %r1, 255;
	bfe.u32 	%r58, %r1, 8, 8;
	bfe.u32 	%r57, %r1, 16, 8;
	shr.u32 	%r60, %r1, 24;
	shl.b32 	%r44, %r2, 2;
	add.s32 	%r45, %r43, %r44;
	ld.volatile.global.u32 	%r46, [%r45];
	cvt.rn.f32.s32	%f1, %r46;
	setp.leu.f32	%p8, %f1, 0f3F800000;
	@%p8 bra 	BB2_3;

	cvt.rn.f32.u32	%f6, %r59;
	div.rn.f32 	%f7, %f6, %f1;
	cvt.rmi.f32.f32	%f8, %f7;
	cvt.rzi.u32.f32	%r59, %f8;
	cvt.rn.f32.u32	%f9, %r58;
	div.rn.f32 	%f10, %f9, %f1;
	cvt.rmi.f32.f32	%f11, %f10;
	cvt.rzi.u32.f32	%r58, %f11;
	cvt.rn.f32.u32	%f12, %r57;
	div.rn.f32 	%f13, %f12, %f1;
	cvt.rmi.f32.f32	%f14, %f13;
	cvt.rzi.u32.f32	%r57, %f14;
	cvt.rn.f32.u32	%f15, %r60;
	div.rn.f32 	%f16, %f15, %f1;
	cvt.rmi.f32.f32	%f17, %f16;
	cvt.rzi.u32.f32	%r60, %f17;

BB2_3:
	cvta.to.global.u32 	%r47, %r15;
	shl.b32 	%r48, %r57, 16;
	shl.b32 	%r49, %r58, 8;
	or.b32  	%r50, %r49, %r48;
	or.b32  	%r51, %r50, %r59;
	shl.b32 	%r52, %r60, 24;
	or.b32  	%r53, %r51, %r52;
	add.s32 	%r55, %r47, %r44;
	atom.global.add.u32 	%r56, [%r55], %r53;

BB2_4:
	ret;
}

I seem to recall that “invalid floating-point operation” on some platforms may also be triggered by an integer division by zero, but my memory is hazy. You state that this is a “load issue” but I don’t see any code that attempts to load the kernel.

To file a bug report, go to https://developer.nvidia.com/, then click on the “Login” link in the upper right corner. Log in with your registered developer log-in and password. The link for the bug report is right on the page shown after you logged into the registered developer account.

Code to actually load the PTX shouldn’t really be necessary. One way to test the PTX is to download my bandwidth test, and simply copy and paste this PTX into one the PTX files (32 bit version) that come with the tool. Then start 32 bit application. The application will attempt to load it and it will report an exception in the log.

Or any other tool that can load PTX files.

http://www.skybuck.org/CUDA/BandwidthTest/

I don’t think the problem is with my loading code, but somewhere in the driver. If you or anybody doubts this it’s best to write your own loading code or find some common driver api loading code and try it out and then see what the results are… I would be curious to see that.

My expectation would be that this kernel fails to load on any GT 520, and that it might load on other GPUs… that could indicate a GT 520 specific bug in the driver… or it could always fail… which could indicate a more serious issue ?!?