Cuda compiler will optimize code to use more registers than available by attempting to cache parameters

When parameters are reused multiple times, the compiler will optimize the code to store those parameters in registers and reuse those registers, even if the resulting code exceeds the register limit imposed. This implicitly spills registers to local memory, causing unnecessary reads and writes to local memory, that would have been single reads (with cache hits) from constant memory otherwise. Therefore, this compiler behavior is or can be a de-optimization.

The following example can be compiled with --gpu-architecture=compute_52 --fmad=false --ftz=true -I --maxregcount=32 to produce the described results.

#define BATCH_SIZE 16

typedef float data_t;
typedef unsigned int dimSize_t;
typedef struct {
	float x[BATCH_SIZE];
	float y[BATCH_SIZE];
} infoBlob_t;

extern "C" __global__ void
__launch_bounds__(1024, 2) //effectively limits registers per thread to 32
myKernel(data_t* out, infoBlob_t info, cudaTextureObject_t img, dimSize_t xSize, dimSize_t ySize)
{
	dimSize_t x = threadIdx.x + blockIdx.x * blockDim.x;
	
	for (dimSize_t y = 0; y < ySize; y++) {
		data_t tmp = 0;
#pragma unroll
		for (int i=0;i<BATCH_SIZE;i++) {
			tmp += tex2D<data_t>(img, info.x[i]+x, info.y[i]+y);
		}
		atomicAdd(out + x + xSize*y, tmp);
	}
}

This is translated to

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-21373419
// Cuda compilation tools, release 8.0, V8.0.55
// Based on LLVM 3.4svn
//

.version 5.0
.target sm_52
.address_size 64

	// .globl	myKernel

.visible .entry myKernel(
	.param .u64 myKernel_param_0,
	.param .align 4 .b8 myKernel_param_1[128],
	.param .u64 myKernel_param_2,
	.param .u32 myKernel_param_3,
	.param .u32 myKernel_param_4
)
.maxntid 1024, 1, 1
.minnctapersm 2
{
	.reg .pred 	%p<3>;
	.reg .f32 	%f<148>;
	.reg .b32 	%r<16>;
	.reg .b64 	%rd<10>;

	ld.param.u64 	%rd2, [myKernel_param_0];
	ld.param.u64 	%rd3, [myKernel_param_2];
	ld.param.u32 	%r3, [myKernel_param_3];
	ld.param.u32 	%r4, [myKernel_param_4];
	mov.u64 	%rd1, myKernel_param_1;
	ld.param.f32 	%f1, [%rd1+64];
	ld.param.f32 	%f2, [%rd1+68];
	ld.param.f32 	%f3, [%rd1+72];
	ld.param.f32 	%f4, [%rd1+76];
	ld.param.f32 	%f5, [%rd1+80];
	ld.param.f32 	%f6, [%rd1+84];
	ld.param.f32 	%f7, [%rd1+88];
	ld.param.f32 	%f8, [%rd1+92];
	ld.param.f32 	%f9, [%rd1+96];
	ld.param.f32 	%f10, [%rd1+100];
	setp.eq.s32	%p1, %r4, 0;
	@%p1 bra 	BB0_3;

	ld.param.f32 	%f11, [%rd1+104];
	ld.param.f32 	%f12, [%rd1+108];
	ld.param.f32 	%f13, [%rd1+112];
	ld.param.f32 	%f14, [%rd1+116];
	ld.param.f32 	%f15, [%rd1+120];
	ld.param.f32 	%f16, [%rd1+124];
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %ctaid.x;
	mov.u32 	%r8, %tid.x;
	mad.lo.s32 	%r9, %r6, %r7, %r8;
	ld.param.f32 	%f33, [%rd1+60];
	cvt.rn.f32.u32	%f34, %r9;
	ld.param.f32 	%f35, [%rd1];
	add.rn.ftz.f32 	%f17, %f34, %f35;
	ld.param.f32 	%f36, [%rd1+4];
	add.rn.ftz.f32 	%f18, %f34, %f36;
	ld.param.f32 	%f37, [%rd1+8];
	add.rn.ftz.f32 	%f19, %f34, %f37;
	ld.param.f32 	%f38, [%rd1+12];
	add.rn.ftz.f32 	%f20, %f34, %f38;
	ld.param.f32 	%f39, [%rd1+16];
	add.rn.ftz.f32 	%f21, %f34, %f39;
	ld.param.f32 	%f40, [%rd1+20];
	add.rn.ftz.f32 	%f22, %f34, %f40;
	ld.param.f32 	%f41, [%rd1+24];
	add.rn.ftz.f32 	%f23, %f34, %f41;
	ld.param.f32 	%f42, [%rd1+28];
	add.rn.ftz.f32 	%f24, %f34, %f42;
	ld.param.f32 	%f43, [%rd1+32];
	add.rn.ftz.f32 	%f25, %f34, %f43;
	ld.param.f32 	%f44, [%rd1+36];
	add.rn.ftz.f32 	%f26, %f34, %f44;
	ld.param.f32 	%f45, [%rd1+40];
	add.rn.ftz.f32 	%f27, %f34, %f45;
	ld.param.f32 	%f46, [%rd1+44];
	add.rn.ftz.f32 	%f28, %f34, %f46;
	ld.param.f32 	%f47, [%rd1+48];
	add.rn.ftz.f32 	%f29, %f34, %f47;
	ld.param.f32 	%f48, [%rd1+52];
	add.rn.ftz.f32 	%f30, %f34, %f48;
	ld.param.f32 	%f49, [%rd1+56];
	add.rn.ftz.f32 	%f31, %f34, %f49;
	add.rn.ftz.f32 	%f32, %f34, %f33;
	mov.u32 	%r15, 0;
	cvta.to.global.u64 	%rd7, %rd2;

BB0_2:
	cvt.rn.f32.u32	%f50, %r15;
	add.rn.ftz.f32 	%f51, %f50, %f1;
	tex.2d.v4.f32.f32	{%f52, %f53, %f54, %f55}, [%rd3, {%f17, %f51}];
	add.rn.ftz.f32 	%f56, %f52, 0f00000000;
	add.rn.ftz.f32 	%f57, %f50, %f2;
	tex.2d.v4.f32.f32	{%f58, %f59, %f60, %f61}, [%rd3, {%f18, %f57}];
	add.rn.ftz.f32 	%f62, %f56, %f58;
	add.rn.ftz.f32 	%f63, %f50, %f3;
	tex.2d.v4.f32.f32	{%f64, %f65, %f66, %f67}, [%rd3, {%f19, %f63}];
	add.rn.ftz.f32 	%f68, %f62, %f64;
	add.rn.ftz.f32 	%f69, %f50, %f4;
	tex.2d.v4.f32.f32	{%f70, %f71, %f72, %f73}, [%rd3, {%f20, %f69}];
	add.rn.ftz.f32 	%f74, %f68, %f70;
	add.rn.ftz.f32 	%f75, %f50, %f5;
	tex.2d.v4.f32.f32	{%f76, %f77, %f78, %f79}, [%rd3, {%f21, %f75}];
	add.rn.ftz.f32 	%f80, %f74, %f76;
	add.rn.ftz.f32 	%f81, %f50, %f6;
	tex.2d.v4.f32.f32	{%f82, %f83, %f84, %f85}, [%rd3, {%f22, %f81}];
	add.rn.ftz.f32 	%f86, %f80, %f82;
	add.rn.ftz.f32 	%f87, %f50, %f7;
	tex.2d.v4.f32.f32	{%f88, %f89, %f90, %f91}, [%rd3, {%f23, %f87}];
	add.rn.ftz.f32 	%f92, %f86, %f88;
	add.rn.ftz.f32 	%f93, %f50, %f8;
	tex.2d.v4.f32.f32	{%f94, %f95, %f96, %f97}, [%rd3, {%f24, %f93}];
	add.rn.ftz.f32 	%f98, %f92, %f94;
	add.rn.ftz.f32 	%f99, %f50, %f9;
	tex.2d.v4.f32.f32	{%f100, %f101, %f102, %f103}, [%rd3, {%f25, %f99}];
	add.rn.ftz.f32 	%f104, %f98, %f100;
	add.rn.ftz.f32 	%f105, %f50, %f10;
	tex.2d.v4.f32.f32	{%f106, %f107, %f108, %f109}, [%rd3, {%f26, %f105}];
	add.rn.ftz.f32 	%f110, %f104, %f106;
	add.rn.ftz.f32 	%f111, %f50, %f11;
	tex.2d.v4.f32.f32	{%f112, %f113, %f114, %f115}, [%rd3, {%f27, %f111}];
	add.rn.ftz.f32 	%f116, %f110, %f112;
	add.rn.ftz.f32 	%f117, %f50, %f12;
	tex.2d.v4.f32.f32	{%f118, %f119, %f120, %f121}, [%rd3, {%f28, %f117}];
	add.rn.ftz.f32 	%f122, %f116, %f118;
	add.rn.ftz.f32 	%f123, %f50, %f13;
	tex.2d.v4.f32.f32	{%f124, %f125, %f126, %f127}, [%rd3, {%f29, %f123}];
	add.rn.ftz.f32 	%f128, %f122, %f124;
	add.rn.ftz.f32 	%f129, %f50, %f14;
	tex.2d.v4.f32.f32	{%f130, %f131, %f132, %f133}, [%rd3, {%f30, %f129}];
	add.rn.ftz.f32 	%f134, %f128, %f130;
	add.rn.ftz.f32 	%f135, %f50, %f15;
	tex.2d.v4.f32.f32	{%f136, %f137, %f138, %f139}, [%rd3, {%f31, %f135}];
	add.rn.ftz.f32 	%f140, %f134, %f136;
	add.rn.ftz.f32 	%f141, %f50, %f16;
	tex.2d.v4.f32.f32	{%f142, %f143, %f144, %f145}, [%rd3, {%f32, %f141}];
	add.rn.ftz.f32 	%f146, %f140, %f142;
	mul.lo.s32 	%r10, %r15, %r3;
	cvt.u64.u32	%rd4, %r10;
	cvt.u64.u32	%rd5, %r9;
	add.s64 	%rd6, %rd4, %rd5;
	shl.b64 	%rd8, %rd6, 2;
	add.s64 	%rd9, %rd7, %rd8;
	atom.global.add.f32 	%f147, [%rd9], %f146;
	add.s32 	%r15, %r15, 1;
	setp.lt.u32	%p2, %r15, %r4;
	@%p2 bra 	BB0_2;

BB0_3:
	ret;
}

The ptx code of this example shows that the parameters are loaded into registers before the loop and kept there for the remainder of the program. However, this exceeds the register count limit. Therefore, some of the registers will be spilled to local memory, introducing large latencies.
Without the loop, the parameters are loaded just in time before the texture access, saving valuable register file space, even without imposing any register limit on the compiler:

extern "C" __global__ void
myKernel(data_t* out, infoBlob_t info, cudaTextureObject_t img, dimSize_t xSize, dimSize_t ySize)
{
	dimSize_t x = threadIdx.x + blockIdx.x * blockDim.x;
	dimSize_t y = threadIdx.y + blockIdx.y * blockDim.y;
	
	data_t tmp = 0;
#pragma unroll
	for (int i=0;i<BATCH_SIZE;i++) {
		tmp += tex2D<data_t>(img, info.x[i]+x, info.y[i]+y);
	}
	atomicAdd(out + x + xSize*y, tmp);
}

This translates to

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-21373419
// Cuda compilation tools, release 8.0, V8.0.55
// Based on LLVM 3.4svn
//

.version 5.0
.target sm_52
.address_size 64

	// .globl	myKernel

.visible .entry myKernel(
	.param .u64 myKernel_param_0,
	.param .align 4 .b8 myKernel_param_1[128],
	.param .u64 myKernel_param_2,
	.param .u32 myKernel_param_3,
	.param .u32 myKernel_param_4
)
.maxntid 1024, 1, 1
.minnctapersm 2
{
	.reg .f32 	%f<148>;
	.reg .b32 	%r<11>;
	.reg .b64 	%rd<10>;

	ld.param.u64 	%rd1, [myKernel_param_0];
	ld.param.u64 	%rd2, [myKernel_param_2];
	ld.param.u32 	%r1, [myKernel_param_3];
	mov.u64 	%rd3, myKernel_param_1;
	mov.u32 	%r2, %tid.x;
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mad.lo.s32 	%r5, %r4, %r3, %r2;
	mov.u32 	%r6, %tid.y;
	mov.u32 	%r7, %ctaid.y;
	mov.u32 	%r8, %ntid.y;
	mad.lo.s32 	%r9, %r8, %r7, %r6;
	cvt.rn.f32.u32	%f1, %r5;
	cvt.rn.f32.u32	%f2, %r9;
	ld.param.f32 	%f3, [%rd3+64];
	ld.param.f32 	%f4, [%rd3];
	add.rn.ftz.f32 	%f5, %f1, %f4;
	add.rn.ftz.f32 	%f6, %f2, %f3;
	tex.2d.v4.f32.f32	{%f7, %f8, %f9, %f10}, [%rd2, {%f5, %f6}];
	ld.param.f32 	%f11, [%rd3+4];
	ld.param.f32 	%f12, [%rd3+68];
	add.rn.ftz.f32 	%f13, %f7, 0f00000000;
	add.rn.ftz.f32 	%f14, %f1, %f11;
	add.rn.ftz.f32 	%f15, %f2, %f12;
	tex.2d.v4.f32.f32	{%f16, %f17, %f18, %f19}, [%rd2, {%f14, %f15}];
	ld.param.f32 	%f20, [%rd3+8];
	ld.param.f32 	%f21, [%rd3+72];
	add.rn.ftz.f32 	%f22, %f13, %f16;
	add.rn.ftz.f32 	%f23, %f1, %f20;
	add.rn.ftz.f32 	%f24, %f2, %f21;
	tex.2d.v4.f32.f32	{%f25, %f26, %f27, %f28}, [%rd2, {%f23, %f24}];
	ld.param.f32 	%f29, [%rd3+12];
	ld.param.f32 	%f30, [%rd3+76];
	add.rn.ftz.f32 	%f31, %f22, %f25;
	add.rn.ftz.f32 	%f32, %f1, %f29;
	add.rn.ftz.f32 	%f33, %f2, %f30;
	tex.2d.v4.f32.f32	{%f34, %f35, %f36, %f37}, [%rd2, {%f32, %f33}];
	ld.param.f32 	%f38, [%rd3+16];
	ld.param.f32 	%f39, [%rd3+80];
	add.rn.ftz.f32 	%f40, %f31, %f34;
	add.rn.ftz.f32 	%f41, %f1, %f38;
	add.rn.ftz.f32 	%f42, %f2, %f39;
	tex.2d.v4.f32.f32	{%f43, %f44, %f45, %f46}, [%rd2, {%f41, %f42}];
	ld.param.f32 	%f47, [%rd3+20];
	ld.param.f32 	%f48, [%rd3+84];
	add.rn.ftz.f32 	%f49, %f40, %f43;
	add.rn.ftz.f32 	%f50, %f1, %f47;
	add.rn.ftz.f32 	%f51, %f2, %f48;
	tex.2d.v4.f32.f32	{%f52, %f53, %f54, %f55}, [%rd2, {%f50, %f51}];
	ld.param.f32 	%f56, [%rd3+24];
	ld.param.f32 	%f57, [%rd3+88];
	add.rn.ftz.f32 	%f58, %f49, %f52;
	add.rn.ftz.f32 	%f59, %f1, %f56;
	add.rn.ftz.f32 	%f60, %f2, %f57;
	tex.2d.v4.f32.f32	{%f61, %f62, %f63, %f64}, [%rd2, {%f59, %f60}];
	ld.param.f32 	%f65, [%rd3+28];
	ld.param.f32 	%f66, [%rd3+92];
	add.rn.ftz.f32 	%f67, %f58, %f61;
	add.rn.ftz.f32 	%f68, %f1, %f65;
	add.rn.ftz.f32 	%f69, %f2, %f66;
	tex.2d.v4.f32.f32	{%f70, %f71, %f72, %f73}, [%rd2, {%f68, %f69}];
	ld.param.f32 	%f74, [%rd3+32];
	ld.param.f32 	%f75, [%rd3+96];
	add.rn.ftz.f32 	%f76, %f67, %f70;
	add.rn.ftz.f32 	%f77, %f1, %f74;
	add.rn.ftz.f32 	%f78, %f2, %f75;
	tex.2d.v4.f32.f32	{%f79, %f80, %f81, %f82}, [%rd2, {%f77, %f78}];
	ld.param.f32 	%f83, [%rd3+36];
	ld.param.f32 	%f84, [%rd3+100];
	add.rn.ftz.f32 	%f85, %f76, %f79;
	add.rn.ftz.f32 	%f86, %f1, %f83;
	add.rn.ftz.f32 	%f87, %f2, %f84;
	tex.2d.v4.f32.f32	{%f88, %f89, %f90, %f91}, [%rd2, {%f86, %f87}];
	ld.param.f32 	%f92, [%rd3+40];
	ld.param.f32 	%f93, [%rd3+104];
	add.rn.ftz.f32 	%f94, %f85, %f88;
	add.rn.ftz.f32 	%f95, %f1, %f92;
	add.rn.ftz.f32 	%f96, %f2, %f93;
	tex.2d.v4.f32.f32	{%f97, %f98, %f99, %f100}, [%rd2, {%f95, %f96}];
	ld.param.f32 	%f101, [%rd3+44];
	ld.param.f32 	%f102, [%rd3+108];
	add.rn.ftz.f32 	%f103, %f94, %f97;
	add.rn.ftz.f32 	%f104, %f1, %f101;
	add.rn.ftz.f32 	%f105, %f2, %f102;
	tex.2d.v4.f32.f32	{%f106, %f107, %f108, %f109}, [%rd2, {%f104, %f105}];
	ld.param.f32 	%f110, [%rd3+48];
	ld.param.f32 	%f111, [%rd3+112];
	add.rn.ftz.f32 	%f112, %f103, %f106;
	add.rn.ftz.f32 	%f113, %f1, %f110;
	add.rn.ftz.f32 	%f114, %f2, %f111;
	tex.2d.v4.f32.f32	{%f115, %f116, %f117, %f118}, [%rd2, {%f113, %f114}];
	ld.param.f32 	%f119, [%rd3+52];
	ld.param.f32 	%f120, [%rd3+116];
	add.rn.ftz.f32 	%f121, %f112, %f115;
	add.rn.ftz.f32 	%f122, %f1, %f119;
	add.rn.ftz.f32 	%f123, %f2, %f120;
	tex.2d.v4.f32.f32	{%f124, %f125, %f126, %f127}, [%rd2, {%f122, %f123}];
	ld.param.f32 	%f128, [%rd3+56];
	ld.param.f32 	%f129, [%rd3+120];
	add.rn.ftz.f32 	%f130, %f121, %f124;
	add.rn.ftz.f32 	%f131, %f1, %f128;
	add.rn.ftz.f32 	%f132, %f2, %f129;
	tex.2d.v4.f32.f32	{%f133, %f134, %f135, %f136}, [%rd2, {%f131, %f132}];
	ld.param.f32 	%f137, [%rd3+60];
	ld.param.f32 	%f138, [%rd3+124];
	add.rn.ftz.f32 	%f139, %f130, %f133;
	add.rn.ftz.f32 	%f140, %f1, %f137;
	add.rn.ftz.f32 	%f141, %f2, %f138;
	tex.2d.v4.f32.f32	{%f142, %f143, %f144, %f145}, [%rd2, {%f140, %f141}];
	add.rn.ftz.f32 	%f146, %f139, %f142;
	cvta.to.global.u64 	%rd4, %rd1;
	cvt.u64.u32	%rd5, %r5;
	mul.lo.s32 	%r10, %r9, %r1;
	cvt.u64.u32	%rd6, %r10;
	add.s64 	%rd7, %rd6, %rd5;
	shl.b64 	%rd8, %rd7, 2;
	add.s64 	%rd9, %rd4, %rd8;
	atom.global.add.f32 	%f147, [%rd9], %f146;
	ret;
}

This is a reduced example derived from actual productive code. Introducing the loop increased the execution time of the kernel by a factor of 3. (Interestingly, in this example, I had to add the addition of x and y in tex2D<data_t>(img, info.x[i]+x, info.y[i]+y);, because otherwise the compiler would preemptively store all parameters in registers, even without the loop. This would, again, spill registers. Quite frankly, I don’t see how the x and y influence that decision.)

All in all, the compiler behavior seems a bit strange here, and it is quite apparent that loading the parameters just in time like in the second case (without the loop) should yield better memory access speed than the solution obtained from compilation. Is this a bug in the compiler? Because I can’t see why it would preserve the parameters in registers only to spill them again.

providing a register limit that the compiler would not normally adhere to is likely to result in spilling

analysis of ptx is not a conclusive way to evaluate register usage. Conclusions about register usage reached there are not guaranteed to be correct. ptx goes through an additional optimized compilation step, before becoming executable machine code (SASS). It’s generally necessary to analyze the sass to be sure.

in case you thought you were filing a bug here, you are not. if your intent was to file a bug, that should be done through the developer portal at developer.nvidia.com

Since launch_bounds() is the more flexible mechanism to control register use, I compiled the first version of your code, using CUDA 8. You should use either launch_bounds() or the older -maxrregcount mechanism, but not both.

I don’t see any spilling when compiling this code. As txbob says, the thing to look at is the generated machine code (SASS). Note that PTX is merely an intermediate compiler representation (doubling as a virtual ISA), and it uses virtual registers in SSA fashion (meaning each register is exactly written once). The actual register allocation is performed by the PTXAS compiler, which is an optimizing compiler that turns PTX into SASS.

C:\Users\Norbert\My Programs>nvcc -arch=sm_52 -Xptxas -v spills.cu
nvcc warning : nvcc support for Microsoft Visual Studio 2010 and earlier has been deprecated and is no longer being maintained
spills.cu
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function 'myKernel' for 'sm_52'
ptxas info    : Function properties for myKernel
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 31 registers, 472 bytes cmem[0]
support for Microsoft Visual Studio 2010 has been deprecated!
   Creating library a.lib and object a.exp

Here is the contents of spills.cu:

#define BATCH_SIZE 16

typedef float data_t;
typedef unsigned int dimSize_t;
typedef struct {
    float x[BATCH_SIZE];
    float y[BATCH_SIZE];
} infoBlob_t;

extern "C" __global__ void
__launch_bounds__(1024, 2) //effectively limits registers per thread to 32
myKernel (data_t* out, infoBlob_t info, cudaTextureObject_t img, 
          dimSize_t xSize, dimSize_t ySize)
{
    dimSize_t x = threadIdx.x + blockIdx.x * blockDim.x;
    
    for (dimSize_t y = 0; y < ySize; y++) {
        data_t tmp = 0;
#pragma unroll
        for (int i=0;i<BATCH_SIZE;i++) {
            tmp += tex2D<data_t>(img, info.x[i]+x, info.y[i]+y);
        }
        atomicAdd(out + x + xSize*y, tmp);
    }
}

int main (void) 
{
    infoBlob_t foo = {0};
    myKernel <<<1,1>>>(0,foo,0,0,0);
    return 0;
}

Here is the generated machine code:

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = cuda
host = windows
compile_size = 64bit

	code for sm_52
		Function : myKernel
	.headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
                                                                                              /* 0x001ff400fda007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];                                      /* 0x4c98078000870001 */
        /*0010*/                   ISETP.EQ.AND P0, PT, RZ, c[0x0][0x1d4], PT;                /* 0x4b6503800757ff07 */
        /*0018*/               @P0 EXIT;                                                      /* 0xe30000000000000f */
                                                                                              /* 0x001c7c00e22007f0 */
        /*0028*/         {         MOV R20, RZ;                                               /* 0x5c9807800ff70014 */
        /*0030*/                   S2R R0, SR_CTAID.X;        }                               /* 0xf0c8000002570000 */
        /*0038*/                   S2R R2, SR_TID.X;                                          /* 0xf0c8000002170002 */
                                                                                              /* 0x001fc840fec20ff1 */
        /*0048*/                   XMAD.MRG R3, R0.reuse, c[0x0] [0x8].H1, RZ;                /* 0x4f107f8000270003 */
        /*0050*/                   XMAD R2, R0.reuse, c[0x0] [0x8], R2;                       /* 0x4e00010000270002 */
        /*0058*/                   XMAD.PSL.CBCC R0, R0.H1, R3.H1, R2;                        /* 0x5b30011800370000 */
                                                                                              /* 0x001f8442fe20003d */
        /*0068*/                   I2F.F32.U32 R2, R0;                                        /* 0x5cb8000000070a02 */
        /*0070*/                   FADD R4, R2.reuse, c[0x0][0x148];                          /* 0x4c58000005270204 */
        /*0078*/                   FADD R5, R2, c[0x0][0x14c];                                /* 0x4c58000005370205 */
                                                                                              /* 0x081fc440fe2207f1 */
        /*0088*/                   FADD R6, R2.reuse, c[0x0][0x150];                          /* 0x4c58000005470206 */
        /*0090*/                   FADD R7, R2.reuse, c[0x0][0x154];                          /* 0x4c58000005570207 */
        /*0098*/                   FADD R8, R2.reuse, c[0x0][0x158];                          /* 0x4c58000005670208 */
                                                                                              /* 0x001f8440fe2207f1 */
        /*00a8*/                   FADD R9, R2.reuse, c[0x0][0x15c];                          /* 0x4c58000005770209 */
        /*00b0*/                   FADD R10, R2.reuse, c[0x0][0x160];                         /* 0x4c5800000587020a */
        /*00b8*/                   FADD R11, R2, c[0x0][0x164];                               /* 0x4c5800000597020b */
                                                                                              /* 0x081fc440fe2207f1 */
        /*00c8*/                   FADD R12, R2.reuse, c[0x0][0x168];                         /* 0x4c58000005a7020c */
        /*00d0*/                   FADD R13, R2.reuse, c[0x0][0x16c];                         /* 0x4c58000005b7020d */
        /*00d8*/                   FADD R14, R2.reuse, c[0x0][0x170];                         /* 0x4c58000005c7020e */
                                                                                              /* 0x001f8440fe2207f1 */
        /*00e8*/                   FADD R15, R2.reuse, c[0x0][0x174];                         /* 0x4c58000005d7020f */
        /*00f0*/                   FADD R16, R2.reuse, c[0x0][0x178];                         /* 0x4c58000005e70210 */
        /*00f8*/                   FADD R17, R2, c[0x0][0x17c];                               /* 0x4c58000005f70211 */
                                                                                              /* 0x0000f400fec207f1 */
        /*0108*/                   FADD R18, R2.reuse, c[0x0][0x180];                         /* 0x4c58000006070212 */
        /*0110*/                   FADD R19, R2, c[0x0][0x184];                               /* 0x4c58000006170213 */
        /*0118*/                   I2F.F32.U32 R25, R20;                                      /* 0x5cb8000001470a19 */
                                                                                              /* 0x081fc8400e2217f2 */
        /*0128*/                   FADD R27, R25.reuse, c[0x0][0x188];                        /* 0x4c5800000627191b */
        /*0130*/                   TEXS.NODEP.T RZ, R27, R4, R27, 0x72, 2D, R;                /* 0xd822072ff1b7041b */
        /*0138*/                   FADD R30, R25.reuse, c[0x0][0x18c];                        /* 0x4c5800000637191e */
                                                                                              /* 0x080ac440fe420191 */
        /*0148*/                   TEXS.NODEP.T RZ, R21, R5, R30, 0x72, 2D, R;                /* 0xd822072ff1e70515 */
        /*0150*/                   FADD R23, R25.reuse, c[0x0][0x190];                        /* 0x4c58000006471917 */
        /*0158*/                   TEXS.NODEP.T RZ, R23, R6, R23, 0x72, 2D, R;                /* 0xd822072ff1770617 */
                                                                                              /* 0x081fc840162007e2 */
        /*0168*/                   FADD R24, R25, c[0x0][0x194];                              /* 0x4c58000006571918 */
        /*0170*/                   TEXS.NODEP.T RZ, R24, R7, R24, 0x72, 2D, R;                /* 0xd822072ff1870718 */
        /*0178*/                   FADD R29, R25.reuse, c[0x0][0x198];                        /* 0x4c5800000667191d */
                                                                                              /* 0x0802c440fe4200b1 */
        /*0188*/                   TEXS.NODEP.T RZ, R26, R8, R29, 0x72, 2D, R;                /* 0xd822072ff1d7081a */
        /*0190*/                   FADD R2, R25.reuse, c[0x0][0x19c];                         /* 0x4c58000006771902 */
        /*0198*/                   TEXS.NODEP.T RZ, R2, R9, R2, 0x72, 2D, R;                  /* 0xd822072ff0270902 */
                                                                                              /* 0x005ffc80162007e2 */
        /*01a8*/                   FADD R3, R25, c[0x0][0x1a0];                               /* 0x4c58000006871903 */
        /*01b0*/                   TEXS.NODEP.P RZ, R22, R10, R3, 0x72, 2D, R;                /* 0xd822072ff0370a16 */
        /*01b8*/                   FADD R30, R25, c[0x0][0x1a4];                              /* 0x4c5800000697191e */
                                                                                              /* 0x021fc008fec00ff1 */
        /*01c8*/                   FADD R3, R25, c[0x0][0x1bc];                               /* 0x4c58000006f71903 */
        /*01d0*/                   FADD R28, RZ, R27;                                         /* 0x5c58000001b7ff1c */
        /*01d8*/         {         FADD R21, R28, R21;                                        /* 0x5c58000001571c15 */
        /*01e8*/                   DEPBAR.LE SB5, 0x3;        }                               /* 0x001fc000fec007f6 */
                                                                                              /* 0xf0f0000034370000 */
        /*01f0*/                   FADD R21, R21, R23;                                        /* 0x5c58000001771515 */
        /*01f8*/         {         FADD R21, R21, R24;                                        /* 0x5c58000001871515 */
        /*0208*/                   DEPBAR.LE SB5, 0x1;        }                               /* 0x001fc400fcc007f6 */
                                                                                              /* 0xf0f0000034170000 */
        /*0210*/                   FADD R21, R21, R26;                                        /* 0x5c58000001a71515 */
        /*0218*/                   FADD R27, R21, R2;                                         /* 0x5c5800000027151b */
                                                                                              /* 0x0801c420fe0207f5 */
        /*0228*/                   FADD R26, R25.reuse, c[0x0][0x1a8];                        /* 0x4c58000006a7191a */
        /*0230*/         {         FADD R22, R27, R22;                                        /* 0x5c58000001671b16 */
        /*0238*/                   TEXS.NODEP.T RZ, R27, R11, R30, 0x72, 2D, R;        }      /* 0xd822072ff1e70b1b */
                                                                                              /* 0x0802c440fe420191 */
        /*0248*/                   TEXS.NODEP.T RZ, R26, R12, R26, 0x72, 2D, R;               /* 0xd822072ff1a70c1a */
        /*0250*/                   FADD R28, R25.reuse, c[0x0][0x1ac];                        /* 0x4c58000006b7191c */
        /*0258*/                   TEXS.NODEP.T RZ, R28, R13, R28, 0x72, 2D, R;               /* 0xd822072ff1c70d1c */
                                                                                              /* 0x081fc840162007e2 */
        /*0268*/                   FADD R24, R25, c[0x0][0x1b0];                              /* 0x4c58000006c71918 */
        /*0270*/                   TEXS.NODEP.T RZ, R24, R14, R24, 0x72, 2D, R;               /* 0xd822072ff1870e18 */
        /*0278*/                   FADD R21, R25.reuse, c[0x0][0x1b4];                        /* 0x4c58000006d71915 */
                                                                                              /* 0x0802c440fe4200b1 */
        /*0288*/                   TEXS.NODEP.T RZ, R21, R15, R21, 0x72, 2D, R;               /* 0xd822072ff1570f15 */
        /*0290*/                   FADD R2, R25.reuse, c[0x0][0x1b8];                         /* 0x4c58000006e71902 */
        /*0298*/                   TEXS.NODEP.T RZ, R2, R16, R2, 0x72, 2D, R;                 /* 0xd822072ff0271002 */
                                                                                              /* 0x080ac404fc4200b1 */
        /*02a8*/                   TEXS.NODEP.T RZ, R3, R17, R3, 0x72, 2D, R;                 /* 0xd822072ff0371103 */
        /*02b0*/                   FADD R23, R25, c[0x0][0x1c0];                              /* 0x4c58000007071917 */
        /*02b8*/                   TEXS.NODEP.T RZ, R23, R18, R23, 0x72, 2D, R;               /* 0xd822072ff1771217 */
                                                                                              /* 0x011fd880162007f2 */
        /*02c8*/                   FADD R25, R25, c[0x0][0x1c4];                              /* 0x4c58000007171919 */
        /*02d0*/                   TEXS.NODEP.P RZ, R25, R19, R25, 0x72, 2D, R;               /* 0xd822072ff1971319 */
        /*02d8*/                   FADD R27, R22, R27;                                        /* 0x5c58000001b7161b */
                                                                                              /* 0x001f8400fec087f0 */
        /*02e8*/         {         FADD R22, R27, R26;                                        /* 0x5c58000001a71b16 */
        /*02f0*/                   DEPBAR.LE SB5, 0x5;        }                               /* 0xf0f0000034570000 */
        /*02f8*/                   FADD R22, R22, R28;                                        /* 0x5c58000001c71616 */
                                                                                              /* 0x001fc840fe8207f1 */
        /*0308*/                   XMAD R26, R20.reuse, c[0x0] [0x1d0], RZ;                   /* 0x4e007f800747141a */
        /*0310*/                   XMAD.MRG R27, R20.reuse, c[0x0] [0x1d0].H1, RZ;            /* 0x4f107f800747141b */
        /*0318*/                   FADD R22, R22, R24;                                        /* 0x5c58000001871616 */
                                                                                              /* 0x001fc800fe8007f0 */
        /*0328*/         {         XMAD.PSL.CBCC R26, R20.H1, R27.H1, R26;                    /* 0x5b300d1801b7141a */
        /*0330*/                   DEPBAR.LE SB5, 0x3, {0};        }                          /* 0xf0f0000034370001 */
        /*0338*/                   FADD R21, R22, R21;                                        /* 0x5c58000001571615 */
                                                                                              /* 0x001fc000fe4007e4 */
        /*0348*/                   IADD R24.CC, R0, R26;                                      /* 0x5c10800001a70018 */
        /*0350*/                   FADD R2, R21, R2;                                          /* 0x5c58000000271502 */
        /*0358*/         {         IADD.X R26, RZ, RZ;                                        /* 0x5c1008000ff7ff1a */
        /*0368*/                   DEPBAR.LE SB5, 0x1;        }                               /* 0x001fc400fe2007f4 */
                                                                                              /* 0xf0f0000034170000 */
        /*0370*/                   FADD R2, R2, R3;                                           /* 0x5c58000000370202 */
        /*0378*/                   LEA R21.CC, R24, c[0x0][0x140], 0x2;                       /* 0x4bd7810005071815 */
                                                                                              /* 0x001fc400fc2007f4 */
        /*0388*/                   IADD32I R22, R20, 0x1;                                     /* 0x1c00000000171416 */
        /*0390*/                   FADD R20, R2, R23;                                         /* 0x5c58000001770214 */
        /*0398*/                   MOV R2, R21;                                               /* 0x5c98078001570002 */
                                                                                              /* 0x0003c420fe4007f4 */
        /*03a8*/                   LEA.HI.X R3, R24, c[0x0][0x144], R26, 0x2;                 /* 0x1a170d0005171803 */
        /*03b0*/                   FADD R21, R20, R25;                                        /* 0x5c58000001971415 */
        /*03b8*/                   RED.E.ADD.F32.FTZ.RN [R2], R21;                            /* 0xebf9000000370215 */
                                                                                              /* 0x00bff400fe0007ed */
        /*03c8*/                   ISETP.LT.U32.AND P0, PT, R22, c[0x0][0x1d4], PT;           /* 0x4b62038007571607 */
        /*03d0*/         {         MOV R20, R22;                                              /* 0x5c98078001670014 */
        /*03d8*/               @P0 BRA 0x118;        }                                        /* 0xe2400fffd380000f */
                                                                                              /* 0x001f8000ffe007ff */
        /*03e8*/                   EXIT;                                                      /* 0xe30000000007000f */
        /*03f0*/                   BRA 0x3f0;                                                 /* 0xe2400fffff87000f */
        /*03f8*/                   NOP;                                                       /* 0x50b0000000070f00 */
		.........................

First of all, thank you for your answers.

txbob, I did not intend to file a bug report, but thanks for the advise nonetheless.

I was under the impression that ptx code already resembled the sequence of instructions that the mashine code would ultimately take and that ptxas would therefore take a role similar to that of an assambler on the host, except that it would additionally need to map the virtual registers of the ptx code to physical registers or local memory locations in some way. I did not know that it would perform optimizations. Naturally, when reading the ptx output of the compiler, I noticed that at several points in the code there were more than 32 virtual registers written to and still waiting to be read from. The conclusion that, as a result, this meant spills was not far.

I suppose that pointing out that ptx does not translate directly to mashine code renders the rest of my post inane. I’m sorry to have incorrectly assumed that.

While I will now have to go all the way to understanding what instructions my code will actually end up with (thank you njuffa for doing that with my example code) and what they do, in order to find out what goes wrong instead, I would like your expertise for potential reasons for the performance losses that I encountered.

Unfortunatly I cannot post my actual code here. Suppose that info.x[i]+x, info.y[i]+y was replaced with a larger calculation, but still only using x, y, and info, but otherwise no significant changes to the code. What would be potention reasons that I should look out for, why introducing the loop could tripple the execution time of the kernel.

I understand that this may be too vague of a question to give advice on. Either way, I thank you for your input so far. Kind regards.

That impression may be based on the name ‘ptxas’, but it is not a correct impression. ptxas is an optimizing compiler that provides loop unrolling and common subexpression elimination (CSE), for example.

Without knowing anything about your actual code, one thing you can try is to use “pragma unroll 1” to prevent unrolling, rather than “pragma unroll” to encourage unrolling. If your real code is anything like the code you posted, it should be dominated by memory access (use the CUDA profiler to check!) meaning dynamic instruction count shouldn’t be the main performance concern. Texture loads tend to eat up a fair amount of registers, and if you unroll, the compiler may try to schedule those loads early to increase latency tolerance, which then requires additional temp registers.

Note that placing variables in registers is an optimization. Thread local variables live in local memory by default. If the compiler thinks it is profitable, it will pull some of them into registers, but with only 32 32-bit registers, that may not be possible if you have lots of “live” variables at certain code points. Some minor spilling should not have a major performance impact due to caches on the GPU. Again, let the profiler guide you to the bottlenecks. Use -Xptxas -v to tell you about spills: not every local memory access is due to a spill.

A classical trade-off for GPU code, especially with register-starved older architectures, is occupancy versus spilling. You don’t need 100% occupancy (or even something close) for good performance, so I would suggest increasing the register target count from 32 to a higher value, e.g. 64. As for sizing your thread blocks, it is usually better to make them around 256 threads, to optimize the overlap of active thread blocks on a given SM (as absolute minimum, two thread blocks should run concurrently per SM).

Study the Best Practices Guide for tips on who to make your code more efficient, and check all code changes against the CUDA profiler, to avoid pessimizations. The “computation” you mention, does that involve standard math library functions? If so, which ones?

You are right in that the bottleneck of the kernel are memory accesses. However, preventing unrolling only aggravates that problem, causing lots of local memory access (95%). Weirdly, when I look at the SASS disassambly (which I do not (yet?) have a good overview of, so forgive any fumbling), I see that it first loads all kinds of constant memory (must be the parameters, obviously) and stores them in local memory. I mean, it makes sense, when you forbit unrolling, it needs indexable memory, which registers are not. But couldn’t it do the same from constant memory? (Ironic, really. By playing around with the unrolling, the code ultimately does what I prematurely claimed it did to begin with.) Or is the constant memory not indexable?

The only library function used is rsqrt and only as a means to approximate division.

Edit: Whoops. I meant to edit my post to fix a slight sentence structure error. Instead I reposted. How did I manage to do that?

Without access to the code, I can only speculate. Some such speculation may be wide off the mark. Like all optimizing compilers, the CUDA compiler works with a large collection of heuristics that produce good code for 90% of source code, but are rarely completely optimal. But so far I have seen no evidence that the compiler makes massively suboptimal choices here. Limiting it to only 32 register may, however, be overly constraining it.

Have you tried restructuring the code so that it uses thread blocks of 256 threads instead of 1024, allowing for a bigger register budget per thread? Very large thread blocks are rarely the correct approach. With modern GPUs, as a rule of thumb, one should typically design for a thread block size between 128 and 256 threads, and deviate from that only if their are very good reasons to do so.

Actually, that last result was without restricting register use at all. I can see though that this form of discussion without any code is not very productive. On monday, I will try to build you a working example.

I have a hunch that we are looking at an XY-problem in this discussion. Maybe it is time to step back and reconsider from the 10,000 ft level what the application should accomplish, and based on that pick an algorithm that is well matched to the massively parallel GPU architecture.

BTW, I noticed you are building with --fmad=false. Other than for one-of debugging reasons, using that is pretty much never the right choice. GPU computation is built around the fused multiply-add, and that is a good thing.

To be quite honest, X was already solved before this thread began. The version without the loop works quite fast. While it would be nice to see this improving X, I’m actually more interested in Y for the sake of understanding than for the sake of X.

Yes, fmad is enabled in the productive use of the program.

As for an example, let’s use the same code as posted before, but with a BATCH_SIZE of 32 rather than 16. Compiling this will lead to spilling as proposed. Interestingly, if one was to change info.x[i]+x, info.y[i]+y to info.x[i], info.y[i], there would be no local stores or loads. It seems that the compiler tries to hold on to the results of the addition if possible and will even brave local memory access to do so.

I guess this makes sense. When computations are the main issue, keeping results for reuse sounds like a good idea. The compiler may not know that this kernel will be dominated by memory access and that recalculating results will be less of an issue than storing them away. I don’t suppose there is a way to hint anything that would cause the compiler to recalculate rather than to use local memory in this particular case or in general?

How does one usually go about this? Surely, I am not the first person faced with the occupancy vs speed trade-off, and the compiler is probably faced with it all the time. In the productive code, the version with the outer loop either uses 72 registers instead of 31 (the version without the loop uses 31), thus crippling occupancy, or it runs a lot slower, if I force it to use at most 32. Both ways the looped version takes approximately 3 times longer than the loopless version (I can’t claim the example code above with BATCH_SIZE 32 will have the same slowdown factor, but it should be apparent that recalculating would be faster than spilling in this particular example (is it? correct me if I’m wrong)). In my particular case, I can simply switch to the unlooped version. But if that was not available, how would one usually go about the register use and spilling?

Define “crippling”. What is the actual measured impact on performance? Code doesn’t necessarily need very high occupancy for good performance. How does performance change if you switch to blocks with 256 threads instead of 1024 threads, as I previously recommended? I would expect higher performance when using the smaller thread blocks (finer granularity allows higher percentage of GPU resources to be used). What overall bandwidth is this code consuming in its current state, relative to available bandwidth?

I have no idea how register use balloons to 72 registers in your actual code, because I have no idea how your actual code differs from the reduced version you have shown which doesn’t suffer from high register pressure. When given register targets, e.g. with launch_bounds(), the compiler will typically increase re-computation to save registers instead of spilling. You can try different launch bound settings to find a sweet spot in the occupancy vs spilling trade-off, e.g. try 48 or 64 registers as the target.

Impossible to tell from the information provided, but it’s possible that you are hitting a case where the compiler heuristics fail to find a reasonable solution, in which case you might want to file an enhancement request with NVIDIA (use the bug reporting form, prefix synopsis with “RFE:” to mark it as an enhancement request). Attach a self-contained smallest example possible that reproduces the issue.

As in, it goes from nearly 100% to less than 25%. Of course that is not necessarily “crippling” performance as well, as you mentioned.

The measured impact on performance is that the unlooped version takes 7.5s for 90 kernel calls and the looped version takes around 20s. But switching to smaller blocks reduces this to around 16.5s, as you suggested. I am unsure what you mean by overall consumed vs available bandwidth. The L2 cache achieves a bandwidth of 97,000 GB/s and nvvp claims a L2 cache utilization of ~55% (using size 256 thread blocks). For comparison, the unlooped version achieves 280,000 GB/s and a utilization of ~75%. Again, I am unsure what the utilization refers to and what the available bandwidth is.

Neither 48 nor 64 registers per thread allowed getting close to the unlooped execution time.

I just had the idea that there might be a texture locality problem, where the looped version would encounter more cache misses when accessing the textures, due to the difference in its access pattern. However, after making sure that both versions would be very local in their texture accesses, by using both coordinates modulo BATCH_SIZE, I can confirm, that the looped version is still slower.

Ultimately, I concede. I will simple have to not use a loop there.