When parameters are reused multiple times, the compiler will optimize the code to store those parameters in registers and reuse those registers, even if the resulting code exceeds the register limit imposed. This implicitly spills registers to local memory, causing unnecessary reads and writes to local memory, that would have been single reads (with cache hits) from constant memory otherwise. Therefore, this compiler behavior is or can be a de-optimization.
The following example can be compiled with --gpu-architecture=compute_52 --fmad=false --ftz=true -I --maxregcount=32 to produce the described results.
#define BATCH_SIZE 16
typedef float data_t;
typedef unsigned int dimSize_t;
typedef struct {
float x[BATCH_SIZE];
float y[BATCH_SIZE];
} infoBlob_t;
extern "C" __global__ void
__launch_bounds__(1024, 2) //effectively limits registers per thread to 32
myKernel(data_t* out, infoBlob_t info, cudaTextureObject_t img, dimSize_t xSize, dimSize_t ySize)
{
dimSize_t x = threadIdx.x + blockIdx.x * blockDim.x;
for (dimSize_t y = 0; y < ySize; y++) {
data_t tmp = 0;
#pragma unroll
for (int i=0;i<BATCH_SIZE;i++) {
tmp += tex2D<data_t>(img, info.x[i]+x, info.y[i]+y);
}
atomicAdd(out + x + xSize*y, tmp);
}
}
This is translated to
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-21373419
// Cuda compilation tools, release 8.0, V8.0.55
// Based on LLVM 3.4svn
//
.version 5.0
.target sm_52
.address_size 64
// .globl myKernel
.visible .entry myKernel(
.param .u64 myKernel_param_0,
.param .align 4 .b8 myKernel_param_1[128],
.param .u64 myKernel_param_2,
.param .u32 myKernel_param_3,
.param .u32 myKernel_param_4
)
.maxntid 1024, 1, 1
.minnctapersm 2
{
.reg .pred %p<3>;
.reg .f32 %f<148>;
.reg .b32 %r<16>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [myKernel_param_0];
ld.param.u64 %rd3, [myKernel_param_2];
ld.param.u32 %r3, [myKernel_param_3];
ld.param.u32 %r4, [myKernel_param_4];
mov.u64 %rd1, myKernel_param_1;
ld.param.f32 %f1, [%rd1+64];
ld.param.f32 %f2, [%rd1+68];
ld.param.f32 %f3, [%rd1+72];
ld.param.f32 %f4, [%rd1+76];
ld.param.f32 %f5, [%rd1+80];
ld.param.f32 %f6, [%rd1+84];
ld.param.f32 %f7, [%rd1+88];
ld.param.f32 %f8, [%rd1+92];
ld.param.f32 %f9, [%rd1+96];
ld.param.f32 %f10, [%rd1+100];
setp.eq.s32 %p1, %r4, 0;
@%p1 bra BB0_3;
ld.param.f32 %f11, [%rd1+104];
ld.param.f32 %f12, [%rd1+108];
ld.param.f32 %f13, [%rd1+112];
ld.param.f32 %f14, [%rd1+116];
ld.param.f32 %f15, [%rd1+120];
ld.param.f32 %f16, [%rd1+124];
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %ctaid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r9, %r6, %r7, %r8;
ld.param.f32 %f33, [%rd1+60];
cvt.rn.f32.u32 %f34, %r9;
ld.param.f32 %f35, [%rd1];
add.rn.ftz.f32 %f17, %f34, %f35;
ld.param.f32 %f36, [%rd1+4];
add.rn.ftz.f32 %f18, %f34, %f36;
ld.param.f32 %f37, [%rd1+8];
add.rn.ftz.f32 %f19, %f34, %f37;
ld.param.f32 %f38, [%rd1+12];
add.rn.ftz.f32 %f20, %f34, %f38;
ld.param.f32 %f39, [%rd1+16];
add.rn.ftz.f32 %f21, %f34, %f39;
ld.param.f32 %f40, [%rd1+20];
add.rn.ftz.f32 %f22, %f34, %f40;
ld.param.f32 %f41, [%rd1+24];
add.rn.ftz.f32 %f23, %f34, %f41;
ld.param.f32 %f42, [%rd1+28];
add.rn.ftz.f32 %f24, %f34, %f42;
ld.param.f32 %f43, [%rd1+32];
add.rn.ftz.f32 %f25, %f34, %f43;
ld.param.f32 %f44, [%rd1+36];
add.rn.ftz.f32 %f26, %f34, %f44;
ld.param.f32 %f45, [%rd1+40];
add.rn.ftz.f32 %f27, %f34, %f45;
ld.param.f32 %f46, [%rd1+44];
add.rn.ftz.f32 %f28, %f34, %f46;
ld.param.f32 %f47, [%rd1+48];
add.rn.ftz.f32 %f29, %f34, %f47;
ld.param.f32 %f48, [%rd1+52];
add.rn.ftz.f32 %f30, %f34, %f48;
ld.param.f32 %f49, [%rd1+56];
add.rn.ftz.f32 %f31, %f34, %f49;
add.rn.ftz.f32 %f32, %f34, %f33;
mov.u32 %r15, 0;
cvta.to.global.u64 %rd7, %rd2;
BB0_2:
cvt.rn.f32.u32 %f50, %r15;
add.rn.ftz.f32 %f51, %f50, %f1;
tex.2d.v4.f32.f32 {%f52, %f53, %f54, %f55}, [%rd3, {%f17, %f51}];
add.rn.ftz.f32 %f56, %f52, 0f00000000;
add.rn.ftz.f32 %f57, %f50, %f2;
tex.2d.v4.f32.f32 {%f58, %f59, %f60, %f61}, [%rd3, {%f18, %f57}];
add.rn.ftz.f32 %f62, %f56, %f58;
add.rn.ftz.f32 %f63, %f50, %f3;
tex.2d.v4.f32.f32 {%f64, %f65, %f66, %f67}, [%rd3, {%f19, %f63}];
add.rn.ftz.f32 %f68, %f62, %f64;
add.rn.ftz.f32 %f69, %f50, %f4;
tex.2d.v4.f32.f32 {%f70, %f71, %f72, %f73}, [%rd3, {%f20, %f69}];
add.rn.ftz.f32 %f74, %f68, %f70;
add.rn.ftz.f32 %f75, %f50, %f5;
tex.2d.v4.f32.f32 {%f76, %f77, %f78, %f79}, [%rd3, {%f21, %f75}];
add.rn.ftz.f32 %f80, %f74, %f76;
add.rn.ftz.f32 %f81, %f50, %f6;
tex.2d.v4.f32.f32 {%f82, %f83, %f84, %f85}, [%rd3, {%f22, %f81}];
add.rn.ftz.f32 %f86, %f80, %f82;
add.rn.ftz.f32 %f87, %f50, %f7;
tex.2d.v4.f32.f32 {%f88, %f89, %f90, %f91}, [%rd3, {%f23, %f87}];
add.rn.ftz.f32 %f92, %f86, %f88;
add.rn.ftz.f32 %f93, %f50, %f8;
tex.2d.v4.f32.f32 {%f94, %f95, %f96, %f97}, [%rd3, {%f24, %f93}];
add.rn.ftz.f32 %f98, %f92, %f94;
add.rn.ftz.f32 %f99, %f50, %f9;
tex.2d.v4.f32.f32 {%f100, %f101, %f102, %f103}, [%rd3, {%f25, %f99}];
add.rn.ftz.f32 %f104, %f98, %f100;
add.rn.ftz.f32 %f105, %f50, %f10;
tex.2d.v4.f32.f32 {%f106, %f107, %f108, %f109}, [%rd3, {%f26, %f105}];
add.rn.ftz.f32 %f110, %f104, %f106;
add.rn.ftz.f32 %f111, %f50, %f11;
tex.2d.v4.f32.f32 {%f112, %f113, %f114, %f115}, [%rd3, {%f27, %f111}];
add.rn.ftz.f32 %f116, %f110, %f112;
add.rn.ftz.f32 %f117, %f50, %f12;
tex.2d.v4.f32.f32 {%f118, %f119, %f120, %f121}, [%rd3, {%f28, %f117}];
add.rn.ftz.f32 %f122, %f116, %f118;
add.rn.ftz.f32 %f123, %f50, %f13;
tex.2d.v4.f32.f32 {%f124, %f125, %f126, %f127}, [%rd3, {%f29, %f123}];
add.rn.ftz.f32 %f128, %f122, %f124;
add.rn.ftz.f32 %f129, %f50, %f14;
tex.2d.v4.f32.f32 {%f130, %f131, %f132, %f133}, [%rd3, {%f30, %f129}];
add.rn.ftz.f32 %f134, %f128, %f130;
add.rn.ftz.f32 %f135, %f50, %f15;
tex.2d.v4.f32.f32 {%f136, %f137, %f138, %f139}, [%rd3, {%f31, %f135}];
add.rn.ftz.f32 %f140, %f134, %f136;
add.rn.ftz.f32 %f141, %f50, %f16;
tex.2d.v4.f32.f32 {%f142, %f143, %f144, %f145}, [%rd3, {%f32, %f141}];
add.rn.ftz.f32 %f146, %f140, %f142;
mul.lo.s32 %r10, %r15, %r3;
cvt.u64.u32 %rd4, %r10;
cvt.u64.u32 %rd5, %r9;
add.s64 %rd6, %rd4, %rd5;
shl.b64 %rd8, %rd6, 2;
add.s64 %rd9, %rd7, %rd8;
atom.global.add.f32 %f147, [%rd9], %f146;
add.s32 %r15, %r15, 1;
setp.lt.u32 %p2, %r15, %r4;
@%p2 bra BB0_2;
BB0_3:
ret;
}
The ptx code of this example shows that the parameters are loaded into registers before the loop and kept there for the remainder of the program. However, this exceeds the register count limit. Therefore, some of the registers will be spilled to local memory, introducing large latencies.
Without the loop, the parameters are loaded just in time before the texture access, saving valuable register file space, even without imposing any register limit on the compiler:
extern "C" __global__ void
myKernel(data_t* out, infoBlob_t info, cudaTextureObject_t img, dimSize_t xSize, dimSize_t ySize)
{
dimSize_t x = threadIdx.x + blockIdx.x * blockDim.x;
dimSize_t y = threadIdx.y + blockIdx.y * blockDim.y;
data_t tmp = 0;
#pragma unroll
for (int i=0;i<BATCH_SIZE;i++) {
tmp += tex2D<data_t>(img, info.x[i]+x, info.y[i]+y);
}
atomicAdd(out + x + xSize*y, tmp);
}
This translates to
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-21373419
// Cuda compilation tools, release 8.0, V8.0.55
// Based on LLVM 3.4svn
//
.version 5.0
.target sm_52
.address_size 64
// .globl myKernel
.visible .entry myKernel(
.param .u64 myKernel_param_0,
.param .align 4 .b8 myKernel_param_1[128],
.param .u64 myKernel_param_2,
.param .u32 myKernel_param_3,
.param .u32 myKernel_param_4
)
.maxntid 1024, 1, 1
.minnctapersm 2
{
.reg .f32 %f<148>;
.reg .b32 %r<11>;
.reg .b64 %rd<10>;
ld.param.u64 %rd1, [myKernel_param_0];
ld.param.u64 %rd2, [myKernel_param_2];
ld.param.u32 %r1, [myKernel_param_3];
mov.u64 %rd3, myKernel_param_1;
mov.u32 %r2, %tid.x;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mad.lo.s32 %r5, %r4, %r3, %r2;
mov.u32 %r6, %tid.y;
mov.u32 %r7, %ctaid.y;
mov.u32 %r8, %ntid.y;
mad.lo.s32 %r9, %r8, %r7, %r6;
cvt.rn.f32.u32 %f1, %r5;
cvt.rn.f32.u32 %f2, %r9;
ld.param.f32 %f3, [%rd3+64];
ld.param.f32 %f4, [%rd3];
add.rn.ftz.f32 %f5, %f1, %f4;
add.rn.ftz.f32 %f6, %f2, %f3;
tex.2d.v4.f32.f32 {%f7, %f8, %f9, %f10}, [%rd2, {%f5, %f6}];
ld.param.f32 %f11, [%rd3+4];
ld.param.f32 %f12, [%rd3+68];
add.rn.ftz.f32 %f13, %f7, 0f00000000;
add.rn.ftz.f32 %f14, %f1, %f11;
add.rn.ftz.f32 %f15, %f2, %f12;
tex.2d.v4.f32.f32 {%f16, %f17, %f18, %f19}, [%rd2, {%f14, %f15}];
ld.param.f32 %f20, [%rd3+8];
ld.param.f32 %f21, [%rd3+72];
add.rn.ftz.f32 %f22, %f13, %f16;
add.rn.ftz.f32 %f23, %f1, %f20;
add.rn.ftz.f32 %f24, %f2, %f21;
tex.2d.v4.f32.f32 {%f25, %f26, %f27, %f28}, [%rd2, {%f23, %f24}];
ld.param.f32 %f29, [%rd3+12];
ld.param.f32 %f30, [%rd3+76];
add.rn.ftz.f32 %f31, %f22, %f25;
add.rn.ftz.f32 %f32, %f1, %f29;
add.rn.ftz.f32 %f33, %f2, %f30;
tex.2d.v4.f32.f32 {%f34, %f35, %f36, %f37}, [%rd2, {%f32, %f33}];
ld.param.f32 %f38, [%rd3+16];
ld.param.f32 %f39, [%rd3+80];
add.rn.ftz.f32 %f40, %f31, %f34;
add.rn.ftz.f32 %f41, %f1, %f38;
add.rn.ftz.f32 %f42, %f2, %f39;
tex.2d.v4.f32.f32 {%f43, %f44, %f45, %f46}, [%rd2, {%f41, %f42}];
ld.param.f32 %f47, [%rd3+20];
ld.param.f32 %f48, [%rd3+84];
add.rn.ftz.f32 %f49, %f40, %f43;
add.rn.ftz.f32 %f50, %f1, %f47;
add.rn.ftz.f32 %f51, %f2, %f48;
tex.2d.v4.f32.f32 {%f52, %f53, %f54, %f55}, [%rd2, {%f50, %f51}];
ld.param.f32 %f56, [%rd3+24];
ld.param.f32 %f57, [%rd3+88];
add.rn.ftz.f32 %f58, %f49, %f52;
add.rn.ftz.f32 %f59, %f1, %f56;
add.rn.ftz.f32 %f60, %f2, %f57;
tex.2d.v4.f32.f32 {%f61, %f62, %f63, %f64}, [%rd2, {%f59, %f60}];
ld.param.f32 %f65, [%rd3+28];
ld.param.f32 %f66, [%rd3+92];
add.rn.ftz.f32 %f67, %f58, %f61;
add.rn.ftz.f32 %f68, %f1, %f65;
add.rn.ftz.f32 %f69, %f2, %f66;
tex.2d.v4.f32.f32 {%f70, %f71, %f72, %f73}, [%rd2, {%f68, %f69}];
ld.param.f32 %f74, [%rd3+32];
ld.param.f32 %f75, [%rd3+96];
add.rn.ftz.f32 %f76, %f67, %f70;
add.rn.ftz.f32 %f77, %f1, %f74;
add.rn.ftz.f32 %f78, %f2, %f75;
tex.2d.v4.f32.f32 {%f79, %f80, %f81, %f82}, [%rd2, {%f77, %f78}];
ld.param.f32 %f83, [%rd3+36];
ld.param.f32 %f84, [%rd3+100];
add.rn.ftz.f32 %f85, %f76, %f79;
add.rn.ftz.f32 %f86, %f1, %f83;
add.rn.ftz.f32 %f87, %f2, %f84;
tex.2d.v4.f32.f32 {%f88, %f89, %f90, %f91}, [%rd2, {%f86, %f87}];
ld.param.f32 %f92, [%rd3+40];
ld.param.f32 %f93, [%rd3+104];
add.rn.ftz.f32 %f94, %f85, %f88;
add.rn.ftz.f32 %f95, %f1, %f92;
add.rn.ftz.f32 %f96, %f2, %f93;
tex.2d.v4.f32.f32 {%f97, %f98, %f99, %f100}, [%rd2, {%f95, %f96}];
ld.param.f32 %f101, [%rd3+44];
ld.param.f32 %f102, [%rd3+108];
add.rn.ftz.f32 %f103, %f94, %f97;
add.rn.ftz.f32 %f104, %f1, %f101;
add.rn.ftz.f32 %f105, %f2, %f102;
tex.2d.v4.f32.f32 {%f106, %f107, %f108, %f109}, [%rd2, {%f104, %f105}];
ld.param.f32 %f110, [%rd3+48];
ld.param.f32 %f111, [%rd3+112];
add.rn.ftz.f32 %f112, %f103, %f106;
add.rn.ftz.f32 %f113, %f1, %f110;
add.rn.ftz.f32 %f114, %f2, %f111;
tex.2d.v4.f32.f32 {%f115, %f116, %f117, %f118}, [%rd2, {%f113, %f114}];
ld.param.f32 %f119, [%rd3+52];
ld.param.f32 %f120, [%rd3+116];
add.rn.ftz.f32 %f121, %f112, %f115;
add.rn.ftz.f32 %f122, %f1, %f119;
add.rn.ftz.f32 %f123, %f2, %f120;
tex.2d.v4.f32.f32 {%f124, %f125, %f126, %f127}, [%rd2, {%f122, %f123}];
ld.param.f32 %f128, [%rd3+56];
ld.param.f32 %f129, [%rd3+120];
add.rn.ftz.f32 %f130, %f121, %f124;
add.rn.ftz.f32 %f131, %f1, %f128;
add.rn.ftz.f32 %f132, %f2, %f129;
tex.2d.v4.f32.f32 {%f133, %f134, %f135, %f136}, [%rd2, {%f131, %f132}];
ld.param.f32 %f137, [%rd3+60];
ld.param.f32 %f138, [%rd3+124];
add.rn.ftz.f32 %f139, %f130, %f133;
add.rn.ftz.f32 %f140, %f1, %f137;
add.rn.ftz.f32 %f141, %f2, %f138;
tex.2d.v4.f32.f32 {%f142, %f143, %f144, %f145}, [%rd2, {%f140, %f141}];
add.rn.ftz.f32 %f146, %f139, %f142;
cvta.to.global.u64 %rd4, %rd1;
cvt.u64.u32 %rd5, %r5;
mul.lo.s32 %r10, %r9, %r1;
cvt.u64.u32 %rd6, %r10;
add.s64 %rd7, %rd6, %rd5;
shl.b64 %rd8, %rd7, 2;
add.s64 %rd9, %rd4, %rd8;
atom.global.add.f32 %f147, [%rd9], %f146;
ret;
}
This is a reduced example derived from actual productive code. Introducing the loop increased the execution time of the kernel by a factor of 3. (Interestingly, in this example, I had to add the addition of x and y in tex2D<data_t>(img, info.x[i]+x, info.y[i]+y);, because otherwise the compiler would preemptively store all parameters in registers, even without the loop. This would, again, spill registers. Quite frankly, I don’t see how the x and y influence that decision.)
All in all, the compiler behavior seems a bit strange here, and it is quite apparent that loading the parameters just in time like in the second case (without the loop) should yield better memory access speed than the solution obtained from compilation. Is this a bug in the compiler? Because I can’t see why it would preserve the parameters in registers only to spill them again.