Hi all.
I have attached as follows the sass assembly for a sm_10 gpu architecture and the output of ptxas for the same architecture.
As you can see the assembly code uses three kind of registers:
R-type: R0 to R10, R124
C-type: C0
A-type: A1 and A2
However, ptxas’ output about register usage says 11 registers.
ptxas info : Compiling entry function ‘_Z9dwtHaar1DPfS_S_jji’ for ‘sm_10’
ptxas info : Used 11 registers, 48+16 bytes smem, 4 bytes cmem[1]
The question is … why ptxas doesn’t take in acount C type registers, A type registers and R124?
Defing the register usage of a thread, shouldn’t these types of registers (C type and A type) being taken in account?
Thanks a lot.
Function : _Z9dwtHaar1DPfS_S_jji
/*0000*/ /*0xa0004c0d04200780*/ I2I.U32.U16 R3, g [0x6].U16;
/*0008*/ /*0x1000d8050423c780*/ MOV R1, g [0xc];
/*0010*/ /*0x4007040900000780*/ IMUL.U16.U16 R2, R1L, R3H;
/*0018*/ /*0x6006060900008780*/ IMAD.U16 R2, R1H, R3L, R2;
/*0020*/ /*0x30100409c4100780*/ SHL R2, R2, 0x10;
/*0028*/ /*0x6006041100008780*/ IMAD.U16 R4, R1L, R3L, R2;
/*0030*/ /*0xa000001504000780*/ I2I.U32.U16 R5, R0L;
/*0038*/ /*0x30010801c4100780*/ SHL R0, R4, 0x1;
/*0040*/ /*0x20008a00 */ IADD32 R0, R5, R0;
/*0044*/ /*0x2100f804 */ IADD32 R1, g [0xc], R0;
/*0048*/ /*0x30020001c4100780*/ SHL R0, R0, 0x2;
/*0050*/ /*0x30020209c4100780*/ SHL R2, R1, 0x2;
/*0058*/ /*0x2000c80104200780*/ IADD R0, g [0x4], R0;
/*0060*/ /*0xd00e000580c00780*/ GLD.U32 R1, global14 [R0];
/*0068*/ /*0x00020a05c0000780*/ R2A A1, R5, 0x2;
/*0070*/ /*0x2000c80104208780*/ IADD R0, g [0x4], R2;
/*0078*/ /*0xd00e000180c00780*/ GLD.U32 R0, global14 [R0];
/*0080*/ /*0x2000d80904214780*/ IADD R2, g [0xc], R5;
/*0088*/ /*0x04002001e4204780*/ R2G.U32.U32 g [A1+0x10], R1;
/*0090*/ /*0x00020405c0000780*/ R2A A1, R2, 0x2;
/*0098*/ /*0x04002001e4200780*/ R2G.U32.U32 g [A1+0x10], R0;
/*00a0*/ /*0x861ffe0300000000*/ BAR.ARV.WAIT b0, 0xfff;
/*00a8*/ /*0x00030a05c0000780*/ R2A A1, R5, 0x3;
/*00b0*/ /*0x30010a01c4100780*/ SHL R0, R5, 0x1;
/*00b8*/ /*0x1400e0090423c780*/ MOV R2, g [A1+0x10];
/*00c0*/ /*0x1400e2050423c780*/ MOV R1, g [A1+0x11];
/*00c8*/ /*0x861ffe0300000000*/ BAR.ARV.WAIT b0, 0xfff;
/*00d0*/ /*0x2000081904014780*/ IADD R6, R4, R5;
/*00d8*/ /*0x30040a11ec100780*/ SHR.S32 R4, R5, 0x4;
/*00e0*/ /*0x2106f618 */ IADD32 R6, g [0xb], R6;
/*00e4*/ /*0x20048a10 */ IADD32 R4, R5, R4;
/*00e8*/ /*0xb000041d08004780*/ FADD R7, R2, -R1;
/*00f0*/ /*0x30020c19c4100780*/ SHL R6, R6, 0x2;
/*00f8*/ /*0xb000042100004780*/ FADD R8, R2, R1;
/*0100*/ /*0x00020805c0000780*/ R2A A1, R4, 0x2;
/*0108*/ /*0xc0330e0903f3504f*/ FMUL32I R2, R7, 0x3f3504f3;
/*0110*/ /*0x2000cc0504218780*/ IADD R1, g [0x6], R6;
/*0118*/ /*0xc033101103f3504f*/ FMUL32I R4, R8, 0x3f3504f3;
/*0120*/ /*0xd00e0209a0c00780*/ GST.U32 global14 [R1], R2;
/*0128*/ /*0x04002001e4210780*/ R2G.U32.U32 g [A1+0x10], R4;
/*0130*/ /*0x861ffe0300000000*/ BAR.ARV.WAIT b0, 0xfff;
/*0138*/ /*0x3080d5fd6460c7c8*/ ISET.C0 o [0x7f], g [0xa], c [0x1] [0x0], LE;
/*0140*/ /*0x3000000300000280*/ RET C0.NE;
/*0148*/ /*0x1001801900000003*/ MVI R6, 0x1;
/*0150*/ /*0x1001801d00000003*/ MVI R7, 0x1;
/*0158*/ /*0x3001d811ec300780*/ SHR.S32 R4, g [0xc], 0x1;
/*0160*/ /*0x30040bfd640187c8*/ ISET.C0 o [0x7f], R5, R4, GE;
/*0168*/ /*0xa004900300000000*/ SSY 0x248;
/*0170*/ /*0x1004900300000280*/ BRA C0.NE, 0x248;
/*0178*/ /*0xa000480504200780*/ I2I.U32.U16 R1, g [0x4].U16;
/*0180*/ /*0x200002050400c780*/ IADD R1, R1, R3;
/*0188*/ /*0x20078008 */ IADD32 R2, R0, R7;
/*018c*/ /*0x40031020 */ IMUL32.U16.U16 R8, R4L, R1H;
/*0190*/ /*0x30040429e4100780*/ SHR R10, R2, 0x4;
/*0198*/ /*0x6002122500020780*/ IMAD.U16 R9, R4H, R1L, R8;
/*01a0*/ /*0x30040021e4100780*/ SHR R8, R0, 0x4;
/*01a8*/ /*0x2000042904028780*/ IADD R10, R2, R10;
/*01b0*/ /*0x30101209c4100780*/ SHL R2, R9, 0x10;
/*01b8*/ /*0x2000102104000780*/ IADD R8, R8, R0;
/*01c0*/ /*0x00021405c0000780*/ R2A A1, R10, 0x2;
/*01c8*/ /*0x6002100900008780*/ IMAD.U16 R2, R4L, R1L, R2;
/*01d0*/ /*0x00021009c0000780*/ R2A A2, R8, 0x2;
/*01d8*/ /*0x1400e0050423c780*/ MOV R1, g [A1+0x10];
/*01e0*/ /*0x2000042104014780*/ IADD R8, R2, R5;
/*01e8*/ /*0x1400e0090423c780*/ MOV R2, g [A1+0x10];
/*01f0*/ /*0xb800e02508204780*/ FADD R9, g [A2+0x10], -R1;
/*01f8*/ /*0x30021005c4100780*/ SHL R1, R8, 0x2;
/*0200*/ /*0xb800e02100208780*/ FADD R8, g [A2+0x10], R2;
/*0208*/ /*0xc033120903f3504f*/ FMUL32I R2, R9, 0x3f3504f3;
/*0210*/ /*0x2000cc0504204780*/ IADD R1, g [0x6], R1;
/*0218*/ /*0xc033102103f3504f*/ FMUL32I R8, R8, 0x3f3504f3;
/*0220*/ /*0xd00e0209a0c00780*/ GST.U32 global14 [R1], R2;
/*0228*/ /*0x08002001e4220780*/ R2G.U32.U32 g [A2+0x10], R8;
/*0230*/ /*0x30010e1dc4100780*/ SHL R7, R7, 0x1;
/*0238*/ /*0x30010001c4100780*/ SHL R0, R0, 0x1;
/*0240*/ /*0x30010811e4100780*/ SHR R4, R4, 0x1;
/*0248*/ /*0xf0000001e0000002*/ NOP.S;
/*0250*/ /*0x861ffe0300000000*/ BAR.ARV.WAIT b0, 0xfff;
/*0258*/ /*0x20018c1900000003*/ IADD32I R6, R6, 0x1;
/*0260*/ /*0x3006d5fd642147c8*/ ISET.C0 o [0x7f], g [0xa], R6, NE;
/*0268*/ /*0x1002c00300000280*/ BRA C0.NE, 0x160;
/*0270*/ /*0x307c0bfd6c0147c8*/ ISET.S32.C0 o [0x7f], R5, R124, NE;
/*0278*/ /*0x3000000300000280*/ RET C0.NE;
/*0280*/ /*0x30020605c4100780*/ SHL R1, R3, 0x2;
/*0288*/ /*0x1000e0010423c780*/ MOV R0, g [0x10];
/*0290*/ /*0x2000d00504204780*/ IADD R1, g [0x8], R1;
/*0298*/ /*0xd00e0201a0c00781*/ GST.U32 global14 [R1], R0;