Hello,
I am new to CUDA. After reading the PTX ISA doc I found there is a separate MAD instruction for integer values. But when I use something like this:
a = b * c + d in my C for CUDA kernel, it is translated to sequence of multiply and add instruction in PTX output. Is there a way, how to tell the compiler to translate this to MAD instruction?
Hello,
I am new to CUDA. After reading the PTX ISA doc I found there is a separate MAD instruction for integer values. But when I use something like this:
a = b * c + d in my C for CUDA kernel, it is translated to sequence of multiply and add instruction in PTX output. Is there a way, how to tell the compiler to translate this to MAD instruction?
Check the actual cubin for the real instructions the GPU executes. The ptxas intermediate output is further compiled to assembly, and the MADD optimization might be done there instead. You’ll need to use decuda or cuobjdump.
Check the actual cubin for the real instructions the GPU executes. The ptxas intermediate output is further compiled to assembly, and the MADD optimization might be done there instead. You’ll need to use decuda or cuobjdump.
You can use the inline asm directive, but it is not documented, not supported, and not recommended by the Nvidia folks.
[codebox]# cat hw.cu
global void fun(int * mem)
{
int a = 3;
int b = 5;
int d;
asm("\
mad.lo.s32 %0, %1, %2, %3;"
: "=r"(d) : "r"(*mem), "r"(a), "r"(b) : );
*mem = d;
}
int main()
{
int h = 1;
int * d;
cudaMalloc(&d, sizeof(int));
cudaMemcpy(d, &h, sizeof(int), cudaMemcpyHostToDevice);
fun<<<1,1>>>(d);
cudaThreadSynchronize();
int rv = cudaGetLastError();
cudaMemcpy(&h, d, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "Result = " << h << "\n";
return 0;
}
# COMPILE AND BUILD WITH -KEEP OPTION …
cat hw.compute_20.ptx
.version 2.2
.target sm_20
// compiled with C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.2\\bin/../open64/lib//be.exe
// nvopencc 3.2 built on 2010-09-09
//-----------------------------------------------------------
// Compiling hw.compute_20.cpp3.i (C:/Users/Ken/AppData/Local/Temp/ccBI#.a03380)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:32
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
/// (REMOVED .FILE LINES)
.entry _Z3funPi (
.param .u32 __cudaparm__Z3funPi_mem)
{
.reg .u32 %r<11>;
.loc 28 3 0
$LDWbegin__Z3funPi:
.loc 28 8 0
ld.param.u32 %r1, [__cudaparm__Z3funPi_mem];
ldu.global.s32 %r2, [%r1+0];
mov.u32 %r3, %r2;
mov.s32 %r4, 3;
mov.u32 %r5, %r4;
mov.s32 %r6, 5;
mov.u32 %r7, %r6;
mad.lo.s32 %r8, %r3, %r5, %r7;
mov.s32 %r9, %r8;
.loc 28 11 0
st.global.s32 [%r1+0], %r9;
.loc 28 12 0
exit;
$LDWend__Z3funPi:
} // _Z3funPi
ptxas -arch sm_20 hw.compute_20.ptx
nvdis elf.o
ELF File…
00000000: 2800440400005de4 mov b32 $r1 c1[0x100]
00000008: 2800400080009de4 mov b32 $r2 c0[0x20]
00000010: 180000001400dde2 mov b32 $r3 0x5
00000018: 8800000000201c85 ldu b32 $r0 g[$r2+0]
00000020: 2006c0000c001ca3 add $r0 mul s32 $r0 0x3 $r3 <<<<
00000028: 9000000000201c85 st b32 wb g[$r2+0] $r0
00000030: 8000000000001de7 exit
#[/codebox]
You can use the inline asm directive, but it is not documented, not supported, and not recommended by the Nvidia folks.
[codebox]# cat hw.cu
global void fun(int * mem)
{
int a = 3;
int b = 5;
int d;
asm("\
mad.lo.s32 %0, %1, %2, %3;"
: "=r"(d) : "r"(*mem), "r"(a), "r"(b) : );
*mem = d;
}
int main()
{
int h = 1;
int * d;
cudaMalloc(&d, sizeof(int));
cudaMemcpy(d, &h, sizeof(int), cudaMemcpyHostToDevice);
fun<<<1,1>>>(d);
cudaThreadSynchronize();
int rv = cudaGetLastError();
cudaMemcpy(&h, d, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "Result = " << h << "\n";
return 0;
}
# COMPILE AND BUILD WITH -KEEP OPTION …
cat hw.compute_20.ptx
.version 2.2
.target sm_20
// compiled with C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.2\\bin/../open64/lib//be.exe
// nvopencc 3.2 built on 2010-09-09
//-----------------------------------------------------------
// Compiling hw.compute_20.cpp3.i (C:/Users/Ken/AppData/Local/Temp/ccBI#.a03380)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:32
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
/// (REMOVED .FILE LINES)
.entry _Z3funPi (
.param .u32 __cudaparm__Z3funPi_mem)
{
.reg .u32 %r<11>;
.loc 28 3 0
$LDWbegin__Z3funPi:
.loc 28 8 0
ld.param.u32 %r1, [__cudaparm__Z3funPi_mem];
ldu.global.s32 %r2, [%r1+0];
mov.u32 %r3, %r2;
mov.s32 %r4, 3;
mov.u32 %r5, %r4;
mov.s32 %r6, 5;
mov.u32 %r7, %r6;
mad.lo.s32 %r8, %r3, %r5, %r7;
mov.s32 %r9, %r8;
.loc 28 11 0
st.global.s32 [%r1+0], %r9;
.loc 28 12 0
exit;
$LDWend__Z3funPi:
} // _Z3funPi
ptxas -arch sm_20 hw.compute_20.ptx
nvdis elf.o
ELF File…
00000000: 2800440400005de4 mov b32 $r1 c1[0x100]
00000008: 2800400080009de4 mov b32 $r2 c0[0x20]
00000010: 180000001400dde2 mov b32 $r3 0x5
00000018: 8800000000201c85 ldu b32 $r0 g[$r2+0]
00000020: 2006c0000c001ca3 add $r0 mul s32 $r0 0x3 $r3 <<<<
00000028: 9000000000201c85 st b32 wb g[$r2+0] $r0
00000030: 8000000000001de7 exit
#[/codebox]
There no integer MAD in hardware, so looks like compiler not even bother to use imad in PTX as it’ll be anyway converted to sequence of multiplies (even 24-bit ones for pre-Fermi GPUs) and additions.
There no integer MAD in hardware, so looks like compiler not even bother to use imad in PTX as it’ll be anyway converted to sequence of multiplies (even 24-bit ones for pre-Fermi GPUs) and additions.
thanks for explanation. After reading more staff about CUDA HW and experimenting with simple kernels I came to the same conclusion.
to kaberdude: the asm directive is a nifty trick :)
thanks for explanation. After reading more staff about CUDA HW and experimenting with simple kernels I came to the same conclusion.
to kaberdude: the asm directive is a nifty trick :)
Hi,
The translation really depends on your target architecture. PTX MAD.LO.S32 turns into multiple machine code instructions for sm_10 to sm_13. Kind of confusing, because NVIDIA’s CUOBJDUMP shows a couple IMAD machine code instructions. But, it isn’t used as how I initially thought. They are just used to perform a 32-bit integer multiplication. But, for sm_20, it translates to an integer MAD machine code instruction, or so NVDIS says so.
Ken
[codebox]# cat hw.cu
cat hw.cu
global void fun(int * mem)
{
int a = *mem;
int b = *mem;
int d = *mem;
asm("\
mad.lo.s32 %0, %1, %2, %3;"
: "=r"(d) : "r"(d), "r"(a), "r"(b) : );
*mem = d;
}
int main()
{
int h = 1;
int * d;
cudaMalloc(&d, sizeof(int));
cudaMemcpy(d, &h, sizeof(int), cudaMemcpyHostToDevice);
fun<<<1,1>>>(d);
cudaThreadSynchronize();
int rv = cudaGetLastError();
cudaMemcpy(&h, d, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "Result = " << h << "\n";
return 0;
}
cat hw.compute_10.ptx
.version 1.4
.target sm_10, map_f64_to_f32
// LINES REMOVED
.entry _Z3funPi (
.param .u32 __cudaparm__Z3funPi_mem)
{
.reg .u32 %r<9>;
.loc 28 3 0
$LDWbegin__Z3funPi:
.loc 28 8 0
ld.param.u32 %r1, [__cudaparm__Z3funPi_mem];
ld.global.s32 %r2, [%r1+0];
mov.u32 %r3, %r2;
mov.u32 %r4, %r2;
mov.u32 %r5, %r2;
mad.lo.s32 %r6, %r3, %r4, %r5;
mov.s32 %r7, %r6;
ld.param.u32 %r1, [__cudaparm__Z3funPi_mem];
.loc 28 11 0
st.global.s32 [%r1+0], %r7;
.loc 28 12 0
exit;
$LDWend__Z3funPi:
} // _Z3funPi
cat hw.compute_20.ptx
.version 2.2
.target sm_20
// LINES REMOVED
.entry _Z3funPi (
.param .u32 __cudaparm__Z3funPi_mem)
{
.reg .u32 %r<9>;
.loc 28 3 0
$LDWbegin__Z3funPi:
.loc 28 8 0
ld.param.u32 %r1, [__cudaparm__Z3funPi_mem];
ldu.global.s32 %r2, [%r1+0];
mov.u32 %r3, %r2;
mov.u32 %r4, %r2;
mov.u32 %r5, %r2;
mad.lo.s32 %r6, %r3, %r4, %r5;
mov.s32 %r7, %r6;
.loc 28 11 0
st.global.s32 [%r1+0], %r7;
.loc 28 12 0
exit;
$LDWend__Z3funPi:
} // _Z3funPi
ptxas -arch sm_10 hw.compute_10.ptx
nvdis elf.o
ELF File…
00000000: 0423c7801000c801 mov b32 $r0 b32 s[0x10]
00000008: 80c00780d00e0001 ld b32 $r0 g14[$r0]
00000010: 0000078040010005 mul $r1 u16 $r0l u16 $r0h
00000018: 0000478060000205 add $r1 mul u16 $r0h $r0l $r1 <<<<<< IMAD
00000020: c410078030100205 shl b32 $r1 $r1 0x10
00000028: 0000478060000005 add $r1 mul u16 $r0l $r0l $r1 <<< IMAD
00000030: 20008204 add b32 $r1 $r1 $r0
00000034: 1100e800 mov b32 $r0 b32 s[0x10]
00000038: a0c00781d00e0005 st b32 g14[$r0] $r1
exit
ptxas -arch sm_20 hw.compute_20.ptx
nvdis elf.o
ELF File…
00000000: 2800440400005de4 mov b32 $r1 c1[0x100]
00000008: 2800400080009de4 mov b32 $r2 c0[0x20]
00000010: 8800000000201c85 ldu b32 $r0 g[$r2+0]
00000018: 2000000000001ca3 add $r0 mul s32 $r0 $r0 $r0 <<<<<<<<<<<<<<<<< INTEGER MAD
00000020: 9000000000201c85 st b32 wb g[$r2+0] $r0
00000028: 8000000000001de7 exit
# (RECOMPILE FOR ONLY SM_10. CUOBJDUMP DOES NOT WORK WITH MULTIPLE TARGETS.)
cuobjdump -sass Debug/hw.exe
c:/Personal/tem/cuda-waste/test/hw/hw.cu:
========================================
Version = 0x00000004
gpuInfoVersion = 0xa14f518d
key = 5585e27e8bf2d4b8
usageMode = -maxrregcount=32
debuggable = no
ptx = compute_10
code for sm_10
--------------
Function : _Z3funPi
/*0000*/ MOV R0, g [0x4];
/*0008*/ GLD.U32 R0, global14 [R0];
/*0010*/ IMUL.U16.U16 R1, R0L, R0H;
/*0018*/ IMAD.U16 R1, R0H, R0L, R1;
/*0020*/ SHL R1, R1, 0x10;
/*0028*/ IMAD.U16 R1, R0L, R0L, R1;
/*0030*/ IADD32 R1, R1, R0;
/*0034*/ MOV32 R0, g [0x4];
/*0038*/ GST.U32 global14 [R0], R1;
...................
[/codebox]
Hi,
The translation really depends on your target architecture. PTX MAD.LO.S32 turns into multiple machine code instructions for sm_10 to sm_13. Kind of confusing, because NVIDIA’s CUOBJDUMP shows a couple IMAD machine code instructions. But, it isn’t used as how I initially thought. They are just used to perform a 32-bit integer multiplication. But, for sm_20, it translates to an integer MAD machine code instruction, or so NVDIS says so.
Ken
[codebox]# cat hw.cu
cat hw.cu
global void fun(int * mem)
{
int a = *mem;
int b = *mem;
int d = *mem;
asm("\
mad.lo.s32 %0, %1, %2, %3;"
: "=r"(d) : "r"(d), "r"(a), "r"(b) : );
*mem = d;
}
int main()
{
int h = 1;
int * d;
cudaMalloc(&d, sizeof(int));
cudaMemcpy(d, &h, sizeof(int), cudaMemcpyHostToDevice);
fun<<<1,1>>>(d);
cudaThreadSynchronize();
int rv = cudaGetLastError();
cudaMemcpy(&h, d, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "Result = " << h << "\n";
return 0;
}
cat hw.compute_10.ptx
.version 1.4
.target sm_10, map_f64_to_f32
// LINES REMOVED
.entry _Z3funPi (
.param .u32 __cudaparm__Z3funPi_mem)
{
.reg .u32 %r<9>;
.loc 28 3 0
$LDWbegin__Z3funPi:
.loc 28 8 0
ld.param.u32 %r1, [__cudaparm__Z3funPi_mem];
ld.global.s32 %r2, [%r1+0];
mov.u32 %r3, %r2;
mov.u32 %r4, %r2;
mov.u32 %r5, %r2;
mad.lo.s32 %r6, %r3, %r4, %r5;
mov.s32 %r7, %r6;
ld.param.u32 %r1, [__cudaparm__Z3funPi_mem];
.loc 28 11 0
st.global.s32 [%r1+0], %r7;
.loc 28 12 0
exit;
$LDWend__Z3funPi:
} // _Z3funPi
cat hw.compute_20.ptx
.version 2.2
.target sm_20
// LINES REMOVED
.entry _Z3funPi (
.param .u32 __cudaparm__Z3funPi_mem)
{
.reg .u32 %r<9>;
.loc 28 3 0
$LDWbegin__Z3funPi:
.loc 28 8 0
ld.param.u32 %r1, [__cudaparm__Z3funPi_mem];
ldu.global.s32 %r2, [%r1+0];
mov.u32 %r3, %r2;
mov.u32 %r4, %r2;
mov.u32 %r5, %r2;
mad.lo.s32 %r6, %r3, %r4, %r5;
mov.s32 %r7, %r6;
.loc 28 11 0
st.global.s32 [%r1+0], %r7;
.loc 28 12 0
exit;
$LDWend__Z3funPi:
} // _Z3funPi
ptxas -arch sm_10 hw.compute_10.ptx
nvdis elf.o
ELF File…
00000000: 0423c7801000c801 mov b32 $r0 b32 s[0x10]
00000008: 80c00780d00e0001 ld b32 $r0 g14[$r0]
00000010: 0000078040010005 mul $r1 u16 $r0l u16 $r0h
00000018: 0000478060000205 add $r1 mul u16 $r0h $r0l $r1 <<<<<< IMAD
00000020: c410078030100205 shl b32 $r1 $r1 0x10
00000028: 0000478060000005 add $r1 mul u16 $r0l $r0l $r1 <<< IMAD
00000030: 20008204 add b32 $r1 $r1 $r0
00000034: 1100e800 mov b32 $r0 b32 s[0x10]
00000038: a0c00781d00e0005 st b32 g14[$r0] $r1
exit
ptxas -arch sm_20 hw.compute_20.ptx
nvdis elf.o
ELF File…
00000000: 2800440400005de4 mov b32 $r1 c1[0x100]
00000008: 2800400080009de4 mov b32 $r2 c0[0x20]
00000010: 8800000000201c85 ldu b32 $r0 g[$r2+0]
00000018: 2000000000001ca3 add $r0 mul s32 $r0 $r0 $r0 <<<<<<<<<<<<<<<<< INTEGER MAD
00000020: 9000000000201c85 st b32 wb g[$r2+0] $r0
00000028: 8000000000001de7 exit
# (RECOMPILE FOR ONLY SM_10. CUOBJDUMP DOES NOT WORK WITH MULTIPLE TARGETS.)
cuobjdump -sass Debug/hw.exe
c:/Personal/tem/cuda-waste/test/hw/hw.cu:
========================================
Version = 0x00000004
gpuInfoVersion = 0xa14f518d
key = 5585e27e8bf2d4b8
usageMode = -maxrregcount=32
debuggable = no
ptx = compute_10
code for sm_10
--------------
Function : _Z3funPi
/*0000*/ MOV R0, g [0x4];
/*0008*/ GLD.U32 R0, global14 [R0];
/*0010*/ IMUL.U16.U16 R1, R0L, R0H;
/*0018*/ IMAD.U16 R1, R0H, R0L, R1;
/*0020*/ SHL R1, R1, 0x10;
/*0028*/ IMAD.U16 R1, R0L, R0L, R1;
/*0030*/ IADD32 R1, R1, R0;
/*0034*/ MOV32 R0, g [0x4];
/*0038*/ GST.U32 global14 [R0], R1;
...................
[/codebox]