I took the mul.wide.u64 code for sm_50 and above from a recent thread (https://devtalk.nvidia.com/default/topic/1017754/cuda-programming-and-performance/long-integer-multiplication-mul-wide-u64-and-mul-wide-u128/) and reworked it into a mul.hi.u64 emulation, which is a trivial modification.
I then used this code in the implementation of division of 64-bit unsigned integers, and found speedup of up to 16% when comparing against CUDA 8’s built-in division operator on a Quadro K2200 (sm_50). From looking at the SASS, I noted with interest that CUDA’s 64-bit unsigned division code uses a fastpath / slowpath design, with 64-bit division mapped to 32-bit division where possible, and using an inlined fastpath but a called slowpath (as this is very large, a sensible choice). This can give rise to divergent control flows, so for a fair comparison I duplicated that design choice in the code below.
With all operations going through the fastpath, I measured a 3% speedup. With all operations going through the slowpath I measured a 16% speedup. For a “realistic” mix of operands that can cause either path to be taken I measured 11% speedup. The divergence causes execution time to increase ~25% compared to the exclusive use of the slowpath.
I do not want to spend the time on a detailed analysis of the SASS for the division slowpath in CUDA 8; my current hypothesis is that CUDA 8 employs an insufficiently optimized emulation sequence for mul.hi.u64 on sm_50 and later architectures.
I should note that ensuring the correctness of non-trivial integer division routines is notoriously tricky. While I have tested the code below with billions of test vectors constructed according to multiple different patterns, you would want to double check before deploying this in any mission critical software.
[code updated 7/15/2017]
/*
Copyright (c) 2017, Norbert Juffa
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
__device__ __forceinline__
unsigned long long int umul64lo (unsigned long long int a,
unsigned long long int b)
{
unsigned long long int res;
#if (__CUDA_ARCH__ >= 200) && (__CUDA_ARCH__ < 500)
asm ("{\n\t"
".reg .u32 alo, ahi, blo, bhi, rlo, rhi;\n\t"
// unpack source operands
"mov.b64 {alo,ahi}, %1;\n\t"
"mov.b64 {blo,bhi}, %2;\n\t"
// generate and sum four partial products
"mul.lo.u32 rlo, alo, blo;\n\t"
"mul.hi.u32 rhi, alo, blo;\n\t"
"mad.lo.u32 rhi, alo, bhi, rhi;\n\t"
"mad.lo.u32 rhi, ahi, blo, rhi;\n\t"
// pack output
"mov.b64 %0, {rlo,rhi};\n\t"
"}"
: "=l"(res)
: "l"(a), "l"(b));
#elif __CUDA_ARCH__ >= 500
asm ("{\n\t"
".reg .u32 alo, ahi, blo, bhi, r0, r1, r2, r3;\n\t"
".reg .u16 a0, a1, a2, a3, b0, b1, b2, b3;\n\t"
".reg .u32 s0, s1, t0, t1;\n\t"
// unpack source operands
"mov.b64 {alo,ahi}, %1;\n\t"
"mov.b64 {blo,bhi}, %2;\n\t"
"mov.b32 {a0,a1}, alo;\n\t"
"mov.b32 {a2,a3}, ahi;\n\t"
"mov.b32 {b0,b1}, blo;\n\t"
"mov.b32 {b2,b3}, bhi;\n\t"
// first partial product r{0:1}
"mul.wide.u16 r0, a0, b0;\n\t"
"mul.wide.u16 r1, a0, b2;\n\t"
"mul.wide.u16 s1, a1, b1;\n\t"
"add.u32 r1, r1, s1;\n\t"
"mul.wide.u16 s1, a2, b0;\n\t"
"add.u32 r1, r1, s1;\n\t"
// second partial product t{0:1}
"mul.wide.u16 t0, a0, b1;\n\t"
"mul.wide.u16 t1, a0, b3;\n\t"
"mul.wide.u16 s0, a1, b0;\n\t"
"mul.wide.u16 s1, a1, b2;\n\t"
"add.cc.u32 t0, t0, s0;\n\t"
"addc.u32 t1, t1, s1;\n\t"
"mul.wide.u16 s1, a2, b1;\n\t"
"add.u32 t1, t1, s1;\n\t"
"mul.wide.u16 s1, a3, b0;\n\t"
"add.u32 t1, t1, s1;\n\t"
// offset second partial product
"shf.l.clamp.b32 s1, t0, t1, 16;\n\t"
"shf.l.clamp.b32 s0, 0, t0, 16;\n\t"
// add partial sums
"add.cc.u32 r0, r0, s0;\n\t"
"addc.u32 r1, r1, s1;\n\t"
// pack output
"mov.b64 %0, {r0,r1};\n\t"
"}"
: "=l"(res)
: "l"(a), "l"(b));
#elif __CUDA_ARCH__
#error unsupported __CUDA_ARCH__
#else // avoid warning about uninitialized 'res'
res = 0;
#endif
return res;
}
__device__ __forceinline__
unsigned long long int umul64hi (unsigned long long int a,
unsigned long long int b)
{
unsigned long long int res;
#if (__CUDA_ARCH__ >= 200) && (__CUDA_ARCH__ < 500)
asm ("{\n\t"
".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;\n\t"
"mov.b64 {alo,ahi}, %1;\n\t"
"mov.b64 {blo,bhi}, %2;\n\t"
"mul.lo.u32 r0, alo, blo;\n\t"
"mul.hi.u32 r1, alo, blo; \n\t"
"mad.lo.cc.u32 r1, alo, bhi, r1;\n\t"
"madc.hi.u32 r2, alo, bhi, 0;\n\t"
"mad.lo.cc.u32 r1, ahi, blo, r1;\n\t"
"madc.hi.cc.u32 r2, ahi, blo, r2;\n\t"
"madc.hi.u32 r3, ahi, bhi, 0;\n\t"
"mad.lo.cc.u32 r2, ahi, bhi, r2;\n\t"
"addc.u32 r3, r3, 0;\n\t"
"mov.b64 %0, {r2,r3};\n\t"
"}"
: "=l"(res)
: "l"(a), "l"(b));
#elif __CUDA_ARCH__ >= 500
asm ("{\n\t"
".reg .u32 alo, ahi, blo, bhi, r0, r1, r2, r3;\n\t"
".reg .u32 s0, s1, s2, s3, t0, t1, t2, t3, t4;\n\t"
".reg .u16 a0, a1, a2, a3, b0, b1, b2, b3;\n\t"
// split inputs into 16-bit chunks
"mov.b64 {alo,ahi}, %1;\n\t"
"mov.b64 {blo,bhi}, %2;\n\t"
"mov.b32 {a0,a1}, alo;\n\t"
"mov.b32 {a2,a3}, ahi;\n\t"
"mov.b32 {b0,b1}, blo;\n\t"
"mov.b32 {b2,b3}, bhi;\n\t"
// first partial sum:
// a3b3.wide a1b3.wide a0b2.wide a0b0.wide
// 0 a2b2.wide a1b1.wide
// 0 a3b1.wide a2b0.wide
"mul.wide.u16 r0, a0, b0;\n\t"
"mul.wide.u16 r1, a0, b2;\n\t"
"mul.wide.u16 r2, a1, b3;\n\t"
"mul.wide.u16 r3, a3, b3;\n\t"
"mul.wide.u16 t1, a1, b1;\n\t"
"mul.wide.u16 t2, a2, b2;\n\t"
"mul.wide.u16 t3, a2, b0;\n\t"
"mul.wide.u16 t4, a3, b1;\n\t"
"add.cc.u32 r1, r1, t1;\n\t"
"addc.cc.u32 r2, r2, t2;\n\t"
"addc.u32 r3, r3, 0;\n\t"
"mul.wide.u16 t1, a2, b0;\n\t"
"mul.wide.u16 t2, a3, b1;\n\t"
"add.cc.u32 r1, r1, t1;\n\t"
"addc.cc.u32 r2, r2, t2;\n\t"
"addc.u32 r3, r3, 0;\n\t"
// second partial sum:
// a2b3.wide a0b3.wide a0b1.wide
// a3b2.wide a1b2.wide a1b0.wide
// 0 a2b1.wide
// 0 a3b0.wide
"mul.wide.u16 s0, a0, b1;\n\t"
"mul.wide.u16 s1, a0, b3;\n\t"
"mul.wide.u16 s2, a2, b3;\n\t"
"mul.wide.u16 t1, a2, b1;\n\t"
"add.cc.u32 s1, s1, t1;\n\t"
"addc.u32 s2, s2, 0;\n\t"
"mul.wide.u16 t1, a3, b0;\n\t"
"add.cc.u32 s1, s1, t1;\n\t"
"addc.u32 s2, s2, 0;\n\t"
"mul.wide.u16 t0, a1, b0;\n\t"
"mul.wide.u16 t1, a1, b2;\n\t"
"mul.wide.u16 t2, a3, b2;\n\t"
"add.cc.u32 s0, s0, t0;\n\t"
"addc.cc.u32 s1, s1, t1;\n\t"
"addc.cc.u32 s2, s2, t2;\n\t"
"addc.u32 s3, 0, 0;\n\t"
// offset second partial sum by 16 bits to the left
"shf.l.clamp.b32 t3, s2, s3, 16;\n\t"
"shf.l.clamp.b32 t2, s1, s2, 16;\n\t"
"shf.l.clamp.b32 t1, s0, s1, 16;\n\t"
"shf.l.clamp.b32 t0, 0, s0, 16;\n\t"
// add first sum in r{0,1,2,3} to second sum in t{0,1,2,3}
"add.cc.u32 r0, r0, t0;\n\t"
"addc.cc.u32 r1, r1, t1;\n\t"
"addc.cc.u32 r2, r2, t2;\n\t"
"addc.u32 r3, r3, t3;\n\t"
// pack outputs
"mov.b64 %0, {r2,r3};\n\t"
"}"
: "=l"(res)
: "l"(a), "l"(b));
#elif __CUDA_ARCH__
#error unsupported __CUDA_ARCH__
#else // avoid warning about uninitialized 'res'
res = 0;
#endif
return res;
}
__device__ __noinline__
unsigned long long int udiv64_core (unsigned long long int dividend,
unsigned long long int divisor)
{
unsigned long long int temp, quot, rem, recip, neg_divisor = 0ULL - divisor;
float r, s, t;
/* compute initial approximation for reciprocal; must be underestimate! */
t = __ull2float_ru (divisor);
asm ("rcp.approx.f32.ftz %0,%1;" : "=f"(r) : "f"(t));
s = __int_as_float (__float_as_int (r) - 2 + 0x20000000);
recip = (unsigned long long int)s; /* underestimate of 2**64 / divisor */
/* perform Halley iteration with cubic convergence to refine reciprocal */
temp = umul64lo (neg_divisor, recip);
temp = umul64hi (temp, temp) + temp;
recip = umul64hi (recip, temp) + recip;
/* compute preliminary quotient and remainder */
quot = umul64hi (dividend, recip);
rem = dividend - umul64lo (divisor, quot);
/* adjust quotient if too small; quotient off by 2 at most */
if (rem >= divisor) quot += ((rem - divisor) >= divisor) ? 2 : 1;
/* handle division by zero */
if (divisor == 0ULL) quot = ~0ULL;
return quot;
}
__device__ __forceinline__
unsigned long long int udiv64 (unsigned long long int dividend,
unsigned long long int divisor)
{
unsigned int combined_hi;
asm ("{\n\t"
".reg .u32 dividend_lo, dividend_hi, divisor_lo, divisor_hi;\n\t"
"mov.b64 {dividend_lo, dividend_hi}, %1;\n\t"
"mov.b64 {divisor_lo, divisor_hi}, %2;\n\t"
"or.b32 %0, dividend_hi, divisor_hi;\n\t"
"}"
: "=r"(combined_hi)
: "l"(dividend), "l"(divisor));
if (combined_hi == 0) {
return (unsigned long long int)((unsigned int)(dividend) /
(unsigned int)(divisor));
} else {
return udiv64_core (dividend, divisor);
}
}