Skip to content

Commit 4bc4532

Browse files
committed
int-util: faster and more portable mul128
1 parent 09bb370 commit 4bc4532

File tree

5 files changed

+111
-207
lines changed

5 files changed

+111
-207
lines changed

contrib/epee/include/int-util.h

Lines changed: 83 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
#pragma once
3232

3333
#include <assert.h>
34-
#include <stdbool.h>
3534
#include <stdint.h>
3635
#include <string.h>
3736

@@ -47,86 +46,95 @@
4746
#include <endian.h>
4847
#endif
4948

50-
#if defined(_MSC_VER)
51-
#include <stdlib.h>
52-
53-
static inline uint32_t rol32(uint32_t x, int r) {
54-
static_assert(sizeof(uint32_t) == sizeof(unsigned int), "this code assumes 32-bit integers");
55-
return _rotl(x, r);
56-
}
57-
58-
static inline uint64_t rol64(uint64_t x, int r) {
59-
return _rotl64(x, r);
60-
}
61-
62-
#else
63-
6449
static inline uint32_t rol32(uint32_t x, int r) {
6550
return (x << (r & 31)) | (x >> (-r & 31));
6651
}
6752

68-
static inline uint64_t rol64(uint64_t x, int r) {
69-
return (x << (r & 63)) | (x >> (-r & 63));
70-
}
53+
// Adapted from xxHash's function XXH_mult64to128 without the MSVC code
54+
// https://github.com/Cyan4973/xxHash/blob/8e5fdcbe70687573265b7154515567ee7ca0645c/xxh3.h#L294
55+
// Note: prod_hi must be a valid pointer, otherwise this function will segfault
56+
static inline uint64_t mul128(uint64_t lhs, uint64_t rhs, uint64_t* prod_hi) {
57+
assert(NULL != prod_hi);
58+
59+
#if defined(__GNUC__) && !defined(__wasm__) && defined(__SIZEOF_INT128__) \
60+
|| (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
61+
/*
62+
* GCC/Clang __uint128_t method.
63+
*
64+
* On most 64-bit targets, GCC and Clang define a __uint128_t type.
65+
* This is usually the best way as it usually uses a native long 64-bit
66+
* multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
67+
*
68+
* Usually.
69+
*
70+
* Despite being a 32-bit platform, Clang (and emscripten) define this
71+
* type despite not having the arithmetic for it. This results in a
72+
* laggy compiler builtin call which calculates a full 128-bit multiply.
73+
* In that case it is best to use the portable one.
74+
* https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
75+
*/
76+
77+
__uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
78+
*prod_hi = product >> 64;
79+
return product;
7180

81+
#else
82+
/*
83+
* Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
84+
*
85+
* This is a fast and simple grade school multiply, which is shown
86+
* below with base 10 arithmetic instead of base 0x100000000.
87+
*
88+
* 9 3 // D2 lhs = 93
89+
* x 7 5 // D2 rhs = 75
90+
* ----------
91+
* 1 5 // D2 lo_lo = (93 % 10) * (75 % 10)
92+
* 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10)
93+
* 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10)
94+
* + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10)
95+
* ---------
96+
* 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21
97+
* + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63
98+
* ---------
99+
* 6 9 7 5
100+
*
101+
* The reasons for adding the products like this are:
102+
* 1. It avoids manual carry tracking. Just like how
103+
* (9 * 9) + 9 + 9 = 99, the same applies with this for
104+
* UINT64_MAX. This avoids a lot of complexity.
105+
*
106+
* 2. It hints for, and on Clang, compiles to, the powerful UMAAL
107+
* instruction available in ARMv6+ A32/T32, which is shown below:
108+
*
109+
* void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
110+
* {
111+
* xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
112+
* *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
113+
* *RdHi = (xxh_u32)(product >> 32);
114+
* }
115+
*
116+
* This instruction was designed for efficient long multiplication,
117+
* and allows this to be calculated in only 4 instructions which
118+
* is comparable to some 64-bit ALUs.
119+
*
120+
* 3. It isn't terrible on other platforms. Usually this will be
121+
* a couple of 32-bit ADD/ADCs.
122+
*/
123+
124+
/* First calculate all of the cross products. */
125+
uint64_t const lo_lo = (lhs & 0xFFFFFFFF) * (rhs & 0xFFFFFFFF);
126+
uint64_t const hi_lo = (lhs >> 32 ) * (rhs & 0xFFFFFFFF);
127+
uint64_t const lo_hi = (lhs & 0xFFFFFFFF) * (rhs >> 32);
128+
uint64_t const hi_hi = (lhs >> 32 ) * (rhs >> 32);
129+
130+
/* Now add the products together. These will never overflow. */
131+
uint64_t const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
132+
uint64_t const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
133+
uint64_t const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
134+
135+
*prod_hi = upper;
136+
return lower;
72137
#endif
73-
74-
static inline uint64_t hi_dword(uint64_t val) {
75-
return val >> 32;
76-
}
77-
78-
static inline uint64_t lo_dword(uint64_t val) {
79-
return val & 0xFFFFFFFF;
80-
}
81-
82-
static inline uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) {
83-
// multiplier = ab = a * 2^32 + b
84-
// multiplicand = cd = c * 2^32 + d
85-
// ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
86-
uint64_t a = hi_dword(multiplier);
87-
uint64_t b = lo_dword(multiplier);
88-
uint64_t c = hi_dword(multiplicand);
89-
uint64_t d = lo_dword(multiplicand);
90-
91-
uint64_t ac = a * c;
92-
uint64_t ad = a * d;
93-
uint64_t bc = b * c;
94-
uint64_t bd = b * d;
95-
96-
uint64_t adbc = ad + bc;
97-
uint64_t adbc_carry = adbc < ad ? 1 : 0;
98-
99-
// multiplier * multiplicand = product_hi * 2^64 + product_lo
100-
uint64_t product_lo = bd + (adbc << 32);
101-
uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
102-
*product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
103-
assert(ac <= *product_hi);
104-
105-
return product_lo;
106-
}
107-
108-
static inline uint64_t div_with_reminder(uint64_t dividend, uint32_t divisor, uint32_t* remainder) {
109-
dividend |= ((uint64_t)*remainder) << 32;
110-
*remainder = dividend % divisor;
111-
return dividend / divisor;
112-
}
113-
114-
// Long division with 2^32 base
115-
static inline uint32_t div128_32(uint64_t dividend_hi, uint64_t dividend_lo, uint32_t divisor, uint64_t* quotient_hi, uint64_t* quotient_lo) {
116-
uint64_t dividend_dwords[4];
117-
uint32_t remainder = 0;
118-
119-
dividend_dwords[3] = hi_dword(dividend_hi);
120-
dividend_dwords[2] = lo_dword(dividend_hi);
121-
dividend_dwords[1] = hi_dword(dividend_lo);
122-
dividend_dwords[0] = lo_dword(dividend_lo);
123-
124-
*quotient_hi = div_with_reminder(dividend_dwords[3], divisor, &remainder) << 32;
125-
*quotient_hi |= div_with_reminder(dividend_dwords[2], divisor, &remainder);
126-
*quotient_lo = div_with_reminder(dividend_dwords[1], divisor, &remainder) << 32;
127-
*quotient_lo |= div_with_reminder(dividend_dwords[0], divisor, &remainder);
128-
129-
return remainder;
130138
}
131139

132140
// Long divisor with 2^64 base

src/cryptonote_basic/difficulty.cpp

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -48,52 +48,10 @@ namespace cryptonote {
4848
using std::uint64_t;
4949
using std::vector;
5050

51-
#if defined(__x86_64__)
5251
static inline void mul(uint64_t a, uint64_t b, uint64_t &low, uint64_t &high) {
5352
low = mul128(a, b, &high);
5453
}
5554

56-
#else
57-
58-
static inline void mul(uint64_t a, uint64_t b, uint64_t &low, uint64_t &high) {
59-
// __int128 isn't part of the standard, so the previous function wasn't portable. mul128() in Windows is fine,
60-
// but this portable function should be used elsewhere. Credit for this function goes to latexi95.
61-
62-
uint64_t aLow = a & 0xFFFFFFFF;
63-
uint64_t aHigh = a >> 32;
64-
uint64_t bLow = b & 0xFFFFFFFF;
65-
uint64_t bHigh = b >> 32;
66-
67-
uint64_t res = aLow * bLow;
68-
uint64_t lowRes1 = res & 0xFFFFFFFF;
69-
uint64_t carry = res >> 32;
70-
71-
res = aHigh * bLow + carry;
72-
uint64_t highResHigh1 = res >> 32;
73-
uint64_t highResLow1 = res & 0xFFFFFFFF;
74-
75-
res = aLow * bHigh;
76-
uint64_t lowRes2 = res & 0xFFFFFFFF;
77-
carry = res >> 32;
78-
79-
res = aHigh * bHigh + carry;
80-
uint64_t highResHigh2 = res >> 32;
81-
uint64_t highResLow2 = res & 0xFFFFFFFF;
82-
83-
//Addition
84-
85-
uint64_t r = highResLow1 + lowRes2;
86-
carry = r >> 32;
87-
low = (r << 32) | lowRes1;
88-
r = highResHigh1 + highResLow2 + carry;
89-
uint64_t d3 = r & 0xFFFFFFFF;
90-
carry = r >> 32;
91-
r = highResHigh2 + carry;
92-
high = d3 | (r << 32);
93-
}
94-
95-
#endif
96-
9755
static inline bool cadd(uint64_t a, uint64_t b) {
9856
return a + b < a;
9957
}

tests/performance_tests/check_hash.h

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,12 @@
3232
#include "int-util.h"
3333
#include "cryptonote_basic/difficulty.h"
3434

35-
template<uint64_t hash_target_high, uint64_t hash_target_low, uint64_t difficulty_high, uint64_t difficulty_low>
35+
template<uint64_t hash_target_high, uint64_t hash_target_low, uint64_t difficulty_high, uint64_t difficulty_low, bool FORCE_128>
3636
class test_check_hash
3737
{
3838
public:
39-
static const size_t loop_count = 100000;
39+
static const size_t loop_count = 2;
40+
static const size_t inner_loop_count = 300000;
4041

4142
bool init()
4243
{
@@ -62,7 +63,17 @@ class test_check_hash
6263

6364
bool test()
6465
{
65-
cryptonote::check_hash_128(hash, difficulty);
66+
for (size_t j = 0; j < inner_loop_count; ++j)
67+
{
68+
if constexpr (FORCE_128)
69+
{
70+
cryptonote::check_hash_128(hash, difficulty);
71+
}
72+
else
73+
{
74+
cryptonote::check_hash(hash, difficulty);
75+
}
76+
}
6677
return true;
6778
}
6879

tests/performance_tests/main.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,20 @@ int main(int argc, char** argv)
187187
TEST_PERFORMANCE4(filter, p, test_check_hash, 1, 0, 0, 1);
188188
TEST_PERFORMANCE4(filter, p, test_check_hash, 0xffffffffffffffff, 0xffffffffffffffff, 0, 1);
189189
TEST_PERFORMANCE4(filter, p, test_check_hash, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff);
190+
TEST_PERFORMANCE5(filter, p, test_check_hash, 0, 1, 0, 1, true);
191+
TEST_PERFORMANCE5(filter, p, test_check_hash, 0, 0xffffffffffffffff, 0, 0xffffffffffffffff, true);
192+
TEST_PERFORMANCE5(filter, p, test_check_hash, 0, 0xffffffffffffffff, 0, 1, true);
193+
TEST_PERFORMANCE5(filter, p, test_check_hash, 1, 0, 1, 0, true);
194+
TEST_PERFORMANCE5(filter, p, test_check_hash, 1, 0, 0, 1, true);
195+
TEST_PERFORMANCE5(filter, p, test_check_hash, 0xffffffffffffffff, 0xffffffffffffffff, 0, 1, true);
196+
TEST_PERFORMANCE5(filter, p, test_check_hash, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, true);
197+
TEST_PERFORMANCE5(filter, p, test_check_hash, 0, 1, 0, 1, false);
198+
TEST_PERFORMANCE5(filter, p, test_check_hash, 0, 0xffffffffffffffff, 0, 0xffffffffffffffff, false);
199+
TEST_PERFORMANCE5(filter, p, test_check_hash, 0, 0xffffffffffffffff, 0, 1, false);
200+
TEST_PERFORMANCE5(filter, p, test_check_hash, 1, 0, 1, 0, false);
201+
TEST_PERFORMANCE5(filter, p, test_check_hash, 1, 0, 0, 1, false);
202+
TEST_PERFORMANCE5(filter, p, test_check_hash, 0xffffffffffffffff, 0xffffffffffffffff, 0, 1, false);
203+
TEST_PERFORMANCE5(filter, p, test_check_hash, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, false);
190204

191205
TEST_PERFORMANCE0(filter, p, test_is_out_to_acc);
192206
TEST_PERFORMANCE0(filter, p, test_is_out_to_acc_precomp);

0 commit comments

Comments
 (0)