|
31 | 31 | #pragma once
|
32 | 32 |
|
33 | 33 | #include <assert.h>
|
34 |
| -#include <stdbool.h> |
35 | 34 | #include <stdint.h>
|
36 | 35 | #include <string.h>
|
37 | 36 |
|
|
47 | 46 | #include <endian.h>
|
48 | 47 | #endif
|
49 | 48 |
|
50 |
| -#if defined(_MSC_VER) |
51 |
| -#include <stdlib.h> |
52 |
| - |
53 |
| -static inline uint32_t rol32(uint32_t x, int r) { |
54 |
| - static_assert(sizeof(uint32_t) == sizeof(unsigned int), "this code assumes 32-bit integers"); |
55 |
| - return _rotl(x, r); |
56 |
| -} |
57 |
| - |
58 |
| -static inline uint64_t rol64(uint64_t x, int r) { |
59 |
| - return _rotl64(x, r); |
60 |
| -} |
61 |
| - |
62 |
| -#else |
63 |
| - |
64 | 49 | static inline uint32_t rol32(uint32_t x, int r) {
|
65 | 50 | return (x << (r & 31)) | (x >> (-r & 31));
|
66 | 51 | }
|
67 | 52 |
|
68 |
| -static inline uint64_t rol64(uint64_t x, int r) { |
69 |
| - return (x << (r & 63)) | (x >> (-r & 63)); |
70 |
| -} |
| 53 | +// Adapted from xxHash's function XXH_mult64to128 without the MSVC code |
| 54 | +// https://github.com/Cyan4973/xxHash/blob/8e5fdcbe70687573265b7154515567ee7ca0645c/xxh3.h#L294 |
| 55 | +// Note: prod_hi must be a valid pointer, otherwise this function will segfault |
| 56 | +static inline uint64_t mul128(uint64_t lhs, uint64_t rhs, uint64_t* prod_hi) { |
| 57 | + assert(NULL != prod_hi); |
| 58 | + |
| 59 | +#if defined(__GNUC__) && !defined(__wasm__) && defined(__SIZEOF_INT128__) \ |
| 60 | + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) |
| 61 | + /* |
| 62 | + * GCC/Clang __uint128_t method. |
| 63 | + * |
| 64 | + * On most 64-bit targets, GCC and Clang define a __uint128_t type. |
| 65 | + * This is usually the best way as it usually uses a native long 64-bit |
| 66 | + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. |
| 67 | + * |
| 68 | + * Usually. |
| 69 | + * |
| 70 | + * Despite being a 32-bit platform, Clang (and emscripten) define this |
| 71 | + * type despite not having the arithmetic for it. This results in a |
| 72 | + * laggy compiler builtin call which calculates a full 128-bit multiply. |
| 73 | + * In that case it is best to use the portable one. |
| 74 | + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 |
| 75 | + */ |
| 76 | + |
| 77 | + __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs; |
| 78 | + *prod_hi = product >> 64; |
| 79 | + return product; |
71 | 80 |
|
| 81 | +#else |
| 82 | + /* |
| 83 | + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. |
| 84 | + * |
| 85 | + * This is a fast and simple grade school multiply, which is shown |
| 86 | + * below with base 10 arithmetic instead of base 0x100000000. |
| 87 | + * |
| 88 | + * 9 3 // D2 lhs = 93 |
| 89 | + * x 7 5 // D2 rhs = 75 |
| 90 | + * ---------- |
| 91 | + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) |
| 92 | + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) |
| 93 | + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) |
| 94 | + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) |
| 95 | + * --------- |
| 96 | + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 |
| 97 | + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 |
| 98 | + * --------- |
| 99 | + * 6 9 7 5 |
| 100 | + * |
| 101 | + * The reasons for adding the products like this are: |
| 102 | + * 1. It avoids manual carry tracking. Just like how |
| 103 | + * (9 * 9) + 9 + 9 = 99, the same applies with this for |
| 104 | + * UINT64_MAX. This avoids a lot of complexity. |
| 105 | + * |
| 106 | + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL |
| 107 | + * instruction available in ARMv6+ A32/T32, which is shown below: |
| 108 | + * |
| 109 | + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) |
| 110 | + * { |
| 111 | + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; |
| 112 | + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); |
| 113 | + * *RdHi = (xxh_u32)(product >> 32); |
| 114 | + * } |
| 115 | + * |
| 116 | + * This instruction was designed for efficient long multiplication, |
| 117 | + * and allows this to be calculated in only 4 instructions which |
| 118 | + * is comparable to some 64-bit ALUs. |
| 119 | + * |
| 120 | + * 3. It isn't terrible on other platforms. Usually this will be |
| 121 | + * a couple of 32-bit ADD/ADCs. |
| 122 | + */ |
| 123 | + |
| 124 | + /* First calculate all of the cross products. */ |
| 125 | + uint64_t const lo_lo = (lhs & 0xFFFFFFFF) * (rhs & 0xFFFFFFFF); |
| 126 | + uint64_t const hi_lo = (lhs >> 32 ) * (rhs & 0xFFFFFFFF); |
| 127 | + uint64_t const lo_hi = (lhs & 0xFFFFFFFF) * (rhs >> 32); |
| 128 | + uint64_t const hi_hi = (lhs >> 32 ) * (rhs >> 32); |
| 129 | + |
| 130 | + /* Now add the products together. These will never overflow. */ |
| 131 | + uint64_t const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; |
| 132 | + uint64_t const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; |
| 133 | + uint64_t const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); |
| 134 | + |
| 135 | + *prod_hi = upper; |
| 136 | + return lower; |
72 | 137 | #endif
|
73 |
| - |
74 |
| -static inline uint64_t hi_dword(uint64_t val) { |
75 |
| - return val >> 32; |
76 |
| -} |
77 |
| - |
78 |
| -static inline uint64_t lo_dword(uint64_t val) { |
79 |
| - return val & 0xFFFFFFFF; |
80 |
| -} |
81 |
| - |
82 |
| -static inline uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) { |
83 |
| - // multiplier = ab = a * 2^32 + b |
84 |
| - // multiplicand = cd = c * 2^32 + d |
85 |
| - // ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d |
86 |
| - uint64_t a = hi_dword(multiplier); |
87 |
| - uint64_t b = lo_dword(multiplier); |
88 |
| - uint64_t c = hi_dword(multiplicand); |
89 |
| - uint64_t d = lo_dword(multiplicand); |
90 |
| - |
91 |
| - uint64_t ac = a * c; |
92 |
| - uint64_t ad = a * d; |
93 |
| - uint64_t bc = b * c; |
94 |
| - uint64_t bd = b * d; |
95 |
| - |
96 |
| - uint64_t adbc = ad + bc; |
97 |
| - uint64_t adbc_carry = adbc < ad ? 1 : 0; |
98 |
| - |
99 |
| - // multiplier * multiplicand = product_hi * 2^64 + product_lo |
100 |
| - uint64_t product_lo = bd + (adbc << 32); |
101 |
| - uint64_t product_lo_carry = product_lo < bd ? 1 : 0; |
102 |
| - *product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry; |
103 |
| - assert(ac <= *product_hi); |
104 |
| - |
105 |
| - return product_lo; |
106 |
| -} |
107 |
| - |
108 |
| -static inline uint64_t div_with_reminder(uint64_t dividend, uint32_t divisor, uint32_t* remainder) { |
109 |
| - dividend |= ((uint64_t)*remainder) << 32; |
110 |
| - *remainder = dividend % divisor; |
111 |
| - return dividend / divisor; |
112 |
| -} |
113 |
| - |
114 |
| -// Long division with 2^32 base |
115 |
| -static inline uint32_t div128_32(uint64_t dividend_hi, uint64_t dividend_lo, uint32_t divisor, uint64_t* quotient_hi, uint64_t* quotient_lo) { |
116 |
| - uint64_t dividend_dwords[4]; |
117 |
| - uint32_t remainder = 0; |
118 |
| - |
119 |
| - dividend_dwords[3] = hi_dword(dividend_hi); |
120 |
| - dividend_dwords[2] = lo_dword(dividend_hi); |
121 |
| - dividend_dwords[1] = hi_dword(dividend_lo); |
122 |
| - dividend_dwords[0] = lo_dword(dividend_lo); |
123 |
| - |
124 |
| - *quotient_hi = div_with_reminder(dividend_dwords[3], divisor, &remainder) << 32; |
125 |
| - *quotient_hi |= div_with_reminder(dividend_dwords[2], divisor, &remainder); |
126 |
| - *quotient_lo = div_with_reminder(dividend_dwords[1], divisor, &remainder) << 32; |
127 |
| - *quotient_lo |= div_with_reminder(dividend_dwords[0], divisor, &remainder); |
128 |
| - |
129 |
| - return remainder; |
130 | 138 | }
|
131 | 139 |
|
132 | 140 | // Long divisor with 2^64 base
|
|
0 commit comments