@@ -38,24 +38,49 @@ namespace cp_algo {
38
38
};
39
39
}
40
40
41
- [[gnu::always_inline]] inline u64x4 montgomery_reduce (u64x4 x, u64x4 mod, u64x4 imod) {
42
- auto x_ninv = u64x4 (u32x8 (x) * u32x8 (imod));
41
+ [[gnu::always_inline]] inline u64x4 montgomery_reduce (u64x4 x, uint32_t mod, uint32_t imod) {
42
+ auto x_ninv = u64x4 (u32x8 (x) * ( u32x8 () + imod));
43
43
#ifdef __AVX2__
44
- x += u64x4 (_mm256_mul_epu32 (__m256i (x_ninv), __m256i (mod) ));
44
+ x += u64x4 (_mm256_mul_epu32 (__m256i (x_ninv), __m256i () + mod ));
45
45
#else
46
46
x += x_ninv * mod;
47
47
#endif
48
48
return x >> 32 ;
49
49
}
50
50
51
- [[gnu::always_inline]] inline u64x4 montgomery_mul (u64x4 x, u64x4 y, u64x4 mod, u64x4 imod) {
51
+ [[gnu::always_inline]] inline u64x4 montgomery_mul (u64x4 x, u64x4 y, uint32_t mod, uint32_t imod) {
52
52
#ifdef __AVX2__
53
53
return montgomery_reduce (u64x4 (_mm256_mul_epu32 (__m256i (x), __m256i (y))), mod, imod);
54
54
#else
55
55
return montgomery_reduce (x * y, mod, imod);
56
56
#endif
57
57
}
58
58
59
+ u32x8 montgomery_mul (u32x8 x, u32x8 y, uint32_t mod, uint32_t imod) {
60
+ auto x0246 = u64x4 (x) & uint32_t (-1 );
61
+ auto y0246 = u64x4 (y) & uint32_t (-1 );
62
+ auto x1357 = u64x4 (x) >> 32 ;
63
+ auto y1357 = u64x4 (y) >> 32 ;
64
+ #ifdef __AVX2__
65
+ auto xy0246 = u64x4 (_mm256_mul_epu32 (__m256i (x0246), __m256i (y0246)));
66
+ auto xy1357 = u64x4 (_mm256_mul_epu32 (__m256i (x1357), __m256i (y1357)));
67
+ #else
68
+ u64x4 xy0246 = x0246 * y0246;
69
+ u64x4 xy1357 = x1357 * y1357;
70
+ #endif
71
+ auto xy_inv = u64x4 (u32x8 (xy0246 | (xy1357 << 32 )) * (u32x8 () + imod));
72
+ auto xy_inv0246 = xy_inv & uint32_t (-1 );
73
+ auto xy_inv1357 = xy_inv >> 32 ;
74
+ #ifdef __AVX2__
75
+ xy0246 += u64x4 (_mm256_mul_epu32 (__m256i (xy_inv0246), __m256i () + mod));
76
+ xy1357 += u64x4 (_mm256_mul_epu32 (__m256i (xy_inv1357), __m256i () + mod));
77
+ #else
78
+ xy0246 += xy_inv0246 * mod;
79
+ xy1357 += xy_inv1357 * mod;
80
+ #endif
81
+ return u32x8 ((xy0246 >> 32 ) | (xy1357 & -1ULL << 32 ));
82
+ }
83
+
59
84
[[gnu::always_inline]] inline dx4 rotate_right (dx4 x) {
60
85
static constexpr u64x4 shuffler = {3 , 0 , 1 , 2 };
61
86
return __builtin_shuffle (x, shuffler);
0 commit comments