@@ -38,36 +38,65 @@ FFT_Processor_Spqlios::FFT_Processor_Spqlios(const int32_t N) : _2N(2 * N), N(N)
38
38
}
39
39
40
40
void FFT_Processor_Spqlios::execute_reverse_uint (double *res, const uint32_t *a) {
41
- // for (int32_t i=0; i<N; i++) real_inout_rev[i]=(double)a[i];
41
+ # ifdef USE_AVX512
42
42
{
43
43
double *dst = res;
44
- // double *dst = real_inout_rev;
45
44
const uint32_t *ait = a;
46
45
const uint32_t *aend = a + N;
46
+ // __asm__ __volatile__ (
47
+ // "0:\n"
48
+ // "vmovupd (%1),%%xmm0\n"
49
+ // "vcvtudq2pd %%xmm0,%%ymm1\n"
50
+ // "vmovapd %%ymm1,(%0)\n"
51
+ // "addq $16,%1\n"
52
+ // "addq $32,%0\n"
53
+ // "cmpq %2,%1\n"
54
+ // "jb 0b\n"
55
+ // : "=r"(dst), "=r"(ait), "=r"(aend)
56
+ // : "0"(dst), "1"(ait), "2"(aend)
57
+ // : "%xmm0", "%ymm1", "memory"
58
+ // );
47
59
__asm__ __volatile__ (
48
60
" 0:\n "
49
- " vmovupd (%1),%%xmm0 \n "
50
- " vcvtudq2pd %%xmm0 ,%%ymm1 \n "
61
+ " vmovupd (%1),%%ymm0 \n "
62
+ " vcvtudq2pd %%ymm0 ,%%zmm1 \n "
51
63
" vmovapd %%ymm1,(%0)\n "
52
- " addq $16 ,%1\n "
53
- " addq $32 ,%0\n "
64
+ " addq $32 ,%1\n "
65
+ " addq $64 ,%0\n "
54
66
" cmpq %2,%1\n "
55
67
" jb 0b\n "
56
68
: " =r" (dst), " =r" (ait), " =r" (aend)
57
69
: " 0" (dst), " 1" (ait), " 2" (aend)
58
- : " %xmm0 " , " %ymm1 " , " memory"
70
+ : " %ymm0 " , " %zmm1 " , " memory"
59
71
);
60
72
}
73
+ #else
74
+ for (int32_t i=0 ; i<N; i++) res[i]=(double )a[i];
75
+ #endif
61
76
ifft (tables_reverse, res);
62
77
}
63
78
64
79
void FFT_Processor_Spqlios::execute_reverse_int (double *res, const int32_t *a) {
65
80
// for (int32_t i=0; i<N; i++) real_inout_rev[i]=(double)a[i];
66
81
{
67
82
double *dst = res;
68
- // double *dst = real_inout_rev;
69
83
const int32_t *ait = a;
70
84
const int32_t *aend = a + N;
85
+ #ifdef USE_AVX512
86
+ __asm__ __volatile__ (
87
+ " 0:\n "
88
+ " vmovdqu32 (%1),%%zmm0\n " // Load 16 int32_t values from `ait` into zmm0
89
+ " vcvtdq2pd %%zmm0,%%zmm1\n " // Convert 16 int32_t values to 8 double-precision values
90
+ " vmovapd %%zmm1,(%0)\n " // Store the result (8 doubles) in `dst`
91
+ " addq $64,%1\n " // Increment `ait` by 64 bytes (16 int32_t values)
92
+ " addq $64,%0\n " // Increment `dst` by 64 bytes (8 double-precision values)
93
+ " cmpq %2,%1\n " // Compare `ait` with `aend`
94
+ " jb 0b\n " // Jump back if `ait < aend`
95
+ : " =r" (dst), " =r" (ait), " =r" (aend)
96
+ : " 0" (dst), " 1" (ait), " 2" (aend)
97
+ : " %zmm0" , " %zmm1" , " memory"
98
+ );
99
+ #else
71
100
__asm__ __volatile__ (
72
101
" 0:\n "
73
102
" vmovupd (%1),%%xmm0\n "
@@ -81,6 +110,7 @@ void FFT_Processor_Spqlios::execute_reverse_int(double *res, const int32_t *a) {
81
110
: " 0" (dst), " 1" (ait), " 2" (aend)
82
111
: " %xmm0" , " %ymm1" , " memory"
83
112
);
113
+ #endif
84
114
}
85
115
ifft (tables_reverse, res);
86
116
}
@@ -110,8 +140,23 @@ void FFT_Processor_Spqlios::execute_direct_torus32(uint32_t *res, const double *
110
140
double *dst = real_inout_direct;
111
141
const double *sit = a;
112
142
const double *send = a + N;
113
- // double __2sN = 2./N;
114
143
const double *bla = &_2sN;
144
+ #ifdef AVX512
145
+ __asm__ __volatile__ (
146
+ " vbroadcastsd (%3),%%zmm2\n " // Broadcast _2sN to zmm2
147
+ " 1:\n "
148
+ " vmovupd (%1),%%zmm0\n " // Load 8 double-precision values from `sit` into zmm0
149
+ " vmulpd %%zmm2,%%zmm0,%%zmm0\n " // Multiply zmm0 by zmm2
150
+ " vmovupd %%zmm0,(%0)\n " // Store the result in `dst`
151
+ " addq $64,%1\n " // Increment `sit` by 64 bytes (8 doubles)
152
+ " addq $64,%0\n " // Increment `dst` by 64 bytes (8 doubles)
153
+ " cmpq %2,%1\n " // Compare `sit` with `send`
154
+ " jb 1b\n " // Jump if `sit` < `send`
155
+ : " =r" (dst), " =r" (sit), " =r" (send), " =r" (bla)
156
+ : " 0" (dst), " 1" (sit), " 2" (send), " 3" (bla)
157
+ : " %zmm0" , " %zmm2" , " memory"
158
+ );
159
+ #else
115
160
__asm__ __volatile__ (
116
161
" vbroadcastsd (%3),%%ymm2\n "
117
162
" 1:\n "
@@ -126,6 +171,7 @@ void FFT_Processor_Spqlios::execute_direct_torus32(uint32_t *res, const double *
126
171
: " 0" (dst), " 1" (sit), " 2" (send), " 3" (bla)
127
172
: " %ymm0" , " %ymm2" , " memory"
128
173
);
174
+ #endif
129
175
}
130
176
fft (tables_direct, real_inout_direct);
131
177
// for (int32_t i = 0; i < N; i++) res[i] = uint32_t(int64_t(real_inout_direct[i]));
@@ -142,6 +188,22 @@ void FFT_Processor_Spqlios::execute_direct_torus32_q(uint32_t *res, const double
142
188
const double *send = a + N;
143
189
// double __2sN = 2./N;
144
190
const double *bla = &_2sN;
191
+ #ifdef USE_AVX512
192
+ __asm__ __volatile__ (
193
+ " vbroadcastsd (%3),%%zmm2\n " // Broadcast _2sN to zmm2
194
+ " 1:\n "
195
+ " vmovupd (%1),%%zmm0\n " // Load 8 double-precision values from `sit` into zmm0
196
+ " vmulpd %%zmm2,%%zmm0,%%zmm0\n " // Multiply zmm0 by zmm2
197
+ " vmovupd %%zmm0,(%0)\n " // Store the result in `dst`
198
+ " addq $64,%1\n " // Increment `sit` by 64 bytes (8 doubles)
199
+ " addq $64,%0\n " // Increment `dst` by 64 bytes (8 doubles)
200
+ " cmpq %2,%1\n " // Compare `sit` with `send`
201
+ " jb 1b\n " // Jump if `sit` < `send`
202
+ : " =r" (dst), " =r" (sit), " =r" (send), " =r" (bla)
203
+ : " 0" (dst), " 1" (sit), " 2" (send), " 3" (bla)
204
+ : " %zmm0" , " %zmm2" , " memory"
205
+ );
206
+ #else
145
207
__asm__ __volatile__ (
146
208
" vbroadcastsd (%3),%%ymm2\n "
147
209
" 1:\n "
@@ -156,6 +218,7 @@ void FFT_Processor_Spqlios::execute_direct_torus32_q(uint32_t *res, const double
156
218
: " 0" (dst), " 1" (sit), " 2" (send), " 3" (bla)
157
219
: " %ymm0" , " %ymm2" , " memory"
158
220
);
221
+ #endif
159
222
}
160
223
fft (tables_direct, real_inout_direct);
161
224
for (int32_t i = 0 ; i < N; i++) res[i] = uint32_t ((int64_t (real_inout_direct[i])%q+q)%q);
@@ -169,8 +232,23 @@ void FFT_Processor_Spqlios::execute_direct_torus32_rescale(uint32_t *res, const
169
232
double *dst = real_inout_direct;
170
233
const double *sit = a;
171
234
const double *send = a + N;
172
- // double __2sN = 2./N;
173
235
const double *bla = &_2sN;
236
+ #ifdef USE_AVX512
237
+ __asm__ __volatile__ (
238
+ " vbroadcastsd (%3),%%zmm2\n " // Broadcast _2sN to zmm2
239
+ " 1:\n "
240
+ " vmovupd (%1),%%zmm0\n " // Load 8 double-precision values from `sit` into zmm0
241
+ " vmulpd %%zmm2,%%zmm0,%%zmm0\n " // Multiply zmm0 by zmm2
242
+ " vmovupd %%zmm0,(%0)\n " // Store the result in `dst`
243
+ " addq $64,%1\n " // Increment `sit` by 64 bytes (8 doubles)
244
+ " addq $64,%0\n " // Increment `dst` by 64 bytes (8 doubles)
245
+ " cmpq %2,%1\n " // Compare `sit` with `send`
246
+ " jb 1b\n " // Jump if `sit` < `send`
247
+ : " =r" (dst), " =r" (sit), " =r" (send), " =r" (bla)
248
+ : " 0" (dst), " 1" (sit), " 2" (send), " 3" (bla)
249
+ : " %zmm0" , " %zmm2" , " memory"
250
+ );
251
+ #else
174
252
__asm__ __volatile__ (
175
253
" vbroadcastsd (%3),%%ymm2\n "
176
254
" 1:\n "
@@ -185,6 +263,7 @@ void FFT_Processor_Spqlios::execute_direct_torus32_rescale(uint32_t *res, const
185
263
: " 0" (dst), " 1" (sit), " 2" (send), " 3" (bla)
186
264
: " %ymm0" , " %ymm2" , " memory"
187
265
);
266
+ #endif
188
267
}
189
268
fft (tables_direct, real_inout_direct);
190
269
for (int32_t i = 0 ; i < N; i++) res[i] = static_cast <uint32_t >(int64_t (real_inout_direct[i]/Δ));
@@ -200,6 +279,22 @@ void FFT_Processor_Spqlios::execute_direct_torus64(uint64_t* res, const double*
200
279
const double * send = a+N;
201
280
// double __2sN = 2./N;
202
281
const double * bla = &_2sN;
282
+ #ifdef USE_AVX512
283
+ __asm__ __volatile__ (
284
+ " vbroadcastsd (%3),%%zmm2\n " // Broadcast 2sN to zmm2
285
+ " 1:\n "
286
+ " vmovupd (%1),%%zmm0\n " // Load 8 double-precision floats from `sit` into zmm0
287
+ " vmulpd %%zmm2,%%zmm0,%%zmm0\n " // Multiply the vector by zmm2
288
+ " vmovapd %%zmm0,(%0)\n " // Store the result into `dst`
289
+ " addq $64,%1\n " // Increment `sit` by 64 (8 doubles * 8 bytes per double)
290
+ " addq $64,%0\n " // Increment `dst` by 64 (8 doubles * 8 bytes per double)
291
+ " cmpq %2,%1\n " // Compare `sit` with `send`
292
+ " jb 1b\n " // Jump back if not done
293
+ : " =r" (dst), " =r" (sit), " =r" (send), " =r" (bla)
294
+ : " 0" (dst), " 1" (sit), " 2" (send), " 3" (bla)
295
+ : " %zmm0" , " %zmm2" , " memory"
296
+ );
297
+ #else
203
298
__asm__ __volatile__ (
204
299
" vbroadcastsd (%3),%%ymm2\n "
205
300
" 1:\n "
@@ -214,6 +309,7 @@ void FFT_Processor_Spqlios::execute_direct_torus64(uint64_t* res, const double*
214
309
: " 0" (dst)," 1" (sit)," 2" (send)," 3" (bla)
215
310
: " %ymm0" ," %ymm2" ," memory"
216
311
);
312
+ #endif
217
313
}
218
314
fft (tables_direct,real_inout_direct);
219
315
#ifdef USE_AVX512
0 commit comments