@@ -120,15 +120,31 @@ float DistanceL2Int8::compare(const int8_t *a, const int8_t *b, uint32_t size) c
120
120
{
121
121
#ifdef _WINDOWS
122
122
#ifdef USE_AVX2
123
+ // Prefetch the start of both vectors
124
+ _mm_prefetch ((const char *)a, _MM_HINT_T0);
125
+ _mm_prefetch ((const char *)b, _MM_HINT_T0);
126
+
123
127
__m256 r = _mm256_setzero_ps ();
124
128
char *pX = (char *)a, *pY = (char *)b;
129
+ const char *original_pX = pX;
130
+ const char *original_pY = pY;
131
+ uint32_t prefetch_offset = 64 ; // Prefetch 64 bytes ahead
132
+
125
133
while (size >= 32 )
126
134
{
135
+ // Prefetch ahead for better cache performance
136
+ if (size > prefetch_offset)
137
+ {
138
+ _mm_prefetch (original_pX + prefetch_offset, _MM_HINT_T0);
139
+ _mm_prefetch (original_pY + prefetch_offset, _MM_HINT_T0);
140
+ }
141
+
127
142
__m256i r1 = _mm256_subs_epi8 (_mm256_loadu_si256 ((__m256i *)pX), _mm256_loadu_si256 ((__m256i *)pY));
128
143
r = _mm256_add_ps (r, _mm256_mul_epi8 (r1, r1));
129
144
pX += 32 ;
130
145
pY += 32 ;
131
146
size -= 32 ;
147
+ prefetch_offset += 32 ;
132
148
}
133
149
while (size > 0 )
134
150
{
@@ -141,19 +157,39 @@ float DistanceL2Int8::compare(const int8_t *a, const int8_t *b, uint32_t size) c
141
157
r = _mm256_hadd_ps (_mm256_hadd_ps (r, r), r);
142
158
return r.m256_f32 [0 ] + r.m256_f32 [4 ];
143
159
#else
160
+ // Prefetch the start of both vectors for non-AVX2 fallback
161
+ _mm_prefetch ((const char *)a, _MM_HINT_T0);
162
+ _mm_prefetch ((const char *)b, _MM_HINT_T0);
163
+
144
164
int32_t result = 0 ;
145
165
#pragma omp simd reduction(+ : result) aligned(a, b : 8)
146
166
for (int32_t i = 0 ; i < (int32_t )size; i++)
147
167
{
168
+ // Prefetch ahead every 64 bytes (64 int8_t values)
169
+ if (i % 64 == 0 && i + 64 < (int32_t )size)
170
+ {
171
+ _mm_prefetch ((const char *)(a + i + 64 ), _MM_HINT_T0);
172
+ _mm_prefetch ((const char *)(b + i + 64 ), _MM_HINT_T0);
173
+ }
148
174
result += ((int32_t )((int16_t )a[i] - (int16_t )b[i])) * ((int32_t )((int16_t )a[i] - (int16_t )b[i]));
149
175
}
150
176
return (float )result;
151
177
#endif
152
178
#else
179
+ // Prefetch the start of both vectors for Linux version
180
+ _mm_prefetch ((const char *)a, _MM_HINT_T0);
181
+ _mm_prefetch ((const char *)b, _MM_HINT_T0);
182
+
153
183
int32_t result = 0 ;
154
184
#pragma omp simd reduction(+ : result) aligned(a, b : 8)
155
185
for (int32_t i = 0 ; i < (int32_t )size; i++)
156
186
{
187
+ // Prefetch ahead every 64 bytes (64 int8_t values)
188
+ if (i % 64 == 0 && i + 64 < (int32_t )size)
189
+ {
190
+ _mm_prefetch ((const char *)(a + i + 64 ), _MM_HINT_T0);
191
+ _mm_prefetch ((const char *)(b + i + 64 ), _MM_HINT_T0);
192
+ }
157
193
result += ((int32_t )((int16_t )a[i] - (int16_t )b[i])) * ((int32_t )((int16_t )a[i] - (int16_t )b[i]));
158
194
}
159
195
return (float )result;
@@ -162,12 +198,22 @@ float DistanceL2Int8::compare(const int8_t *a, const int8_t *b, uint32_t size) c
162
198
163
199
float DistanceL2UInt8::compare (const uint8_t *a, const uint8_t *b, uint32_t size) const
164
200
{
201
+ // Prefetch the start of both vectors
202
+ _mm_prefetch ((const char *)a, _MM_HINT_T0);
203
+ _mm_prefetch ((const char *)b, _MM_HINT_T0);
204
+
165
205
uint32_t result = 0 ;
166
206
#ifndef _WINDOWS
167
207
#pragma omp simd reduction(+ : result) aligned(a, b : 8)
168
208
#endif
169
209
for (int32_t i = 0 ; i < (int32_t )size; i++)
170
210
{
211
+ // Prefetch ahead every 64 bytes (64 uint8_t values)
212
+ if (i % 64 == 0 && i + 64 < (int32_t )size)
213
+ {
214
+ _mm_prefetch ((const char *)(a + i + 64 ), _MM_HINT_T0);
215
+ _mm_prefetch ((const char *)(b + i + 64 ), _MM_HINT_T0);
216
+ }
171
217
result += ((int32_t )((int16_t )a[i] - (int16_t )b[i])) * ((int32_t )((int16_t )a[i] - (int16_t )b[i]));
172
218
}
173
219
return (float )result;
@@ -209,11 +255,21 @@ float DistanceL2Float::compare(const float *a, const float *b, uint32_t size) co
209
255
// horizontal add sum
210
256
result = _mm256_reduce_add_ps (sum);
211
257
#else
258
+ // Prefetch the start of both vectors for non-AVX2 fallback
259
+ _mm_prefetch ((const char *)a, _MM_HINT_T0);
260
+ _mm_prefetch ((const char *)b, _MM_HINT_T0);
261
+
212
262
#ifndef _WINDOWS
213
263
#pragma omp simd reduction(+ : result) aligned(a, b : 32)
214
264
#endif
215
265
for (int32_t i = 0 ; i < (int32_t )size; i++)
216
266
{
267
+ // Prefetch ahead every 16 floats (64 bytes)
268
+ if (i % 16 == 0 && i + 16 < (int32_t )size)
269
+ {
270
+ _mm_prefetch ((const char *)(a + i + 16 ), _MM_HINT_T0);
271
+ _mm_prefetch ((const char *)(b + i + 16 ), _MM_HINT_T0);
272
+ }
217
273
result += (a[i] - b[i]) * (a[i] - b[i]);
218
274
}
219
275
#endif
@@ -271,18 +327,34 @@ float AVXDistanceL2Int8::compare(const int8_t *a, const int8_t *b, uint32_t leng
271
327
272
328
float AVXDistanceL2Float::compare (const float *a, const float *b, uint32_t length) const
273
329
{
330
+ // Prefetch the start of both vectors
331
+ _mm_prefetch ((const char *)a, _MM_HINT_T0);
332
+ _mm_prefetch ((const char *)b, _MM_HINT_T0);
333
+
274
334
__m128 diff, v1, v2;
275
335
__m128 sum = _mm_set1_ps (0 );
336
+
337
+ const float *original_a = a;
338
+ const float *original_b = b;
339
+ uint32_t prefetch_offset = 64 ; // Prefetch 64 bytes ahead (16 floats)
276
340
277
341
while (length >= 4 )
278
342
{
343
+ // Prefetch ahead for better cache performance
344
+ if (length > prefetch_offset)
345
+ {
346
+ _mm_prefetch ((const char *)(original_a + prefetch_offset), _MM_HINT_T0);
347
+ _mm_prefetch ((const char *)(original_b + prefetch_offset), _MM_HINT_T0);
348
+ }
349
+
279
350
v1 = _mm_loadu_ps (a);
280
351
a += 4 ;
281
352
v2 = _mm_loadu_ps (b);
282
353
b += 4 ;
283
354
diff = _mm_sub_ps (v1, v2);
284
355
sum = _mm_add_ps (sum, _mm_mul_ps (diff, diff));
285
356
length -= 4 ;
357
+ prefetch_offset += 4 ;
286
358
}
287
359
288
360
return sum.m128_f32 [0 ] + sum.m128_f32 [1 ] + sum.m128_f32 [2 ] + sum.m128_f32 [3 ];
0 commit comments