Skip to content

Commit a3e1f68

Browse files
committed
chore: enable simd optimizations for aarch64 (#5150)
chore: enable SIMD optimizations for aarch64 for ascii pack and unpack, Also optimize scalar unpack for both x86 and aarch64. We had to fix the bug in #5140 and now we load chunks of 7 bytes during unpacking. This greatly degraded the performance of scalar unpack, so we now use the "naive" byte by byte implementation which actually faster then using 7-byte loads on both x86 and aarch64. On c4a (aarch64): Benchmarks before: ------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------ BM_PackNaive 222 ns 222 ns 18936335 BM_Pack 222 ns 222 ns 18956309 BM_Pack2 222 ns 222 ns 18951694 BM_PackSimd 220 ns 220 ns 19103906 BM_PackSimd2 223 ns 223 ns 18861252 BM_UnpackNaive 229 ns 229 ns 18228081 BM_Unpack 743 ns 743 ns 5643824 BM_UnpackSimd 744 ns 744 ns 5648469 Benchmarks after: ------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------ BM_PackNaive 221 ns 221 ns 18971332 BM_Pack 222 ns 221 ns 18963948 BM_PackSimd 97.2 ns 97.2 ns 43226095 BM_PackSimd2 96.6 ns 96.6 ns 43491371 BM_Unpack 228 ns 228 ns 18397585 BM_UnpackSimd 101 ns 101 ns 41733901 We improved scalar unpack by x3 from 743ns to 228ns, and improved vectorized unpack by x7. We improved vectorized pack by x2. Signed-off-by: Roman Gershman <roman@dragonflydb.io>
1 parent 09ab218 commit a3e1f68

File tree

2 files changed

+18
-64
lines changed

2 files changed

+18
-64
lines changed

src/core/compact_object_test.cc

Lines changed: 0 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -674,29 +674,6 @@ static void ascii_pack_naive(const char* ascii, size_t len, uint8_t* bin) {
674674
}
675675
}
676676

677-
static void ascii_unpack_naive(const uint8_t* bin, size_t ascii_len, char* ascii) {
678-
constexpr uint8_t kM = 0x7F;
679-
uint8_t p = 0;
680-
unsigned i = 0;
681-
682-
while (ascii_len >= 8) {
683-
for (i = 0; i < 7; ++i) {
684-
uint8_t src = *bin; // keep on stack in case we unpack inplace.
685-
*ascii++ = (p >> (8 - i)) | ((src << i) & kM);
686-
p = src;
687-
++bin;
688-
}
689-
690-
ascii_len -= 8;
691-
*ascii++ = p >> 1;
692-
}
693-
694-
DCHECK_LT(ascii_len, 8u);
695-
for (i = 0; i < ascii_len; ++i) {
696-
*ascii++ = *bin++;
697-
}
698-
}
699-
700677
static void BM_PackNaive(benchmark::State& state) {
701678
string val(1024, 'a');
702679
uint8_t buf[1024];
@@ -717,16 +694,6 @@ static void BM_Pack(benchmark::State& state) {
717694
}
718695
BENCHMARK(BM_Pack);
719696

720-
static void BM_Pack2(benchmark::State& state) {
721-
string val(1024, 'a');
722-
uint8_t buf[1024];
723-
724-
while (state.KeepRunning()) {
725-
detail::ascii_pack(val.data(), val.size(), buf);
726-
}
727-
}
728-
BENCHMARK(BM_Pack2);
729-
730697
static void BM_PackSimd(benchmark::State& state) {
731698
string val(1024, 'a');
732699
uint8_t buf[1024];
@@ -747,18 +714,6 @@ static void BM_PackSimd2(benchmark::State& state) {
747714
}
748715
BENCHMARK(BM_PackSimd2);
749716

750-
static void BM_UnpackNaive(benchmark::State& state) {
751-
string val(1024, 'a');
752-
uint8_t buf[1024];
753-
754-
detail::ascii_pack(val.data(), val.size(), buf);
755-
756-
while (state.KeepRunning()) {
757-
ascii_unpack_naive(buf, val.size(), val.data());
758-
}
759-
}
760-
BENCHMARK(BM_UnpackNaive);
761-
762717
static void BM_Unpack(benchmark::State& state) {
763718
string val(1024, 'a');
764719
uint8_t buf[1024];

src/core/detail/bitpacking.cc

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ static inline uint64_t Compress8x7bit(uint64_t x) {
2828
return x;
2929
}
3030

31-
#ifdef __SSE3__
31+
#if defined(__SSE3__) || defined(__aarch64__)
3232
static inline pair<const char*, uint8_t*> simd_variant1_pack(const char* ascii, const char* end,
3333
uint8_t* bin) {
3434
__m128i val, rpart, lpart;
@@ -204,7 +204,7 @@ void ascii_pack2(const char* ascii, size_t len, uint8_t* bin) {
204204

205205
// The algo - do in parallel what ascii_pack does on two uint64_t integers
206206
void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin) {
207-
#ifdef __SSE3__
207+
#if defined(__SSE3__) || defined(__aarch64__)
208208
// I leave out 16 bytes in addition to 16 that we load in the loop
209209
// because we store into bin full 16 bytes instead of 14. To prevent data
210210
// overwrite we finish loop one iteration earlier.
@@ -221,7 +221,7 @@ void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin) {
221221
}
222222

223223
void ascii_pack_simd2(const char* ascii, size_t len, uint8_t* bin) {
224-
#ifdef __SSE3__
224+
#if defined(__SSE3__) || defined(__aarch64__)
225225
// I leave out 16 bytes in addition to 16 that we load in the loop
226226
// because we store into bin full 16 bytes instead of 14. To prevent data
227227
// overwrite we finish loop one iteration earlier.
@@ -248,32 +248,31 @@ void ascii_pack_simd2(const char* ascii, size_t len, uint8_t* bin) {
248248
// however, if binary data is positioned on the right of the ascii buffer with empty space on the
249249
// left than we can unpack inplace.
250250
void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii) {
251-
uint64_t val;
252-
253-
// we read 7 bytes from bin for every 8 bytes in ascii len.
254-
size_t loop_len = (ascii_len / 8) * 8;
255-
const char* end = ascii + loop_len;
256-
while (ascii < end) {
257-
// note: we assume here little endian architectures which is the case for x86 and arm.
258-
memcpy(&val, bin, 7); // read 7 bytes from bin and one byte is ignored.
251+
constexpr uint8_t kM = 0x7F;
252+
uint8_t p = 0;
253+
unsigned i = 0;
259254

260-
val = ((val & 0x00FFFFFFF0000000) << 4) | (val & 0x000000000FFFFFFF);
261-
val = ((val & 0xFFFFC000FFFFC000) << 2) | (val & 0x00003FFF00003FFF);
262-
val = ((val & 0x7F807F807F807F80) << 1) | (val & 0x007F007F007F007F);
263-
memcpy(ascii, &val, 8);
255+
while (ascii_len >= 8) {
256+
for (i = 0; i < 7; ++i) {
257+
uint8_t src = *bin; // keep on stack in case we unpack inplace.
258+
*ascii++ = (p >> (8 - i)) | ((src << i) & kM);
259+
p = src;
260+
++bin;
261+
}
264262

265-
ascii += 8;
266-
bin += 7;
263+
ascii_len -= 8;
264+
*ascii++ = p >> 1;
267265
}
268266

269-
for (; loop_len < ascii_len; ++loop_len) {
267+
DCHECK_LT(ascii_len, 8u);
268+
for (i = 0; i < ascii_len; ++i) {
270269
*ascii++ = *bin++;
271270
}
272271
}
273272

274273
// See CompactObjectTest.AsanTriggerReadOverflow for more details.
275274
void ascii_unpack_simd(const uint8_t* bin, size_t ascii_len, char* ascii) {
276-
#ifdef __SSSE3__
275+
#if defined(__SSE3__) || defined(__aarch64__)
277276

278277
if (ascii_len < 18) { // ascii_len >=18 means bin length >=16.
279278
ascii_unpack(bin, ascii_len, ascii);

0 commit comments

Comments
 (0)