chore: enable simd optimizations for aarch64 (#5150)

romange · romange · commit a3e1f6889622 · 2025-05-25T09:36:22.000+03:00
chore: enable SIMD optimizations for aarch64 for ascii pack and unpack, Also optimize scalar unpack for both x86 and aarch64. We had to fix the bug in #5140 and now we load chunks of 7 bytes during unpacking. This greatly degraded the performance of scalar unpack, so we now use the "naive" byte by byte implementation which actually faster then using 7-byte loads on both x86 and aarch64. On c4a (aarch64): Benchmarks before: ------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------ BM_PackNaive 222 ns 222 ns 18936335 BM_Pack 222 ns 222 ns 18956309 BM_Pack2 222 ns 222 ns 18951694 BM_PackSimd 220 ns 220 ns 19103906 BM_PackSimd2 223 ns 223 ns 18861252 BM_UnpackNaive 229 ns 229 ns 18228081 BM_Unpack 743 ns 743 ns 5643824 BM_UnpackSimd 744 ns 744 ns 5648469 Benchmarks after: ------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------ BM_PackNaive 221 ns 221 ns 18971332 BM_Pack 222 ns 221 ns 18963948 BM_PackSimd 97.2 ns 97.2 ns 43226095 BM_PackSimd2 96.6 ns 96.6 ns 43491371 BM_Unpack 228 ns 228 ns 18397585 BM_UnpackSimd 101 ns 101 ns 41733901 We improved scalar unpack by x3 from 743ns to 228ns, and improved vectorized unpack by x7. We improved vectorized pack by x2. Signed-off-by: Roman Gershman <roman@dragonflydb.io>
diff --git a/src/core/compact_object_test.cc b/src/core/compact_object_test.cc
@@ -674,29 +674,6 @@ static void ascii_pack_naive(const char* ascii, size_t len, uint8_t* bin) {
   }
 }
 
-static void ascii_unpack_naive(const uint8_t* bin, size_t ascii_len, char* ascii) {
-  constexpr uint8_t kM = 0x7F;
-  uint8_t p = 0;
-  unsigned i = 0;
-
-  while (ascii_len >= 8) {
-    for (i = 0; i < 7; ++i) {
-      uint8_t src = *bin;  // keep on stack in case we unpack inplace.
-      *ascii++ = (p >> (8 - i)) | ((src << i) & kM);
-      p = src;
-      ++bin;
-    }
-
-    ascii_len -= 8;
-    *ascii++ = p >> 1;
-  }
-
-  DCHECK_LT(ascii_len, 8u);
-  for (i = 0; i < ascii_len; ++i) {
-    *ascii++ = *bin++;
-  }
-}
-
 static void BM_PackNaive(benchmark::State& state) {
   string val(1024, 'a');
   uint8_t buf[1024];
@@ -717,16 +694,6 @@ static void BM_Pack(benchmark::State& state) {
 }
 BENCHMARK(BM_Pack);
 
-static void BM_Pack2(benchmark::State& state) {
-  string val(1024, 'a');
-  uint8_t buf[1024];
-
-  while (state.KeepRunning()) {
-    detail::ascii_pack(val.data(), val.size(), buf);
-  }
-}
-BENCHMARK(BM_Pack2);
-
 static void BM_PackSimd(benchmark::State& state) {
   string val(1024, 'a');
   uint8_t buf[1024];
@@ -747,18 +714,6 @@ static void BM_PackSimd2(benchmark::State& state) {
 }
 BENCHMARK(BM_PackSimd2);
 
-static void BM_UnpackNaive(benchmark::State& state) {
-  string val(1024, 'a');
-  uint8_t buf[1024];
-
-  detail::ascii_pack(val.data(), val.size(), buf);
-
-  while (state.KeepRunning()) {
-    ascii_unpack_naive(buf, val.size(), val.data());
-  }
-}
-BENCHMARK(BM_UnpackNaive);
-
 static void BM_Unpack(benchmark::State& state) {
   string val(1024, 'a');
   uint8_t buf[1024];
diff --git a/src/core/detail/bitpacking.cc b/src/core/detail/bitpacking.cc
@@ -28,7 +28,7 @@ static inline uint64_t Compress8x7bit(uint64_t x) {
   return x;
 }
 
-#ifdef __SSE3__
+#if defined(__SSE3__) || defined(__aarch64__)
 static inline pair<const char*, uint8_t*> simd_variant1_pack(const char* ascii, const char* end,
                                                              uint8_t* bin) {
   __m128i val, rpart, lpart;
@@ -204,7 +204,7 @@ void ascii_pack2(const char* ascii, size_t len, uint8_t* bin) {
 
 // The algo - do in parallel what ascii_pack does on two uint64_t integers
 void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin) {
-#ifdef __SSE3__
+#if defined(__SSE3__) || defined(__aarch64__)
   // I leave out 16 bytes in addition to 16 that we load in the loop
   // because we store into bin full 16 bytes instead of 14. To prevent data
   // overwrite we finish loop one iteration earlier.
@@ -221,7 +221,7 @@ void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin) {
 }
 
 void ascii_pack_simd2(const char* ascii, size_t len, uint8_t* bin) {
-#ifdef __SSE3__
+#if defined(__SSE3__) || defined(__aarch64__)
   // I leave out 16 bytes in addition to 16 that we load in the loop
   // because we store into bin full 16 bytes instead of 14. To prevent data
   // overwrite we finish loop one iteration earlier.
@@ -248,32 +248,31 @@ void ascii_pack_simd2(const char* ascii, size_t len, uint8_t* bin) {
 // however, if binary data is positioned on the right of the ascii buffer with empty space on the
 // left than we can unpack inplace.
 void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii) {
-  uint64_t val;
-
-  // we read 7 bytes from bin for every 8 bytes in ascii len.
-  size_t loop_len = (ascii_len / 8) * 8;
-  const char* end = ascii + loop_len;
-  while (ascii < end) {
-    // note: we assume here little endian architectures which is the case for x86 and arm.
-    memcpy(&val, bin, 7);  // read 7 bytes from bin and one byte is ignored.
+  constexpr uint8_t kM = 0x7F;
+  uint8_t p = 0;
+  unsigned i = 0;
 
-    val = ((val & 0x00FFFFFFF0000000) << 4) | (val & 0x000000000FFFFFFF);
-    val = ((val & 0xFFFFC000FFFFC000) << 2) | (val & 0x00003FFF00003FFF);
-    val = ((val & 0x7F807F807F807F80) << 1) | (val & 0x007F007F007F007F);
-    memcpy(ascii, &val, 8);
+  while (ascii_len >= 8) {
+    for (i = 0; i < 7; ++i) {
+      uint8_t src = *bin;  // keep on stack in case we unpack inplace.
+      *ascii++ = (p >> (8 - i)) | ((src << i) & kM);
+      p = src;
+      ++bin;
+    }
 
-    ascii += 8;
-    bin += 7;
+    ascii_len -= 8;
+    *ascii++ = p >> 1;
   }
 
-  for (; loop_len < ascii_len; ++loop_len) {
+  DCHECK_LT(ascii_len, 8u);
+  for (i = 0; i < ascii_len; ++i) {
     *ascii++ = *bin++;
   }
 }
 
 // See CompactObjectTest.AsanTriggerReadOverflow for more details.
 void ascii_unpack_simd(const uint8_t* bin, size_t ascii_len, char* ascii) {
-#ifdef __SSSE3__
+#if defined(__SSE3__) || defined(__aarch64__)
 
   if (ascii_len < 18) {  // ascii_len >=18 means bin length >=16.
     ascii_unpack(bin, ascii_len, ascii);