avx experiments

mcalancea · mcalancea · commit 8d549549efff · 2025-10-07T12:24:12.000+03:00
diff --git a/iris-mpc-common/src/iris_db/iris.rs b/iris-mpc-common/src/iris_db/iris.rs
@@ -1,3 +1,17 @@
+use std::arch::x86_64::__m256i;
+use std::arch::x86_64::_mm256_add_epi64;
+use std::arch::x86_64::_mm256_add_epi8;
+use std::arch::x86_64::_mm256_and_si256;
+use std::arch::x86_64::_mm256_loadu_si256;
+use std::arch::x86_64::_mm256_sad_epu8;
+use std::arch::x86_64::_mm256_set1_epi8;
+use std::arch::x86_64::_mm256_setr_epi8;
+use std::arch::x86_64::_mm256_setzero_si256;
+use std::arch::x86_64::_mm256_shuffle_epi8;
+use std::arch::x86_64::_mm256_srli_epi16;
+use std::arch::x86_64::_mm256_storeu_si256;
+use std::arch::x86_64::_mm256_xor_si256;
+
 use crate::galois_engine::degree4::GaloisRingIrisCodeShare;
 use crate::IRIS_CODE_LENGTH;
 use crate::ROTATIONS;
@@ -400,6 +414,57 @@ impl IrisCode {
         (code_distance as u16, combined_mask_len as u16)
     }
 
+    /// An unsafe worker function to calculate Hamming distance using AVX2.
+    /// It processes the 200 u64s in the IrisCodeArray in 50 chunks of 256 bits.
+    ///
+    /// SAFETY: This function MUST only be called after a runtime check confirms
+    /// that the CPU supports AVX2.
+    #[target_feature(enable = "avx2")]
+    pub unsafe fn get_distance_fraction_avx2(&self, other: &Self) -> (u16, u16) {
+        // Get pointers to the raw u64 arrays.
+        let self_code_ptr = self.code.0.as_ptr() as *const __m256i;
+        let other_code_ptr = other.code.0.as_ptr() as *const __m256i;
+        let self_mask_ptr = self.mask.0.as_ptr() as *const __m256i;
+        let other_mask_ptr = other.mask.0.as_ptr() as *const __m256i;
+
+        let mut total_code_distance: u32 = 0;
+        let mut total_mask_len: u32 = 0;
+
+        // A temporary array to store vector results for scalar popcounting.
+        let mut temp_storage: [u64; 4] = [0; 4];
+        let temp_storage_ptr = temp_storage.as_mut_ptr() as *mut __m256i;
+
+        // Loop 50 times (200 u64s / 4 u64s per __m256i vector = 50 iterations).
+        for i in 0..50 {
+            // Load 256 bits (4 u64s) for each of the four arrays.
+            let self_code_vec = _mm256_loadu_si256(self_code_ptr.add(i));
+            let other_code_vec = _mm256_loadu_si256(other_code_ptr.add(i));
+            let self_mask_vec = _mm256_loadu_si256(self_mask_ptr.add(i));
+            let other_mask_vec = _mm256_loadu_si256(other_mask_ptr.add(i));
+
+            // 1. Get combined_mask = self.mask & other.mask;
+            let combined_mask_vec = _mm256_and_si256(self_mask_vec, other_mask_vec);
+
+            // 2. Get combined_code = (self.code ^ other.code) & combined_mask;
+            let xor_code_vec = _mm256_xor_si256(self_code_vec, other_code_vec);
+            let combined_code_vec = _mm256_and_si256(xor_code_vec, combined_mask_vec);
+
+            // 3. Store the vector results to memory and use the fast scalar `count_ones` (`popcnt`).
+            _mm256_storeu_si256(temp_storage_ptr, combined_mask_vec);
+            total_mask_len += temp_storage[0].count_ones()
+                + temp_storage[1].count_ones()
+                + temp_storage[2].count_ones()
+                + temp_storage[3].count_ones();
+
+            _mm256_storeu_si256(temp_storage_ptr, combined_code_vec);
+            total_code_distance += temp_storage[0].count_ones()
+                + temp_storage[1].count_ones()
+                + temp_storage[2].count_ones()
+                + temp_storage[3].count_ones();
+        }
+
+        (total_code_distance as u16, total_mask_len as u16)
+    }
     /// Return the fractional Hamming distance between two iris codes, represented
     /// as the `i16` dot product of associated masked-bit vectors and the `u16` size
     /// of the common unmasked region.