diff --git a/.vscode/launch.json b/.vscode/launch.json index 0a5246fc..6ac02dab 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -74,7 +74,7 @@ "-k", "inner_bf16" ], - "justMyCode": true + "justMyCode": false } ] } \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 7613205f..c9cae64a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -41,6 +41,12 @@ version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "cast" version = "0.3.0" @@ -199,9 +205,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.12" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "libc", @@ -261,9 +267,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.152" +version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "linux-raw-sys" @@ -334,9 +340,12 @@ dependencies = [ [[package]] name = "ppv-lite86" -version = "0.2.17" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] [[package]] name = "proc-macro2" @@ -408,9 +417,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.3" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", @@ -420,9 +429,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.5" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -431,9 +440,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.2" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "rustix" @@ -502,6 +511,7 @@ dependencies = [ "criterion", "half", "rand", + "regex", ] [[package]] @@ -707,3 +717,24 @@ name = "windows_x86_64_msvc" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index 32656a84..8b2552d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,19 +23,27 @@ include = ["/rust/**", "/c/**", "/include/**", "/build.rs"] name = "simsimd" path = "rust/lib.rs" +[[bin]] +name = "spdot" +path = "scripts/spdot.rs" + [build-dependencies] cc = "1.0.83" +[[bench]] +name = "sqeuclidean" +harness = false +path = "scripts/bench_sqeuclidean.rs" [[bench]] -name = "cosine" +name = "tfidf" harness = false -path = "scripts/bench_cosine.rs" +path = "scripts/bench_tfidf.rs" [[bench]] -name = "sqeuclidean" +name = "spdot" harness = false -path = "scripts/bench_sqeuclidean.rs" +path = "scripts/bench_spdot.rs" [profile.bench] opt-level = 3 # Corresponds to -O3 @@ -45,5 +53,9 @@ rpath = false # On some systems, setting this to false can help with optimiz [dev-dependencies] criterion = { version = "0.5.1" } -rand = { version = "0.8.5" } half = { version = "2.4.0" } + +[dependencies] +regex = "1.11.1" +rand = { version = "0.8.5" } + diff --git a/build.rs b/build.rs index 0a65c28c..be763957 100644 --- a/build.rs +++ b/build.rs @@ -59,6 +59,7 @@ fn main() { println!("cargo:rerun-if-changed=include/simsimd/dot.h"); println!("cargo:rerun-if-changed=include/simsimd/spatial.h"); + println!("cargo:rerun-if-changed=include/simsimd/sparse.h"); println!("cargo:rerun-if-changed=include/simsimd/probability.h"); println!("cargo:rerun-if-changed=include/simsimd/binary.h"); println!("cargo:rerun-if-changed=include/simsimd/types.h"); diff --git a/c/lib.c b/c/lib.c index 42872632..cadfe460 100644 --- a/c/lib.c +++ b/c/lib.c @@ -96,6 +96,45 @@ extern "C" { metric(a, b, a_length, b_length, result); \ } +#define SIMSIMD_DECLARATION_SPARSE(name, extension, type) \ + SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \ + simsimd_size_t a_length, simsimd_size_t b_length, \ + simsimd_distance_t *result) { \ + static simsimd_metric_sparse_punned_t metric = 0; \ + if (metric == 0) { \ + simsimd_capability_t used_capability; \ + simsimd_find_kernel_punned(simsimd_metric_##name##_k, simsimd_datatype_##extension##_k, \ + simsimd_capabilities(), simsimd_cap_any_k, \ + (simsimd_kernel_punned_t *)(&metric), &used_capability); \ + if (!metric) { \ + *(simsimd_u64_t *)result = 0x7FF0000000000001ull; \ + return; \ + } \ + } \ + metric(a, b, a_length, b_length, result); \ + } + +#define SIMSIMD_DECLARATION_SPARSE_DOT(name, extension, type) \ + SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \ + simsimd_##extension##_t const *a_weights, \ + simsimd_##extension##_t const *b_weights, \ + simsimd_size_t a_length, simsimd_size_t b_length, \ + simsimd_distance_t *result) { \ + static simsimd_metric_sparse_weight_punned_t metric = 0; \ + if (metric == 0) { \ + simsimd_capability_t used_capability; \ + simsimd_find_kernel_punned(simsimd_metric_##name##_k, simsimd_datatype_##extension##_k, \ + simsimd_capabilities(), simsimd_cap_any_k, \ + (simsimd_kernel_punned_t *)(&metric), &used_capability); \ + if (!metric) { \ + *(simsimd_u64_t *)result = 0x7FF0000000000001ull; \ + return; \ + } \ + } \ + metric(a, b, a_weights, b_weights, a_length, b_length, result); \ + } + + #define SIMSIMD_DECLARATION_CURVED(name, extension, type) \ SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \ simsimd_##type##_t const *c, simsimd_size_t n, \ @@ -195,6 +234,8 @@ SIMSIMD_DECLARATION_DENSE(js, f64, f64) // Sparse sets SIMSIMD_DECLARATION_SPARSE(intersect, u16, u16) SIMSIMD_DECLARATION_SPARSE(intersect, u32, u32) +// Sparse dot +SIMSIMD_DECLARATION_SPARSE_DOT(spdot_weights, f32, u16) // Curved spaces SIMSIMD_DECLARATION_CURVED(bilinear, f64, f64) diff --git a/include/simsimd/simsimd.h b/include/simsimd/simsimd.h index 2eb4f805..cc4c746c 100644 --- a/include/simsimd/simsimd.h +++ b/include/simsimd/simsimd.h @@ -259,6 +259,12 @@ typedef void (*simsimd_metric_sparse_punned_t)(void const *a, void const *b, simsimd_size_t a_length, simsimd_size_t b_length, // simsimd_distance_t *d); + +typedef void (*simsimd_metric_sparse_weight_punned_t)(void const *a, void const *b, // + void const *a_weights, void const *b_weights, // + simsimd_size_t a_length, simsimd_size_t b_length, // + simsimd_distance_t *d); + /** * @brief Type-punned function pointer for curved vector spaces and similarity measures. * @@ -616,6 +622,8 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f32(simsimd_capability_t v, si case simsimd_metric_kl_k: *m = (m_t)&simsimd_kl_f32_neon, *c = simsimd_cap_neon_k; return; case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_f32_neon, *c = simsimd_cap_neon_k; return; case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_f32_neon, *c = simsimd_cap_neon_k; return; + case simsimd_metric_spdot_weights_k: *m = (m_t)&simsimd_spdot_weights_u16_f32_neon, *c = simsimd_cap_serial_k; return; + default: break; } #endif @@ -658,6 +666,7 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f32(simsimd_capability_t v, si case simsimd_metric_mahalanobis_k: *m = (m_t)&simsimd_mahalanobis_f32_serial, *c = simsimd_cap_serial_k; return; case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_f32_serial, *c = simsimd_cap_serial_k; return; case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_f32_serial, *c = simsimd_cap_serial_k; return; + case simsimd_metric_spdot_weights_k: *m = (m_t)&simsimd_spdot_weights_u16_serial, *c = simsimd_cap_serial_k; return; default: break; } } @@ -1122,6 +1131,7 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_u16(simsimd_capability_t v, si #endif if (v & simsimd_cap_serial_k) switch (k) { case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u16_serial, *c = simsimd_cap_serial_k; return; + case simsimd_metric_spdot_weights_k: *m = (m_t)&simsimd_spdot_weights_u16_serial, *c = simsimd_cap_serial_k; return; default: break; } } @@ -2087,6 +2097,8 @@ SIMSIMD_PUBLIC void simsimd_spdot_weights_u16(simsimd_u16_t const *a, simsimd_u1 simsimd_spdot_weights_u16_sve2(a, b, a_weights, b_weights, a_length, b_length, d); #elif SIMSIMD_TARGET_TURIN simsimd_spdot_weights_u16_turin(a, b, a_weights, b_weights, a_length, b_length, d); +// #elif SIMSIMD_TARGET_NEON +// simsimd_spdot_weights_u16_neon(a, b, a_weights, b_weights, a_length, b_length, d); #else simsimd_spdot_weights_u16_serial(a, b, a_weights, b_weights, a_length, b_length, d); #endif diff --git a/include/simsimd/sparse.h b/include/simsimd/sparse.h index 493828bb..457e27fe 100644 --- a/include/simsimd/sparse.h +++ b/include/simsimd/sparse.h @@ -49,6 +49,8 @@ #define SIMSIMD_SPARSE_H #include "types.h" +#include +#include #ifdef __cplusplus extern "C" { @@ -72,7 +74,7 @@ SIMSIMD_PUBLIC void simsimd_spdot_counts_u16_serial( // simsimd_distance_t *results); SIMSIMD_PUBLIC void simsimd_spdot_weights_u16_serial( // simsimd_u16_t const *a, simsimd_u16_t const *b, // - simsimd_bf16_t const *a_weights, simsimd_bf16_t const *b_weights, // + simsimd_f32_t const *a_weights, simsimd_f32_t const *b_weights, // simsimd_size_t a_length, simsimd_size_t b_length, // simsimd_distance_t *results); @@ -182,18 +184,18 @@ SIMSIMD_MAKE_INTERSECT_LINEAR(accurate, u32, size) // simsimd_intersect_u32_accu simsimd_size_t a_length, simsimd_size_t b_length, simsimd_distance_t *results) { \ simsimd_##counter_type##_t intersection_size = 0; \ simsimd_##accumulator_type##_t weights_product = 0; \ - simsimd_size_t i = 0, j = 0; \ + simsimd_size_t i = 0, j = 0; \ while (i != a_length && j != b_length) { \ simsimd_##input_type##_t ai = a[i]; \ simsimd_##input_type##_t bj = b[j]; \ int matches = ai == bj; \ simsimd_##counter_type##_t awi = load_and_convert(a_weights + i); \ - simsimd_##counter_type##_t bwi = load_and_convert(b_weights + i); \ - weights_product += matches * awi * bwi; \ + simsimd_##counter_type##_t bwi = load_and_convert(b_weights + j); \ + weights_product += matches * awi * bwi; \ intersection_size += matches; \ i += ai < bj; \ j += ai >= bj; \ - } \ + } \ results[0] = intersection_size; \ results[1] = weights_product; \ } @@ -255,8 +257,9 @@ SIMSIMD_MAKE_INTERSECT_GALLOPING(serial, u16, size) // simsimd_intersect_u16_ser SIMSIMD_MAKE_INTERSECT_GALLOPING(serial, u32, size) // simsimd_intersect_u32_serial SIMSIMD_MAKE_INTERSECT_WEIGHTED(serial, spdot_counts, u16, size, i16, i32, SIMSIMD_DEREFERENCE) // simsimd_spdot_counts_u16_serial -SIMSIMD_MAKE_INTERSECT_WEIGHTED(serial, spdot_weights, u16, size, bf16, f32, - SIMSIMD_BF16_TO_F32) // simsimd_spdot_weights_u16_serial +SIMSIMD_MAKE_INTERSECT_WEIGHTED(serial, spdot_weights, u16, f64, f32, f32, + SIMSIMD_DEREFERENCE) // simsimd_spdot_weights_u16_serial + /* The AVX-512 implementations are inspired by the "Faster-Than-Native Alternatives * for x86 VP2INTERSECT Instructions" paper by Guille Diez-Canas, 2022. @@ -1059,6 +1062,151 @@ SIMSIMD_PUBLIC void simsimd_intersect_u32_neon( // *results += vaddvq_u32(c_counts_vec.u32x4); } +inline uint8_t skips(uint16x8_t indexes, uint16_t max) { + uint16x8_t smaller_elements = vcleq_u16(indexes, vdupq_n_u16(max)); + uint8x8_t high_bits = vshrn_n_u16(smaller_elements, 8); + uint64_t results = vreinterpret_u64_u8(high_bits); + int leading_zeroes = __builtin_clzl(results); + return leading_zeroes / 8; +} + +void process_remaining( + simsimd_u16_t const *a, simsimd_u16_t const *b, + simsimd_f32_t const *a_weights, simsimd_f32_t const *b_weights, + simsimd_u16_t const *a_end, simsimd_u16_t const *b_end, + simsimd_distance_t *results) { + + simsimd_distance_t intersection_size = results[0]; + simsimd_distance_t total_product = results[1]; + while(a < a_end && b < b_end) { + if(*a == *b) { + total_product += (*a_weights * *b_weights); + a += 1; b += 1; a_weights += 1; b_weights += 1; + intersection_size += 1; + } + if(*a < *b) { a += 1; a_weights += 1;} + if(*b < *a) { b += 1; b_weights += 1;} + } + results[0] = intersection_size; + results[1] = total_product; +} + +#define EXTRACT_LANE_U16(vec, lane) \ + ((lane) == 0 ? vgetq_lane_u16(vec, 0) : \ + (lane) == 1 ? vgetq_lane_u16(vec, 1) : \ + (lane) == 2 ? vgetq_lane_u16(vec, 2) : \ + (lane) == 3 ? vgetq_lane_u16(vec, 3) : \ + (lane) == 4 ? vgetq_lane_u16(vec, 4) : \ + (lane) == 5 ? vgetq_lane_u16(vec, 5) : \ + (lane) == 6 ? vgetq_lane_u16(vec, 6) : \ + vgetq_lane_u16(vec, 7)) + + + +#define EXTRACT_LANE_F32(vec, lane) \ + ((lane) == 0 ? vgetq_lane_f32(vec, 0) : \ + (lane) == 1 ? vgetq_lane_f32(vec, 1) : \ + (lane) == 2 ? vgetq_lane_f32(vec, 2) : \ + vgetq_lane_f32(vec, 3)) + +#define EXT_LANE_U16(vec, lane) \ + ((lane) == 0 ? vgetq_lane_u16(vec, 0) : \ + (lane) == 1 ? vgetq_lane_u16(vec, 1) : \ + (lane) == 2 ? vgetq_lane_u16(vec, 2) : \ + (lane) == 3 ? vgetq_lane_u16(vec, 3) : \ + (lane) == 4 ? vgetq_lane_u16(vec, 4) : \ + (lane) == 5 ? vgetq_lane_u16(vec, 5) : \ + (lane) == 6 ? vgetq_lane_u16(vec, 6) : \ + vgetq_lane_u16(vec, 7)) + +SIMSIMD_PUBLIC void simsimd_spdot_weights_u16_f32_neon( + simsimd_u16_t const *a, simsimd_u16_t const *b, + simsimd_f32_t const *a_weights, simsimd_f32_t const *b_weights, + simsimd_size_t a_length, simsimd_size_t b_length, + simsimd_distance_t *results) { + + const simsimd_size_t register_size = 8; + simsimd_size_t intersection_size = 0; + float total_product = 0.0f; + + #ifndef EXT_LANE + #define EXT_LANE(vec, lane) vget_lane_u16(vec, lane) + #endif + #ifndef EXT_F_LANE + #define EXT_F_LANE(vec, lane) vgetq_lane_f32(vec, lane) + #endif + simsimd_u16_t const * a_start = a; + simsimd_u16_t const * b_start = b; + simsimd_u16_t const * a_end = a + a_length; + simsimd_u16_t const * b_end = b + b_length; + while((a + 8) < a_end && ( b + 8) < b_end) { + + uint16x8_t a_vec, b_vec; + a_vec = vld1q_u16(a) ; + b_vec = vld1q_u16(b) ; + + uint16_t a_min = vgetq_lane_u16(a_vec, 0); + uint16_t a_max = vgetq_lane_u16(a_vec, 7); + uint16_t b_min = vgetq_lane_u16(b_vec, 0); + uint16_t b_max = vgetq_lane_u16(b_vec, 7); + while(a_max < b_min && a + 16 < a_end) { + a += 8; + a_weights += 8; + a_vec = vld1q_u16(a); + a_max = vgetq_lane_u16(a_vec, 7); + } + a_min = vgetq_lane_u16(a_vec, 0); + while(b_max < a_min && b + 16 < b_end) { + b += 8; + b_weights += 8; + b_vec = vld1q_u16(b); + b_max = vgetq_lane_u16(b_vec, 7); + } + b_min = vgetq_lane_u16(b_vec, 0); + + float32x4_t a_weights_vec_low = vld1q_f32(a_weights); + float32x4_t a_weights_vec_high = vld1q_f32(a_weights + 4); + + float32x4_t b_weights_vec_low = vld1q_f32(b_weights); + float32x4_t b_weights_vec_high = vld1q_f32(b_weights + 4); + float32x4_t zeroes = vdupq_n_f32(0.0); + for (int lane = 0; lane < register_size; lane++) { + uint16_t elem = EXT_LANE_U16(b_vec, lane); + uint16x8_t compare_vec = vceqq_u16(a_vec, vdupq_n_u16(elem)); + uint16_t has_matches = vaddvq_u16(compare_vec); + if(has_matches == 0) { + continue; + } + uint16x4_t matches_low = vget_low_u16(compare_vec); + uint16x4_t matches_high = vget_high_u16(compare_vec); + uint32x4_t masks_low = vmovl_s16(matches_low); + uint32x4_t masks_high = vmovl_s16(matches_high); + float b_weight = lane < 4 ? + EXTRACT_LANE_F32(b_weights_vec_low, lane) : + EXTRACT_LANE_F32(b_weights_vec_high, lane - 4); + float32x4_t b_weights = vdupq_n_f32(b_weight); + float32x4_t bl = vbslq_f32(masks_low, b_weights, zeroes); + float32x4_t bh = vbslq_f32(masks_high, b_weights, zeroes); + float32x4_t weights_product_low = vmulq_f32(bl, a_weights_vec_low); + float32x4_t weights_product_high = vmulq_f32(bh, a_weights_vec_high); + total_product += vaddvq_f32(weights_product_low); + total_product += vaddvq_f32(weights_product_high); + uint16_t upper_intersections = vaddv_u16(vshr_n_u16(matches_low, 15)); + uint16_t lower_intersections = vaddv_u16(vshr_n_u16(matches_high, 15)); + intersection_size += upper_intersections; + intersection_size += lower_intersections; + } + uint8_t askips = skips(a_vec, b_max); + a += 8 - askips; + uint8_t bskips = skips(b_vec, a_max); + b += 8 - bskips; + } + results[0] += intersection_size; + results[1] += total_product; + process_remaining(a, b,a_weights,b_weights, a_start + a_length, b_start + b_length, results); + +} + #pragma clang attribute pop #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON diff --git a/rust/lib.rs b/rust/lib.rs index 1445167d..f0695dc4 100644 --- a/rust/lib.rs +++ b/rust/lib.rs @@ -82,7 +82,7 @@ extern "C" { fn simsimd_l2_i8(a: *const i8, b: *const i8, c: usize, d: *mut Distance); fn simsimd_l2_f16(a: *const u16, b: *const u16, c: usize, d: *mut Distance); - fn simsimd_l2_bf16(a: *const u16, b: *const u16, c: usize, d: *mut Distance); + // fn simsimd_l2_bf16(a: *const u16, b: *const u16, c: usize, d: *mut Distance); fn simsimd_l2_f32(a: *const f32, b: *const f32, c: usize, d: *mut Distance); fn simsimd_l2_f64(a: *const f64, b: *const f64, c: usize, d: *mut Distance); @@ -99,9 +99,31 @@ extern "C" { fn simsimd_kl_f32(a: *const f32, b: *const f32, c: usize, d: *mut Distance); fn simsimd_kl_f64(a: *const f64, b: *const f64, c: usize, d: *mut Distance); - fn simsimd_intersect_u16(a: *const u16, b: *const u16, a_length: usize, b_length: usize, d: *mut Distance); - fn simsimd_intersect_u32(a: *const u32, b: *const u32, a_length: usize, b_length: usize, d: *mut Distance); - + fn simsimd_intersect_u16( + a: *const u16, + b: *const u16, + a_length: usize, + b_length: usize, + d: *mut Distance, + ); + fn simsimd_intersect_u32( + a: *const u32, + b: *const u32, + a_length: usize, + b_length: usize, + d: *mut Distance, + ); + + fn simsimd_spdot_weights_f32( + a: *const u16, + b: *const u16, + a_weights: *const f32, + b_weights: *const f32, + a_length: usize, + b_length: usize, + intersection: *mut f64, + ); + fn simsimd_uses_neon() -> i32; fn simsimd_uses_neon_f16() -> i32; fn simsimd_uses_neon_bf16() -> i32; @@ -127,7 +149,8 @@ impl f16 {} /// A half-precision floating point number, called brain float. #[repr(transparent)] -pub struct bf16(u16); +#[derive(Debug, Clone, Copy)] +pub struct bf16(pub u16); impl bf16 {} @@ -225,8 +248,8 @@ where fn l2sq(a: &[Self], b: &[Self]) -> Option; /// Computes the Euclidean distance between two slices. - /// The Euclidean distance is the square root of - // sum of the squared differences between corresponding + /// The Euclidean distance is the square root of + // sum of the squared differences between corresponding /// elements of the two slices. fn l2(a: &[Self], b: &[Self]) -> Option; @@ -238,8 +261,8 @@ where } /// Computes the Euclidean distance between two slices. - /// The Euclidean distance is the square root of the - /// sum of the squared differences between corresponding + /// The Euclidean distance is the square root of the + /// sum of the squared differences between corresponding /// elements of the two slices. fn euclidean(a: &[Self], b: &[Self]) -> Option { SpatialSimilarity::l2(a, b) @@ -328,6 +351,30 @@ where fn intersect(a: &[Self], b: &[Self]) -> Option; } +pub fn sparse_dot_product( + a: &[u16], + b: &[u16], + a_weights: &[f32], + b_weights: &[f32], +) -> (Distance, Distance) { + let mut intersection = [0.0; 2]; + let af32_ptr = a_weights.as_ptr() as *const f32; + let bf32_ptr = b_weights.as_ptr() as *const f32; + unsafe { + crate::simsimd_spdot_weights_f32( + a.as_ptr(), + b.as_ptr(), + af32_ptr, + bf32_ptr, + a.len(), + b.len(), + intersection.as_mut_ptr() + ); + } + (intersection[0], intersection[1]) + +} + impl BinarySimilarity for u8 { fn hamming(a: &[Self], b: &[Self]) -> Option { if a.len() != b.len() { @@ -393,25 +440,21 @@ impl SpatialSimilarity for i8 { } impl Sparse for u16 { - fn intersect(a: &[Self], b: &[Self]) -> Option { let mut distance_value: Distance = 0.0; let distance_ptr: *mut Distance = &mut distance_value as *mut Distance; unsafe { simsimd_intersect_u16(a.as_ptr(), b.as_ptr(), a.len(), b.len(), distance_ptr) }; Some(distance_value) } - } impl Sparse for u32 { - fn intersect(a: &[Self], b: &[Self]) -> Option { let mut distance_value: Distance = 0.0; let distance_ptr: *mut Distance = &mut distance_value as *mut Distance; unsafe { simsimd_intersect_u32(a.as_ptr(), b.as_ptr(), a.len(), b.len(), distance_ptr) }; Some(distance_value) } - } impl SpatialSimilarity for f16 { @@ -458,7 +501,6 @@ impl SpatialSimilarity for f16 { } fn l2(a: &[Self], b: &[Self]) -> Option { - if a.len() != b.len() { return None; } @@ -993,7 +1035,6 @@ mod tests { .map(|&x| HalfF16::from_f32(x)) .collect(); - let a_simsimd: &[f16] = unsafe { std::slice::from_raw_parts(a_half.as_ptr() as *const f16, a_half.len()) }; let b_simsimd: &[f16] = @@ -1003,7 +1044,6 @@ mod tests { println!("The result of l2_f16 is {:.8}", result); assert_almost_equal(5.2, result, 0.01); } - } #[test] @@ -1160,8 +1200,8 @@ mod tests { #[test] fn test_intersect_u16() { { - let a_u16: &[u16] = &[153, 16384, 17408]; - let b_u16: &[u16] = &[15360, 16384, 7408]; + let a_u16: &[u16] = &[153, 16384, 17408]; + let b_u16: &[u16] = &[15360, 16384, 7408]; if let Some(result) = Sparse::intersect(a_u16, b_u16) { println!("The result of intersect_u16 is {:.8}", result); @@ -1170,37 +1210,36 @@ mod tests { } { - let a_u16: &[u16] = &[153, 11638, 08]; - let b_u16: &[u16] = &[15360, 16384, 7408]; + let a_u16: &[u16] = &[153, 11638, 08]; + let b_u16: &[u16] = &[15360, 16384, 7408]; if let Some(result) = Sparse::intersect(a_u16, b_u16) { println!("The result of intersect_u16 is {:.8}", result); assert_almost_equal(0.0, result, 0.0001); - } + } } } #[test] fn test_intersect_u32() { { - let a_u32: &[u32] = &[11, 153]; - let b_u32: &[u32] = &[11, 153, 7408, 16384]; + let a_u32: &[u32] = &[11, 153]; + let b_u32: &[u32] = &[11, 153, 7408, 16384]; if let Some(result) = Sparse::intersect(a_u32, b_u32) { println!("The result of intersect_u32 is {:.8}", result); assert_almost_equal(2.0, result, 0.0001); } } - + { - let a_u32: &[u32] = &[153, 7408, 11638]; - let b_u32: &[u32] = &[153, 7408, 11638]; + let a_u32: &[u32] = &[153, 7408, 11638]; + let b_u32: &[u32] = &[153, 7408, 11638]; if let Some(result) = Sparse::intersect(a_u32, b_u32) { println!("The result of intersect_u32 is {:.8}", result); assert_almost_equal(3.0, result, 0.0001); - } + } } - } } diff --git a/scripts/bench_spdot.rs b/scripts/bench_spdot.rs new file mode 100644 index 00000000..25be69e5 --- /dev/null +++ b/scripts/bench_spdot.rs @@ -0,0 +1,162 @@ +use criterion::BenchmarkId; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rand::Rng; +use std::fmt::Display; +use std::fmt::Formatter; +use rand::seq::index::sample; +use simsimd::sparse_dot_product; +//use half::bf16 as hbf16; +#[derive(Clone, Debug)] +struct SparseVector { + indices: Vec, + values: Vec, +} +impl SparseVector { + fn from_dense(dense_vec: &[f32]) -> Self { + if dense_vec.len() >= u16::MAX as usize { + panic!("Dense vector is too large to convert to sparse vector"); + } + let mut indices: Vec = Vec::new(); + let mut values = Vec::new(); + + for (idx, &value) in dense_vec.iter().enumerate() { + if value != 0.0 { + indices.push(idx.try_into().unwrap()); + values.push(value); + } + } + + SparseVector { indices, values } + } + + fn sparse_dot_product(&self, other: &SparseVector) -> (u16, f64) { + let mut result = 0.0; + let mut i = 0; + let mut j = 0; + let mut matches: u16 = 0; + while i < self.indices.len() && j < other.indices.len() { + if self.indices[i] == other.indices[j] { + matches += 1; + result += f64::from( self.values[i] * other.values[j]); + i += 1; + j += 1; + } else if self.indices[i] < other.indices[j] { + i += 1; + } else { + j += 1; + } + } + + (matches, result) + } +} + + +impl Display for SparseVector { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "SparseVector {{ indices: {:?}, values: {:?} }}", self.indices, self.values) + } +} + +fn generate_intersecting_vectors(first_size: usize, second_size: usize, intersection_size: usize) -> (SparseVector, SparseVector) +{ + let mut rng = rand::thread_rng(); + let mut first_vector_indices: Vec = Vec::with_capacity(first_size); + let mut second_vector_indices: Vec = Vec::with_capacity(second_size); + let mut first_vector: Vec = Vec::with_capacity(first_size); + let mut second_vector: Vec = Vec::with_capacity(second_size); + let unique_first = first_size - intersection_size; + let unique_second = second_size - intersection_size; + assert!(intersection_size + unique_first + unique_second <= 65535, "Too many elements in the vectors"); + let total = intersection_size + (first_size - intersection_size) + (second_size - intersection_size); + + let unique_indices: Vec = sample(&mut rng, 65535, total).into_iter().map(|x| x as u16).collect(); + // assert!( unique_indices.len() == total, "unique_indices length is not correct: {}, expected {}", unique_indices.len(), total); + + first_vector_indices.extend(unique_indices.iter().take(intersection_size)); + second_vector_indices.extend(unique_indices.iter().take(intersection_size)); + first_vector_indices.extend(unique_indices.iter().skip(intersection_size).take(first_size - intersection_size)); + second_vector_indices.extend(unique_indices.iter().skip(intersection_size).skip(first_size - intersection_size).take(second_size - intersection_size)); + first_vector_indices.sort(); + second_vector_indices.sort(); + + for _i in 0..first_size { + let value: f32 = rng.gen(); + first_vector.push(value); + } + for _i in 0..second_size { + let value: f32 = rng.gen(); + second_vector.push(value); + } + + (SparseVector{indices: first_vector_indices, values: first_vector}, SparseVector{indices: second_vector_indices, values: second_vector}) + +} + + +fn bench_dot_products(c: &mut Criterion) { + // Define test parameters + let first_lens = [66, 129, 513, 1025, 2049]; + let second_lens = [9, 17, 33]; + let intersection_ratios = [0.1, 0.5, 0.9]; + + // Create benchmark group for plain implementation + let mut plain_group = c.benchmark_group("plain_dot_product"); + for &first_len in first_lens.iter() { + for &second_len in second_lens.iter() { + for &ratio in intersection_ratios.iter() { + let intersection_size = (ratio * second_len as f32).ceil() as usize; + let params = format!("{}x{}@{}", first_len, second_len, ratio); + + plain_group.bench_with_input( + BenchmarkId::new("plain", params), + &(first_len, second_len, intersection_size), + |b, &(f_len, s_len, i_size)| { + b.iter_with_setup( + || generate_intersecting_vectors(f_len, s_len, i_size), + |(first_vector, second_vector)| { + let (similar_items, _dot_product) = first_vector.sparse_dot_product(&second_vector); + black_box(similar_items); + } + ); + } + ); + } + } + } + plain_group.finish(); + + // Create benchmark group for NEON implementation + let mut neon_group = c.benchmark_group("neon_dot_product"); + for &first_len in first_lens.iter() { + for &second_len in second_lens.iter() { + for &ratio in intersection_ratios.iter() { + let intersection_size = (ratio * second_len as f32).ceil() as usize; + let params = format!("{}x{}@{}", first_len, second_len, ratio); + + neon_group.bench_with_input( + BenchmarkId::new("neon", params), + &(first_len, second_len, intersection_size), + |b, &(f_len, s_len, i_size)| { + b.iter_with_setup( + || generate_intersecting_vectors(f_len, s_len, i_size), + |(first_vector, second_vector)| { + let (similar_items, _dot_product) = sparse_dot_product( + first_vector.indices.as_slice(), + second_vector.indices.as_slice(), + first_vector.values.as_slice(), + second_vector.values.as_slice(), + ); + black_box(similar_items) + } + ); + } + ); + } + } + } + neon_group.finish(); +} + +criterion_group!(benches, bench_dot_products); +criterion_main!(benches); \ No newline at end of file diff --git a/scripts/bench_tfidf.rs b/scripts/bench_tfidf.rs new file mode 100644 index 00000000..7cdbe03c --- /dev/null +++ b/scripts/bench_tfidf.rs @@ -0,0 +1,220 @@ +use criterion::BenchmarkId; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use std::collections::HashMap; +use std::collections::HashSet; +use std::fs::File; +use std::io::BufRead; +use std::io::BufReader; +// use std::io::{self}; +use regex::Regex; +use std::path::Path; + +use simsimd::SpatialSimilarity; + +fn l2_normalize(v: &mut Vec) { + let mut sum = 0.0; + for i in v.iter() { + sum += i.powi(2); + } + sum = sum.sqrt(); + for element in v.iter_mut() { + *element = *element / sum; + } +} + +pub fn plain_cosine_similarity(vec1: &[f64], vec2: &[f64]) -> Option { + // Check if vectors have the same length + if vec1.len() != vec2.len() { + return None; + } + + // Calculate dot product + let dot_product: f64 = vec1.iter().zip(vec2.iter()).map(|(a, b)| a * b).sum(); + + // Calculate magnitudes + let magnitude1: f64 = vec1.iter().map(|x| x.powi(2)).sum::().sqrt(); + + let magnitude2: f64 = vec2.iter().map(|x| x.powi(2)).sum::().sqrt(); + + // Prevent division by zero + if magnitude1 == 0.0 || magnitude2 == 0.0 { + return None; + } + + // Calculate cosine similarity + Some(dot_product / (magnitude1 * magnitude2)) +} + +#[derive(Debug)] +struct TfidfCalculator { + tokens: Vec, + tokens_index: HashMap, + row_norms: Vec>, + documents: Vec>, + idfs: HashMap, + re: Regex, +} + +impl TfidfCalculator { + fn new() -> Self { + TfidfCalculator { + tokens: Vec::new(), + tokens_index: HashMap::new(), + row_norms: Vec::new(), + documents: Vec::new(), + idfs: HashMap::new(), + re: Regex::new(r"(?U)\b\w{2,}\b").unwrap(), + } + } + + fn tokenize(&self, text: &str) -> Vec { + let matches: Vec = self + .re + .find_iter(text.to_lowercase().as_str()) + .map(|mat| mat.as_str().to_string()) + .collect(); + matches + } + + fn process_documents(&mut self, documents: Vec) { + let mut unique_tokens = HashSet::new(); + let mut document_tokens: Vec> = Vec::new(); + for document in &documents { + let tokens = self.tokenize(document); + for token in tokens.iter() { + unique_tokens.insert(token.clone()); + } + document_tokens.push(tokens); + } + self.tokens = unique_tokens.into_iter().collect(); + self.tokens.sort(); + + for (idx, term) in self.tokens.iter().enumerate() { + self.tokens_index.insert(term.clone(), idx); + } + + let mut matrix = vec![vec![0.0; self.tokens.len()]; documents.len()]; + // I have count of terms in each document + for (row_idx, document) in document_tokens.iter().enumerate() { + for token in document.iter() { + let term_id = self.tokens_index.get(token).unwrap(); + matrix[row_idx][*term_id] += 1.0; + } + } + self.documents = std::mem::take(&mut matrix); + } + + fn calculate_idf(&mut self) { + for term_id in 0..self.tokens.len() { + let mut df = 0; + for document in self.documents.iter() { + if document[term_id] > 0.0 { + df += 1; + } + } + let n_documents = self.documents.len(); + let present = df as f64; + let idf = (((n_documents + 1) as f64) / (present + 1.0)).ln() + 1.0; + self.idfs.insert(term_id, idf); + } + } + + fn calculate_tfidf(&mut self) { + let mut tfidf_matrix = vec![vec![0.0; self.tokens.len()]; self.documents.len()]; + + for (row_idx, document) in self.documents.iter().enumerate() { + for i in 0..self.tokens.len() { + let term_id = i; + let term_frequency = document[term_id]; + let tf = term_frequency as f64; + let idf = self.idfs.get(&term_id).unwrap(); + if term_frequency == 0.0 { + tfidf_matrix[row_idx][term_id] = 0.0; + } else { + tfidf_matrix[row_idx][term_id] = tf * idf; + } + } + } + self.row_norms = std::mem::take(&mut tfidf_matrix); + } + + fn normalize(&mut self) { + for row in self.row_norms.iter_mut() { + l2_normalize(row); + } + } + + fn tfidf_representation(&self, row: &str) -> Vec { + let mut row_vector = vec![0.0; self.tokens.len()]; + let tokens = self.tokenize(row); + for token in tokens.iter() { + let term_id = self.tokens_index.get(token).unwrap(); + let term_idf = self.idfs.get(term_id).unwrap(); + row_vector[*term_id] += *term_idf; + } + l2_normalize(&mut row_vector); + return row_vector; + } +} + +pub fn tfidfsimilarity_benchmark(c: &mut Criterion) { + // Prepare your data and calculator here + + let path = Path::new("leipzig10000.txt"); + let file = File::open(path).unwrap(); + let reader = BufReader::new(file); + + let mut calculator = TfidfCalculator::new(); + + let mut documents = Vec::new(); + for line in reader.lines() { + documents.push(line.unwrap()); + } + + calculator.process_documents(documents); + calculator.calculate_idf(); + calculator.calculate_tfidf(); + calculator.normalize(); + + let mut group = c.benchmark_group("TF-IDF Similarity"); + + let query = "we went over to the whole world"; + let query_vector = calculator.tfidf_representation(query); + + group.sample_size(10); + + group.bench_function("SimSIMD Cosine Similarity", |b| { + b.iter(|| { + let total_similarity: f64 = calculator + .row_norms + .iter() + .map(|row_vector| { + black_box( + SpatialSimilarity::cosine(query_vector.as_ref(), row_vector.as_ref()) + .unwrap_or(0.0), + ) + }) + .sum(); + black_box(total_similarity) + }) + }); + + group.bench_function("Rust plain Cosine similarity", |b| { + b.iter(|| { + let total_similarity: f64 = calculator + .row_norms + .iter() + .map(|row_vector| { + black_box( + plain_cosine_similarity(query_vector.as_ref(), row_vector.as_ref()) + .unwrap_or(0.0), + ) + }) + .sum(); + black_box(total_similarity) + }) + }); +} + +criterion_group!(benches, tfidfsimilarity_benchmark); +criterion_main!(benches); diff --git a/scripts/spdot.rs b/scripts/spdot.rs new file mode 100644 index 00000000..948c1a00 --- /dev/null +++ b/scripts/spdot.rs @@ -0,0 +1,142 @@ +use rand::Rng; +use std::fmt::{Formatter, Display}; +use rand::seq::index::sample; +use simsimd::sparse_dot_product; +//use half::bf16 as hbf16; +#[derive(Clone, Debug)] +struct SparseVector { + indices: Vec, + values: Vec, +} +impl SparseVector { + fn from_dense(dense_vec: &[f32]) -> Self { + if dense_vec.len() >= u16::MAX as usize { + panic!("Dense vector is too large to convert to sparse vector"); + } + let mut indices: Vec = Vec::new(); + let mut values = Vec::new(); + + for (idx, &value) in dense_vec.iter().enumerate() { + if value != 0.0 { + indices.push(idx.try_into().unwrap()); + values.push(value); + } + } + + SparseVector { indices, values } + } + + fn sparse_dot_product(&self, other: &SparseVector) -> (u16, f64) { + let mut result = 0.0; + let mut i = 0; + let mut j = 0; + let mut matches: u16 = 0; + while i < self.indices.len() && j < other.indices.len() { + if self.indices[i] == other.indices[j] { + matches += 1; + result += f64::from( self.values[i] * other.values[j]); + i += 1; + j += 1; + } else if self.indices[i] < other.indices[j] { + i += 1; + } else { + j += 1; + } + } + + (matches, result) + } +} + + +impl Display for SparseVector { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "SparseVector {{ indices: {:?}, values: {:?} }}", self.indices, self.values) + } +} + +fn generate_intersecting_vectors(first_size: usize, second_size: usize, intersection_size: usize) -> (SparseVector, SparseVector) +{ + let mut rng = rand::thread_rng(); + let mut first_vector_indices: Vec = Vec::with_capacity(first_size); + let mut second_vector_indices: Vec = Vec::with_capacity(second_size); + let mut first_vector: Vec = Vec::with_capacity(first_size); + let mut second_vector: Vec = Vec::with_capacity(second_size); + let unique_first = first_size - intersection_size; + let unique_second = second_size - intersection_size; + assert!(intersection_size + unique_first + unique_second <= 65535, "Too many elements in the vectors"); + let total = intersection_size + (first_size - intersection_size) + (second_size - intersection_size); + + let unique_indices: Vec = sample(&mut rng, 65535, total).into_iter().map(|x| x as u16).collect(); + // assert!( unique_indices.len() == total, "unique_indices length is not correct: {}, expected {}", unique_indices.len(), total); + + first_vector_indices.extend(unique_indices.iter().take(intersection_size)); + second_vector_indices.extend(unique_indices.iter().take(intersection_size)); + first_vector_indices.extend(unique_indices.iter().skip(intersection_size).take(first_size - intersection_size)); + second_vector_indices.extend(unique_indices.iter().skip(intersection_size).skip(first_size - intersection_size).take(second_size - intersection_size)); + first_vector_indices.sort(); + second_vector_indices.sort(); + + for _i in 0..first_size { + let value: f32 = rng.gen(); + first_vector.push(value); + } + for _i in 0..second_size { + let value: f32 = rng.gen(); + second_vector.push(value); + } + + (SparseVector{indices: first_vector_indices, values: first_vector}, SparseVector{indices: second_vector_indices, values: second_vector}) + +} + +pub fn main() { + for first_len in [64, 128, 512, 1024, 2048] { + for second_len in [8, 16, 32] { + for intersection_ratio in [0.1, 0.5, 0.9] { + let intersection_size = (intersection_ratio * second_len as f32).ceil() as usize; + let (first_vector, second_vector) = generate_intersecting_vectors(first_len, second_len, intersection_size); + let mut total_ns: u128 = 0; + for _j in 0..10 { + let start = std::time::Instant::now(); + let (_similar_items, _dot_product) = first_vector.sparse_dot_product(&second_vector); + let elapsed = start.elapsed(); + assert!(_similar_items == intersection_size as u16, "similar items: {}, intersection_size: {}", _similar_items, intersection_size); + total_ns += elapsed.as_nanos(); + } + println!("plain dot product: {} against {}, avg elapsed_time ns: {}", first_len, second_len, total_ns/10); + + + + } + } + } + + for first_len in [64, 128, 512, 1024, 2048] { + for second_len in [8, 16, 32] { + for intersection_ratio in [0.1, 0.5, 0.9] { + let intersection_size = (intersection_ratio * second_len as f32).ceil() as usize; + let (first_vector, second_vector) = generate_intersecting_vectors(first_len, second_len, intersection_size); + let mut total_ns: u128 = 0; + for _j in 0..10 { + let start = std::time::Instant::now(); + let (neon_similar_items, _dot_product) = sparse_dot_product( + first_vector.indices.as_slice(), + second_vector.indices.as_slice(), + first_vector.values.as_slice(), + second_vector.values.as_slice(), + ); + let elapsed = start.elapsed(); + total_ns += elapsed.as_nanos(); + assert!(neon_similar_items == intersection_size as f64, "similar items: {}, intersection_size: {}\n {}, \n second {}", neon_similar_items, intersection_size, first_vector, second_vector); + } + println!("NEON: {} vs {} avg elapsed_time ns: {}",first_len, second_len, total_ns/10); + + + } + } + } + + +} + \ No newline at end of file