modified jaccard similarity

t-sutradhara_microsoft · sdananya · commit 389e40449fad · 2025-07-27T23:16:19.000-07:00
diff --git a/apps/search_memory_index.cpp b/apps/search_memory_index.cpp
@@ -142,7 +142,10 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path,
           << std::setw(20) << "Brute Recall" 
           << std::setw(22) << "Graph avg cmps"
           << std::setw(22) << "Graph Latency(mus)" 
-          << std::setw(20) << "Graph Recall" 
+          << std::setw(20) << "Graph Recall"
+          << std::setw(18) << "Filter Eval(mus)"
+          << std::setw(18) << "Penalty Det(mus)"
+          << std::setw(18) << "Core Algo(mus)"
           << std::endl;
 
     table_width += 4 + 4 + 8 + 18 + 20 + 20 + 20 + 20 + 10 + 22 + 20 + 22 + 20 + 22 + 22;
@@ -408,26 +411,49 @@ int search_memory_index(diskann::Metric &metric, const std::string &index_path,
         }
         else
         {
-            std::cout << std::setw(4) << L << std::setw(4) << recall_at << std::setw(8) << displayed_qps << std::setw(15) << avg_cmps
-                      << std::setw(20) << (float)mean_latency << std::setw(15) 
-                      << std::setw(20) << (float)latency_999 << std::setw(15)
-                      << std::setw(20) << (float)latency_99 << std::setw(15)
-                      << std::setw(20) << (float)latency_95 << std::setw(15)
-                      << (float)recalls[0] << std::setw(20)
-                      << (float)(brute_dist_cmp[test_id] * 1.0) / (num_brutes * 1.0) << std::setw(22)
-                      << (float)(brute_lat[test_id] * 1.0) / (num_brutes * 1.0) << std::setw(20)
-                      << (float)(brute_recalls[test_id] * 100.0) / (num_brutes * recall_at * 1.0) << std::setw(20)
-                      << (float)(graph_lat[test_id] * 1.0) / (num_graphs * 1.0) << std::setw(20)
-                      << (float)(graph_recalls[test_id] * 100.0) / (num_graphs * recall_at * 1.0) << " " << (1000000*time_to_detect_penalty) / query_num << "\t" << (1000000*time_to_get_valid) / query_num 
-                      //                      << std::setw(20) << (float)(brute_lat[test_id]*1.0) << std::setw(20) <<
-                      //                      (float)(brute_recalls[test_id]*100.0)
-                      //                     << std::setw(20) << (float)(graph_lat[test_id]*1.0) << std::setw(20) <<
-                      //                     (float)(graph_recalls[test_id]*100.0)
+            // Calculate timing breakdowns (convert to microseconds)
+            float filter_eval_time_us = (float)(time_to_get_valid * 1000000.0) / (float)query_num;
+            float penalty_detection_time_us = (float)(time_to_detect_penalty * 1000000.0) / (float)query_num;
+            float core_algo_time_us = mean_latency - filter_eval_time_us - penalty_detection_time_us;
+            
+            std::cout << std::setw(4) << L << std::setw(4) << recall_at << std::setw(8) << displayed_qps << std::setw(18) << avg_cmps
+                      << std::setw(20) << (float)mean_latency 
+                      << std::setw(20) << (float)latency_999
+                      << std::setw(20) << (float)latency_99
+                      << std::setw(20) << (float)latency_95
+                      << std::setw(10) << (float)recalls[0] 
+                      << std::setw(22) << (float)(brute_dist_cmp[test_id] * 1.0) / (num_brutes * 1.0) 
+                      << std::setw(22) << (float)(brute_lat[test_id] * 1.0) / (num_brutes * 1.0) 
+                      << std::setw(20) << (float)(brute_recalls[test_id] * 100.0) / (num_brutes * recall_at * 1.0) 
+                      << std::setw(22) << (float)(graph_dist_cmp[test_id] * 1.0) / (num_graphs * 1.0)
+                      << std::setw(22) << (float)(graph_lat[test_id] * 1.0) / (num_graphs * 1.0) 
+                      << std::setw(20) << (float)(graph_recalls[test_id] * 100.0) / (num_graphs * recall_at * 1.0)
+                      << std::setw(18) << filter_eval_time_us
+                      << std::setw(18) << penalty_detection_time_us
+                      << std::setw(18) << core_algo_time_us
                       << std::endl;
         }
     }
     std::cout << "num_graphs " << num_graphs << std::endl;
     std::cout << "num_brutes " << num_brutes << std::endl;
+    
+    // Print detailed timing breakdown summary
+    if (filtered_search) {
+        std::cout << "\n=== TIMING BREAKDOWN ANALYSIS ===" << std::endl;
+        std::cout << "Total queries: " << query_num << std::endl;
+        std::cout << "Filter evaluation time: " << (time_to_get_valid * 1000000.0) / query_num << " μs/query" << std::endl;
+        std::cout << "Penalty detection time: " << (time_to_detect_penalty * 1000000.0) / query_num << " μs/query" << std::endl;
+        std::cout << "Filter intersection time: " << (time_to_intersect * 1000000.0) / query_num << " μs/query" << std::endl;
+        std::cout << "Filter check & compare time: " << (time_to_filter_check_and_compare * 1000000.0) / query_num << " μs/query" << std::endl;
+        
+        double total_filter_overhead = (time_to_get_valid + time_to_detect_penalty + time_to_intersect + time_to_filter_check_and_compare) * 1000000.0 / query_num;
+        std::cout << "Total filter overhead: " << total_filter_overhead << " μs/query" << std::endl;
+        
+        std::cout << "Breakdown percentage:" << std::endl;
+        std::cout << "  Graph searches: " << (100.0 * num_graphs) / query_num << "%" << std::endl;
+        std::cout << "  Brute force searches: " << (100.0 * num_brutes) / query_num << "%" << std::endl;
+        std::cout << "=================================" << std::endl;
+    }
 
     std::cout << "Done searching. Now saving results " << std::endl;
     uint64_t test_id = 0;
diff --git a/include/index.h b/include/index.h
@@ -121,6 +121,9 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
                                                         const std::vector<std::vector<LabelT>> &incoming_labels);
 
     DISKANN_DLLEXPORT inline float calculate_jaccard_similarity(const std::vector<LabelT> &set1, const std::vector<LabelT> &set2); 
+    
+    // Overloaded version for multi-filter query labels (vector<vector<LabelT>>)
+    DISKANN_DLLEXPORT inline float calculate_jaccard_similarity(const std::vector<std::vector<LabelT>> &filter_sets, const std::vector<LabelT> &vector_labels);
 
     // Batch build from a file. Optionally pass tags vector.
     DISKANN_DLLEXPORT void build(const char *filename, const size_t num_points_to_load,
diff --git a/scripts/ml_ilp/ilp.py b/scripts/ml_ilp/ilp.py
@@ -130,6 +130,7 @@ def main():
     parser.add_argument('--method', choices=['ratio', 'lp', 'pulp'], default='ratio')
     parser.add_argument('--eps', type=float, default=1e-4)
     parser.add_argument('--plot', action='store_true')
+    parser.add_argument('--norm_factors', help='Normalization factors file (scale, shift)', default=None)
     args = parser.parse_args()
 
     # Read the ground truth file
@@ -164,9 +165,23 @@ def main():
     print(f"Distances shape: {distances.shape}")
     print(f"Matches shape: {matches.shape}")
     
-    print(f"Distances: {distances[0][:5]}")
-    # distances_scaled = distances / distances.max()
-    # print(f"Scaled distances: {distances_scaled[0][:5]}")
+    print(f"Original distances: {distances[0][:5]}")
+    
+    # Apply normalization if provided
+    if args.norm_factors:
+        with open(args.norm_factors, 'r') as f:
+            line = f.readline().strip()
+            scale, shift = map(float, line.split())
+        print(f"Applying normalization: scale={scale}, shift={shift}")
+        distances_normalized = (distances + shift) * scale
+        print(f"Normalized distances: {distances_normalized[0][:5]}")
+        distances = distances_normalized
+    else:
+        # Fallback: simple max normalization
+        distances_max = distances.max()
+        print(f"Max distance: {distances_max}")
+        distances = distances / distances_max
+        print(f"Max-normalized distances: {distances[0][:5]}")
 
     if args.method == 'ratio':
         w_d, w_m, total_pairs, _ = direct_ratio_method(distances, filter_matches, args.eps)
diff --git a/src/index.cpp b/src/index.cpp
@@ -951,25 +951,63 @@ std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::brute_force_filters(const
 
 template <typename T, typename TagT, typename LabelT>
 inline float Index<T, TagT, LabelT>:: calculate_jaccard_similarity(const std::vector<LabelT> &set1, const std::vector<LabelT> &set2) {
-    // std::cout << "calculate_jaccard_similarity called" << std::endl;
-    std::unordered_set<LabelT> intersection, union_set;
-
-    for (const auto &label : set1) {
-        union_set.insert(label);
-    }
-    for (const auto &label : set2) {
-        if (union_set.find(label) != union_set.end()) {
-            intersection.insert(label);
+    if (set1.empty()) return 0.0f;
+    
+    size_t intersection_count = 0;
+    
+    // For small sets, linear scan is often faster due to cache locality
+    // Threshold based on your colleagues' discussion about cache vs complexity
+    constexpr size_t LINEAR_SCAN_THRESHOLD = 100;
+    
+    if (set1.size() <= LINEAR_SCAN_THRESHOLD && set2.size() <= LINEAR_SCAN_THRESHOLD) {
+        // Linear scan approach - cache friendly for small sets
+        for (const auto &label : set1) {
+            if (std::find(set2.begin(), set2.end(), label) != set2.end()) {
+                intersection_count++;
+            }
+        }
+    } else {
+        // Hash table approach for larger sets
+        const auto &smaller = set1.size() <= set2.size() ? set1 : set2;
+        const auto &larger = set1.size() <= set2.size() ? set2 : set1;
+        
+        std::unordered_set<LabelT> lookup_set(larger.begin(), larger.end());
+        
+        for (const auto &label : smaller) {
+            if (lookup_set.count(label)) {
+                intersection_count++;
+            }
         }
-        union_set.insert(label);
     }
+    
+    return static_cast<float>(intersection_count) / static_cast<float>(set1.size());
+}
 
-    if (union_set.empty()) {
-        return 0.0f; // Avoid division by zero
+// Overloaded version for multi-filter query labels (vector<vector<LabelT>>)
+// Returns the count of how many filter sets (clauses) have intersection with vector_labels
+template <typename T, typename TagT, typename LabelT>
+inline float Index<T, TagT, LabelT>:: calculate_jaccard_similarity(const std::vector<std::vector<LabelT>> &filter_sets, const std::vector<LabelT> &vector_labels) {
+    if (filter_sets.empty()) return 0.0f;
+    
+    size_t matching_clauses = 0;
+    
+    // Count how many filter sets (clauses) have ANY intersection with vector_labels
+    for (const auto& filter_set : filter_sets) {
+        // Check if ANY filter in this clause matches ANY label in vector_labels
+        bool clause_satisfied = false;
+        for (const auto& filter : filter_set) {
+            if (std::find(vector_labels.begin(), vector_labels.end(), filter) != vector_labels.end()) {
+                clause_satisfied = true;
+                break; // Early exit - this clause is satisfied
+            }
+        }
+        if (clause_satisfied) {
+            matching_clauses++;
+        }
     }
-
-    // return static_cast<float>(intersection.size()) / static_cast<float>(union_set.size());
-    return static_cast<float>(intersection.size()) / static_cast<float>(set1.size());
+    
+    // Return fraction of clauses that match
+    return static_cast<float>(matching_clauses) / static_cast<float>(filter_sets.size());
 }
 
 
@@ -1067,7 +1105,7 @@ std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::iterate_to_fixed_point(
                 // if (res > 0) {
                 //     res = 1;
                 // }
-                res = 1 - calculate_jaccard_similarity(filter_labels[0], _location_to_labels[id]);
+                res = 1 - calculate_jaccard_similarity(filter_labels, _location_to_labels[id]);
                 if (print_qstats)
                 {
                     std::ofstream out("query_stats.txt", std::ios_base::app);
@@ -1095,7 +1133,7 @@ std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::iterate_to_fixed_point(
                     continue;
                 }
                 else {
-                    res = 1 - calculate_jaccard_similarity(filter_labels[0], _location_to_labels[id]);
+                    res = 1 - calculate_jaccard_similarity(filter_labels, _location_to_labels[id]);
                     if (print_qstats)
                     {
                         std::ofstream out("query_stats.txt", std::ios_base::app);
@@ -1298,7 +1336,7 @@ std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::iterate_to_fixed_point(
                         // penalty = res * penalty_scale;
                         // i
                         
-                        res = 1 - calculate_jaccard_similarity(filter_labels[0], _location_to_labels[id]);
+                        res = 1 - calculate_jaccard_similarity(filter_labels, _location_to_labels[id]);
 
 
                         if (print_qstats)
@@ -1319,7 +1357,7 @@ std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::iterate_to_fixed_point(
                         if (detect_common_filters(id, search_invocation, filter_labels) < min_inter_size)
                             continue;
                         else {
-                           res = 1 - calculate_jaccard_similarity(filter_labels[0], _location_to_labels[id]);
+                           res = 1 - calculate_jaccard_similarity(filter_labels, _location_to_labels[id]);
                         }
                     }
                 }