updated ground truth and ILP to accept relational filters

Ananya Sutradhar · Ananya Sutradhar · commit 7e6cc7302c18 · 2025-06-12T10:58:28.000Z
diff --git a/apps/utils/compute_filtered_groundtruth.cpp b/apps/utils/compute_filtered_groundtruth.cpp
@@ -460,6 +460,26 @@ void print_query_stats(std::vector<std::pair<uint32_t, uint32_t>> &v)
     return;
 }
 
+// Add this struct and helper at the top of the file
+struct RelationalFilter {
+    std::string field;
+    std::string op;
+    std::string value;
+};
+
+inline bool is_relational(const std::string& label) {
+    return label.find('<') != std::string::npos || label.find('>') != std::string::npos;
+}
+
+inline bool eval_relational(const std::string& base_val, const std::string& op, const std::string& query_val) {
+    float b = std::stof(base_val), q = std::stof(query_val);
+    if (op == "<") return b < q;
+    if (op == "<=") return b <= q;
+    if (op == ">") return b > q;
+    if (op == ">=") return b >= q;
+    return false;
+}
+
 // template<typename A, typename B>
 // add UNIVERSAL LABEL SUPPORT
 int identify_matching_points(const std::string &base, const size_t start_id, const std::string &query,
@@ -493,19 +513,52 @@ int identify_matching_points(const std::string &base, const size_t start_id, con
                 for (uint32_t k = 0; k < query_labels[i].size(); k++)
                 {
                     bool or_pass = false;
-                for (uint32_t l = 0; l < query_labels[i][k].size(); l++)
-                {
-                    if (base_labels[j].find(query_labels[i][k][l]) != base_labels[j].end())
+                    for (uint32_t l = 0; l < query_labels[i][k].size(); l++)
                     {
-                        or_pass = true;
+                        const std::string& qlabel = query_labels[i][k][l];
+                        if (!is_relational(qlabel)) {
+                            // Old flow: treat as set
+                            if (base_labels[j].find(qlabel) != base_labels[j].end()) {
+                                or_pass = true;
+                                break;
+                            }
+                        } else {
+                            // New flow: relational filter
+                            // Parse field, op, value from qlabel, e.g. "year<2020"
+                            size_t pos = qlabel.find_first_of("<>");
+                            std::string field = qlabel.substr(0, pos);
+                            std::string op = qlabel.substr(pos, (qlabel[pos+1] == '=') ? 2 : 1);
+                            std::string value = qlabel.substr(pos + op.size());
+                            // // Find base value for this field
+                            auto it = std::find_if(base_labels[j].begin(), base_labels[j].end(),
+                                [&](const std::string& s) { return s.find(field + "=") == 0; });
+                            // if (it != base_labels[j].end()) {
+                            //     std::string base_val = it->substr(field.size() + 1);
+                            //     if (eval_relational(base_val, op, value)) {
+                            //         or_pass = true;
+                            //         break;
+                            //     }
+                            // }
+                            if (it != base_labels[j].end()) {
+                                std::string base_val = it->substr(field.size() + 1);
+                                bool match = eval_relational(base_val, op, value);
+                                // #pragma omp critical
+                                // {
+                                //     std::cout << "Query: " << qlabel << ", Base: " << *it << ", Parsed: " << base_val
+                                //             << ", Match: " << match << std::endl;
+                                // }
+                                if (match) {
+                                    or_pass = true;
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                    if (!or_pass) {
+                        pass = false;
                         break;
                     }
                 }
-                if (or_pass == false) {
-                    pass = false;
-                    break;
-                }
-                }
             }
             if (pass)
             {
diff --git a/apps/utils/compute_groundtruth.cpp b/apps/utils/compute_groundtruth.cpp
@@ -494,11 +494,14 @@ int aux_main(const std::string &base_file, const std::string &query_file, const
 
     int *closest_points = new int[nqueries * k];
     float *dist_closest_points = new float[nqueries * k];
-    std::vector<std::vector<float>> match_scores(nqueries, std::vector<float>(k, 0));
 
     std::vector<std::vector<std::pair<uint32_t, float>>> results =
         processUnfilteredParts<T>(base_file, nqueries, npoints, dim, k, query_data, metric, location_to_tag);
 
+    std::vector<std::vector<float>> jaccard_scores(nqueries, std::vector<float>(k, 0));
+    std::vector<std::vector<float>> relational_scores(nqueries, std::vector<float>(k, 0));
+
+
     for (size_t i = 0; i < nqueries; i++)
     {
         std::vector<std::pair<uint32_t, float>> &cur_res = results[i];
@@ -524,29 +527,61 @@ int aux_main(const std::string &base_file, const std::string &query_file, const
                 dist_closest_points[i * k + j] = iter.second;
 
             // Calculate match score for this vector
+            // Jaccard score (normal filters)
+            float jaccard_similarity = 0.0f;
+            // Relational score (relational filters)
+            float rel_score = 0.0f;
+
             if (!base_labels.empty() && !query_labels.empty() && iter.first < base_labels.size())
             {
                 const auto &query_label_predicates = query_labels[i];
                 const auto &base_label_set = base_labels[iter.first];
 
-
-                // calculate jaccard distance between query and base labels
+                // Jaccard
                 std::set<std::string> intersection;
+                int normal_total = 0;
+                for (const auto &clause : query_label_predicates)
+                {
+                    for (const auto &label : clause)
+                    {
+                        size_t pos = label.find_first_of("<>");
+                        if (pos == std::string::npos) { // normal filter
+                            normal_total++;
+                            if (base_label_set.find(label) != base_label_set.end())
+                                intersection.insert(label);
+                        }
+                    }
+                }
+                jaccard_similarity = (normal_total > 0) ? (float)intersection.size() / normal_total : 0.0f;
+
+                // Relational
                 for (const auto &clause : query_label_predicates)
                 {
                     for (const auto &label : clause)
                     {
-                        if (base_label_set.find(label) != base_label_set.end())
-                        {
-                            intersection.insert(label);
+                        size_t pos = label.find_first_of("<>");
+                        if (pos != std::string::npos) { // relational filter
+                            std::string field = label.substr(0, pos);
+                            std::string op = label.substr(pos, (label[pos+1] == '=') ? 2 : 1);
+                            std::string value = label.substr(pos + op.size());
+                            for (const auto &base_label : base_label_set)
+                            {
+                                if (base_label.find(field + "=") == 0)
+                                {
+                                    float query_val = std::stof(value);
+                                    float base_val = std::stof(base_label.substr(field.size() + 1));
+                                    rel_score = std::abs(query_val - base_val) / query_val;
+                                    break;
+                                }
+                            }
                         }
                     }
                 }
-                   
-                float jaccard_distance = (float)intersection.size() / (float)query_label_predicates.size();
-                match_scores[i][j] = jaccard_distance;            
             }
 
+            jaccard_scores[i][j] = jaccard_similarity;
+            relational_scores[i][j] = rel_score;
+
             ++j;
         }
         if (j < k)
@@ -564,11 +599,23 @@ int aux_main(const std::string &base_file, const std::string &query_file, const
             std::cerr << "Failed to open match score file: " << match_score_file << std::endl;
             return -1;
         }
+        // First part: Jaccard scores
+        for (size_t i = 0; i < nqueries; i++)
+        {
+            for (size_t j = 0; j < k; j++)
+            {
+                match_score_writer << jaccard_scores[i][j];
+                if (j < k - 1)
+                    match_score_writer << " ";
+            }
+            match_score_writer << "\n";
+        }
+        // Second part: Relational scores
         for (size_t i = 0; i < nqueries; i++)
         {
             for (size_t j = 0; j < k; j++)
             {
-                match_score_writer << match_scores[i][j];
+                match_score_writer << relational_scores[i][j];
                 if (j < k - 1)
                     match_score_writer << " ";
             }
diff --git a/scripts/ml_ilp/ilp.py b/scripts/ml_ilp/ilp.py
@@ -36,6 +36,15 @@ def read_ground_truth(file_path):
 
     return indices, distances
 
+def read_match_scores(match_score_file, Q, N):
+    scores = np.loadtxt(match_score_file)
+    if scores.shape[0] == 2 * Q:
+        jaccard_scores = scores[:Q, :]
+        relational_scores = scores[Q:, :]
+    else:
+        raise ValueError("Unexpected match score file shape")
+    return jaccard_scores, relational_scores
+
 def direct_ratio_method(distances, matches, eps=1e-4):
     Q, N = distances.shape
     max_diff = 0.0
@@ -158,15 +167,63 @@ def lp_soft_method_without_slack(distances, matches, eps=1e-4, method ='lp_wo_sl
     print("eps:", eps)
     prob.solve(pulp.PULP_CBC_CMD(msg=False))
     return w_m.value(), num_equations
-    
+
+
+def lp_soft_method_pulp_with_relational(distances, jaccard_scores, relational_scores, eps=1e-4):
+    print(f"Distances shape: {distances.shape}")
+    Q, N = distances.shape
+    print("using PuLP with relational filter weight")
+    prob = pulp.LpProblem('VectorRanking', pulp.LpMinimize)
+    w_d = 1
+    w_m = pulp.LpVariable('w_m', lowBound=0)
+    w_r = pulp.LpVariable('w_r', lowBound=0)
+    slacks = []
+
+    for q in tqdm(range(Q), desc="Building PuLP constraints"):
+        d = distances[q]
+        jac = jaccard_scores[q]
+        rel = relational_scores[q]
+        # Positive: jaccard==1 and (relational==0 if relational filter exists)
+        has_relational = np.any(rel != 0)
+        if has_relational:
+            pos = np.where((jac == 1) & (rel == 0))[0]
+            neg = np.where(~((jac == 1) & (rel == 0)))[0]
+        else:
+            pos = np.where(jac == 1)[0]
+            neg = np.where(jac < 1)[0]
+        neg_sample_size = min(1, len(neg))
+
+        for i in pos:
+            neg_sample = np.random.choice(neg, size=neg_sample_size, replace=False)
+            for j in neg_sample:
+                if d[i] < d[j]:
+                    continue
+                s = pulp.LpVariable(f's_{q}_{i}_{j}', lowBound=0)
+                slacks.append(s)
+                prob += (
+                    w_d * d[i] + w_m * (1 - jac[i]) + w_r * rel[i] + eps
+                    <= w_d * d[j] + w_m * (1 - jac[j]) + w_r * rel[j] + s
+                )
+    print(f"Total equations: {len(slacks)}")
+    alpha = 500
+    if len(slacks) > 0:
+        avg_slack = pulp.lpSum(slacks) / len(slacks)
+    else:
+        avg_slack = 0
+    prob += w_m + w_r + alpha * avg_slack
+    print("Solving LP...")
+    prob.solve(pulp.PULP_CBC_CMD(msg=False))
+    slack_vals = [v.value() for v in slacks]
+    violations = sum(1 for v in slack_vals if v > 1e-6)
+    return w_d, w_m.value(), w_r.value(), len(slacks), violations
 
 
 def main():
     parser = argparse.ArgumentParser(description='Learn weights for vector ranking')
     parser.add_argument('unfiltered_ground_truth', help='Unfiltered Ground truth file (binary format)')
     parser.add_argument('filtered_ground_truth', help='Filtered Ground truth file (binary format)')
     parser.add_argument('unfiltered_match_scores', help='Filter match file (match scores)')
-    parser.add_argument('--method', choices=['ratio', 'gekko', 'pulp', 'pulp_wo_slack'], default='ratio')
+    parser.add_argument('--method', choices=['ratio', 'gekko', 'pulp', 'pulp_wo_slack', 'pulp_w_relational'], default='ratio')
     parser.add_argument('--eps', type=float, default=1e-4)
     parser.add_argument('--plot', action='store_true')
     args = parser.parse_args()
@@ -180,10 +237,10 @@ def main():
     print(f"Filter matches shape: {unfiltered_match_scores.shape}")
     print("Done reading filter match file")
     
-    # Validate shapes
-    if unfiltered_gt_indices.shape != unfiltered_match_scores.shape:
-        print(f"Shape mismatch: {unfiltered_gt_indices.shape} vs {unfiltered_match_scores.shape}")
-        sys.exit(1)
+    # # Validate shapes
+    # if unfiltered_gt_indices.shape != unfiltered_match_scores.shape:
+    #     print(f"Shape mismatch: {unfiltered_gt_indices.shape} vs {unfiltered_match_scores.shape}")
+    #     sys.exit(1)
         
     
     # Concatenate filtered and unfiltered ground truth distances and match scores
@@ -196,10 +253,10 @@ def main():
     print(f"Number of filtered queries: {num_filtered}")
     
     distances = np.concatenate([filtered_gt_distances, unfiltered_gt_distances], axis=1)
-    matches = np.concatenate([filtered_match_score, unfiltered_match_scores], axis=1)
+    # matches = np.concatenate([filtered_match_score, unfiltered_match_scores], axis=1)
     
     print(f"Distances shape: {distances.shape}")
-    print(f"Matches shape: {matches.shape}")
+    # print(f"Matches shape: {matches.shape}")
     
     print(f"Distances: {distances[0][:5]}")
     # distances_scaled = distances / distances.max()
@@ -215,6 +272,41 @@ def main():
         w_d, w_m, total_pairs, violations = lp_soft_method_gekko(distances, unfiltered_match_scores, args.eps)
     if args.method == 'pulp':
         w_d, w_m, total_pairs, violations = lp_soft_method_pulp(distances, unfiltered_match_scores, args.eps)
+    if args.method == 'pulp_w_relational':
+        unfiltered_jaccard_scores, unfiltered_relational_scores = read_match_scores(args.unfiltered_match_scores, *distances.shape)
+        print(f"Unfiltered Jaccard scores shape: {unfiltered_jaccard_scores.shape}")
+        print(f"Unfiltered Relational scores shape: {unfiltered_relational_scores.shape}")
+        max_rel = np.max(unfiltered_relational_scores, axis=1, keepdims=True)
+        max_rel[max_rel == 0] = 1.0
+        unfiltered_relational_scores = unfiltered_relational_scores / max_rel
+        
+        print(f"Relational scores: {unfiltered_relational_scores[0][:5]}")
+        
+        filtered_jaccard_scores = np.ones_like(filtered_gt_distances, dtype=np.float32)
+        filtered_relational_scores = np.zeros_like(filtered_gt_distances, dtype=np.float32)
+        
+        jaccard_scores = np.concatenate([filtered_jaccard_scores, unfiltered_jaccard_scores], axis=1)
+        relational_scores = np.concatenate([filtered_relational_scores, unfiltered_relational_scores], axis=1)
+        relational_scores = relational_scores.astype(np.float32)
+        
+        print(f"Jaccard scores shape: {jaccard_scores.shape}")
+        print(f"Relational scores shape: {relational_scores.shape}")
+        
+        print(f"Relational scores: {relational_scores[0]}")
+        
+        # # take the first 100 queries 
+        # jaccard_scores = jaccard_scores[:1000]
+        # relational_scores = relational_scores[:1000]
+        # distances = distances[:1000]
+        print(f"Filtered Jaccard scores shape: {jaccard_scores.shape}")
+        print(f"Filtered Relational scores shape: {relational_scores.shape}")
+        
+
+        
+        w_d, w_m, w_r, total_pairs, violations = lp_soft_method_pulp_with_relational(
+            distances, jaccard_scores, relational_scores, args.eps
+        )
+        print(f"Relational weight: {w_r:.6f}")
 
     print(f"Method: {args.method}")
     print(f"Total pairs evaluated: {total_pairs}")