Skip to content

Commit 8c2d73c

Browse files
mihowCopilot
andauthored
Require Algorithms & CategoryMaps to be registered before processing (#942)
* feat: use enums for valid algorithm task types * feat: create labels hash automatically on create * feat: require that category maps & algorithms are created before processing * fix: logic for checking existing algorithms, introduce named exception * feat: allow passing existing pipeline configs when registering, renames * feat: update tests for new behavior (algos & maps must be pre-registered) * feat: deprecate the inclusion of algorithm details in result responses * feat: migration for merging duplicate category maps * feat: update category map relationships in old classifications * feat: ensure labels hash is always generated with save method. * feat: docs and convenience methods to explain category map schema * fix: return category maps from all models in example processing service * fix: slightly improve the top_n taxa response * Update ami/ml/tests.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update ami/ml/models/pipeline.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 3866aff commit 8c2d73c

File tree

15 files changed

+648
-262
lines changed

15 files changed

+648
-262
lines changed

ami/main/models.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2219,14 +2219,15 @@ def top_n(self, n: int = 3) -> list[dict[str, "Taxon | float | None"]]:
22192219
"""Return top N taxa and scores for this classification."""
22202220
if not self.category_map:
22212221
logger.warning(
2222-
f"Classification {self.pk}'s algrorithm ({self.algorithm_id} has no catgory map, "
2222+
f"Classification {self.pk}'s algorithm ({self.algorithm_id}) has no category map, "
22232223
"can't get top N predictions."
22242224
)
22252225
return []
22262226

22272227
top_scored = self.top_scores_with_index(n) # (index, score) pairs
22282228
indexes = [idx for idx, _ in top_scored]
2229-
category_data = self.category_map.with_taxa(only_indexes=indexes)
2229+
category_data: list[dict] = self.category_map.with_taxa(only_indexes=indexes)
2230+
assert category_data is not None
22302231
index_to_taxon = {cat["index"]: cat["taxon"] for cat in category_data}
22312232

22322233
return [

ami/ml/exceptions.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
class PipelineNotConfigured(ValueError):
2+
pass
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# Generated on 2025-09-09 for merging duplicate AlgorithmCategoryMaps
2+
3+
from django.db import migrations
4+
from django.db.models import Count
5+
import logging
6+
import json
7+
8+
9+
logger = logging.getLogger(__name__)
10+
11+
12+
def merge_duplicate_category_maps(apps, schema_editor):
13+
"""
14+
Find duplicate AlgorithmCategoryMaps based on their `data` field,
15+
compare them (description, version, associated Algorithms, associated Classifications),
16+
choose a keeper, then reassign Classifications and Algorithms to use the keeper.
17+
Then delete the duplicates.
18+
"""
19+
AlgorithmCategoryMap = apps.get_model("ml", "AlgorithmCategoryMap")
20+
Algorithm = apps.get_model("ml", "Algorithm")
21+
Classification = apps.get_model("main", "Classification")
22+
23+
# Group category maps by their data content (JSON field)
24+
# We'll use a dictionary to group by serialized data
25+
data_groups = {}
26+
27+
for category_map in AlgorithmCategoryMap.objects.all():
28+
# Normalize the data for comparison by converting to a sorted JSON string
29+
normalized_data = json.dumps(category_map.data, sort_keys=True)
30+
31+
if normalized_data not in data_groups:
32+
data_groups[normalized_data] = []
33+
data_groups[normalized_data].append(category_map)
34+
35+
# Process each group that has duplicates
36+
duplicates_found = 0
37+
maps_merged = 0
38+
39+
for normalized_data, category_maps in data_groups.items():
40+
if len(category_maps) <= 1:
41+
continue # Skip groups with only one category map
42+
43+
duplicates_found += len(category_maps) - 1
44+
logger.info(f"Found {len(category_maps)} duplicate category maps with data hash")
45+
46+
# Choose the keeper - prioritize by:
47+
# 1. Has description
48+
# 2. Has version
49+
# 3. Most associated algorithms
50+
# 4. Most associated classifications
51+
# 5. Earliest created (as tie-breaker)
52+
53+
def score_category_map(cm):
54+
score = 0
55+
56+
# Has description
57+
if cm.description:
58+
score += 1000
59+
60+
# Has version
61+
if cm.version:
62+
score += 500
63+
64+
# Count associated algorithms
65+
algorithm_count = Algorithm.objects.filter(category_map=cm).count()
66+
score += algorithm_count * 100
67+
68+
# Count associated classifications
69+
classification_count = Classification.objects.filter(category_map=cm).count()
70+
score += classification_count * 10
71+
72+
# Prefer older records (negative timestamp for sorting)
73+
score -= cm.created_at.timestamp() / 1000000 # Small adjustment for tie-breaking
74+
75+
return score
76+
77+
# Sort by score (highest first) and pick the keeper
78+
sorted_maps = sorted(category_maps, key=score_category_map, reverse=True)
79+
keeper = sorted_maps[0]
80+
duplicates = sorted_maps[1:]
81+
82+
logger.info(f"Keeping category map #{keeper.pk}, merging {len(duplicates)} duplicates")
83+
84+
# Merge data from duplicates to keeper
85+
for duplicate in duplicates:
86+
# Update algorithms pointing to the duplicate
87+
algorithms_updated = Algorithm.objects.filter(category_map=duplicate).update(category_map=keeper)
88+
logger.info(f"Updated {algorithms_updated} algorithms from category map #{duplicate.pk} to #{keeper.pk}")
89+
90+
# Update classifications pointing to the duplicate
91+
classifications_updated = Classification.objects.filter(category_map=duplicate).update(category_map=keeper)
92+
logger.info(
93+
f"Updated {classifications_updated} classifications from category map #{duplicate.pk} to #{keeper.pk}"
94+
)
95+
96+
# If duplicate has better description or version, update keeper
97+
if not keeper.description and duplicate.description:
98+
keeper.description = duplicate.description
99+
logger.info(f"Updated keeper description from duplicate #{duplicate.pk}")
100+
101+
if not keeper.version and duplicate.version:
102+
keeper.version = duplicate.version
103+
logger.info(f"Updated keeper version from duplicate #{duplicate.pk}")
104+
105+
if not keeper.uri and duplicate.uri:
106+
keeper.uri = duplicate.uri
107+
logger.info(f"Updated keeper URI from duplicate #{duplicate.pk}")
108+
109+
# Save keeper with any merged data
110+
keeper.save()
111+
112+
# Delete the duplicates
113+
for duplicate in duplicates:
114+
logger.info(f"Deleting duplicate category map #{duplicate.pk}")
115+
duplicate.delete()
116+
117+
maps_merged += len(duplicates)
118+
119+
logger.info(
120+
f"Migration completed: {duplicates_found} duplicates found, {maps_merged} category maps merged and deleted"
121+
)
122+
123+
124+
def reverse_merge_duplicate_category_maps(apps, schema_editor):
125+
"""
126+
This migration cannot be easily reversed since we deleted duplicate data.
127+
The reverse operation would require restoring the deleted category maps
128+
and reassigning relationships, which is not feasible without backup data.
129+
"""
130+
raise NotImplementedError(
131+
"This migration cannot be reversed as it permanently deletes duplicate "
132+
"AlgorithmCategoryMap instances. If you need to reverse this, restore from a backup."
133+
)
134+
135+
136+
class Migration(migrations.Migration):
137+
dependencies = [
138+
("ml", "0022_alter_pipeline_default_config"),
139+
("main", "0053_alter_classification_algorithm"), # Ensure Classification model is available
140+
]
141+
142+
operations = [
143+
migrations.RunPython(
144+
merge_duplicate_category_maps,
145+
reverse_merge_duplicate_category_maps,
146+
),
147+
]
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Generated on 2025-09-09 for fixing unlinked Classifications
2+
3+
from django.db import migrations
4+
import logging
5+
6+
7+
logger = logging.getLogger(__name__)
8+
9+
10+
def fix_unlinked_classifications(apps, schema_editor):
11+
"""
12+
Fix classifications that are missing category_map references but have
13+
algorithms with category_maps. This typically happens with legacy data
14+
created before the automatic category_map assignment was implemented.
15+
"""
16+
Classification = apps.get_model("main", "Classification")
17+
18+
# Find classifications that need fixing
19+
unlinked_classifications = Classification.objects.filter(
20+
category_map__isnull=True, algorithm__category_map__isnull=False
21+
)
22+
23+
total_unlinked = unlinked_classifications.count()
24+
logger.info(f"Found {total_unlinked:,} classifications missing category_map but with algorithm that has one")
25+
26+
if total_unlinked == 0:
27+
logger.info("No unlinked classifications found - migration complete")
28+
return
29+
30+
# Group by algorithm to do bulk updates more efficiently
31+
algorithms_with_unlinked = unlinked_classifications.values_list("algorithm_id", flat=True).distinct()
32+
33+
total_fixed = 0
34+
35+
for algorithm_id in algorithms_with_unlinked:
36+
# Get the algorithm's category_map
37+
algorithm_classifications = unlinked_classifications.filter(algorithm_id=algorithm_id)
38+
first_classification = algorithm_classifications.first()
39+
40+
if not first_classification or not first_classification.algorithm:
41+
continue
42+
43+
category_map = first_classification.algorithm.category_map
44+
if not category_map:
45+
continue
46+
47+
# Bulk update all classifications for this algorithm
48+
updated_count = algorithm_classifications.update(category_map=category_map)
49+
total_fixed += updated_count
50+
51+
logger.info(
52+
f"Updated {updated_count:,} classifications for algorithm #{algorithm_id} to use category_map #{category_map.pk}"
53+
)
54+
55+
logger.info(f"Migration completed: Fixed {total_fixed:,} unlinked classifications")
56+
57+
58+
def reverse_fix_unlinked_classifications(apps, schema_editor):
59+
"""
60+
This migration fixes data consistency issues and should not be reversed.
61+
However, if needed, this would set category_map back to null for classifications
62+
that were updated by this migration.
63+
"""
64+
logger.warning("Reversing this migration would create data inconsistency - not recommended")
65+
# We could implement a reversal if absolutely necessary, but it's not recommended
66+
# since this migration fixes legitimate data consistency issues
67+
68+
69+
class Migration(migrations.Migration):
70+
dependencies = [
71+
("ml", "0023_merge_duplicate_category_maps"),
72+
("main", "0053_alter_classification_algorithm"), # Ensure Classification model is available
73+
]
74+
75+
operations = [
76+
migrations.RunPython(
77+
fix_unlinked_classifications,
78+
reverse_fix_unlinked_classifications,
79+
),
80+
]

0 commit comments

Comments
 (0)