RolnickLab
diff --git a/‎ami/main/models.py‎
Lines changed: 3 additions & 2 deletions b/‎ami/main/models.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎ami/ml/exceptions.py‎
Lines changed: 2 additions & 0 deletions b/‎ami/ml/exceptions.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎ami/ml/migrations/0023_merge_duplicate_category_maps.py‎
Lines changed: 147 additions & 0 deletions b/‎ami/ml/migrations/0023_merge_duplicate_category_maps.py‎
Lines changed: 147 additions & 0 deletions
diff --git a/‎ami/ml/migrations/0024_fix_classifications_missing_category_maps.py‎
Lines changed: 80 additions & 0 deletions b/‎ami/ml/migrations/0024_fix_classifications_missing_category_maps.py‎
Lines changed: 80 additions & 0 deletions
@@ -2219,14 +2219,15 @@ def top_n(self, n: int = 3) -> list[dict[str, "Taxon | float | None"]]:
         """Return top N taxa and scores for this classification."""
         if not self.category_map:
             logger.warning(
-                f"Classification {self.pk}'s algrorithm ({self.algorithm_id} has no catgory map, "
+                f"Classification {self.pk}'s algorithm ({self.algorithm_id}) has no category map, "
                 "can't get top N predictions."
             )
             return []
 
         top_scored = self.top_scores_with_index(n)  # (index, score) pairs
         indexes = [idx for idx, _ in top_scored]
-        category_data = self.category_map.with_taxa(only_indexes=indexes)
+        category_data: list[dict] = self.category_map.with_taxa(only_indexes=indexes)
+        assert category_data is not None
         index_to_taxon = {cat["index"]: cat["taxon"] for cat in category_data}
 
         return [
 
@@ -0,0 +1,2 @@
+class PipelineNotConfigured(ValueError):
+    pass
@@ -0,0 +1,147 @@
+# Generated on 2025-09-09 for merging duplicate AlgorithmCategoryMaps
+
+from django.db import migrations
+from django.db.models import Count
+import logging
+import json
+
+
+logger = logging.getLogger(__name__)
+
+
+def merge_duplicate_category_maps(apps, schema_editor):
+    """
+    Find duplicate AlgorithmCategoryMaps based on their `data` field,
+    compare them (description, version, associated Algorithms, associated Classifications),
+    choose a keeper, then reassign Classifications and Algorithms to use the keeper.
+    Then delete the duplicates.
+    """
+    AlgorithmCategoryMap = apps.get_model("ml", "AlgorithmCategoryMap")
+    Algorithm = apps.get_model("ml", "Algorithm")
+    Classification = apps.get_model("main", "Classification")
+
+    # Group category maps by their data content (JSON field)
+    # We'll use a dictionary to group by serialized data
+    data_groups = {}
+
+    for category_map in AlgorithmCategoryMap.objects.all():
+        # Normalize the data for comparison by converting to a sorted JSON string
+        normalized_data = json.dumps(category_map.data, sort_keys=True)
+
+        if normalized_data not in data_groups:
+            data_groups[normalized_data] = []
+        data_groups[normalized_data].append(category_map)
+
+    # Process each group that has duplicates
+    duplicates_found = 0
+    maps_merged = 0
+
+    for normalized_data, category_maps in data_groups.items():
+        if len(category_maps) <= 1:
+            continue  # Skip groups with only one category map
+
+        duplicates_found += len(category_maps) - 1
+        logger.info(f"Found {len(category_maps)} duplicate category maps with data hash")
+
+        # Choose the keeper - prioritize by:
+        # 1. Has description
+        # 2. Has version
+        # 3. Most associated algorithms
+        # 4. Most associated classifications
+        # 5. Earliest created (as tie-breaker)
+
+        def score_category_map(cm):
+            score = 0
+
+            # Has description
+            if cm.description:
+                score += 1000
+
+            # Has version
+            if cm.version:
+                score += 500
+
+            # Count associated algorithms
+            algorithm_count = Algorithm.objects.filter(category_map=cm).count()
+            score += algorithm_count * 100
+
+            # Count associated classifications
+            classification_count = Classification.objects.filter(category_map=cm).count()
+            score += classification_count * 10
+
+            # Prefer older records (negative timestamp for sorting)
+            score -= cm.created_at.timestamp() / 1000000  # Small adjustment for tie-breaking
+
+            return score
+
+        # Sort by score (highest first) and pick the keeper
+        sorted_maps = sorted(category_maps, key=score_category_map, reverse=True)
+        keeper = sorted_maps[0]
+        duplicates = sorted_maps[1:]
+
+        logger.info(f"Keeping category map #{keeper.pk}, merging {len(duplicates)} duplicates")
+
+        # Merge data from duplicates to keeper
+        for duplicate in duplicates:
+            # Update algorithms pointing to the duplicate
+            algorithms_updated = Algorithm.objects.filter(category_map=duplicate).update(category_map=keeper)
+            logger.info(f"Updated {algorithms_updated} algorithms from category map #{duplicate.pk} to #{keeper.pk}")
+
+            # Update classifications pointing to the duplicate
+            classifications_updated = Classification.objects.filter(category_map=duplicate).update(category_map=keeper)
+            logger.info(
+                f"Updated {classifications_updated} classifications from category map #{duplicate.pk} to #{keeper.pk}"
+            )
+
+            # If duplicate has better description or version, update keeper
+            if not keeper.description and duplicate.description:
+                keeper.description = duplicate.description
+                logger.info(f"Updated keeper description from duplicate #{duplicate.pk}")
+
+            if not keeper.version and duplicate.version:
+                keeper.version = duplicate.version
+                logger.info(f"Updated keeper version from duplicate #{duplicate.pk}")
+
+            if not keeper.uri and duplicate.uri:
+                keeper.uri = duplicate.uri
+                logger.info(f"Updated keeper URI from duplicate #{duplicate.pk}")
+
+        # Save keeper with any merged data
+        keeper.save()
+
+        # Delete the duplicates
+        for duplicate in duplicates:
+            logger.info(f"Deleting duplicate category map #{duplicate.pk}")
+            duplicate.delete()
+
+        maps_merged += len(duplicates)
+
+    logger.info(
+        f"Migration completed: {duplicates_found} duplicates found, {maps_merged} category maps merged and deleted"
+    )
+
+
+def reverse_merge_duplicate_category_maps(apps, schema_editor):
+    """
+    This migration cannot be easily reversed since we deleted duplicate data.
+    The reverse operation would require restoring the deleted category maps
+    and reassigning relationships, which is not feasible without backup data.
+    """
+    raise NotImplementedError(
+        "This migration cannot be reversed as it permanently deletes duplicate "
+        "AlgorithmCategoryMap instances. If you need to reverse this, restore from a backup."
+    )
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("ml", "0022_alter_pipeline_default_config"),
+        ("main", "0053_alter_classification_algorithm"),  # Ensure Classification model is available
+    ]
+
+    operations = [
+        migrations.RunPython(
+            merge_duplicate_category_maps,
+            reverse_merge_duplicate_category_maps,
+        ),
+    ]
@@ -0,0 +1,80 @@
+# Generated on 2025-09-09 for fixing unlinked Classifications
+
+from django.db import migrations
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+def fix_unlinked_classifications(apps, schema_editor):
+    """
+    Fix classifications that are missing category_map references but have
+    algorithms with category_maps. This typically happens with legacy data
+    created before the automatic category_map assignment was implemented.
+    """
+    Classification = apps.get_model("main", "Classification")
+
+    # Find classifications that need fixing
+    unlinked_classifications = Classification.objects.filter(
+        category_map__isnull=True, algorithm__category_map__isnull=False
+    )
+
+    total_unlinked = unlinked_classifications.count()
+    logger.info(f"Found {total_unlinked:,} classifications missing category_map but with algorithm that has one")
+
+    if total_unlinked == 0:
+        logger.info("No unlinked classifications found - migration complete")
+        return
+
+    # Group by algorithm to do bulk updates more efficiently
+    algorithms_with_unlinked = unlinked_classifications.values_list("algorithm_id", flat=True).distinct()
+
+    total_fixed = 0
+
+    for algorithm_id in algorithms_with_unlinked:
+        # Get the algorithm's category_map
+        algorithm_classifications = unlinked_classifications.filter(algorithm_id=algorithm_id)
+        first_classification = algorithm_classifications.first()
+
+        if not first_classification or not first_classification.algorithm:
+            continue
+
+        category_map = first_classification.algorithm.category_map
+        if not category_map:
+            continue
+
+        # Bulk update all classifications for this algorithm
+        updated_count = algorithm_classifications.update(category_map=category_map)
+        total_fixed += updated_count
+
+        logger.info(
+            f"Updated {updated_count:,} classifications for algorithm #{algorithm_id} to use category_map #{category_map.pk}"
+        )
+
+    logger.info(f"Migration completed: Fixed {total_fixed:,} unlinked classifications")
+
+
+def reverse_fix_unlinked_classifications(apps, schema_editor):
+    """
+    This migration fixes data consistency issues and should not be reversed.
+    However, if needed, this would set category_map back to null for classifications
+    that were updated by this migration.
+    """
+    logger.warning("Reversing this migration would create data inconsistency - not recommended")
+    # We could implement a reversal if absolutely necessary, but it's not recommended
+    # since this migration fixes legitimate data consistency issues
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("ml", "0023_merge_duplicate_category_maps"),
+        ("main", "0053_alter_classification_algorithm"),  # Ensure Classification model is available
+    ]
+
+    operations = [
+        migrations.RunPython(
+            fix_unlinked_classifications,
+            reverse_fix_unlinked_classifications,
+        ),
+    ]
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+class PipelineNotConfigured(ValueError):`
	`2`	`+ pass`