chore: rebase feat/postprocessing-class-masking onto feat/postprocessing-framework

mohamedelabbas1996 · mohamedelabbas1996 · commit 88ffba8fb7e0 · 2025-10-14T14:52:36.000-04:00
diff --git a/ami/ml/post_processing/__init__.py b/ami/ml/post_processing/__init__.py
@@ -1 +1 @@
-from . import rank_rollup, small_size_filter  # noqa: F401
+from . import class_masking, rank_rollup, small_size_filter  # noqa: F401
diff --git a/ami/ml/post_processing/base.py b/ami/ml/post_processing/base.py
@@ -2,7 +2,7 @@
 
 import abc
 import logging
-from typing import Any, Optional
+from typing import Any
 
 from ami.jobs.models import Job
 from ami.ml.models import Algorithm
@@ -39,34 +39,54 @@ class BasePostProcessingTask(abc.ABC):
     Abstract base class for all post-processing tasks.
     """
 
+    # Each task must override these
     key: str = ""
     name: str = ""
 
     def __init__(
         self,
-        job: Optional["Job"] = None,
-        task_logger: logging.Logger | None = None,
+        job: Job | None = None,
+        logger: logging.Logger | None = None,
         **config: Any,
     ):
-        """
-        Initialize task with optional job and logger context.
-        """
         self.job = job
-        self.config: dict[str, Any] = config
-
-        if job:
+        self.config = config
+        # Choose the right logger
+        if logger is not None:
+            self.logger = logger
+        elif job is not None:
             self.logger = job.logger
-        elif task_logger:
-            self.logger = task_logger
         else:
             self.logger = logging.getLogger(f"ami.post_processing.{self.key}")
-        self.log_config()
+
+        algorithm, _ = Algorithm.objects.get_or_create(
+            name=self.__class__.__name__,
+            defaults={
+                "description": f"Post-processing task: {self.key}",
+                "task_type": AlgorithmTaskType.POST_PROCESSING.value,
+            },
+        )
+        self.algorithm: Algorithm = algorithm
+
+        self.logger.info(f"Initialized {self.__class__.__name__} with config={self.config}, job={job}")
+
+    def update_progress(self, progress: float):
+        """
+        Update progress if job is present, otherwise just log.
+        """
+
+        if self.job:
+            self.job.progress.update_stage(self.job.job_type_key, progress=progress)
+            self.job.save(update_fields=["progress"])
+
+        else:
+            # No job object — fallback to plain logging
+            self.logger.info(f"[{self.name}] Progress {progress:.0%}")
 
     @abc.abstractmethod
     def run(self) -> None:
-        """Run the task logic. Must be implemented by subclasses."""
-        raise NotImplementedError("Subclasses must implement run()")
-
-    def log_config(self):
-        """Helper to log the task configuration at start."""
-        self.logger.info(f"Running task {self.name} ({self.key}) with config: {self.config}")
+        """
+        Run the task logic.
+        Must be implemented by subclasses.
+        """
+        raise NotImplementedError("BasePostProcessingTask subclasses must implement run()")
diff --git a/ami/ml/post_processing/class_masking.py b/ami/ml/post_processing/class_masking.py
@@ -0,0 +1,253 @@
+import logging
+
+from django.db.models import QuerySet
+from django.utils import timezone
+
+from ami.main.models import Classification, Occurrence, SourceImageCollection, TaxaList
+from ami.ml.models import Algorithm, AlgorithmCategoryMap
+from ami.ml.post_processing.base import BasePostProcessingTask, register_postprocessing_task
+
+logger = logging.getLogger(__name__)
+
+
+def update_single_occurrence(
+    occurrence: Occurrence,
+    algorithm: Algorithm,
+    taxa_list: TaxaList,
+    task_logger: logging.Logger = logger,
+):
+    task_logger.info(f"Recalculating classifications for occurrence {occurrence.pk}.")
+
+    # Get the classifications for the occurrence in the collection
+    classifications = Classification.objects.filter(
+        detection__occurrence=occurrence,
+        terminal=True,
+        algorithm=algorithm,
+        scores__isnull=False,
+    ).distinct()
+
+    make_classifications_filtered_by_taxa_list(
+        classifications=classifications,
+        taxa_list=taxa_list,
+        algorithm=algorithm,
+    )
+
+
+def update_occurrences_in_collection(
+    collection: SourceImageCollection,
+    taxa_list: TaxaList,
+    algorithm: Algorithm,
+    params: dict,
+    task_logger: logging.Logger = logger,
+    job=None,
+):
+    task_logger.info(f"Recalculating classifications based on a taxa list. Params: {params}")
+
+    # Make new AlgorithmCategoryMap with the taxa in the list
+    # @TODO
+
+    classifications = Classification.objects.filter(
+        detection__source_image__collections=collection,
+        terminal=True,
+        # algorithm__task_type="classification",
+        algorithm=algorithm,
+        scores__isnull=False,
+    ).distinct()
+
+    make_classifications_filtered_by_taxa_list(
+        classifications=classifications,
+        taxa_list=taxa_list,
+        algorithm=algorithm,
+    )
+
+
+def make_classifications_filtered_by_taxa_list(
+    classifications: QuerySet[Classification],
+    taxa_list: TaxaList,
+    algorithm: Algorithm,
+):
+    taxa_in_list = taxa_list.taxa.all()
+
+    occurrences_to_update: set[Occurrence] = set()
+    logger.info(f"Found {len(classifications)} terminal classifications with scores to update.")
+
+    if not classifications:
+        raise ValueError("No terminal classifications with scores found to update.")
+
+    if not algorithm.category_map:
+        raise ValueError(f"Algorithm {algorithm} does not have a category map.")
+    category_map: AlgorithmCategoryMap = algorithm.category_map
+
+    # Consider moving this to a method on the Classification model
+
+    # @TODO find a more efficient way to get the category map with taxa. This is slow!
+    logger.info(f"Retrieving category map with Taxa instances for algorithm {algorithm}")
+    category_map_with_taxa = category_map.with_taxa()
+    # Filter the category map to only include taxa that are in the taxa list
+    # included_category_map_with_taxa = [
+    #     category for category in category_map_with_taxa if category["taxon"] in taxa_in_list
+    # ]
+    excluded_category_map_with_taxa = [
+        category for category in category_map_with_taxa if category["taxon"] not in taxa_in_list
+    ]
+
+    # included_category_indices = [int(category["index"]) for category in category_map_with_taxa]
+    excluded_category_indices = [
+        int(category["index"]) for category in excluded_category_map_with_taxa  # type: ignore
+    ]
+
+    # Log number of categories in the category map, num included, and num excluded, num classifications to update
+    logger.info(
+        f"Category map has {len(category_map_with_taxa)} categories, "
+        f"{len(excluded_category_map_with_taxa)} categories excluded, "
+        f"{len(classifications)} classifications to check"
+    )
+
+    classifications_to_add = []
+    classifications_to_update = []
+
+    timestamp = timezone.now()
+    for classification in classifications:
+        scores, logits = classification.scores, classification.logits
+        # Set scores and logits to zero if they are not in the filtered category indices
+
+        import numpy as np
+
+        # Assert that all scores & logits are lists of numbers
+        if not isinstance(scores, list) or not all(isinstance(score, (int, float)) for score in scores):
+            raise ValueError(f"Scores for classification {classification.pk} are not a list of numbers: {scores}")
+        if not isinstance(logits, list) or not all(isinstance(logit, (int, float)) for logit in logits):
+            raise ValueError(f"Logits for classification {classification.pk} are not a list of numbers: {logits}")
+
+        logger.debug(f"Processing classification {classification.pk} with {len(scores)} scores")
+        logger.info(f"Previous totals: {sum(scores)} scores, {sum(logits)} logits")
+
+        # scores_np_filtered = np.array(scores)
+        logits_np = np.array(logits)
+
+        # scores_np_filtered[excluded_category_indices] = 0.0
+
+        # @TODO can we use np.NAN instead of 0.0? zero will NOT calculate correctly in softmax.
+        # @TODO delete the excluded categories from the scores and logits instead of setting to 0.0
+        # logits_np[excluded_category_indices] = 0.0
+        # logits_np[excluded_category_indices] = np.nan
+        logits_np[excluded_category_indices] = -100
+
+        logits: list[float] = logits_np.tolist()
+
+        from numpy import exp
+        from numpy import sum as np_sum
+
+        # @TODO add test to see if this is correct, or needed!
+        # Recalculate the softmax scores based on the filtered logits
+        scores_np: np.ndarray = exp(logits_np - np.max(logits_np))  # Subtract max for numerical stability
+        scores_np /= np_sum(scores_np)  # Normalize to get probabilities
+
+        scores: list = scores_np.tolist()  # Convert back to list
+
+        logger.info(f"New totals: {sum(scores)} scores, {sum(logits)} logits")
+
+        # Get the taxon with the highest score  using the index of the max score
+        top_index = scores.index(max(scores))
+        top_taxon = category_map_with_taxa[top_index][
+            "taxon"
+        ]  # @TODO: This doesn't work if the taxon has never been classified
+        print("Top taxon: ", category_map_with_taxa[top_index])  # @TODO: REMOVE
+        print("Top index: ", top_index)  # @TODO: REMOVE
+
+        # check if needs updating
+        if classification.scores == scores and classification.logits == logits:
+            logger.debug(f"Classification {classification.pk} does not need updating")
+            continue
+
+        # Consider the existing classification as an intermediate classification
+        classification.terminal = False
+        classification.updated_at = timestamp
+
+        # Recalculate the top taxon and score
+        new_classification = Classification(
+            taxon=top_taxon,
+            algorithm=classification.algorithm,
+            score=max(scores),
+            scores=scores,
+            logits=logits,
+            detection=classification.detection,
+            timestamp=classification.timestamp,
+            terminal=True,
+            category_map=None,  # @TODO need a new category map with the filtered taxa
+            created_at=timestamp,
+            updated_at=timestamp,
+        )
+        if new_classification.taxon is None:
+            raise (ValueError("Classification isn't registered yet. Aborting"))  # @TODO remove or fail gracefully
+
+        classifications_to_update.append(classification)
+        classifications_to_add.append(new_classification)
+
+        assert new_classification.detection is not None
+        assert new_classification.detection.occurrence is not None
+        occurrences_to_update.add(new_classification.detection.occurrence)
+
+        logging.info(
+            f"Adding new classification for Taxon {top_taxon} to occurrence {new_classification.detection.occurrence}"
+        )
+
+    # Bulk update the existing classifications
+    if classifications_to_update:
+        logger.info(f"Bulk updating {len(classifications_to_update)} existing classifications")
+        Classification.objects.bulk_update(classifications_to_update, ["terminal", "updated_at"])
+        logger.info(f"Updated {len(classifications_to_update)} existing classifications")
+
+    if classifications_to_add:
+        # Bulk create the new classifications
+        logger.info(f"Bulk creating {len(classifications_to_add)} new classifications")
+        Classification.objects.bulk_create(classifications_to_add)
+        logger.info(f"Added {len(classifications_to_add)} new classifications")
+
+    # Update the occurrence determinations
+    logger.info(f"Updating the determinations for {len(occurrences_to_update)} occurrences")
+    for occurrence in occurrences_to_update:
+        occurrence.save(update_determination=True)
+    logger.info(f"Updated determinations for {len(occurrences_to_update)} occurrences")
+
+
+@register_postprocessing_task
+class ClassMaskingTask(BasePostProcessingTask):
+    key = "class_masking"
+    name = "Class masking"
+
+    def run(self) -> None:
+        """Apply class masking on a source image collection using a taxa list."""
+        job = self.job
+        self.logger.info(f"=== Starting {self.name} ===")
+
+        collection_id = self.config.get("collection_id")
+        taxa_list_id = self.config.get("taxa_list_id")
+        algorithm_id = self.config.get("algorithm_id")
+
+        # Validate config parameters
+        if not all([collection_id, taxa_list_id, algorithm_id]):
+            self.logger.error("Missing required configuration: collection_id, taxa_list_id, algorithm_id")
+            return
+
+        try:
+            collection = SourceImageCollection.objects.get(pk=collection_id)
+            taxa_list = TaxaList.objects.get(pk=taxa_list_id)
+            algorithm = Algorithm.objects.get(pk=algorithm_id)
+        except Exception as e:
+            self.logger.exception(f"Failed to load objects: {e}")
+            return
+
+        self.logger.info(f"Applying class masking on collection {collection_id} using taxa list {taxa_list_id}")
+
+        update_occurrences_in_collection(
+            collection=collection,
+            taxa_list=taxa_list,
+            algorithm=algorithm,
+            params=self.config,
+            task_logger=self.logger,
+            job=job,
+        )
+
+        self.logger.info("Class masking completed successfully.")
+        self.logger.info(f"=== Completed {self.name} ===")
diff --git a/ami/ml/post_processing/small_size_filter.py b/ami/ml/post_processing/small_size_filter.py
@@ -31,7 +31,7 @@ def run(self) -> None:
 
         try:
             collection = SourceImageCollection.objects.get(pk=collection_id)
-            self.logger.info(f"Loaded SourceImageCollection {collection_id} " f"(Project={collection.project})")
+            self.logger.info(f"Loaded SourceImageCollection {collection_id} (Project={collection.project})")
         except SourceImageCollection.DoesNotExist:
             msg = f"SourceImageCollection {collection_id} not found"
             self.logger.error(msg)
@@ -85,6 +85,11 @@ def run(self) -> None:
                             comment=f"Auto-set by {self.name} post-processing task",
                         )
                 modified += 1
-                self.logger.debug(f"Detection {det.pk}: marked as 'Not identifiable'")
+                self.logger.info(f"Detection {det.pk}: marked as 'Not identifiable'")
+
+            # Update progress every 10 detections
+            if i % 10 == 0 or i == total:
+                progress = i / total if total > 0 else 1.0
+                self.update_progress(progress)
 
         self.logger.info(f"=== Completed {self.name}: {modified}/{total} detections modified ===")

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from . import rank_rollup, small_size_filter # noqa: F401`
	`1`	`+from . import class_masking, rank_rollup, small_size_filter # noqa: F401`