Skip to content

Commit fe35b33

Browse files
authored
Merge branch 'main' into fix/fix-duplicate-check
2 parents 7b1fa41 + 0954c38 commit fe35b33

File tree

5 files changed

+85
-7
lines changed

5 files changed

+85
-7
lines changed

ami/main/admin.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import ami.utils
1111
from ami import tasks
12+
from ami.ml.tasks import remove_duplicate_classifications
1213

1314
from .models import (
1415
BlogPost,
@@ -57,6 +58,16 @@ class ProjectAdmin(admin.ModelAdmin[Project]):
5758

5859
list_display = ("name", "priority", "active", "created_at", "updated_at")
5960

61+
@admin.action(description="Remove duplicate classifications from all detections")
62+
def _remove_duplicate_classifications(self, request: HttpRequest, queryset: QuerySet[Project]) -> None:
63+
task_ids = []
64+
for project in queryset:
65+
task = remove_duplicate_classifications.delay(project_id=project.pk)
66+
task_ids.append(task.id)
67+
self.message_user(request, f"Started {len(task_ids)} tasks to delete classification: {task_ids}")
68+
69+
actions = [_remove_duplicate_classifications]
70+
6071

6172
@admin.register(Deployment)
6273
class DeploymentAdmin(admin.ModelAdmin[Deployment]):

ami/main/api/views.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -931,6 +931,9 @@ def zero_occurrences(self, queryset: QuerySet) -> QuerySet:
931931
def get_queryset(self) -> QuerySet:
932932
qs = super().get_queryset()
933933

934+
# First filter out taxa that have no occurrences
935+
# qs = qs.filter(occurrences__isnull=False).distinct()
936+
934937
occurrences_filter, occurrences_count_filter = self.get_occurrences_filters(qs)
935938

936939
qs = qs.select_related("parent")
@@ -944,6 +947,8 @@ def get_queryset(self) -> QuerySet:
944947
if filter_active:
945948
qs = self.filter_by_classification_threshold(qs)
946949
qs = self.add_occurrence_counts(qs, occurrences_count_filter)
950+
# Filter out taxa that have no occurrences or occurrences count is null
951+
qs = qs.filter(occurrences_count__gt=0).filter(occurrences_count__isnull=False)
947952
else:
948953
# If no filter don't return anything related to occurrences
949954
# in a list view.

ami/main/models.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -560,13 +560,14 @@ def update_children(self):
560560
]
561561
for model_name in child_models:
562562
model = apps.get_model("main", model_name)
563-
project_values = set(model.objects.filter(deployment=self).values_list("project", flat=True).distinct())
564-
if len(project_values) > 1:
563+
qs = model.objects.filter(deployment=self).exclude(project=self.project)
564+
project_values = set(qs.values_list("project", flat=True).distinct())
565+
if len(project_values):
565566
logger.warning(
566567
f"Deployment {self} has alternate projects set on {model_name} "
567568
f"objects: {project_values}. Updating them!"
568569
)
569-
model.objects.filter(deployment=self).exclude(project=self.project).update(project=self.project)
570+
qs.update(project=self.project)
570571

571572
def update_calculated_fields(self, save=False):
572573
"""Update calculated fields on the deployment."""
@@ -601,15 +602,19 @@ def update_calculated_fields(self, save=False):
601602
self.save(update_calculated_fields=False)
602603

603604
def save(self, update_calculated_fields=True, *args, **kwargs):
605+
last_updated = self.updated_at or timezone.now()
604606
super().save(*args, **kwargs)
605607
if self.pk and update_calculated_fields:
608+
# @TODO Use "dirty" flag strategy to only update when needed
609+
new_or_updated_captures = self.captures.filter(updated_at__gte=last_updated).count()
610+
deleted_captures = True if self.captures.count() < (self.captures_count or 0) else False
611+
if new_or_updated_captures or deleted_captures:
612+
ami.tasks.regroup_events.delay(self.pk)
606613
self.update_calculated_fields(save=True)
607614
if self.project:
608615
self.update_children()
609616
# @TODO this isn't working as a background task
610617
# ami.tasks.model_task.delay("Project", self.project.pk, "update_children_project")
611-
# @TODO Use "dirty" flag strategy to only update when needed
612-
ami.tasks.regroup_events.delay(self.pk)
613618

614619

615620
@final
@@ -848,8 +853,7 @@ def group_images_into_events(
848853
[f'{d.strftime("%Y-%m-%d %H:%M:%S")} x{c}' for d, c in dupes.values_list("timestamp", "count")]
849854
)
850855
logger.warning(
851-
f"Found multiple images with the same timestamp in deployment '{deployment}':\n "
852-
f"{values}\n"
856+
f"Found {len(values)} images with the same timestamp in deployment '{deployment}'. "
853857
f"Only one image will be used for each timestamp for each event."
854858
)
855859

@@ -1696,6 +1700,8 @@ class Classification(BaseModel):
16961700
)
16971701
# job = models.CharField(max_length=255, null=True)
16981702

1703+
objects = ClassificationManager()
1704+
16991705
# Type hints for auto-generated fields
17001706
taxon_id: int
17011707
algorithm_id: int
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from django.core.management.base import BaseCommand
2+
from tqdm import tqdm
3+
4+
from ami.main.models import Classification
5+
6+
7+
class Command(BaseCommand):
8+
help = "Find and remove duplicate classifications on detections"
9+
10+
def add_arguments(self, parser):
11+
parser.add_argument("--project", type=int, help="Project ID to process")
12+
parser.add_argument("--dry-run", action="store_true", help="List duplicates without deleting them")
13+
14+
def handle(self, *args, **options):
15+
project_id = options["project"]
16+
dry_run = options["dry_run"]
17+
18+
duplicates = Classification.objects.find_duplicates(project_id=project_id) # type: ignore
19+
total = duplicates.count()
20+
self.stdout.write(f"Found {total} duplicate classifications")
21+
if dry_run:
22+
with tqdm(total=total, desc="Listing duplicates", unit="classification") as pbar:
23+
for duplicate in duplicates:
24+
self.stdout.write(f"Duplicate classification: {duplicate}")
25+
pbar.update(1)
26+
27+
else:
28+
with tqdm(total=None, desc="Deleting duplicates") as pbar:
29+
duplicates.delete()

ami/ml/tasks.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,30 @@ def create_detection_images(source_image_ids: list[int]):
4848
logger.debug(f"Created {len(processed_paths)} detection images for SourceImage #{source_image.pk}")
4949
except Exception as e:
5050
logger.error(f"Error creating detection images for SourceImage {source_image.pk}: {str(e)}")
51+
52+
53+
@celery_app.task(soft_time_limit=default_soft_time_limit, time_limit=default_time_limit)
54+
def remove_duplicate_classifications(project_id: int | None = None, dry_run: bool = False) -> int:
55+
"""
56+
Remove duplicate classifications from the database.
57+
58+
A duplicate classification is one where the same detection, taxon, algorithm, score, softmax_output and raw_output
59+
have been classified more than once. This can happen if the same detection is classified multiple times by the same
60+
algorithm with the same result.
61+
62+
This method will keep the oldest classification and delete the rest.
63+
"""
64+
from ami.main.models import Classification
65+
66+
# Find the oldest classification for each unique combination
67+
duplicates_to_delete = Classification.objects.find_duplicates(project_id=project_id) # type: ignore
68+
num = duplicates_to_delete.count()
69+
70+
logger.info(f"Found {duplicates_to_delete.count()} duplicate classifications to delete")
71+
if dry_run:
72+
logger.info(f"Would delete {num} duplicate classifications")
73+
else:
74+
num_deleted = duplicates_to_delete.delete()
75+
logger.info(f"Deleted {num_deleted} duplicate classifications")
76+
77+
return num_deleted

0 commit comments

Comments
 (0)