fix: ML processing that is causing jobs to fail for medium-large jobs (issue #782) (#934)

carlosgjs · web-flow · commit 47fed9a3973e · 2025-09-02T14:39:53.000-07:00
* fix: JobLogHandlers added multiple times

* Fix linting

* Make celeryworker debuggable for local
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,20 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Remote Attach",
+            "type": "debugpy",
+            "request": "attach",
+            "connect": {
+                "host": "localhost",
+                "port": 5678
+            },
+            "pathMappings": [
+                {
+                    "localRoot": "${workspaceFolder}",
+                    "remoteRoot": "."
+                }
+            ]
+        }
+    ]
+}
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ Platform for processing and reviewing images from automated insect monitoring st
 
 Antenna uses [Docker](https://docs.docker.com/get-docker/) & [Docker Compose](https://docs.docker.com/compose/install/) to run all services locally for development.
 
-1) Install Docker for your host operating (Linux, macOS, Windows)
+1) Install Docker for your host operating (Linux, macOS, Windows). Docker Compose `v2.38.2` or later recommended.
 
 2) Add the following to your `/etc/hosts` file in order to see and process the demo source images. This makes the hostname `minio` and `django` alias for `localhost` so the same image URLs can be viewed in the host machine's web browser and be processed by the ML services. This can be skipped if you are using an external image storage service.
 
@@ -24,6 +24,7 @@ Antenna uses [Docker](https://docs.docker.com/get-docker/) & [Docker Compose](ht
     docker compose logs -f django celeryworker ui
     # Ctrl+c to close the logs
 ```
+NOTE: If you see docker build errors such as `At least one invalid signature was encountered`, these could happen if docker runs out of space. Commands like `docker image prune -f` and `docker system prune` can be helpful to clean up space.
 
 3) Optionally, run additional ML processing services: `processing_services` defines ML backends which wrap detections in our FastAPI response schema. The `example` app demos how to add new pipelines, algorithms, and models. See the detailed instructions in `processing_services/README.md`.
 
@@ -32,12 +33,15 @@ docker compose -f processing_services/example/docker-compose.yml up -d
 # Once running, in Antenna register a new processing service called: http://ml_backend_example:2000
 ```
 
-4) Access the platform the following URLs:
+4) Access the platform with the following URLs:
 
 - Primary web interface: http://localhost:4000
 - API browser: http://localhost:8000/api/v2/
 - Django admin: http://localhost:8000/admin/
 - OpenAPI / Swagger documentation: http://localhost:8000/api/v2/docs/
+- Minio UI: http://minio:9001, Minio service: http://minio:9000
+
+NOTE: If one of these services is not working properly, it could be due another process is using the port. You can check for this with `lsof -i :<PORT_NUMBER>`.
 
 A default user will be created with the following credentials. Use these to log into the web UI or the Django admin.
 
diff --git a/ami/jobs/models.py b/ami/jobs/models.py
@@ -404,6 +404,7 @@ def run(cls, job: "Job"):
         chunk_size = config.get("request_source_image_batch_size", 1)
         chunks = [images[i : i + chunk_size] for i in range(0, image_count, chunk_size)]  # noqa
         request_failed_images = []
+        job.logger.info(f"Processing {image_count} images in {len(chunks)} batches of up to {chunk_size}")
 
         for i, chunk in enumerate(chunks):
             request_sent = time.time()
@@ -946,11 +947,15 @@ def default_progress(cls) -> JobProgress:
 
     @property
     def logger(self) -> logging.Logger:
-        logger = logging.getLogger(f"ami.jobs.{self.pk}")
-        # Also log output to a field on thie model instance
-        logger.addHandler(JobLogHandler(self))
-        logger.propagate = False
-        return logger
+        _logger = logging.getLogger(f"ami.jobs.{self.pk}")
+
+        # Only add JobLogHandler if not already present
+        if not any(isinstance(h, JobLogHandler) for h in _logger.handlers):
+            # Also log output to a field on thie model instance
+            logger.info("Adding JobLogHandler to logger for job %s", self.pk)
+            _logger.addHandler(JobLogHandler(self))
+        _logger.propagate = False
+        return _logger
 
     class Meta:
         ordering = ["-created_at"]
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -89,8 +89,11 @@ services:
     <<: *django
     image: ami_local_celeryworker
     scale: 1
-    ports: []
-    command: /start-celeryworker
+    # For remote debugging with debugpy, should get overridden for production
+    # Also make sure to install debugpy in your requirements/local.txt
+    ports:
+      - "5678:5678"
+    command: python -m debugpy --listen 0.0.0.0:5678 -m celery -A config.celery_app worker -l INFO
 
   celerybeat:
     <<: *django
diff --git a/requirements/local.txt b/requirements/local.txt
@@ -1 +1,2 @@
 -r base.txt
+debugpy # For remote debugging with debugpy

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`-r base.txt`
	`2`	`+debugpy # For remote debugging with debugpy`