Issue #719 job manager WIP: start jobs in worker thread

soxofaan · soxofaan · commit 33350707f2f8 · 2025-02-14T15:34:49.000+01:00
diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py
@@ -5,7 +5,9 @@
 import datetime
 import json
 import logging
+import queue
 import re
+import threading
 import time
 import warnings
 from pathlib import Path
@@ -31,13 +33,15 @@
 import shapely.wkt
 from requests.adapters import HTTPAdapter, Retry
 
+import openeo
 from openeo import BatchJob, Connection
 from openeo.internal.processes.parse import (
     Parameter,
     Process,
     parse_remote_process_definition,
 )
 from openeo.rest import OpenEoApiError
+from openeo.rest.auth.auth import BearerAuth
 from openeo.util import LazyLoadCache, deep_get, repr_truncate, rfc3339
 
 _log = logging.getLogger(__name__)
@@ -223,6 +227,9 @@ def __init__(
         )
         self._thread = None
 
+        self._work_queue = queue.Queue()
+        self._result_queue = queue.Queue()
+
     def add_backend(
         self,
         name: str,
@@ -493,6 +500,10 @@ def run_jobs(
         # TODO: support user-provided `stats`
         stats = collections.defaultdict(int)
 
+        # TODO: multiple workers instead of a single one? Work with thread pool?
+        worker_thread = _JobManagerWorkerThread(work_queue=self._work_queue, result_queue=self._result_queue)
+        worker_thread.start()
+
         while sum(job_db.count_by_status(statuses=["not_started", "created", "queued", "running"]).values()) > 0:
             self._job_update_loop(job_db=job_db, start_job=start_job, stats=stats)
             stats["run_jobs loop"] += 1
@@ -502,6 +513,9 @@ def run_jobs(
             time.sleep(self.poll_sleep)
             stats["sleep"] += 1
 
+        worker_thread.stop_event.set()
+        worker_thread.join()
+
         return stats
 
     def _job_update_loop(
@@ -542,6 +556,7 @@ def _job_update_loop(
                         total_added += 1
 
         # Act on jobs
+        # TODO: move this back closer to the `_track_statuses` call above, once job done/error handling is also handled in threads?
         for job, row in jobs_done:
             self.on_job_done(job, row)
 
@@ -551,6 +566,11 @@ def _job_update_loop(
         for job, row in jobs_cancel:
             self.on_job_cancel(job, row)
 
+        # Check worker thread results
+        while not self._result_queue.empty():
+            # TODO
+            ...
+
 
     def _launch_job(self, start_job, df, i, backend_name, stats: Optional[dict] = None):
         """Helper method for launching jobs
@@ -584,7 +604,7 @@ def _launch_job(self, start_job, df, i, backend_name, stats: Optional[dict] = No
             connection = self._get_connection(backend_name, resilient=True)
 
             stats["start_job call"] += 1
-            job = start_job(
+            job: BatchJob = start_job(
                 row=row,
                 connection_provider=self._get_connection,
                 connection=connection,
@@ -605,14 +625,24 @@ def _launch_job(self, start_job, df, i, backend_name, stats: Optional[dict] = No
                     if status == "created":
                         # start job if not yet done by callback
                         try:
-                            job.start()
-                            stats["job start"] += 1
-                            df.loc[i, "status"] = job.status()
-                            stats["job get status"] += 1
+                            job_con = job.connection
+                            self._work_queue.put(
+                                (
+                                    _JobManagerWorkerThread.WORK_TYPE_START_JOB,
+                                    (
+                                        job_con.root_url,
+                                        job_con.auth.bearer if isinstance(job_con.auth, BearerAuth) else None,
+                                        job.job_id,
+                                    ),
+                                )
+                            )
+                            job_status = "queued_for_start"
+                            stats[f"job {job_status}"] += 1
+                            df.loc[i, "status"] = job_status
                         except OpenEoApiError as e:
                             _log.error(e)
-                            df.loc[i, "status"] = "start_failed"
-                            stats["job start error"] += 1
+                            df.loc[i, "status"] = "queued_for_start_failed"
+                            stats["job queued_for_start error"] += 1
             else:
                 # TODO: what is this "skipping" about actually?
                 df.loc[i, "status"] = "skipped"
@@ -673,20 +703,20 @@ def _cancel_prolonged_job(self, job: BatchJob, row):
         try:
             # Ensure running start time is valid
             job_running_start_time = rfc3339.parse_datetime(row.get("running_start_time"), with_timezone=True)
-            
+
             # Parse the current time into a datetime object with timezone info
             current_time = rfc3339.parse_datetime(rfc3339.utcnow(), with_timezone=True)
 
             # Calculate the elapsed time between job start and now
             elapsed = current_time - job_running_start_time
 
             if elapsed > self._cancel_running_job_after:
-    
+
                 _log.info(
                     f"Cancelling long-running job {job.job_id} (after {elapsed}, running since {job_running_start_time})"
                 )
                 job.stop()
-                
+
         except Exception as e:
             _log.error(f"Unexpected error while handling job {job.job_id}: {e}")
 
@@ -783,6 +813,42 @@ def _track_statuses(self, job_db: JobDatabaseInterface, stats: Optional[dict] =
         return jobs_done, jobs_error, jobs_cancel
 
 
+class _JobManagerWorkerThread(threading.Thread):
+    WORK_TYPE_START_JOB = "start_job"
+
+    def __init__(self, work_queue: queue.Queue, result_queue: queue.Queue):
+        super().__init__()
+        self.work_queue = work_queue
+        self.result_queue = result_queue
+        self.stop_event = threading.Event()
+        # TODO: add customization options for timeout/sleep?
+
+    def run(self):
+        while not self.stop_event.is_set():
+            try:
+                work_type, work_args = self.work_queue.get(timeout=5)
+                if work_type == self.WORK_TYPE_START_JOB:
+                    self._start_job(work_args)
+                else:
+                    raise ValueError(f"Unknown work item: {work_type!r}")
+            except queue.Empty:
+                time.sleep(10)
+
+    def _start_job(self, work_args: tuple):
+        root_url, bearer, job_id = work_args
+        try:
+            connection = openeo.connect(url=root_url)
+            if bearer:
+                connection.authenticate_bearer_token(bearer_token=bearer)
+            job = connection.job(job_id)
+            job.start()
+            status = job.status()
+        except Exception as e:
+            self.result_queue.put((self.WORK_TYPE_START_JOB, (job_id, "failed", repr(e))))
+        else:
+            self.result_queue.put((self.WORK_TYPE_START_JOB, (job_id, "started", status)))
+
+
 def _format_usage_stat(job_metadata: dict, field: str) -> str:
     value = deep_get(job_metadata, "usage", field, "value", default=0)
     unit = deep_get(job_metadata, "usage", field, "unit", default="")