Open-EO · HansVRP · Feb 14, 2025 · Feb 14, 2025 · Feb 19, 2025 · Feb 19, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+- `MultiBackendJobManager`: starting of jobs (which can take long in some situations) is now done in side-threads to avoid blocking of the main job management thread, improving its responsiveness and allowing better overall throughput. To make this possible, a new method `get_by_indices()` was added to the `JobDatabaseInterface` API. Make sure to implement this method if you have a custom `JobDatabaseInterface` implementation that does not provide this yet. ([#719](https://github.com/Open-EO/openeo-python-client/issues/719))
+
 ### Removed
 
 ### Fixed

diff --git a/openeo/extra/job_management/__init__.py b/openeo/extra/job_management/__init__.py
@@ -33,12 +33,17 @@
 from urllib3.util import Retry
 
 from openeo import BatchJob, Connection
+from openeo.extra.job_management._thread_worker import (
+    _JobManagerWorkerThreadPool,
+    _JobStartTask,
+)
 from openeo.internal.processes.parse import (
     Parameter,
     Process,
     parse_remote_process_definition,
 )
 from openeo.rest import OpenEoApiError
+from openeo.rest.auth.auth import BearerAuth
 from openeo.util import LazyLoadCache, deep_get, repr_truncate, rfc3339
 
 _log = logging.getLogger(__name__)
@@ -76,8 +81,10 @@ def exists(self) -> bool:
     @abc.abstractmethod
     def persist(self, df: pd.DataFrame):
         """
-        Store job data to the database.
-        The provided dataframe may contain partial information, which is merged into the larger database.
+        Store (now or updated) job data to the database.
+
+        The provided dataframe may only cover a subset of all the jobs ("rows") of the whole database,
+        so it should be merged with the existing data (if any) instead of overwriting it completely.
 
         :param df: job data to store.
         """
@@ -106,6 +113,18 @@ def get_by_status(self, statuses: List[str], max=None) -> pd.DataFrame:
         """
         ...
 
+    @abc.abstractmethod
+    def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
+        """
+        Returns a dataframe with jobs based on their (dataframe) index
+
+        :param indices: List of indices to include.
+
+        :return: DataFrame with jobs filtered by indices.
+        """
+        ...
+
+
 def _start_job_default(row: pd.Series, connection: Connection, *args, **kwargs):
     raise NotImplementedError("No 'start_job' callable provided")
 
@@ -187,6 +206,7 @@ def start_job(
 
     # Expected columns in the job DB dataframes.
     # TODO: make this part of public API when settled?
+    # TODO: move non official statuses to seperate column (not_started, queued_for_start)
     _COLUMN_REQUIREMENTS: Mapping[str, _ColumnProperties] = {
         "id": _ColumnProperties(dtype="str"),
         "backend_name": _ColumnProperties(dtype="str"),
@@ -223,6 +243,7 @@ def __init__(
             datetime.timedelta(seconds=cancel_running_job_after) if cancel_running_job_after is not None else None
         )
         self._thread = None
+        self._worker_pool = None
 
     def add_backend(
         self,
@@ -359,21 +380,27 @@ def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabas
         _log.info(f"Resuming `run_jobs` from existing {job_db}")
 
         self._stop_thread = False
+        self._worker_pool = _JobManagerWorkerThreadPool()
 
         def run_loop():
 
             # TODO: support user-provided `stats`
             stats = collections.defaultdict(int)
 
             while (
-                sum(job_db.count_by_status(statuses=["not_started", "created", "queued", "running"]).values()) > 0
+                sum(
+                    job_db.count_by_status(
+                        statuses=["not_started", "created", "queued", "queued_for_start", "running"]
+                    ).values()
+                )
+                > 0
                 and not self._stop_thread
             ):
-                self._job_update_loop(job_db=job_db, start_job=start_job)
+                self._job_update_loop(job_db=job_db, start_job=start_job, stats=stats)
                 stats["run_jobs loop"] += 1
 
+                # Show current stats and sleep
                 _log.info(f"Job status histogram: {job_db.count_by_status()}. Run stats: {dict(stats)}")
-                # Do sequence of micro-sleeps to allow for quick thread exit
                 for _ in range(int(max(1, self.poll_sleep))):
                     time.sleep(1)
                     if self._stop_thread:
@@ -392,6 +419,8 @@ def stop_job_thread(self, timeout_seconds: Optional[float] = _UNSET):
 
         .. versionadded:: 0.32.0
         """
+        self._worker_pool.shutdown()
+
         if self._thread is not None:
             self._stop_thread = True
             if timeout_seconds is _UNSET:
@@ -494,7 +523,16 @@ def run_jobs(
         # TODO: support user-provided `stats`
         stats = collections.defaultdict(int)
 
-        while sum(job_db.count_by_status(statuses=["not_started", "created", "queued", "running"]).values()) > 0:
+        self._worker_pool = _JobManagerWorkerThreadPool()
+
+        while (
+            sum(
+                job_db.count_by_status(
+                    statuses=["not_started", "created", "queued_for_start", "queued", "running"]
+                ).values()
+            )
+            > 0
+        ):
             self._job_update_loop(job_db=job_db, start_job=start_job, stats=stats)
             stats["run_jobs loop"] += 1
 
@@ -503,6 +541,9 @@ def run_jobs(
             time.sleep(self.poll_sleep)
             stats["sleep"] += 1
 
+        # TODO; run post process after shutdown once more to ensure completion?
+        self._worker_pool.shutdown()
+
         return stats
 
     def _job_update_loop(
@@ -525,7 +566,8 @@ def _job_update_loop(
         not_started = job_db.get_by_status(statuses=["not_started"], max=200).copy()
         if len(not_started) > 0:
             # Check number of jobs running at each backend
-            running = job_db.get_by_status(statuses=["created", "queued", "running"])
+            # TODO: should "created" be included in here? Calling this "running" is quite misleading then.
+            running = job_db.get_by_status(statuses=["created", "queued", "queued_for_start", "running"])
             stats["job_db get_by_status"] += 1
             per_backend = running.groupby("backend_name").size().to_dict()
             _log.info(f"Running per backend: {per_backend}")
@@ -542,7 +584,9 @@ def _job_update_loop(
                         stats["job_db persist"] += 1
                         total_added += 1
 
-        # Act on jobs
+        self._process_threadworker_updates(self._worker_pool, job_db=job_db, stats=stats)
+
+        # TODO: move this back closer to the `_track_statuses` call above, once job done/error handling is also handled in threads?
         for job, row in jobs_done:
             self.on_job_done(job, row)
 
@@ -552,7 +596,6 @@ def _job_update_loop(
         for job, row in jobs_cancel:
             self.on_job_cancel(job, row)
 
-
     def _launch_job(self, start_job, df, i, backend_name, stats: Optional[dict] = None):
         """Helper method for launching jobs
 
@@ -599,26 +642,91 @@ def _launch_job(self, start_job, df, i, backend_name, stats: Optional[dict] = No
             df.loc[i, "start_time"] = rfc3339.now_utc()
             if job:
                 df.loc[i, "id"] = job.job_id
+                _log.info(f"Job created: {job.job_id}")
                 with ignore_connection_errors(context="get status"):
                     status = job.status()
                     stats["job get status"] += 1
                     df.loc[i, "status"] = status
                     if status == "created":
                         # start job if not yet done by callback
                         try:
-                            job.start()
-                            stats["job start"] += 1
-                            df.loc[i, "status"] = job.status()
-                            stats["job get status"] += 1
+                            job_con = job.connection
+                            task = _JobStartTask(
+                                root_url=job_con.root_url,
+                                bearer_token=job_con.auth.bearer if isinstance(job_con.auth, BearerAuth) else None,
+                                job_id=job.job_id,
+                                df_idx=i,
+                            )
+                            _log.info(f"Submitting task {task} to thread pool")
+                            self._worker_pool.submit_task(task)
+
+                            stats["job_queued_for_start"] += 1
+                            df.loc[i, "status"] = "queued_for_start"
                         except OpenEoApiError as e:
-                            _log.error(e)
-                            df.loc[i, "status"] = "start_failed"
-                            stats["job start error"] += 1
+                            _log.info(f"Failed submitting task {task} to thread pool with error: {e}")
+                            df.loc[i, "status"] = "queued_for_start_failed"
+                            stats["job queued for start failed"] += 1
             else:
                 # TODO: what is this "skipping" about actually?
                 df.loc[i, "status"] = "skipped"
                 stats["start_job skipped"] += 1
 
+    def _process_threadworker_updates(
+        self,
+        worker_pool: _JobManagerWorkerThreadPool,
+        *,
+        job_db: JobDatabaseInterface,
+        stats: Dict[str, int],
+    ) -> None:
+        """
+        Fetches completed TaskResult objects from the worker pool and applies
+        their db_update and stats_updates. Only existing DataFrame rows
+        (matched by df_idx) are upserted via job_db.persist(). Any results
+        targeting unknown df_idx indices are logged as errors but not persisted.
+
+        :param worker_pool: Thread-pool managing asynchronous Task executes
+        :param job_db:      Interface to append/upsert to the job database
+        :param stats:       Dictionary accumulating statistic counters
+        """
+        # Retrieve completed task results immediately
+        results, _ = worker_pool.process_futures(timeout=0)
+
+        # Collect update dicts
+        updates: List[Dict[str, Any]] = []
+        for res in results:
+            # Process database updates
+            if res.db_update:
+                try:
+                    updates.append(
+                        {
+                            "id": res.job_id,
+                            "df_idx": res.df_idx,
+                            **res.db_update,
+                        }
+                    )
+                except Exception as e:
+                    _log.error(f"Skipping invalid db_update {res.db_update!r} for job {res.job_id!r}: {e}")
+
+            # Process stats updates
+            if res.stats_update:
+                try:
+                    for key, val in res.stats_update.items():
+                        count = int(val)
+                        stats[key] = stats.get(key, 0) + count
+                except Exception as e:
+                    _log.error(f"Skipping invalid stats_update {res.stats_update!r} for job {res.job_id!r}: {e}")
+
+        # No valid updates: nothing to persist
+        if not updates:
+            return
+
+        # Build update DataFrame and persist
+        df_updates = job_db.get_by_indices(indices=set(u["df_idx"] for u in updates))
+        df_updates.update(pd.DataFrame(updates).set_index("df_idx", drop=True), overwrite=True)
+        job_db.persist(df_updates)
+        stats["job_db persist"] = stats.get("job_db persist", 0) + 1
+
+
     def on_job_done(self, job: BatchJob, row):
         """
         Handles jobs that have finished. Can be overridden to provide custom behaviour.
@@ -674,20 +782,19 @@ def _cancel_prolonged_job(self, job: BatchJob, row):
         try:
             # Ensure running start time is valid
             job_running_start_time = rfc3339.parse_datetime(row.get("running_start_time"), with_timezone=True)
-            
+
             # Parse the current time into a datetime object with timezone info
             current_time = rfc3339.parse_datetime(rfc3339.now_utc(), with_timezone=True)
 
             # Calculate the elapsed time between job start and now
             elapsed = current_time - job_running_start_time
 
             if elapsed > self._cancel_running_job_after:
-
                 _log.info(
                     f"Cancelling long-running job {job.job_id} (after {elapsed}, running since {job_running_start_time})"
                 )
                 job.stop()
-                
+
         except Exception as e:
             _log.error(f"Unexpected error while handling job {job.job_id}: {e}")
 
@@ -716,7 +823,7 @@ def _track_statuses(self, job_db: JobDatabaseInterface, stats: Optional[dict] =
         """
         stats = stats if stats is not None else collections.defaultdict(int)
 
-        active = job_db.get_by_status(statuses=["created", "queued", "running"]).copy()
+        active = job_db.get_by_status(statuses=["created", "queued", "queued_for_start", "running"]).copy()
 
         jobs_done = []
         jobs_error = []
@@ -738,7 +845,7 @@ def _track_statuses(self, job_db: JobDatabaseInterface, stats: Optional[dict] =
                     f"Status of job {job_id!r} (on backend {backend_name}) is {new_status!r} (previously {previous_status!r})"
                 )
 
-                if new_status == "finished":
+                if previous_status != "finished" and new_status == "finished":
                     stats["job finished"] += 1
                     jobs_done.append((the_job, active.loc[i]))
 
@@ -750,7 +857,7 @@ def _track_statuses(self, job_db: JobDatabaseInterface, stats: Optional[dict] =
                     stats["job canceled"] += 1
                     jobs_cancel.append((the_job, active.loc[i]))
 
-                if previous_status in {"created", "queued"} and new_status == "running":
+                if previous_status in {"created", "queued", "queued_for_start"} and new_status == "running":
                     stats["job started running"] += 1
                     active.loc[i, "running_start_time"] = rfc3339.now_utc()
 
@@ -874,10 +981,21 @@ def get_by_status(self, statuses, max=None) -> pd.DataFrame:
 
     def _merge_into_df(self, df: pd.DataFrame):
         if self._df is not None:
+            unknown_indices = set(df.index).difference(df.index)
+            if unknown_indices:
+                _log.warning(f"Merging DataFrame with {unknown_indices=} which will be lost.")
             self._df.update(df, overwrite=True)
         else:
             self._df = df
 
+    def get_by_indices(self, indices: Iterable[Union[int, str]]) -> pd.DataFrame:
+        indices = set(indices)
+        known = indices.intersection(self.df.index)
+        unknown = indices.difference(self.df.index)
+        if unknown:
+            _log.warning(f"Ignoring unknown DataFrame indices {unknown}")
+        return self._df.loc[list(known)]
+
 
 class CsvJobDatabase(FullDataFrameJobDatabase):
     """