|
1 | 1 | import collections
|
2 | 2 | import contextlib
|
| 3 | +import dataclasses |
3 | 4 | import datetime
|
4 | 5 | import logging
|
5 | 6 | import threading
|
6 |
| -from typing import List, Optional |
| 7 | +from typing import Dict, List, Optional, Union |
7 | 8 |
|
8 | 9 | import flask
|
9 | 10 | from openeo.api.logs import LogEntry
|
10 |
| -from openeo.rest.job import ResultAsset |
| 11 | +from openeo.rest.job import BatchJob, ResultAsset |
11 | 12 | from openeo.util import TimingLogger, rfc3339
|
12 | 13 | from openeo_driver.errors import JobNotFinishedException
|
13 | 14 | from openeo_driver.users import User
|
|
21 | 22 | STATUS_INSERTED,
|
22 | 23 | STATUS_RUNNING,
|
23 | 24 | PartitionedJob,
|
| 25 | + SubJob, |
| 26 | +) |
| 27 | +from openeo_aggregator.partitionedjobs.crossbackend import ( |
| 28 | + CrossBackendSplitter, |
| 29 | + SubGraphId, |
24 | 30 | )
|
25 | 31 | from openeo_aggregator.partitionedjobs.splitting import TileGridSplitter
|
26 | 32 | from openeo_aggregator.partitionedjobs.zookeeper import ZooKeeperPartitionedJobDB
|
27 |
| -from openeo_aggregator.utils import _UNSET, timestamp_to_rfc3339 |
| 33 | +from openeo_aggregator.utils import _UNSET, Clock, PGWithMetadata, timestamp_to_rfc3339 |
28 | 34 |
|
29 | 35 | _log = logging.getLogger(__name__)
|
30 | 36 |
|
@@ -57,6 +63,103 @@ def create(self, user_id: str, pjob: PartitionedJob, flask_request: flask.Reques
|
57 | 63 | self.create_sjobs(user_id=user_id, pjob_id=pjob_id, flask_request=flask_request)
|
58 | 64 | return pjob_id
|
59 | 65 |
|
| 66 | + def create_crossbackend_pjob( |
| 67 | + self, |
| 68 | + *, |
| 69 | + user_id: str, |
| 70 | + process: PGWithMetadata, |
| 71 | + metadata: dict, |
| 72 | + job_options: Optional[dict] = None, |
| 73 | + splitter: CrossBackendSplitter, |
| 74 | + ) -> str: |
| 75 | + """ |
| 76 | + crossbackend partitioned job creation is different from original partitioned |
| 77 | + job creation due to dependencies between jobs. |
| 78 | + First the batch jobs have to be created in the right order on the respective backends |
| 79 | + before we have finalised sub-processgraphs, whose metadata can then be persisted in the ZooKeeperPartitionedJobDB |
| 80 | + """ |
| 81 | + # Start with reserving a new partitioned job id based on initial metadata |
| 82 | + pjob_node_value = self._db.serialize( |
| 83 | + user_id=user_id, |
| 84 | + created=Clock.time(), |
| 85 | + process=process, |
| 86 | + metadata=metadata, |
| 87 | + job_options=job_options, |
| 88 | + ) |
| 89 | + pjob_id = self._db.obtain_new_pjob_id(user_id=user_id, initial_value=pjob_node_value) |
| 90 | + self._db.set_pjob_status(user_id=user_id, pjob_id=pjob_id, status=STATUS_INSERTED, create=True) |
| 91 | + |
| 92 | + # Create batch jobs on respective backends, and build the PartitionedJob components along the way |
| 93 | + subjobs: Dict[str, SubJob] = {} |
| 94 | + dependencies: Dict[str, List[str]] = {} |
| 95 | + batch_jobs: Dict[SubGraphId, BatchJob] = {} |
| 96 | + create_stats = collections.Counter() |
| 97 | + |
| 98 | + def get_replacement(node_id: str, node: dict, subgraph_id: SubGraphId) -> dict: |
| 99 | + # TODO: use `load_stac` iso `load_result`, and use canonical URL? |
| 100 | + nonlocal batch_jobs |
| 101 | + job_id = batch_jobs[subgraph_id].job_id |
| 102 | + return { |
| 103 | + node_id: { |
| 104 | + "process_id": "load_result", |
| 105 | + "arguments": {"id": job_id}, |
| 106 | + } |
| 107 | + } |
| 108 | + |
| 109 | + for sjob_id, subjob, subjob_dependencies in splitter.split_streaming( |
| 110 | + process_graph=process["process_graph"], get_replacement=get_replacement |
| 111 | + ): |
| 112 | + subjobs[sjob_id] = subjob |
| 113 | + dependencies[sjob_id] = subjob_dependencies |
| 114 | + try: |
| 115 | + # TODO: how to error handle this? job creation? Fail whole partitioned job or try to finish what is possible? |
| 116 | + con = self._backends.get_connection(subjob.backend_id) |
| 117 | + with con.authenticated_from_request(request=flask.request), con.override( |
| 118 | + default_timeout=CONNECTION_TIMEOUT_JOB_START |
| 119 | + ): |
| 120 | + with TimingLogger(title=f"Create batch job {pjob_id=}:{sjob_id} on {con.id=}", logger=_log.info): |
| 121 | + job = con.create_job( |
| 122 | + process_graph=subjob.process_graph, |
| 123 | + title=f"Crossbackend job {pjob_id}:{sjob_id}", |
| 124 | + plan=metadata.get("plan"), |
| 125 | + budget=metadata.get("budget"), |
| 126 | + additional=job_options, |
| 127 | + ) |
| 128 | + _log.info(f"Created {pjob_id}:{sjob_id} on backend {con.id} as batch job {job.job_id}") |
| 129 | + batch_jobs[sjob_id] = job |
| 130 | + title = f"Partitioned job {pjob_id=} {sjob_id=}" |
| 131 | + self._db.insert_sjob( |
| 132 | + user_id=user_id, |
| 133 | + pjob_id=pjob_id, |
| 134 | + sjob_id=sjob_id, |
| 135 | + subjob=subjob, |
| 136 | + title=title, |
| 137 | + status=STATUS_CREATED, |
| 138 | + ) |
| 139 | + self._db.set_backend_job_id( |
| 140 | + user_id=user_id, pjob_id=pjob_id, sjob_id=sjob_id, job_id=job.job_id |
| 141 | + ) |
| 142 | + create_stats[STATUS_CREATED] += 1 |
| 143 | + except Exception as exc: |
| 144 | + _log.error(f"Creation of {pjob_id}:{sjob_id} failed", exc_info=True) |
| 145 | + msg = f"Create failed: {exc}" |
| 146 | + self._db.set_sjob_status( |
| 147 | + user_id=user_id, pjob_id=pjob_id, sjob_id=sjob_id, status=STATUS_ERROR, message=msg |
| 148 | + ) |
| 149 | + create_stats[STATUS_ERROR] += 1 |
| 150 | + |
| 151 | + # TODO: this is currently unused, don't bother building it at all? |
| 152 | + partitioned_job = PartitionedJob( |
| 153 | + process=process, metadata=metadata, job_options=job_options, subjobs=subjobs, dependencies=dependencies |
| 154 | + ) |
| 155 | + |
| 156 | + pjob_status = STATUS_CREATED if create_stats[STATUS_CREATED] > 0 else STATUS_ERROR |
| 157 | + self._db.set_pjob_status( |
| 158 | + user_id=user_id, pjob_id=pjob_id, status=pjob_status, message=repr(create_stats), progress=0 |
| 159 | + ) |
| 160 | + |
| 161 | + return pjob_id |
| 162 | + |
60 | 163 | def create_sjobs(self, user_id: str, pjob_id: str, flask_request: flask.Request):
|
61 | 164 | """Create all sub-jobs on remote back-end for given partitioned job"""
|
62 | 165 | pjob_metadata = self._db.get_pjob_metadata(user_id=user_id, pjob_id=pjob_id)
|
|
0 commit comments