32
32
import pkg_resources
33
33
import pystac
34
34
import requests
35
+ import reretry
35
36
import shapely .geometry .base
36
37
from deprecated import deprecated
37
38
from geopyspark import LayerType , Pyramid , TiledRasterLayer
96
97
k8s_get_batch_job_cfg_secret_name ,
97
98
truncate_user_id_k8s ,
98
99
)
100
+ from openeogeotrellis .integrations .s3proxy .asset_urls import PresignedS3AssetUrls
99
101
from openeogeotrellis .integrations .stac import ResilientStacIO
100
102
from openeogeotrellis .integrations .traefik import Traefik
101
103
from openeogeotrellis .integrations .yarn_jobrunner import YARNBatchJobRunner
@@ -2082,6 +2084,12 @@ def as_boolean_arg(job_option_key: str, default_value: str) -> str:
2082
2084
)
2083
2085
log .info (f"mapped job_id { job_id } to application ID { spark_app_id } " )
2084
2086
dbl_registry .set_application_id (job_id = job_id , user_id = user_id , application_id = spark_app_id )
2087
+ dbl_registry .set_results_metadata_uri (
2088
+ job_id = job_id ,
2089
+ user_id = user_id ,
2090
+ results_metadata_uri = f"s3://{ bucket } /{ str (job_work_dir ).strip ('/' )} /{ JOB_METADATA_FILENAME } " ,
2091
+ )
2092
+
2085
2093
status_response = {}
2086
2094
retry = 0
2087
2095
while "status" not in status_response and retry < 10 :
@@ -2112,9 +2120,24 @@ def as_boolean_arg(job_option_key: str, default_value: str) -> str:
2112
2120
runner = YARNBatchJobRunner (principal = self ._principal , key_tab = self ._key_tab )
2113
2121
runner .set_default_sentinel_hub_credentials (self ._default_sentinel_hub_client_id ,self ._default_sentinel_hub_client_secret )
2114
2122
vault_token = None if sentinel_hub_client_alias == 'default' else get_vault_token (sentinel_hub_client_alias )
2115
- application_id = runner .run_job (job_info , job_id , job_work_dir = self .get_job_work_dir (job_id = job_id ), log = log , user_id = user_id , api_version = api_version ,proxy_user = proxy_user or job_info .get ('proxy_user' ,None ), vault_token = vault_token )
2123
+ job_work_dir = self .get_job_work_dir (job_id = job_id )
2124
+ application_id = runner .run_job (
2125
+ job_info ,
2126
+ job_id ,
2127
+ job_work_dir = job_work_dir ,
2128
+ log = log ,
2129
+ user_id = user_id ,
2130
+ api_version = api_version ,
2131
+ proxy_user = proxy_user or job_info .get ("proxy_user" , None ),
2132
+ vault_token = vault_token ,
2133
+ )
2116
2134
with self ._double_job_registry as dbl_registry :
2117
2135
dbl_registry .set_application_id (job_id = job_id , user_id = user_id , application_id = application_id )
2136
+ dbl_registry .set_results_metadata_uri (
2137
+ job_id = job_id ,
2138
+ user_id = user_id ,
2139
+ results_metadata_uri = f"file://{ job_work_dir } /{ JOB_METADATA_FILENAME } " ,
2140
+ )
2118
2141
dbl_registry .set_status (job_id = job_id , user_id = user_id , status = JOB_STATUS .QUEUED )
2119
2142
2120
2143
@@ -2564,31 +2587,14 @@ def get_result_assets(self, job_id: str, user_id: str) -> Dict[str, dict]:
2564
2587
2565
2588
:return: A mapping between a filename and a dict containing information about that file.
2566
2589
"""
2567
- job_info = self .get_job_info (job_id = job_id , user_id = user_id )
2568
- if job_info .status != JOB_STATUS .FINISHED :
2569
- raise JobNotFinishedException
2570
-
2571
- job_dir = self .get_job_output_dir (job_id = job_id )
2590
+ with self ._double_job_registry as registry :
2591
+ job_dict = registry .get_job (job_id = job_id , user_id = user_id )
2572
2592
2573
- results_metadata = None
2593
+ if job_dict ["status" ] != JOB_STATUS .FINISHED :
2594
+ raise JobNotFinishedException
2574
2595
2575
- if logger .isEnabledFor (logging .DEBUG ) and not ConfigParams ().use_object_storage :
2576
- # debug/assert what looks like some kind of NFS latency on Terrascope
2577
- debuggable_results_metadata = self .load_results_metadata (job_id = job_id , user_id = user_id )
2578
- if debuggable_results_metadata : # otherwise, will have logged a warning elsewhere
2579
- logger .debug (f"successfully loaded results metadata { debuggable_results_metadata } " , extra = {"job_id" : job_id })
2596
+ results_metadata = self .load_results_metadata (job_id , user_id , job_dict )
2580
2597
2581
- try :
2582
- with self ._double_job_registry as registry :
2583
- job_dict = registry .elastic_job_registry .get_job (job_id , user_id = user_id )
2584
- if "results_metadata" in job_dict :
2585
- results_metadata = job_dict ["results_metadata" ]
2586
- except Exception as e :
2587
- logger .warning (
2588
- "Could not retrieve result metadata from job tracker %s" , e , exc_info = True , extra = {"job_id" : job_id }
2589
- )
2590
- if results_metadata is None or len (results_metadata ) == 0 :
2591
- results_metadata = self .load_results_metadata (job_id , user_id )
2592
2598
out_assets = results_metadata .get ("assets" , {})
2593
2599
out_metadata = out_assets .get ("out" , {})
2594
2600
bands = [Band (* properties ) for properties in out_metadata .get ("bands" , [])]
@@ -2610,6 +2616,8 @@ def get_result_assets(self, job_id: str, user_id: str) -> Dict[str, dict]:
2610
2616
# container that ran the job can already be gone.
2611
2617
# We only want to apply the cases below when we effectively have a job directory:
2612
2618
# it should exists and should be a directory.
2619
+ job_dir = self .get_job_output_dir (job_id = job_id )
2620
+
2613
2621
if job_dir .is_dir ():
2614
2622
if os .path .isfile (job_dir / 'out' ):
2615
2623
results_dict ['out' ] = {
@@ -2662,32 +2670,89 @@ def get_result_assets(self, job_id: str, user_id: str) -> Dict[str, dict]:
2662
2670
def get_results_metadata_path (self , job_id : str ) -> Path :
2663
2671
return self .get_job_output_dir (job_id ) / JOB_METADATA_FILENAME
2664
2672
2665
- def load_results_metadata (self , job_id : str , user_id : str ) -> dict :
2673
+ def load_results_metadata (self , job_id : str , user_id : str , job_dict : dict = None ) -> dict :
2674
+ if job_dict is None :
2675
+ with self ._double_job_registry as registry :
2676
+ job_dict = registry .get_job (job_id = job_id , user_id = user_id )
2677
+
2678
+ results_metadata = None
2679
+
2680
+ if "results_metadata_uri" in job_dict :
2681
+ results_metadata = self ._load_results_metadata_from_file (job_id , job_dict ["results_metadata_uri" ]) # TODO: expose a getter?
2682
+
2683
+ if not results_metadata and "results_metadata" in job_dict :
2684
+ logger .debug ("Loading results metadata from job registry" , extra = {"job_id" : job_id })
2685
+ results_metadata = job_dict ["results_metadata" ]
2686
+
2687
+ if not results_metadata :
2688
+ results_metadata = self ._load_results_metadata_from_file (job_id , results_metadata_uri = None )
2689
+
2690
+ return results_metadata
2691
+
2692
+ def _load_results_metadata_from_file (self , job_id : str , results_metadata_uri : Optional [str ]) -> dict :
2666
2693
"""
2667
- Reads the metadata json file from the job directory and returns it.
2694
+ Reads the metadata json file either from the job directory or an explicit URI and returns it.
2668
2695
"""
2669
2696
2670
- metadata_file = self .get_results_metadata_path (job_id = job_id )
2671
-
2672
- if ConfigParams ().use_object_storage :
2697
+ def try_get_results_metadata_from_object_storage (path : Union [Path , str ], bucket : Optional [str ]) -> dict :
2673
2698
try :
2674
- contents = get_s3_file_contents (str ( metadata_file ) )
2699
+ contents = get_s3_file_contents (path , bucket )
2675
2700
return json .loads (contents )
2676
2701
except Exception :
2677
2702
logger .warning (
2678
- "Could not retrieve result metadata from object storage %s" ,
2679
- metadata_file , exc_info = True ,
2680
- extra = {'job_id' : job_id })
2703
+ "Could not retrieve result metadata from object storage %s in bucket %s" ,
2704
+ path ,
2705
+ bucket or "[default]" ,
2706
+ exc_info = True ,
2707
+ stack_info = True ,
2708
+ extra = {"job_id" : job_id },
2709
+ )
2681
2710
2682
- try :
2683
- with open (metadata_file ) as f :
2684
- return json .load (f )
2685
- except FileNotFoundError :
2686
- logger .warning ("Could not derive result metadata from %s" , metadata_file , exc_info = True ,
2687
- stack_info = True ,
2688
- extra = {'job_id' : job_id })
2689
-
2690
- return {}
2711
+ return {}
2712
+
2713
+ def try_get_results_metadata_from_disk (path : Union [Path , str ]) -> dict :
2714
+ @reretry .retry (
2715
+ exceptions = FileNotFoundError ,
2716
+ logger = logger ,
2717
+ ** get_backend_config ().read_results_metadata_file_retry_settings ,
2718
+ )
2719
+ def read_results_metadata_file ():
2720
+ with open (path ) as f :
2721
+ return json .load (f )
2722
+
2723
+ try :
2724
+ return read_results_metadata_file ()
2725
+ except FileNotFoundError :
2726
+ logger .warning (
2727
+ "Could not derive result metadata from %s" ,
2728
+ path ,
2729
+ exc_info = True ,
2730
+ stack_info = True ,
2731
+ extra = {"job_id" : job_id },
2732
+ )
2733
+
2734
+ return {}
2735
+
2736
+ if results_metadata_uri :
2737
+ logger .debug ("Loading results metadata from %s" , results_metadata_uri , extra = {"job_id" : job_id })
2738
+ uri_parts = urlparse (results_metadata_uri )
2739
+
2740
+ if uri_parts .scheme == "file" :
2741
+ return try_get_results_metadata_from_disk (uri_parts .path )
2742
+ elif uri_parts .scheme == "s3" :
2743
+ bucket , key = PresignedS3AssetUrls .get_bucket_key_from_uri (results_metadata_uri )
2744
+ return try_get_results_metadata_from_object_storage (key , bucket )
2745
+ else :
2746
+ raise NotImplementedError (results_metadata_uri )
2747
+
2748
+ metadata_file = self .get_results_metadata_path (job_id = job_id )
2749
+
2750
+ logger .debug ("Loading results metadata from %s" , metadata_file , extra = {"job_id" : job_id })
2751
+
2752
+ if ConfigParams ().use_object_storage :
2753
+ return try_get_results_metadata_from_object_storage (metadata_file , bucket = None )
2754
+
2755
+ return try_get_results_metadata_from_disk (metadata_file )
2691
2756
2692
2757
def _get_providers (self , job_id : str , user_id : str ) -> List [dict ]:
2693
2758
results_metadata = self .load_results_metadata (job_id , user_id )
0 commit comments