Replace dependency on tar on system with tarfile module

lassebje · lassebje · commit f350ef8dc486 · 2024-11-08T07:11:29.000+01:00
diff --git a/examples/example_import_ERA5.py b/examples/example_import_ERA5.py
@@ -1,13 +1,13 @@
 from metocean_api import ts
 
 # Define TimeSeries-object
-df_ts = ts.TimeSeries(lon=109.94, lat=15.51,start_time='2000-01-01', end_time='2000-12-31' , product='ERA5', 
+df_ts = ts.TimeSeries(lon=109.94, lat=15.51,start_time='2000-01-01', end_time='2000-12-31' , product='ERA5',
                       variable=[ 'significant_height_of_combined_wind_waves_and_swell',
                                 'mean_wave_direction',
                                'peak_wave_period'])
 
 
-#df_ts = ts.TimeSeries(lon=6, lat=55.7,start_time='2012-01-01', end_time='2012-12-31' , product='GTSM', 
+#df_ts = ts.TimeSeries(lon=6, lat=55.7,start_time='2012-01-01', end_time='2012-01-31' , product='GTSM',
 #                      variable=['storm_surge_residual','tidal_elevation','total_water_level'])
 
 # list of wind and wave parameters in ERA5:
@@ -20,7 +20,7 @@
 #        ]
 
 # Import data from thredds.met.no and save it as csv
-df_ts.import_data(save_csv=True)
+df_ts.import_data(save_csv=True,use_cache=True)
 
 # Load data from a local csv-file
 #df_ts.load_data(local_file=df_ts.datafile)
diff --git a/metocean_api/ts/internal/ec/ec_products.py b/metocean_api/ts/internal/ec/ec_products.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING, Tuple, List
 import os
-import subprocess
+import tarfile
 import pandas as pd
 import xarray as xr
 import numpy as np
@@ -99,7 +99,7 @@ def __download_era5_from_cds(self,start_time, end_time, lon, lat, variable,  fol
         Downloads ERA5 data from the Copernicus Climate Data Store for a
         given point and time period
         """
-        import cdsapi
+        import cdsapi #Optional dependency
         start_time = pd.Timestamp(start_time)
         end_time = pd.Timestamp(end_time)
         c = cdsapi.Client()
@@ -164,19 +164,15 @@ def import_data(self, ts: TimeSeries, save_csv=True, save_nc=False, use_cache=Fa
                 "tidal_elevation",
                 "total_water_level",
             ]
-        filenames = self.__download_gtsm_from_cds(ts.start_time, ts.end_time, ts.variable, folder='cache')
+        filenames = self.__download_gtsm_from_cds(ts.start_time, ts.end_time, ts.variable, folder='cache',use_cache=use_cache)
 
         if not isinstance(filenames, list):
             filenames = [filenames]
 
         all_nc_files = []
         for filename in filenames:
-            temppath = os.path.dirname(filename)
             # Unpack the tar.gz file.
-            nc_files = subprocess.run(['tar', '-ztf', filename], stdout=subprocess.PIPE, check=True).stdout.decode('utf-8').split('\n')[0:-1]
-            nc_files = sorted([ff.strip('\r') for ff in nc_files])
-            subprocess.run(['tar', '-xzvf', filename, '--directory', temppath], stdout=subprocess.PIPE, check=True) # Extract tar file
-            all_nc_files.extend([os.path.join(temppath, file) for file in nc_files])
+            all_nc_files.extend(self.__unpack_files(filename,use_cache))
 
         # Open multiple netCDF files as a single xarray dataset
         with xr.open_mfdataset(all_nc_files) as ds:
@@ -193,19 +189,37 @@ def import_data(self, ts: TimeSeries, save_csv=True, save_nc=False, use_cache=Fa
 
         return df
 
-    def __download_gtsm_from_cds(self,start_time, end_time, variable,  folder='cache') -> str:
+    def __unpack_files(self, filename,use_cache) -> List[str]:
+        temppath = os.path.dirname(filename)
+        with tarfile.open(filename, "r:*") as tar:
+            try:
+                files_from_tar = []
+                for file in tar.getmembers():
+                    if file.name.endswith(".nc"):
+                        nc_file = os.path.join(temppath, file.name)
+                        if use_cache and os.path.exists(nc_file):
+                            print(f"Reusing cached file {nc_file}")
+                        else:
+                            print(f"Extracting {file.name} from {filename}")
+                            tar.extract(file, path=temppath)
+                        files_from_tar.append(nc_file)
+                return files_from_tar
+            except Exception as e:
+                # Seen with corrupted tar files from CDS
+                raise RuntimeError(f"Error extracting {filename}: {e}, consider deleting the file to download it again") from e
+
+    def __download_gtsm_from_cds(self,start_time, end_time, variable,  folder='cache',use_cache=True) -> List[str]:
         """
         Downloads GTSM model water level data from the Copernicus Climate Data Store for a
         given point and time period
         """
-        import cdsapi
+        import cdsapi #Optional dependency
         filename = []
         filename_list = []
         start_time = pd.Timestamp(start_time)
         end_time = pd.Timestamp(end_time)
         c = cdsapi.Client()
 
-
         days = pd.date_range(start=start_time , end=end_time, freq='D')
         years = days.year
         years = years.unique()
@@ -241,8 +255,12 @@ def __download_gtsm_from_cds(self,start_time, end_time, variable,  folder='cache
 
                 }
                 print('Download variable:',var, year)
-                c.retrieve('sis-water-level-change-timeseries-cmip6', cds_command, filename)
-                filename_list.append(filename)          
+                if use_cache and os.path.exists(filename):
+                    print(f'Reusing cached file {filename}')
+                else:
+                    print(f'Download variable {var} for year {year}')
+                    c.retrieve('sis-water-level-change-timeseries-cmip6', cds_command, filename)
+                filename_list.append(filename)
         return filename_list
 
     def download_temporary_files(self, ts: TimeSeries, use_cache: bool = False) -> Tuple[List[str], float, float]: