|
| 1 | +import numpy as np |
| 2 | +import xarray as xr |
| 3 | +import matplotlib.pyplot as plt |
| 4 | +import pandas as pd |
| 5 | +import glob |
| 6 | +import re |
| 7 | +import pytz |
| 8 | +import sklearn.metrics as metrics |
| 9 | +from . import general_utils, plot_util, crps_util, stats_util |
| 10 | +from .logging_util import get_slug, debug, error, info |
| 11 | +import xarray.ufuncs as uf |
| 12 | +import matplotlib.dates as mdates |
| 13 | +import utide as ut |
| 14 | +import scipy.signal as signal |
| 15 | + |
| 16 | + |
| 17 | +class TidegaugeMultiple: |
| 18 | + """ |
| 19 | + This is an object for storage and manipulation of multiple tide gauges |
| 20 | + in a single dataset. This may require some processing of the observations |
| 21 | + such as interpolation to a common time step. |
| 22 | +
|
| 23 | + This object's dataset should take the form: |
| 24 | +
|
| 25 | + Dimensions: |
| 26 | + port : The locations dimension. Each time series has an index |
| 27 | + time : The time dimension. Each datapoint at each port has an index |
| 28 | +
|
| 29 | + Coordinates: |
| 30 | + longitude (port) : Longitude values for each port index |
| 31 | + latitude (port) : Latitude values for each port index |
| 32 | + time (port) : Time values for each time index (datetime) |
| 33 | +
|
| 34 | + An example data variable could be ssh, or ntr (non-tidal residual). This |
| 35 | + object can also be used for other instrument types, not just tide gauges. |
| 36 | + For example moorings. |
| 37 | +
|
| 38 | + Every port index for this object should use the same time coordinates. |
| 39 | + Therefore, timeseries need to be aligned before being placed into the |
| 40 | + object. If there is any padding needed, then NaNs should be used. NaNs |
| 41 | + should also be used for quality control/data rejection. |
| 42 | + """ |
| 43 | + |
| 44 | + def init(): |
| 45 | + return |
| 46 | + |
| 47 | + ############################################################################## |
| 48 | + ### ~ Plotting ~ ### |
| 49 | + ############################################################################## |
| 50 | + |
| 51 | + ############################################################################## |
| 52 | + ### ~ Model Comparison ~ ### |
| 53 | + ############################################################################## |
| 54 | + |
| 55 | + ############################################################################## |
| 56 | + ### ~ Analysis ~ ### |
| 57 | + ############################################################################## |
| 58 | + |
| 59 | + def match_missing_values(self, other, fill_value=np.nan): |
| 60 | + """ |
| 61 | + Will match any missing values between two tidegauge_multiple datasets. |
| 62 | + Where missing values (defined by fill_value) are found in either dataset |
| 63 | + they are also placed in the corresponding location in the other dataset. |
| 64 | + Returns two new tidegaugeMultiple objects containing the new |
| 65 | + ssh data. Datasets must contain ssh variables and only ssh will be |
| 66 | + masked. |
| 67 | + """ |
| 68 | + |
| 69 | + ds_self = self.dataset |
| 70 | + ds_other = other.dataset |
| 71 | + |
| 72 | + ssh_self = ds_self.ssh |
| 73 | + ssh_other = ds_other.ssh |
| 74 | + |
| 75 | + if ssh_other.dims[0] == "t_dim": |
| 76 | + ssh_other = ssh_other.transpose() |
| 77 | + if ssh_self.dims[0] == "t_dim": |
| 78 | + ssh_self = ssh_self.transpose() |
| 79 | + |
| 80 | + if np.isnan(fill_value): |
| 81 | + ind_self = uf.isnan(ssh_self) |
| 82 | + ind_other = uf.isnan(ssh_other) |
| 83 | + else: |
| 84 | + ind_self = ssh_self == fill_value |
| 85 | + ind_other = ssh_other == fill_value |
| 86 | + |
| 87 | + ds_self["ssh"] = ssh_self.where(~ind_other) |
| 88 | + ds_other["ssh"] = ssh_other.where(~ind_self) |
| 89 | + |
| 90 | + tg_self = TidegaugeMultiple() |
| 91 | + tg_other = TidegaugeMultiple() |
| 92 | + |
| 93 | + tg_self.dataset = ds_self |
| 94 | + tg_other.dataset = ds_other |
| 95 | + |
| 96 | + return tg_self, tg_other |
| 97 | + |
| 98 | + def harmonic_analysis_utide( |
| 99 | + self, min_datapoints=1000, nodal=False, trend=False, method="ols", conf_int="linear", Rayleigh_min=0.95 |
| 100 | + ): |
| 101 | + """ |
| 102 | + Does a harmonic analysis for each timeseries inside this object using |
| 103 | + the utide library. All arguments except min_datapoints are arguments |
| 104 | + that are passed to ut.solve(). Please see the utide website for more |
| 105 | + information: |
| 106 | +
|
| 107 | + https://pypi.org/project/UTide/ |
| 108 | +
|
| 109 | + Utide will by default do it's harmonic analysis using a set of harmonics |
| 110 | + determined using the Rayleigh criterion. This changes the number of |
| 111 | + harmonics depending on the length and frequency of the time series. |
| 112 | +
|
| 113 | + Output from this routine is not a new dataset, but a list of utide |
| 114 | + analysis object. These are structures containing, amongst other things, |
| 115 | + amplitudes, phases, constituent names and confidence intervals. This |
| 116 | + list can be passed to reconstruct_tide_utide() in this object to create |
| 117 | + a new TidegaugeMultiple object containing reconstructed tide data. |
| 118 | +
|
| 119 | + INPUTS |
| 120 | + min_datapoints : If a time series has less than this value number of |
| 121 | + datapoints, then omit from the analysis. |
| 122 | + <all_others> : Inputs to utide.solve(). See website above. |
| 123 | +
|
| 124 | + OUTPUTS |
| 125 | + A list of utide structures from the solve() routine. If a location |
| 126 | + is omitted, it will contain [] for it's entry. |
| 127 | + """ |
| 128 | + ds = self.dataset |
| 129 | + n_port = ds.dims["id"] |
| 130 | + n_time = ds.dims["t_dim"] |
| 131 | + # Harmonic analysis datenums |
| 132 | + time = mdates.date2num(ds.time.values) |
| 133 | + |
| 134 | + analyses = [] |
| 135 | + |
| 136 | + for pp in range(0, n_port): |
| 137 | + |
| 138 | + # Temporary in-loop datasets |
| 139 | + ds_port = ds.isel(id=pp).load() |
| 140 | + ssh = ds_port.ssh |
| 141 | + |
| 142 | + number_of_nan = np.sum(np.isnan(ssh.values)) |
| 143 | + |
| 144 | + if number_of_nan == n_time: |
| 145 | + analyses.append([]) |
| 146 | + continue |
| 147 | + |
| 148 | + if (n_time - number_of_nan) < min_datapoints: |
| 149 | + print(sum(~np.isnan(ssh.values))) |
| 150 | + analyses.append([]) |
| 151 | + continue |
| 152 | + |
| 153 | + # Do harmonic analysis using UTide |
| 154 | + uts_obs = ut.solve( |
| 155 | + time, |
| 156 | + ssh.values, |
| 157 | + lat=ssh.latitude.values, |
| 158 | + nodal=nodal, |
| 159 | + trend=trend, |
| 160 | + method=method, |
| 161 | + conf_int=conf_int, |
| 162 | + Rayleigh_min=Rayleigh_min, |
| 163 | + ) |
| 164 | + |
| 165 | + analyses.append(uts_obs) |
| 166 | + |
| 167 | + return analyses |
| 168 | + |
| 169 | + def reconstruct_tide_utide(self, utide_solution_list, constit=None): |
| 170 | + """ """ |
| 171 | + |
| 172 | + ds = self.dataset |
| 173 | + n_port = ds.dims["id"] |
| 174 | + n_time = ds.dims["t_dim"] |
| 175 | + # Harmonic analysis datenums |
| 176 | + time = mdates.date2num(ds.time.values) |
| 177 | + |
| 178 | + coords = xr.Dataset(ds[list(ds.coords.keys())]) |
| 179 | + reconstructed = np.zeros((n_port, n_time)) * np.nan |
| 180 | + |
| 181 | + for pp in np.arange(n_port): |
| 182 | + |
| 183 | + # Reconstruct full tidal signal |
| 184 | + pp_solution = utide_solution_list[pp] |
| 185 | + |
| 186 | + if len(pp_solution) == 0: |
| 187 | + continue |
| 188 | + |
| 189 | + tide = np.array(ut.reconstruct(time, pp_solution, constit=constit).h) |
| 190 | + reconstructed[pp] = tide |
| 191 | + |
| 192 | + coords["ssh_tide"] = (["id", "t_dim"], reconstructed) |
| 193 | + tg_return = TidegaugeMultiple() |
| 194 | + tg_return.dataset = coords |
| 195 | + |
| 196 | + return tg_return |
| 197 | + |
| 198 | + def calculate_residuals(self, tg_tide, apply_filter=True, window_length=25, polyorder=3): |
| 199 | + |
| 200 | + # NTR: Calculate non tidal residuals |
| 201 | + ntr = self.dataset.ssh - tg_tide.dataset.ssh_tide |
| 202 | + n_port = self.dataset.dims["id"] |
| 203 | + |
| 204 | + # NTR: Apply filter if wanted |
| 205 | + if apply_filter: |
| 206 | + for pp in range(n_port): |
| 207 | + ntr[pp, :] = signal.savgol_filter(ntr[pp, :], 25, 3) |
| 208 | + |
| 209 | + coords = xr.Dataset(self.dataset[list(self.dataset.coords.keys())]) |
| 210 | + coords["ntr"] = ntr |
| 211 | + |
| 212 | + tg_return = TidegaugeMultiple() |
| 213 | + tg_return.dataset = coords |
| 214 | + return tg_return |
| 215 | + |
| 216 | + def threshold_statistics(self, thresholds=np.arange(-0.4, 2, 0.1), peak_separation=12): |
| 217 | + |
| 218 | + ds = self.dataset |
| 219 | + ds_thresh = xr.Dataset(ds[list(ds.coords.keys())]) |
| 220 | + ds_thresh["threshold"] = ("threshold", thresholds) |
| 221 | + var_list = list(ds.keys()) |
| 222 | + n_thresholds = len(thresholds) |
| 223 | + n_port = ds.dims["id"] |
| 224 | + |
| 225 | + for vv in var_list: |
| 226 | + |
| 227 | + empty_thresh = np.zeros((n_port, n_thresholds)) * np.nan |
| 228 | + ds_thresh["peak_count_" + vv] = (["id", "threshold"], empty_thresh) |
| 229 | + ds_thresh["time_over_threshold_" + vv] = (["id", "threshold"], empty_thresh) |
| 230 | + ds_thresh["dailymax_count_" + vv] = (["id", "threshold"], empty_thresh) |
| 231 | + ds_thresh["monthlymax_count_" + vv] = (["id", "threshold"], empty_thresh) |
| 232 | + |
| 233 | + for pp in range(n_port): |
| 234 | + |
| 235 | + print(pp) |
| 236 | + |
| 237 | + # Identify NTR peaks for threshold analysis |
| 238 | + data_pp = ds[vv].isel(id=pp) |
| 239 | + if np.sum(np.isnan(data_pp.values)) == ds.dims["t_dim"]: |
| 240 | + continue |
| 241 | + |
| 242 | + pk_ind, _ = signal.find_peaks(data_pp.values, distance=peak_separation) |
| 243 | + pk_values = data_pp[pk_ind] |
| 244 | + |
| 245 | + # Threshold Analysis |
| 246 | + for nn in range(0, n_thresholds): |
| 247 | + |
| 248 | + # Calculate daily and monthly maxima for threshold analysis |
| 249 | + ds_daily = data_pp.groupby("time.day") |
| 250 | + ds_daily_max = ds_daily.max(skipna=True) |
| 251 | + ds_monthly = data_pp.groupby("time.month") |
| 252 | + ds_monthly_max = ds_monthly.max(skipna=True) |
| 253 | + |
| 254 | + threshn = thresholds[nn] |
| 255 | + # NTR: Threshold Frequency (Peaks) |
| 256 | + ds_thresh["peak_count_" + vv][pp, nn] = np.sum(pk_values >= threshn) |
| 257 | + |
| 258 | + # NTR: Threshold integral (Time over threshold) |
| 259 | + ds_thresh["time_over_threshold_" + vv][pp, nn] = np.sum(data_pp >= threshn) |
| 260 | + |
| 261 | + # NTR: Number of daily maxima over threshold |
| 262 | + ds_thresh["dailymax_count_" + vv][pp, nn] = np.sum(ds_daily_max.values >= threshn) |
| 263 | + |
| 264 | + # NTR: Number of monthly maxima over threshold |
| 265 | + ds_thresh["monthlymax_count_" + vv][pp, nn] = np.sum(ds_monthly_max.values >= threshn) |
| 266 | + |
| 267 | + return ds_thresh |
| 268 | + |
| 269 | + def demean_timeseries(self): |
| 270 | + return self.dataset - self.dataset.mean(dim="t_dim") |
| 271 | + |
| 272 | + def obs_operator(self, gridded, time_interp="nearest"): |
| 273 | + """ """ |
| 274 | + |
| 275 | + gridded = gridded.dataset |
| 276 | + ds = self.dataset |
| 277 | + |
| 278 | + # Determine spatial indices |
| 279 | + print("Calculating spatial indices.", flush=True) |
| 280 | + ind_x, ind_y = general_utils.nearest_indices_2d( |
| 281 | + gridded.longitude, gridded.latitude, ds.longitude, ds.latitude, mask=gridded.landmask |
| 282 | + ) |
| 283 | + |
| 284 | + # Extract spatial time series |
| 285 | + print("Calculating time indices.", flush=True) |
| 286 | + extracted = gridded.isel(x_dim=ind_x, y_dim=ind_y) |
| 287 | + extracted = extracted.swap_dims({"dim_0": "id"}) |
| 288 | + |
| 289 | + # Compute data (takes a while..) |
| 290 | + print(" Indexing model data at tide gauge locations.. ", flush=True) |
| 291 | + extracted.load() |
| 292 | + |
| 293 | + # Check interpolation distances |
| 294 | + print("Calculating interpolation distances.", flush=True) |
| 295 | + interp_dist = general_utils.calculate_haversine_distance( |
| 296 | + extracted.longitude, extracted.latitude, ds.longitude.values, ds.latitude.values |
| 297 | + ) |
| 298 | + |
| 299 | + # Interpolate model onto obs times |
| 300 | + print("Interpolating in time...", flush=True) |
| 301 | + extracted = extracted.rename({"time": "t_dim"}) |
| 302 | + extracted = extracted.interp(t_dim=ds.time.values, method=time_interp) |
| 303 | + |
| 304 | + # Put interp_dist into dataset |
| 305 | + extracted["interp_dist"] = interp_dist |
| 306 | + extracted = extracted.rename_vars({"t_dim": "time"}) |
| 307 | + |
| 308 | + tg_out = TidegaugeMultiple() |
| 309 | + tg_out.dataset = extracted |
| 310 | + return tg_out |
| 311 | + |
| 312 | + def difference(self, other, absolute_diff=True, square_diff=True): |
| 313 | + |
| 314 | + differenced = self.dataset - other.dataset |
| 315 | + diff_vars = list(differenced.keys()) |
| 316 | + save_coords = list(self.dataset.coords.keys()) |
| 317 | + |
| 318 | + for vv in diff_vars: |
| 319 | + differenced = differenced.rename({vv: "diff_" + vv}) |
| 320 | + |
| 321 | + if absolute_diff: |
| 322 | + abs_tmp = uf.fabs(differenced) |
| 323 | + diff_vars = list(abs_tmp.keys()) |
| 324 | + for vv in diff_vars: |
| 325 | + abs_tmp = abs_tmp.rename({vv: "abs_" + vv}) |
| 326 | + else: |
| 327 | + abs_tmp = xr.Dataset() |
| 328 | + |
| 329 | + if square_diff: |
| 330 | + sq_tmp = uf.square(differenced) |
| 331 | + diff_vars = list(sq_tmp.keys()) |
| 332 | + for vv in diff_vars: |
| 333 | + sq_tmp = sq_tmp.rename({vv: "square_" + vv}) |
| 334 | + else: |
| 335 | + sq_tmp = xr.Dataset() |
| 336 | + |
| 337 | + differenced = xr.merge((differenced, abs_tmp, sq_tmp, self.dataset[save_coords])) |
| 338 | + |
| 339 | + return_differenced = TidegaugeMultiple() |
| 340 | + return_differenced.dataset = differenced |
| 341 | + |
| 342 | + return return_differenced |
0 commit comments