Read the clean currents in

richard-lane · richard-lane · commit abfbccf5003e · 2025-09-11T18:14:04.000+01:00
diff --git a/docs/current_denoising/README.md b/docs/current_denoising/README.md
@@ -0,0 +1 @@
+Additional documentation for the library in `src/current_denoising/
diff --git a/docs/current_denoising/generation/ioutils.md b/docs/current_denoising/generation/ioutils.md
@@ -0,0 +1,60 @@
+ioutils
+====
+Additional documentation for the io utils in `current_denoising/`.
+
+
+read_clean_currents
+----
+This function is more complicated that `read_currents` (which just reads in a whole dat file
+and is for reading in noisy currents). The dat files containing the (clean) CMIP simulation
+hold the data for many different runs, models, and start years, so the correct one needs to be
+specified. This is done with the year/model/name parameters; the name is a special string (e.g.
+r1i1p1f1_gn).
+
+It is assumed that the currents don't change much within 5 years, so we have model outputs at
+a granularity of every 5 years. The provided year in the metadata is the start year of the run.
+
+#### r (realization_index)
+    an integer (≥1) distinguishing among members of an ensemble of simulations that
+    differ only in their initial conditions (e.g., initialized from different points
+    in a control run). Note that if two different simulations were started from the
+    same initial conditions, the same realization number should be used for both simulations.
+    For example if a historical run with “natural forcing” only and another historical
+    run that includes anthropogenic forcing were both spawned at the same point in a
+    control run, both should be assigned the same realization.  Also, each so-called
+    RCP (future scenario) simulation should normally be assigned the same realization
+    integer as the historical run from which it was initiated.
+    This will allow users to easily splice together the appropriate historical and future runs.
+
+#### i (initialization_index)
+    an integer (≥1), which should be assigned a value of 1 except to distinguish
+    simulations performed under the same conditions but with different initialization
+    procedures.  In CMIP6 this index should invariably be assigned the value “1”
+    except for some hindcast and forecast experiments called for by the DCPP activity.
+    The initialization_index can be used either to distinguish between different
+    algorithms used to impose initial conditions on a forecast or to distinguish
+    between different observational datasets used to initialize a forecast.
+
+#### p (physics_index)
+    an integer (≥1) identifying the physics version used by the model.  In the
+    usual case of a single physics version of a model, this argument should normally
+    be assigned the value 1, but it is essential that a consistent assignment of
+    physics_index be used across all simulations performed by a particular model.
+    Use of  “physics_index” is reserved for closely-related model versions (e.g., as
+    in a “perturbed physics” ensemble) or for the same model run with slightly
+    different parameterizations (e.g., of cloud physics).  Model versions that are
+    substantially different from one another should be given a different source_id”
+    (rather than simply assigning a different value of the physics_index).
+
+#### f (forcing_index)
+    an integer (≥1) used to distinguish runs conforming to the protocol of a single
+    CMIP6 experiment, but with different variants of forcing applied.  One can, for
+    example, distinguish between two historical simulations, one forced with the
+    CMIP6-recommended forcing data sets and another forced by a different dataset,
+    which might yield information about how forcing uncertainty affects the simulation.
+
+#### Gridding
+    A grid-label suffix is used to distinguish between the gridding conventions used:
+ - grid_label = "gn"  (output is reported on the native grid, usually but not invariably at grid cell centers) 
+ - grid_label = "gr"   (output is not reported on the native grid, but instead is regridded by the modeling group to a “primary grid” of its choosing) 
+ - grid_label = “gm” (global mean output is reported, so data are not gridded)
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,6 @@ authors = [
 
 requires-python = ">=3.10"
 dependencies = [
-    "nbdime>=4.0.2",
 ]
 
 [tool.uv]
@@ -27,22 +26,28 @@ core = [
     "ipykernel>=6.29.5",
     "jupyter>=1.1.1",
     "numpy>=2.2.6",
+    "pandas>=2.3.2",
 ]
 
 # Test dependencies - a minimal set of dependencies that let us run the tests
 # This enables us to run tests without installing all the dependencies, which is much quicker
+# Basically this just gets us to avoid installing torch
+# Ideally we wouldn't repeat these things in the core dependencies
+# But i need to figure out how to do it better
 test = [
     "numpy>=2.2.6",
     "pytest>=8.4.1",
     "scikit-image>=0.25.2",
     "matplotlib>=3.10.3",
+    "pandas>=2.3.2",
 ]
 
 # Formatting, linting, etc.
 # Useful for development, but not required to run the code
 dev = [
     "black>=25.1.0",
     "pylint>=3.3.7",
+    "nbdime>=4.0.2",
 ]
 
 [build-system]
diff --git a/src/current_denoising/generation/ioutils.py b/src/current_denoising/generation/ioutils.py
@@ -3,8 +3,12 @@
 """
 
 import pathlib
+from functools import cache
 
 import numpy as np
+import pandas as pd
+
+from ..plotting.maps import lat_long_grid
 
 
 class IOError(Exception):
@@ -43,6 +47,166 @@ def read_currents(path: pathlib.Path) -> np.ndarray:
     return np.flipud(data.reshape(shape))
 
 
+@cache
+def _read_clean_current_metadata(metadata_path: pathlib.Path) -> pd.DataFrame:
+    """
+    Read the metadata file for the clean currents .dat file
+
+    :returns: metadata dataframe; model/name/year
+    """
+    # First line is the number of models/runs
+    with open(metadata_path, "r") as f:
+        num_runs = int(f.readline().strip())
+        df = pd.read_csv(f, names=["model", "name", "year"], sep="\s+")
+
+    if len(df) != num_runs:
+        raise IOError(
+            f"Metadata file {metadata_path} has {len(df)} rows, but first line says {num_runs}"
+        )
+
+    return df
+
+
+def _coriolis_parameter(latitudes: np.ndarray) -> np.ndarray:
+    """
+    Calculate the coriolis parameter at each latitude
+    """
+    omega = 7.2921e-5
+    torad = np.pi / 180.0
+
+    return 2 * omega * np.sin(latitudes * torad)
+
+
+def current_speed_from_mdt(mdt: np.ndarray) -> np.ndarray:
+    """
+    Convert geodetic MDT to currents.
+
+    By assuming geostrophic balance, we can take the gradient of the MDT to get the steady-state
+    currents.
+    This requires us to work out the coriolis parameter at each latitude, and to take the gradient
+    of the MDT.
+
+    :param mdt: the mean dynamic topography, in metres, covering the globe.
+
+    :returns: the current speed in m/s
+
+    """
+    g = 9.80665
+    torad = np.pi / 180.0
+    R = 6_371_229.0
+
+    # Find the grid spacing (in m)
+    lats, longs = lat_long_grid(mdt.shape)
+    dlat = np.abs(lats[1] - lats[0]) * torad * R
+    dlong = (
+        np.abs(longs[1] - longs[0]) * torad * R * np.cos(torad * lats)[:, np.newaxis]
+    )
+
+    # Find the coriolis parameter at each latitude
+    f = _coriolis_parameter(lats)
+
+    # Velocities are gradients * coriolis param for geostrophic balance
+    dmdt_dlat = np.gradient(mdt, axis=0) / dlat
+    dmdt_dlon = np.gradient(mdt, axis=1) / dlong
+
+    # u should be negative, but it doesnt matter for speed
+    u = g / f[:, np.newaxis] * dmdt_dlat
+    v = g / f[:, np.newaxis] * dmdt_dlon
+
+    return np.sqrt(u**2 + v**2)
+
+
+def read_clean_currents(
+    path: pathlib.Path,
+    metadata_path: pathlib.Path,
+    *,
+    year: int,
+    model: str = "ACCESS-CM2",
+    name: str = "r1i1p1f1_gn",
+) -> np.ndarray:
+    """
+    Read clean current data from a .dat file.
+
+    Read a .dat file containing clean current data,
+    given the model/name/year, returning a 720x1440 numpy array giving the current
+    in m/s.
+    Sets land grid points to np.nan.
+    Since the clean current data is stored in a large file containing multiple years and models, we need
+    to choose the correct one.
+
+    Notes on the name convention from the CMIP6 documentation can be found in docs/current_denoising/generation/ioutils.md,
+    or in the original at https://docs.google.com/document/d/1h0r8RZr_f3-8egBMMh7aqLwy3snpD6_MrDz1q8n5XUk.
+
+    :param path: location of the .dat file; clean current data is located in
+                 data/projects/SING/richard_stuff/Table2/clean_currents/ on the RDSF
+    :param metadata_path: location of the metadata .csv file describing the contents of the .dat file
+    :param year: start of the 5-year period for which to extract data
+    :param model: the climate model to use
+    :param name: the model variant to use. Name follows the convention {realisation/initialisation/physics/forcing}_grid
+
+    :returns: a numpy array holding current speeds
+    :raises ValueError: if the requested year/model/name is not found in the metadata
+    :raises IOError: if the file is malformed, or has a different length to expected from the metadata
+
+    """
+    metadata = _read_clean_current_metadata(metadata_path)
+
+    # The dat file contains a header (record length), then the record, then a footer (record length)
+    # We want to find the number of bytes to skip to get to the correct record, which
+    # corresponds to the row number in the metadata file
+
+    # Find the row in the metadata file
+    row = metadata[
+        (metadata["year"] == year)
+        & (metadata["model"] == model)
+        & (metadata["name"] == name)
+    ]
+    if len(row) == 0:
+        raise ValueError(
+            f"Could not find entry for {model=}, {name=}, {year=} in metadata"
+        )
+    if len(row) > 1:
+        raise ValueError(
+            f"Found multiple entries for {model=}, {name=}, {year=} in metadata"
+        )
+
+    # This tells us how many records to skip
+    row_index = row.index[0]
+
+    with open(path, "rb") as f:
+        n_bytes_per_record = np.fromfile(f, dtype=np.int32, count=1)[0]
+
+        # Add the header + footer
+        n_bytes_per_record += 8
+
+        # Check the file is the right size, based on the metadata
+        expected_size = int(n_bytes_per_record) * len(metadata)
+        f.seek(0, 2)  # Seek to end of file
+        actual_size = f.tell()
+        if actual_size != expected_size:
+            raise IOError(
+                f"File size {actual_size} does not match expected {expected_size} from metadata"
+            )
+
+        offset = row_index * n_bytes_per_record
+
+        f.seek(offset)
+        header = np.fromfile(f, dtype=np.int32, count=1)[0]
+        if header + 8 != n_bytes_per_record:
+            raise IOError(
+                f"Record length marker {header} does not match expected {n_bytes_per_record - 8}"
+            )
+
+        retval = np.fromfile(f, dtype="<f4", count=header // 4)
+
+    retval[retval == -1.9e19] = np.nan
+
+    # Make it look right
+    retval = np.flipud(retval.reshape((720, 1440)))
+
+    return current_speed_from_mdt(retval)
+
+
 def _included_indices(
     n_rows: int, tile_size: int, max_latitude: float
 ) -> tuple[int, int]:
diff --git a/src/notebooks/plot_changing_currents.ipynb b/src/notebooks/plot_changing_currents.ipynb
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Additional documentation for the library in `src/current_denoising/