@mgrover1 comment, add some extra tests, allow zarr version specification in the data format

charles-turner-1 · charles-turner-1 · commit 1c17ffc8d32b · 2025-08-11T11:59:08.000+08:00
diff --git a/docs/source/reference/esm-catalog-spec.md b/docs/source/reference/esm-catalog-spec.md
@@ -85,13 +85,13 @@ The column names can optionally be associated with a controlled vocabulary, such
 
 An assets object describes the columns in the CSV file relevant for opening the actual data files.
 
-| Element            | Type   | Description                                                                                                                                                                                                            |
-| ------------------ | ------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| column_name        | string | **REQUIRED.** The name of the column containing the path to the asset. Must be in the header of the CSV file.                                                                                                          |
-| format             | string | The data format. Valid values are `netcdf`, `zarr`, `opendap` or `reference` ([`kerchunk`](https://github.com/fsspec/kerchunk) reference files). If specified, it means that all data in the catalog is the same type. |
-| format_column_name | string | The column name which contains the data format, allowing for variable data types in one catalog. Mutually exclusive with `format`.                                                                                     |
+| Element            | Type   | Description                                                                                                                                                                                                                              |
+| ------------------ | ------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| column_name        | string | **REQUIRED.** The name of the column containing the path to the asset. Must be in the header of the CSV file.                                                                                                                            |
+| format             | string | The data format. Valid values are `netcdf`, `zarr`, `zarr2`, `zarr3`, `opendap` or `reference` ([`kerchunk`](https://github.com/fsspec/kerchunk) reference files). If specified, it means that all data in the catalog is the same type. |
+| format_column_name | string | The column name which contains the data format, allowing for variable data types in one catalog. Mutually exclusive with `format`.                                                                                                       |
 
-```{note}
+````{note}
  Zarr v3 is built on asynchronous operations, and requires `xarray_open_kwargs` to contain the following dictionary fragment:
  ```python
    xarray_open_kwargs ={
@@ -103,19 +103,24 @@ An assets object describes the columns in the CSV file relevant for opening the
    },
    ...
  }
- ``````
- In contrast, Zarr v2 is synchronous and instead requires:
- ```python
-   xarray_open_kwargs ={
-     "storage_options" : {
-       "remote_options" : {
-         "async": false,
-         ...
-       }
-   },
-   ...
- }
- ```
+````
+
+In contrast, Zarr v2 is synchronous and instead requires:
+
+```python
+  xarray_open_kwargs ={
+    "storage_options" : {
+      "remote_options" : {
+        "async": false,
+        ...
+      }
+  },
+  ...
+}
+```
+
+If `zarr2` or `zarr3` is specified in the `format` field, the `async` flag will be set automatically. If you specify `zarr` as the format, you must set the `async` flag manually in the `xarray_open_kwargs`.
+
 ```
 
 ### Aggregation Control Object
@@ -137,3 +142,4 @@ An aggregation object describes types of operations done during the aggregation
 | type           | string | **REQUIRED.** Type of aggregation operation to apply. Valid values include: `join_new`, `join_existing`, `union`                                                                                                                                                                                                                                                                                     |
 | attribute_name | string | Name of attribute (column) across which to aggregate.                                                                                                                                                                                                                                                                                                                                                |
 | options        | object | **OPTIONAL.** Aggregration settings that are passed as keywords arguments to [`xarray.concat()`](https://xarray.pydata.org/en/stable/generated/xarray.concat.html) or [`xarray.merge()`](https://xarray.pydata.org/en/stable/generated/xarray.merge.html#xarray.merge). For `join_existing`, it must contain the name of the existing dimension to use (for e.g.: something like `{'dim': 'time'}`). |
+```
diff --git a/intake_esm/cat.py b/intake_esm/cat.py
@@ -54,6 +54,8 @@ class AggregationType(str, enum.Enum):
 class DataFormat(str, enum.Enum):
     netcdf = 'netcdf'
     zarr = 'zarr'
+    zarr2 = 'zarr2'
+    zarr3 = 'zarr3'
     reference = 'reference'
     opendap = 'opendap'
 
diff --git a/intake_esm/source.py b/intake_esm/source.py
@@ -9,7 +9,7 @@
 from intake.source.base import DataSource, Schema
 
 from .cat import Aggregation, DataFormat
-from .utils import OPTIONS
+from .utils import OPTIONS, _set_async_flag
 
 
 class ConcatenationWarning(UserWarning):
@@ -23,7 +23,7 @@ class ESMDataSourceError(Exception):
 def _get_xarray_open_kwargs(data_format, xarray_open_kwargs=None, storage_options=None):
     xarray_open_kwargs = (xarray_open_kwargs or {}).copy()
     _default_open_kwargs = {
-        'engine': 'zarr' if data_format in {'zarr', 'reference'} else 'netcdf4',
+        'engine': 'zarr' if data_format in {'zarr', 'zarr2', 'zarr3', 'reference'} else 'netcdf4',
         'chunks': {},
         'backend_kwargs': {},
         'decode_timedelta': False,
@@ -40,6 +40,8 @@ def _get_xarray_open_kwargs(data_format, xarray_open_kwargs=None, storage_option
     ):
         xarray_open_kwargs['backend_kwargs']['storage_options'] = {} or storage_options
 
+    xarray_open_kwargs = _set_async_flag(data_format, xarray_open_kwargs)
+
     return xarray_open_kwargs
 
 
diff --git a/intake_esm/utils.py b/intake_esm/utils.py
@@ -7,6 +7,12 @@
 import polars as pl
 import zarr
 
+__all__ = [
+    'OPTIONS',
+    'set_options',
+    '_set_async_flag',
+]
+
 
 def show_versions(file=sys.stdout):  # pragma: no cover
     """print the versions of intake-esm and its dependencies.
@@ -67,6 +73,50 @@ def _zarr_async() -> bool:
     return int(zarr.__version__.split('.')[0]) > 2
 
 
+def _set_async_flag(data_format: str, xarray_open_kwargs: dict) -> dict:
+    """
+    If we have the data format set to either zarr2 or zarr3, the async flag in
+    `xarray_open_kwargs['storage_options']['remote_opetions']` is constrained to
+    be either False or True, respectively.
+
+    Parameters
+    ----------
+    data_format : str
+
+    xarray_open_kwargs : dict
+        The xarray open kwargs dictionary that may contain storage options.
+    Returns
+    -------
+    dict
+        The updated xarray open kwargs with the async flag set appropriately.
+    """
+    if data_format not in {'zarr2', 'zarr3'}:
+        return xarray_open_kwargs
+
+    storage_opts_template = {
+        'backend_kwargs': {'storage_options': {'remote_options': {'asynchronous': _zarr_async()}}}
+    }
+    if (
+        xarray_open_kwargs.get('backend_kwargs', {})
+        .get('storage_options', {})
+        .get('remote_options', None)
+        is not None
+    ):
+        xarray_open_kwargs['backend_kwargs']['storage_options']['remote_options'][
+            'asynchronous'
+        ] = _zarr_async()
+    elif xarray_open_kwargs.get('backend_kwargs', {}).get('storage_options', None) is not None:
+        xarray_open_kwargs['backend_kwargs']['storage_options'] = storage_opts_template[
+            'backend_kwargs'
+        ]['storage_options']
+    elif xarray_open_kwargs.get('backend_kwargs', None) is not None:
+        xarray_open_kwargs['backend_kwargs'] = storage_opts_template['backend_kwargs']
+    else:
+        xarray_open_kwargs = storage_opts_template
+
+    return xarray_open_kwargs
+
+
 OPTIONS = {
     'attrs_prefix': 'intake_esm_attrs',
     'dataset_key': 'intake_esm_dataset_key',
diff --git a/requirements.txt b/requirements.txt
@@ -10,4 +10,5 @@ pydantic>=2.0
 pydap!=3.5.5
 requests>=2.24.0
 xarray>=2024.10
-zarr>=3.0.10
+# Allow zarr 2.x or zarr 3.1.0+
+zarr!=3.0.*
diff --git a/tests/sample-catalogs/cesm1-lens-zarr2.json b/tests/sample-catalogs/cesm1-lens-zarr2.json
@@ -0,0 +1,35 @@
+{
+  "esmcat_version": "0.1.0",
+  "id": "sample-cesm1-lens-zarr2",
+  "description": "This is a sample ESM catalog for CESM1-LENS data in zarr v2 format",
+  "catalog_file": "./tests/sample-catalogs/cesm1-lens-aws-zarr.csv",
+  "attributes": [
+    {
+      "column_name": "experiment",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "component",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "frequency",
+      "vocabulary": ""
+    },
+    { "column_name": "variable", "vocabulary": "" }
+  ],
+  "assets": {
+    "column_name": "path",
+    "format": "zarr2"
+  },
+  "aggregation_control": {
+    "variable_column_name": "variable",
+    "groupby_attrs": ["component", "experiment", "frequency"],
+    "aggregations": [
+      {
+        "type": "union",
+        "attribute_name": "variable"
+      }
+    ]
+  }
+}
diff --git a/tests/sample-catalogs/cesm1-lens-zarr3.json b/tests/sample-catalogs/cesm1-lens-zarr3.json
@@ -0,0 +1,35 @@
+{
+  "esmcat_version": "0.1.0",
+  "id": "sample-cesm1-lens-zarr3",
+  "description": "This is a sample ESM catalog for CESM1-LENS data in zarr v3 format",
+  "catalog_file": "./tests/sample-catalogs/cesm1-lens-aws-zarr.csv",
+  "attributes": [
+    {
+      "column_name": "experiment",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "component",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "frequency",
+      "vocabulary": ""
+    },
+    { "column_name": "variable", "vocabulary": "" }
+  ],
+  "assets": {
+    "column_name": "path",
+    "format": "zarr3"
+  },
+  "aggregation_control": {
+    "variable_column_name": "variable",
+    "groupby_attrs": ["component", "experiment", "frequency"],
+    "aggregations": [
+      {
+        "type": "union",
+        "attribute_name": "variable"
+      }
+    ]
+  }
+}
diff --git a/tests/test_cat.py b/tests/test_cat.py
@@ -21,11 +21,19 @@
     sample_pl_df,
     zarr_cat_aws_cesm,
     zarr_cat_pangeo_cmip6,
+    zarr_v2_cat,
+    zarr_v3_cat,
 )
 
 
 @pytest.mark.parametrize(
-    'column_name, format, format_column_name', [('test', 'zarr', None), ('test', 'netcdf', None)]
+    'column_name, format, format_column_name',
+    [
+        ('test', 'zarr', None),
+        ('test', 'zarr2', None),
+        ('test', 'zarr3', None),
+        ('test', 'netcdf', None),
+    ],
 )
 def test_assets(column_name, format, format_column_name):
     a = Assets(column_name=column_name, format=format, format_column_name=format_column_name)
@@ -53,6 +61,8 @@ def test_assets_mutually_exclusive():
         cdf_cat_sample_cmip6_noagg,
         cdf_cat_sample_cesmle,
         multi_variable_cat,
+        zarr_v2_cat,
+        zarr_v3_cat,
     ],
 )
 @pytest.mark.flaky(max_runs=3, min_passes=1)  # Cold start related failures
diff --git a/tests/test_utils.py b/tests/test_utils.py
diff --git a/tests/utils.py b/tests/utils.py