From 9ca15821515516ccd01935eb0e90873970a03cdd Mon Sep 17 00:00:00 2001 From: Guilherme Castelao Date: Mon, 30 Jan 2023 16:16:52 -0700 Subject: [PATCH 01/16] Collect meta files and group by basename Implementing collect_meta(). Let's process per variable, so the first step is to collect all meta and group per variable. --- xmitgcm/dev.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 xmitgcm/dev.py diff --git a/xmitgcm/dev.py b/xmitgcm/dev.py new file mode 100644 index 0000000..2c1fb46 --- /dev/null +++ b/xmitgcm/dev.py @@ -0,0 +1,20 @@ + +import os + +from .utils import parse_meta_file + + +def collect_meta(path: str): + + output = {} + for m in (m for m in os.listdir(path) if m[-5:] == ".meta"): + try: + meta = parse_meta_file(os.path.join(path, m)) + if meta["basename"] not in output: + output[meta["basename"]] = [] + meta["filename"] = m + output[meta["basename"]].append(meta) + except: + print(f"Failed parsing: {m}") + + return output From 0b96244560f1997fad5ae9cd968b09c4afeb2408 Mon Sep 17 00:00:00 2001 From: Gui Castelao Date: Sat, 29 Jul 2023 07:42:03 -0700 Subject: [PATCH 02/16] Renaming to demo I'll collect everything here until I understand where it fits in the library. --- xmitgcm/{dev.py => demo.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename xmitgcm/{dev.py => demo.py} (100%) diff --git a/xmitgcm/dev.py b/xmitgcm/demo.py similarity index 100% rename from xmitgcm/dev.py rename to xmitgcm/demo.py From 962cacc02f1e34f0f618aa01c7282acc87fdd1b1 Mon Sep 17 00:00:00 2001 From: Gui Castelao Date: Sat, 29 Jul 2023 07:46:23 -0700 Subject: [PATCH 03/16] A hardcoded DataFile Understanding and validating the connections between zarr and MITgcm's binaries. Once I validated a working reader, I can start to generalize it. --- xmitgcm/demo.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/xmitgcm/demo.py b/xmitgcm/demo.py index 2c1fb46..25272c8 100644 --- a/xmitgcm/demo.py +++ b/xmitgcm/demo.py @@ -18,3 +18,56 @@ def collect_meta(path: str): print(f"Failed parsing: {m}") return output + + +class DataFile(UserDict): + def get(self, *args, **kwargs): + print(f"get: {args}") + # import pdb; pdb.set_trace() + return super().get(*args) + + def __getitem__(self, key): + print(f"getitem: {key}") + # if "S/0.0.0" not in self: + # super().__setitem__("S/0.0.0", [f"{path_mitgcm}/S/0.0.0", 0, 4800]) + + if key == ".zgroup": + return json.dumps({"zarr_format": 2}) + elif key == "S/.zattrs": + # How to guess this? + return json.dumps({ + "_ARRAY_DIMENSIONS": ["lon", "lat", "depth"] + }) + elif key == "S/.zarray": + return json.dumps({ + "chunks": [ 20, 30, 2 ], + "compressor": None, # fixed + "dtype": ">f4", + "fill_value": "NaN", # fixed + "filters": None, # fixed + "order": "C", # fixed + "shape": [ 20, 30, 2 ], + "zarr_format": 2 # fixed + }) + # return [f"{path_mitgcm}/S/.zarray", 0, 356] + elif key == "S/0.0.0": + import pdb; pdb.set_trace() + # size can guess from chunks * dtype + # super().__setitem__("S/0.0.0", [f"{path_mitgcm}/S.0000000000.001.001.data", 0, 4800]) + return [f"{path_mitgcm}/S.0000000000.001.001.data", 0, 4800] + return super().__getitem__(key) + + def __contains__(self, item): + print(f"contains: {item}") + if item in (".zgroup", "S/.zattrs", "S/.zarray", "S/0.0.0"): + return True + else: + print(f"Checking on {item}") + # if item == "S/0.0.0": + # # import pdb; pdb.set_trace() + # print(super().__contains__(item)) + return super().__contains__(item) + + def __iter__(self): + print("iter") + yield from (".zgroup", "S/.zattrs", "S/.zarray", "S/0.0.0") From 341594801583f7986ef84ef943cd2a3fd0bd2461 Mon Sep 17 00:00:00 2001 From: Gui Castelao Date: Thu, 31 Aug 2023 14:00:02 -0600 Subject: [PATCH 04/16] Prototype of VarZ (hardcoded) Test driven development. Let's start with a hardcoded behavior and later generalize it as I understand what is required. --- xmitgcm/mds_tilez.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 xmitgcm/mds_tilez.py diff --git a/xmitgcm/mds_tilez.py b/xmitgcm/mds_tilez.py new file mode 100644 index 0000000..c06e665 --- /dev/null +++ b/xmitgcm/mds_tilez.py @@ -0,0 +1,33 @@ + +from collections.abc import Container +import json + +class VarZ(): + def __init__(self, path: str, varname: str): + self.path = path + self.varname = varname + self._ndims = 1 # From ["time"] + + def __getitem__(self, key): + print(f"getitem: {key}") + if key == ".zattrs": + # How to guess this? + return json.dumps({ + # "_ARRAY_DIMENSIONS": ["lon", "lat", "depth"] + "_ARRAY_DIMENSIONS": ["time"] + }) + elif key == ".zarray": + return json.dumps({ + # "chunks": [ 20, 30, 2 ], + "chunks": [ 2 ], + "compressor": None, # fixed + # "dtype": ">f4", + "dtype": " Date: Thu, 31 Aug 2023 14:19:07 -0600 Subject: [PATCH 05/16] Starting with a coordinate (1D variable) --- xmitgcm/mds_tilez.py | 2 +- xmitgcm/test/test_mds_tilez.py | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 xmitgcm/test/test_mds_tilez.py diff --git a/xmitgcm/mds_tilez.py b/xmitgcm/mds_tilez.py index c06e665..a359a44 100644 --- a/xmitgcm/mds_tilez.py +++ b/xmitgcm/mds_tilez.py @@ -14,7 +14,7 @@ def __getitem__(self, key): # How to guess this? return json.dumps({ # "_ARRAY_DIMENSIONS": ["lon", "lat", "depth"] - "_ARRAY_DIMENSIONS": ["time"] + "_ARRAY_DIMENSIONS": ["lat"] }) elif key == ".zarray": return json.dumps({ diff --git a/xmitgcm/test/test_mds_tilez.py b/xmitgcm/test/test_mds_tilez.py new file mode 100644 index 0000000..705fe76 --- /dev/null +++ b/xmitgcm/test/test_mds_tilez.py @@ -0,0 +1,27 @@ + +import json +import pytest +import tempfile + +from xmitgcm.mds_tilez import VarZ + +@pytest.fixture(scope="session") +def var_lat(tmp_path_factory): + return tmp_path_factory.mktemp("lat") + + +def test_var_zattrs(var_lat): + v = VarZ(var_lat, 'lat') + + attrs = v['.zattrs'] + attrs = json.loads(attrs) + assert "_ARRAY_DIMENSIONS" in attrs + assert attrs["_ARRAY_DIMENSIONS"] == ["lat"] + + +def test_var_zarray(var_lat): + v = VarZ(var_lat, 'lat') + + attrs = v['.zarray'] + attrs = json.loads(attrs) + assert "chunks" in attrs From 6b42c35a74314cd25719e8ad874474e7d2198234 Mon Sep 17 00:00:00 2001 From: Gui Castelao Date: Thu, 31 Aug 2023 15:20:00 -0600 Subject: [PATCH 06/16] Retrieving 1D values Using lat as an easy case to start. --- xmitgcm/mds_tilez.py | 13 +++++++++---- xmitgcm/test/test_mds_tilez.py | 9 +++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/xmitgcm/mds_tilez.py b/xmitgcm/mds_tilez.py index a359a44..cdeedec 100644 --- a/xmitgcm/mds_tilez.py +++ b/xmitgcm/mds_tilez.py @@ -2,11 +2,12 @@ from collections.abc import Container import json +import numpy as np + class VarZ(): def __init__(self, path: str, varname: str): self.path = path self.varname = varname - self._ndims = 1 # From ["time"] def __getitem__(self, key): print(f"getitem: {key}") @@ -19,15 +20,19 @@ def __getitem__(self, key): elif key == ".zarray": return json.dumps({ # "chunks": [ 20, 30, 2 ], - "chunks": [ 2 ], + "chunks": [ 3 ], "compressor": None, # fixed # "dtype": ">f4", - "dtype": " Date: Sun, 3 Sep 2023 07:10:05 -0700 Subject: [PATCH 07/16] Identifying requirements for VarZ Still using a hardcoded case to understand the requirements for VarZ. --- xmitgcm/mds_tilez.py | 50 +++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/xmitgcm/mds_tilez.py b/xmitgcm/mds_tilez.py index cdeedec..a637e3c 100644 --- a/xmitgcm/mds_tilez.py +++ b/xmitgcm/mds_tilez.py @@ -10,14 +10,27 @@ def __init__(self, path: str, varname: str): self.varname = varname def __getitem__(self, key): - print(f"getitem: {key}") - if key == ".zattrs": - # How to guess this? - return json.dumps({ - # "_ARRAY_DIMENSIONS": ["lon", "lat", "depth"] - "_ARRAY_DIMENSIONS": ["lat"] - }) - elif key == ".zarray": + print(f"VarZ.getitem: {key}") + if key == "lat/.zattrs": + return self._zattrs() + elif key == "lat/.zarray": + return self._zarray() + elif key == "lat/0": + return [ f"{self.path}/lat/0", 0, 24] + # np.array([11.,12.,13.]).tobytes() + + def __iter__(self): + print(f"VarZ.__iter__()") + yield from ['lat/.zattrs', 'lat/.zarray', 'lat/0'] + + def _zattrs(self): + # How to guess this? + return json.dumps({ + # "_ARRAY_DIMENSIONS": ["lon", "lat", "depth"] + "_ARRAY_DIMENSIONS": ["lat"] + }) + + def _zarray(self): return json.dumps({ # "chunks": [ 20, 30, 2 ], "chunks": [ 3 ], @@ -31,8 +44,21 @@ def __getitem__(self, key): "shape": [ 3 ], "zarr_format": 2 # fixed }) - elif key == "0": - lat = np.array([10.,11.,12.]).tobytes() - return lat - return super().__getitem__(key) + def items(self): + print(f"VarZ.items()") + yield from [ + # ('lat/.zattrs', ['/Users/castelao/work/projects/others/MIT_tiles/data/example/lat/.zattrs', 0, 50]), + ('lat/.zattrs', self._zattrs()), + ('lat/.zarray', self._zarray()), + ('lat/0', [f"{self.path}/lat/0", 0, 24]) + ] + + def values(self): + print(f"VarZ.values()") + yield from [ + self._zattrs(), + self._zarray(), + [f"{self.path}/lat/0", 0, 24] + ] + From 38fbbc51833d7352e3cbe3de8913e32c2850c4f0 Mon Sep 17 00:00:00 2001 From: Gui Castelao Date: Sun, 15 Oct 2023 10:59:10 -0600 Subject: [PATCH 08/16] Prototype of a Chunk --- xmitgcm/mds_tilez.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/xmitgcm/mds_tilez.py b/xmitgcm/mds_tilez.py index a637e3c..640935e 100644 --- a/xmitgcm/mds_tilez.py +++ b/xmitgcm/mds_tilez.py @@ -1,8 +1,34 @@ from collections.abc import Container +from collections import UserDict +from dataclasses import dataclass import json +import os.path +from fsspec.implementations.reference import ReferenceFileSystem import numpy as np +import xarray as xr + +from xmitgcm.utils import parse_meta_file + +@dataclass() +class Chunk(): + """ + Handles one chunk of one variable. For instance, it could be: + S.0000000000.001.001.data + S.0000000000.001.001.meta + + metafilename = "/Users/castelao/work/projects/others/MIT_tiles/data/mitgcm/S.0000000000.001.001.meta" + """ + root: str + varname: str + + def __fspath__(self): + return os.path.join(self.root, "S.0000000000.001.001.data") + + def from_meta(filename): + metadata = parse_meta_file(filename) + class VarZ(): def __init__(self, path: str, varname: str): From 1445cda37642b15386d6bc04e77cd2a7f43865c3 Mon Sep 17 00:00:00 2001 From: Gui Castelao Date: Tue, 17 Oct 2023 18:53:34 -0600 Subject: [PATCH 09/16] Testing new concept for Chunks() --- xmitgcm/mds_tilez.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/xmitgcm/mds_tilez.py b/xmitgcm/mds_tilez.py index 640935e..fd9c491 100644 --- a/xmitgcm/mds_tilez.py +++ b/xmitgcm/mds_tilez.py @@ -20,11 +20,37 @@ class Chunk(): metafilename = "/Users/castelao/work/projects/others/MIT_tiles/data/mitgcm/S.0000000000.001.001.meta" """ - root: str - varname: str + filename: str + metadata: dict def __fspath__(self): - return os.path.join(self.root, "S.0000000000.001.001.data") + return os.path.join(self.root, self._fdata) + + @property + def varnames(self): + try: + return self._varnames + except: + if "fldList" in metadata: + self._varnames = metadata['fldList'] + else: + self._varnames = os.path.basename(filename).split('.')[0] + return self._varnames + + @property + def index(self): + idx = [] + for d in metadata['dimList']: + idx.append(str((d[1] - 1) // (d[2] - d[1]+1))) + + return ".".join(idx) + + @property + def labels(self): + size = np.prod([(d[2] - d[1] + 1) for d in self.metadata['dimList']]) + size *= self.metadata['dataprec'].itemsize # f32 + + return {f"{v}/{self.index}": [self.filename, i*size, (i+1)*size] for i,v in enumerate(self.varnames)} def from_meta(filename): metadata = parse_meta_file(filename) From 958eb6ceefce121ed06a3f02369792acd0cf5247 Mon Sep 17 00:00:00 2001 From: Gui Castelao Date: Sun, 29 Oct 2023 22:51:46 -0600 Subject: [PATCH 10/16] Refactoring Chunk Trying a different concept and adding a few resources. --- xmitgcm/mds_tilez.py | 109 ++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 68 deletions(-) diff --git a/xmitgcm/mds_tilez.py b/xmitgcm/mds_tilez.py index fd9c491..ea2901a 100644 --- a/xmitgcm/mds_tilez.py +++ b/xmitgcm/mds_tilez.py @@ -26,91 +26,64 @@ class Chunk(): def __fspath__(self): return os.path.join(self.root, self._fdata) + def __getitem__(self, key): + return self._labels[key] + + @property + def _labels(self): + size = np.prod([(d[2] - d[1] + 1) for d in self.metadata['dimList']]) + size *= self.metadata['dataprec'].itemsize # f32 + + # return {f"{v}/{self.index}": [self.filename, i*size, (i+1)*size] for i,v in enumerate(self.varnames)} + return {v: [self.filename, i*size, (i+1)*size] for i,v in enumerate(self.varnames)} + @property def varnames(self): try: return self._varnames except: - if "fldList" in metadata: - self._varnames = metadata['fldList'] + if "fldList" in self.metadata: + self._varnames = self.metadata['fldList'] else: - self._varnames = os.path.basename(filename).split('.')[0] + self._varnames = os.path.basename(self.filename).split('.')[0] return self._varnames + @property + def dtype(self): + return self.metadata['dataprec'].str + @property def index(self): idx = [] - for d in metadata['dimList']: + for d in self.metadata['dimList']: idx.append(str((d[1] - 1) // (d[2] - d[1]+1))) return ".".join(idx) @property - def labels(self): - size = np.prod([(d[2] - d[1] + 1) for d in self.metadata['dimList']]) - size *= self.metadata['dataprec'].itemsize # f32 - - return {f"{v}/{self.index}": [self.filename, i*size, (i+1)*size] for i,v in enumerate(self.varnames)} - - def from_meta(filename): - metadata = parse_meta_file(filename) + def missing_value(self): + if self.metadata['dataprec'].kind == 'i': + return int(self.metadata['missingValue']) + else: + return float(self.metadata['missingValue']) + @property + def shape(self): + return tuple(s[0] for s in x.metadata['dimList']) -class VarZ(): - def __init__(self, path: str, varname: str): - self.path = path - self.varname = varname + @property + def chunks(self): + return tuple(s[2] for s in x.metadata['dimList']) - def __getitem__(self, key): - print(f"VarZ.getitem: {key}") - if key == "lat/.zattrs": - return self._zattrs() - elif key == "lat/.zarray": - return self._zarray() - elif key == "lat/0": - return [ f"{self.path}/lat/0", 0, 24] - # np.array([11.,12.,13.]).tobytes() - - def __iter__(self): - print(f"VarZ.__iter__()") - yield from ['lat/.zattrs', 'lat/.zarray', 'lat/0'] - - def _zattrs(self): - # How to guess this? - return json.dumps({ - # "_ARRAY_DIMENSIONS": ["lon", "lat", "depth"] - "_ARRAY_DIMENSIONS": ["lat"] - }) - - def _zarray(self): - return json.dumps({ - # "chunks": [ 20, 30, 2 ], - "chunks": [ 3 ], - "compressor": None, # fixed - # "dtype": ">f4", - "dtype": " Date: Sun, 29 Oct 2023 22:54:13 -0600 Subject: [PATCH 11/16] Trying the concept of VarZ() Chunk() is an auxiliary class to organize and collect information from meta, while VarZ keep track of all chunks for one variable. --- xmitgcm/mds_tilez.py | 92 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/xmitgcm/mds_tilez.py b/xmitgcm/mds_tilez.py index ea2901a..15715f3 100644 --- a/xmitgcm/mds_tilez.py +++ b/xmitgcm/mds_tilez.py @@ -87,3 +87,95 @@ def from_meta(filename): filename=filename.replace(".meta", ".data"), metadata=metadata) return chunk + + +@dataclass() +class VarZ(): + varname: str + data: dict + chunks: tuple + dtype: str + fill_value: float + shape: tuple + + def __getitem__(self, key): + print(f"VarZ.getitem: {key}") + + assert key[:len(self.varname)+1] == f"{self.varname}/" + k = key[len(self.varname)+1:] + if k == f".zattrs": + return self._zattrs() + elif k == f".zarray": + return self._zarray() + else: + ti, xyi = k.split(".", 1) + ts = sorted(v.data.keys())[int(ti)] + return self.data[ts][xyi] + + def __iter__(self): + print(f"VarZ.__iter__()") + yield from [ + f"{self.varname}/.zattrs", + f"{self.varname}/.zarray", + ] + idx_t = sorted(self.data.keys()) + for ts in idx_t: + yield from (f"{self.varname}/{idx_t.index(ts)}.{i}" + for i in self.data[ts]) + + def _zattrs(self): + # How to guess this? + if len(self.shape) == 3: + dims = ["time", "lon", "lat", "depth"] + elif len(self.shape) == 2: + dims = ["time", "lon", "lat"] + + return json.dumps({ + # "_ARRAY_DIMENSIONS": ["lon", "lat", "depth"] + "_ARRAY_DIMENSIONS": dims + }) + + def _zarray(self): + return json.dumps({ + "chunks": self.chunks, + "compressor": None, # fixed + "dtype": self.dtype, + "fill_value": self.fill_value, + "filters": None, # fixed + "order": "C", # fixed + "shape": self.shape, + "zarr_format": 2 # fixed + }) + + def push(self, chunk): + assert self.varname in chunk.varnames + assert self.chunks == chunk.chunks + assert self.dtype == chunk.dtype + assert self.shape == chunk.shape + assert self.fill_value == chunk.missing_value + + if chunk.time_step_number not in self.data: + self.data[chunk.time_step_number] = {} + self.data[chunk.time_step_number][chunk.index] = chunk[self.varname] + + def push_from_meta(self, filename): + d = Chunk.from_meta(filename) + self.push(d) + + @staticmethod + def from_chunk(chunk, varname=None): + if varname is None: + assert len(chunk.varnames) == 1 + varname = chunk.varnames[0] + + assert varname in chunk.varnames + + v = VarZ(varname=varname, data={}, chunks=chunk.chunks, + shape=chunk.shape, + dtype=chunk.dtype, fill_value=chunk.missing_value) + v.push(chunk) + return v + + @staticmethod + def from_meta(filename, varname=None): + return VarZ.from_chunk(Chunk.from_meta(filename)) From 7cc139f12d27c95ae5eeaefcbec030afda58071e Mon Sep 17 00:00:00 2001 From: Gui Castelao Date: Wed, 1 Nov 2023 21:03:07 -0600 Subject: [PATCH 12/16] fix: Forgot to redirect to self --- xmitgcm/mds_tilez.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xmitgcm/mds_tilez.py b/xmitgcm/mds_tilez.py index 15715f3..e37ba42 100644 --- a/xmitgcm/mds_tilez.py +++ b/xmitgcm/mds_tilez.py @@ -69,11 +69,11 @@ def missing_value(self): @property def shape(self): - return tuple(s[0] for s in x.metadata['dimList']) + return tuple(s[0] for s in self.metadata['dimList']) @property def chunks(self): - return tuple(s[2] for s in x.metadata['dimList']) + return tuple(s[2] for s in self.metadata['dimList']) @property def time_step_number(self): From 4d5b28d7fbceb70976104bb1df78a4d53e67f631 Mon Sep 17 00:00:00 2001 From: Gui Castelao Date: Wed, 1 Nov 2023 21:05:06 -0600 Subject: [PATCH 13/16] feat: TileZ() to map as Zarr structrue --- xmitgcm/mds_tilez.py | 66 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 3 deletions(-) diff --git a/xmitgcm/mds_tilez.py b/xmitgcm/mds_tilez.py index e37ba42..9ec368a 100644 --- a/xmitgcm/mds_tilez.py +++ b/xmitgcm/mds_tilez.py @@ -4,6 +4,7 @@ from dataclasses import dataclass import json import os.path +import re from fsspec.implementations.reference import ReferenceFileSystem import numpy as np @@ -109,7 +110,7 @@ def __getitem__(self, key): return self._zarray() else: ti, xyi = k.split(".", 1) - ts = sorted(v.data.keys())[int(ti)] + ts = sorted(self.data.keys())[int(ti)] return self.data[ts][xyi] def __iter__(self): @@ -137,13 +138,13 @@ def _zattrs(self): def _zarray(self): return json.dumps({ - "chunks": self.chunks, + "chunks": [1] + list(self.chunks), "compressor": None, # fixed "dtype": self.dtype, "fill_value": self.fill_value, "filters": None, # fixed "order": "C", # fixed - "shape": self.shape, + "shape": [len(self.data.keys())] + list(self.shape), "zarr_format": 2 # fixed }) @@ -179,3 +180,62 @@ def from_chunk(chunk, varname=None): @staticmethod def from_meta(filename, varname=None): return VarZ.from_chunk(Chunk.from_meta(filename)) + + +class TileZ(UserDict): + def __getitem__(self, key): + print(f"TileZ.__getitem__(): {key}") + + if key == ".zgroup": + return self._zgroup() + for v in self.data: + if key.startswith(v): + return self.data[v][key] + + return super().__getitem__(key) + + def __contains__(self, item): + print(f"TileZ.__contains__(): {item}") + if item in (".zgroup"): + return True + for v in self.data: + if item in self.data[v]: + return True + return False + + def __iter__(self): + print(f"TileZ.__iter__()") + yield from (".zgroup",) + for v in self.data: + yield from self.data[v].__iter__() + + def _zgroup(self): + return json.dumps({"zarr_format": 2}) + + def get(self, key, default=None): + print(f"TileZ.get(): {key} / {default}") + return self.data.get(key, default) + #return super().get(*args) + + def push(self, chunk): + for v in chunk.varnames: + if v not in self.data: + self.data[v] = VarZ.from_chunk(chunk, varname=v) + self.data[v].push(chunk) + + def push_from_meta(self, filename): + c = Chunk.from_meta(filename) + self.push(c) + + def scan(self, path): + pattern = re.compile('\w+\.\d+\.meta') + filenames = (m for m in os.listdir(path) if pattern.match(m)) + for mfilename in sorted(filenames): + self.push_from_meta(os.path.join(path, mfilename)) + + def values(self): + print(f"TileZ.values()") + yield self._zgroup() + for v in self.data: + # yield from self.data[v].values() + yield from [self.data[v][k] for k in self.data[v]] From dff6bd167a5bc312b4c1dceaf5fae3ecfb198dc6 Mon Sep 17 00:00:00 2001 From: Gui Castelao Date: Wed, 1 Nov 2023 21:12:41 -0600 Subject: [PATCH 14/16] Missing len() --- xmitgcm/mds_tilez.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xmitgcm/mds_tilez.py b/xmitgcm/mds_tilez.py index 9ec368a..966fd27 100644 --- a/xmitgcm/mds_tilez.py +++ b/xmitgcm/mds_tilez.py @@ -217,6 +217,9 @@ def get(self, key, default=None): return self.data.get(key, default) #return super().get(*args) + def len(self): + return len([v for v in self]) + def push(self, chunk): for v in chunk.varnames: if v not in self.data: From 2eb562943ed1f3fc82278c95bb029b1994076df6 Mon Sep 17 00:00:00 2001 From: Gui Castelao Date: Wed, 1 Nov 2023 21:13:18 -0600 Subject: [PATCH 15/16] feat: method to provide xr.Dataset --- xmitgcm/mds_tilez.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/xmitgcm/mds_tilez.py b/xmitgcm/mds_tilez.py index 966fd27..c9ce9e7 100644 --- a/xmitgcm/mds_tilez.py +++ b/xmitgcm/mds_tilez.py @@ -212,6 +212,12 @@ def __iter__(self): def _zgroup(self): return json.dumps({"zarr_format": 2}) + def as_dataset(self): + fs = ReferenceFileSystem(fo=self, target_protocol='file') + mapper = fs.get_mapper("") + ds = xr.open_zarr(mapper, consolidated=False) + return ds + def get(self, key, default=None): print(f"TileZ.get(): {key} / {default}") return self.data.get(key, default) From dd0d419a338a73b504e7b50694483926809154c8 Mon Sep 17 00:00:00 2001 From: Gui Castelao Date: Wed, 1 Nov 2023 21:39:59 -0600 Subject: [PATCH 16/16] Cleaning DataFile() I was testing with @rabernat 's proof of concept. --- xmitgcm/demo.py | 53 ------------------------------------------------- 1 file changed, 53 deletions(-) diff --git a/xmitgcm/demo.py b/xmitgcm/demo.py index 25272c8..2c1fb46 100644 --- a/xmitgcm/demo.py +++ b/xmitgcm/demo.py @@ -18,56 +18,3 @@ def collect_meta(path: str): print(f"Failed parsing: {m}") return output - - -class DataFile(UserDict): - def get(self, *args, **kwargs): - print(f"get: {args}") - # import pdb; pdb.set_trace() - return super().get(*args) - - def __getitem__(self, key): - print(f"getitem: {key}") - # if "S/0.0.0" not in self: - # super().__setitem__("S/0.0.0", [f"{path_mitgcm}/S/0.0.0", 0, 4800]) - - if key == ".zgroup": - return json.dumps({"zarr_format": 2}) - elif key == "S/.zattrs": - # How to guess this? - return json.dumps({ - "_ARRAY_DIMENSIONS": ["lon", "lat", "depth"] - }) - elif key == "S/.zarray": - return json.dumps({ - "chunks": [ 20, 30, 2 ], - "compressor": None, # fixed - "dtype": ">f4", - "fill_value": "NaN", # fixed - "filters": None, # fixed - "order": "C", # fixed - "shape": [ 20, 30, 2 ], - "zarr_format": 2 # fixed - }) - # return [f"{path_mitgcm}/S/.zarray", 0, 356] - elif key == "S/0.0.0": - import pdb; pdb.set_trace() - # size can guess from chunks * dtype - # super().__setitem__("S/0.0.0", [f"{path_mitgcm}/S.0000000000.001.001.data", 0, 4800]) - return [f"{path_mitgcm}/S.0000000000.001.001.data", 0, 4800] - return super().__getitem__(key) - - def __contains__(self, item): - print(f"contains: {item}") - if item in (".zgroup", "S/.zattrs", "S/.zarray", "S/0.0.0"): - return True - else: - print(f"Checking on {item}") - # if item == "S/0.0.0": - # # import pdb; pdb.set_trace() - # print(super().__contains__(item)) - return super().__contains__(item) - - def __iter__(self): - print("iter") - yield from (".zgroup", "S/.zattrs", "S/.zarray", "S/0.0.0")