Skip to content

Commit cdedda2

Browse files
committed
Merge branch 'refactor/metadata-package' of https://github.com/d-v-b/zarr-python into refactor/metadata-package
2 parents b8d67fe + 039fd7e commit cdedda2

4 files changed

Lines changed: 162 additions & 28 deletions

File tree

changes/3924.bugfix.md

Lines changed: 0 additions & 1 deletion
This file was deleted.

src/zarr/storage/_fsspec.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
)
1717
from zarr.core.buffer import Buffer
1818
from zarr.errors import ZarrUserWarning
19-
from zarr.storage._utils import _join_paths, normalize_path
19+
from zarr.storage._utils import _dereference_path
2020

2121
if TYPE_CHECKING:
2222
from collections.abc import AsyncIterator, Iterable
@@ -127,7 +127,7 @@ def __init__(
127127
) -> None:
128128
super().__init__(read_only=read_only)
129129
self.fs = fs
130-
self.path = normalize_path(path)
130+
self.path = path
131131
self.allowed_exceptions = allowed_exceptions
132132

133133
if not self.fs.async_impl:
@@ -282,7 +282,7 @@ async def get(
282282
# docstring inherited
283283
if not self._is_open:
284284
await self._open()
285-
path = _join_paths([self.path, key])
285+
path = _dereference_path(self.path, key)
286286

287287
try:
288288
if byte_range is None:
@@ -329,7 +329,7 @@ async def set(
329329
raise TypeError(
330330
f"FsspecStore.set(): `value` must be a Buffer instance. Got an instance of {type(value)} instead."
331331
)
332-
path = _join_paths([self.path, key])
332+
path = _dereference_path(self.path, key)
333333
# write data
334334
if byte_range:
335335
raise NotImplementedError
@@ -338,7 +338,7 @@ async def set(
338338
async def delete(self, key: str) -> None:
339339
# docstring inherited
340340
self._check_writable()
341-
path = _join_paths([self.path, key])
341+
path = _dereference_path(self.path, key)
342342
try:
343343
await self.fs._rm(path)
344344
except FileNotFoundError:
@@ -354,14 +354,14 @@ async def delete_dir(self, prefix: str) -> None:
354354
)
355355
self._check_writable()
356356

357-
path_to_delete = _join_paths([self.path, prefix])
357+
path_to_delete = _dereference_path(self.path, prefix)
358358

359359
with suppress(*self.allowed_exceptions):
360360
await self.fs._rm(path_to_delete, recursive=True)
361361

362362
async def exists(self, key: str) -> bool:
363363
# docstring inherited
364-
path = _join_paths([self.path, key])
364+
path = _dereference_path(self.path, key)
365365
exists: bool = await self.fs._exists(path)
366366
return exists
367367

@@ -378,7 +378,7 @@ async def get_partial_values(
378378
starts: list[int | None] = []
379379
stops: list[int | None] = []
380380
for key, byte_range in key_ranges:
381-
paths.append(_join_paths([self.path, key]))
381+
paths.append(_dereference_path(self.path, key))
382382
if byte_range is None:
383383
starts.append(None)
384384
stops.append(None)
@@ -429,7 +429,7 @@ async def list_prefix(self, prefix: str) -> AsyncIterator[str]:
429429
yield onefile.removeprefix(f"{self.path}/")
430430

431431
async def getsize(self, key: str) -> int:
432-
path = _join_paths([self.path, key])
432+
path = _dereference_path(self.path, key)
433433
info = await self.fs._info(path)
434434

435435
size = info.get("size")

src/zarr/storage/_utils.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,53 @@ def _join_paths(paths: Iterable[str]) -> str:
185185
return "/".join(filter(lambda v: v != "", paths))
186186

187187

188+
def _dereference_path(root: str, path: str) -> str:
189+
"""
190+
Combine a store-side root with a key into a single fully-qualified path.
191+
192+
Unlike `_join_paths`, this is purpose-built for the case where `root` is
193+
an opaque backend-side prefix that may use `"/"` as a sentinel for "root
194+
of the filesystem" (notably for fsspec's `ReferenceFileSystem`). A
195+
trailing `"/"` is stripped from `root` before joining; if `root` is then
196+
empty, the bare `path` is returned so that joining `"/"` with `"key"`
197+
yields `"key"` rather than `"//key"`. A trailing `"/"` on the result is
198+
also stripped.
199+
200+
Leading slashes on `root` are preserved -- a backend-side path like
201+
`"/home/foo/data.zarr"` is an absolute filesystem path for
202+
`LocalFileSystem` and must not lose its leading separator.
203+
204+
Parameters
205+
----------
206+
root : str
207+
The backend-side root of a store. May be `""`, `"/"`, an absolute
208+
filesystem path, or a backend-specific prefix.
209+
path : str
210+
The key within the store, typically a zarr key like `"zarr.json"`
211+
or `"a/b/c/zarr.json"`.
212+
213+
Returns
214+
-------
215+
str
216+
`root` and `path` joined by a single `"/"`, with the `"/"` sentinel
217+
collapsed and trailing slashes removed.
218+
219+
Examples
220+
--------
221+
```python
222+
from zarr.storage._utils import _dereference_path
223+
_dereference_path("/", "zarr.json") # 'zarr.json'
224+
_dereference_path("", "zarr.json") # 'zarr.json'
225+
_dereference_path("/home/foo", "zarr.json") # '/home/foo/zarr.json'
226+
_dereference_path("/home/foo/", "zarr.json") # '/home/foo/zarr.json'
227+
_dereference_path("bucket/p", "zarr.json") # 'bucket/p/zarr.json'
228+
```
229+
"""
230+
root = root.rstrip("/")
231+
path = f"{root}/{path}" if root else path
232+
return path.rstrip("/")
233+
234+
188235
def _relativize_path(*, path: str, prefix: str) -> str:
189236
"""
190237
Make a "/"-delimited path relative to some prefix. If the prefix is '', then the path is

tests/test_store/test_fsspec.py

Lines changed: 106 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from zarr.errors import ZarrUserWarning
1818
from zarr.storage import FsspecStore
1919
from zarr.storage._fsspec import _make_async
20-
from zarr.storage._utils import normalize_path
2120
from zarr.testing.store import StoreTests
2221

2322
if TYPE_CHECKING:
@@ -287,25 +286,114 @@ def array_roundtrip(store: FsspecStore) -> None:
287286
np.testing.assert_array_equal(arr[:], data)
288287

289288

290-
@pytest.mark.skipif(
291-
parse_version(fsspec.__version__) < parse_version("2024.12.0"),
292-
reason="No AsyncFileSystemWrapper",
289+
@pytest.mark.parametrize(
290+
("root", "key", "expected"),
291+
[
292+
# `"/"` as root collapses so that bare-key backends (notably
293+
# ReferenceFileSystem) get the right key. Regression test for
294+
# https://github.com/zarr-developers/zarr-python/issues/3922 .
295+
("/", "zarr.json", "zarr.json"),
296+
("", "zarr.json", "zarr.json"),
297+
# Trailing slashes on the root are stripped before joining.
298+
("foo/", "zarr.json", "foo/zarr.json"),
299+
("foo", "zarr.json", "foo/zarr.json"),
300+
# Leading slashes on the root are preserved -- absolute filesystem
301+
# paths must stay absolute. Regression test for the titiler-xarray
302+
# breakage that #3924 introduced when `normalize_path` was applied to
303+
# `FsspecStore.path`.
304+
("/home/runner/data.zarr", "zarr.json", "/home/runner/data.zarr/zarr.json"),
305+
("/home/runner/data.zarr/", "zarr.json", "/home/runner/data.zarr/zarr.json"),
306+
# Multi-segment keys.
307+
("/home/foo", "a/b/zarr.json", "/home/foo/a/b/zarr.json"),
308+
("", "a/b/zarr.json", "a/b/zarr.json"),
309+
# Trailing slash on the result is stripped (relevant when key is "").
310+
("/home/foo", "", "/home/foo"),
311+
],
293312
)
294-
@pytest.mark.parametrize("path", ["", "/", "//", "foo", "foo/", "/foo", "/foo/", "//foo//"])
295-
def test_fsspec_store_path_normalization(path: str) -> None:
296-
"""`FsspecStore.path` is normalized to the canonical form, matching
297-
`normalize_path`, regardless of the surface representation the caller
298-
supplies.
299-
300-
Regression test for https://github.com/zarr-developers/zarr-python/issues/3922
301-
-- when a caller passed `path="/"` the leading slash flowed through
302-
unmodified to subsequent `_join_paths([self.path, key])` calls, producing
303-
`"//key"` and missing the underlying object.
313+
def test_dereference_path(root: str, key: str, expected: str) -> None:
314+
"""Verify the contract `_dereference_path` provides for `FsspecStore`.
315+
316+
`FsspecStore.path` is stored verbatim; the join with a key must collapse a
317+
sentinel `"/"` root, strip trailing slashes, and preserve leading
318+
slashes on absolute paths.
319+
"""
320+
from zarr.storage._utils import _dereference_path
321+
322+
assert _dereference_path(root, key) == expected
323+
324+
325+
async def test_fsspec_store_open_group_via_reference_filesystem() -> None:
326+
"""End-to-end regression test for
327+
https://github.com/zarr-developers/zarr-python/issues/3922 .
328+
329+
``ReferenceFileSystem`` keys its refs by bare strings like ``"zarr.json"``.
330+
The bug was that ``FsspecStore(fs=ref_fs, path="/")`` produced
331+
``"//zarr.json"`` at the join site and failed to find the entry, raising
332+
``GroupNotFoundError``. This test pins ``path="/"`` explicitly to keep
333+
coverage even if the default value changes later.
334+
"""
335+
import json
336+
337+
from fsspec.implementations.reference import ReferenceFileSystem
338+
339+
group_json = json.dumps({"zarr_format": 3, "node_type": "group", "attributes": {}})
340+
fs = ReferenceFileSystem(
341+
fo={"version": 1, "refs": {"zarr.json": group_json}},
342+
asynchronous=True,
343+
)
344+
store = FsspecStore(fs=fs, path="/", read_only=True)
345+
group = await zarr.api.asynchronous.open_group(store, mode="r")
346+
assert group.metadata.zarr_format == 3
347+
348+
349+
async def test_fsspec_store_read_array_chunk_via_reference_filesystem() -> None:
350+
"""End-to-end regression test that exercises the byte-range read path
351+
against ``ReferenceFileSystem``.
352+
353+
Beyond opening a group (covered by
354+
``test_fsspec_store_open_group_via_reference_filesystem``), this test
355+
constructs a small zarr v3 array whose chunk lives in the refs dict and
356+
reads it through the store. Path-handling bugs on the byte-range
357+
fetch path (used by kerchunk-style virtualization) would surface here
358+
rather than at metadata-open time.
304359
"""
305-
sync_fs = fsspec.filesystem("memory")
306-
fs = _make_async(sync_fs)
307-
store = FsspecStore(fs=fs, path=path)
308-
assert store.path == normalize_path(path)
360+
import json
361+
362+
import numpy as np
363+
from fsspec.implementations.reference import ReferenceFileSystem
364+
365+
# Construct a minimal v3 zarr: a single 1-D uint8 array of length 4 with
366+
# one chunk of size 4. The chunk bytes are little-endian uint8s 1..4.
367+
array_meta = json.dumps(
368+
{
369+
"zarr_format": 3,
370+
"node_type": "array",
371+
"shape": [4],
372+
"chunk_grid": {"name": "regular", "configuration": {"chunk_shape": [4]}},
373+
"data_type": "uint8",
374+
"chunk_key_encoding": {"name": "default", "configuration": {"separator": "/"}},
375+
"fill_value": 0,
376+
"codecs": [{"name": "bytes", "configuration": {"endian": "little"}}],
377+
"attributes": {},
378+
}
379+
)
380+
chunk_bytes = bytes([1, 2, 3, 4])
381+
382+
refs: dict[str, str] = {
383+
"zarr.json": array_meta,
384+
# ReferenceFileSystem accepts raw bytes via base64 encoding or
385+
# latin-1-decoded strings; latin-1 round-trips bytes 1:1.
386+
"c/0": chunk_bytes.decode("latin-1"),
387+
}
388+
389+
fs = ReferenceFileSystem(
390+
fo={"version": 1, "refs": refs},
391+
asynchronous=True,
392+
)
393+
store = FsspecStore(fs=fs, path="/", read_only=True)
394+
array = await zarr.api.asynchronous.open_array(store=store, mode="r")
395+
data = await array.getitem(slice(None))
396+
np.testing.assert_array_equal(data, np.array([1, 2, 3, 4], dtype="uint8"))
309397

310398

311399
@pytest.mark.skipif(

0 commit comments

Comments
 (0)