Skip to content

Commit 1bf0d54

Browse files
authored
fix: Make URI parsing more lenient (#316)
The URI validation logic was too strict in some cases, causing valid lakeFS ref expressions with relative (`~N`, `^N`) or HEAD (`@`) suffixes to be rejected. This commit refactors the relevant regexp to accept these URIs and also makes the relative ref expression parsing more explicit in the regex. Issue: #314
1 parent 6fb9701 commit 1bf0d54

File tree

3 files changed

+84
-9
lines changed

3 files changed

+84
-9
lines changed

src/lakefs_spec/util.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,14 @@ def md5_checksum(lpath: str | os.PathLike[str], blocksize: int = 2**22) -> str:
9797
return file_hash.hexdigest()
9898

9999

100+
_uri_parts = {
101+
"protocol": r"^(?:lakefs://)?", # leading lakefs:// protocol (optional)
102+
"repository": r"(?P<repository>[a-z0-9][a-z0-9\-]{2,62})/",
103+
"ref expression": r"(?P<ref>\w[\w\-.]*(([~\^]\d*)*|@)?)/", # ref name with optional @, ~N, ^N suffixes
104+
"resource": r"(?P<resource>.*)",
105+
}
106+
107+
100108
def parse(path: str) -> tuple[str, str, str]:
101109
"""
102110
Parses a lakeFS URI in the form ``lakefs://<repo>/<ref>/<resource>``.
@@ -118,16 +126,9 @@ def parse(path: str) -> tuple[str, str, str]:
118126
If the path does not conform to the lakeFS URI format.
119127
"""
120128

121-
uri_parts = {
122-
"protocol": r"^(?:lakefs://)?", # leading lakefs:// protocol (optional)
123-
"repository": r"(?P<repository>[a-z0-9][a-z0-9\-]{2,62})/",
124-
"ref expression": r"(?P<ref>\w[\w\-.^~]*)/",
125-
"resource": r"(?P<resource>.*)",
126-
}
127-
128129
groups: dict[str, str] = {}
129130
start = 0
130-
for group, regex in uri_parts.items():
131+
for group, regex in _uri_parts.items():
131132
# we parse iteratively to improve the error message for the user if an invalid URI is given.
132133
# by going front to back and parsing each part successively, we obtain the current path segment,
133134
# and print it out to the user if it does not conform to our assumption of the lakeFS URI spec.

tests/regression/test_gh_314.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from lakefs import Branch, Repository
2+
3+
from lakefs_spec.spec import LakeFSFileSystem
4+
5+
6+
def test_gh_314(
7+
fs: LakeFSFileSystem,
8+
repository: Repository,
9+
temp_branch: Branch,
10+
) -> None:
11+
"""
12+
Regression test for GitHub issue 314: Enable `@` and `~N` syntax
13+
https://github.com/aai-institute/lakefs-spec/issues/314
14+
"""
15+
16+
prefix = f"lakefs://{repository.id}/{temp_branch.id}"
17+
datapath = f"{prefix}/data.txt"
18+
19+
# add new file, and immediately commit.
20+
fs.pipe(datapath, b"data1")
21+
temp_branch.commit(message="Add data.txt")
22+
23+
fs.pipe(datapath, b"data2")
24+
# Reading the committed version of the file should yield the correct data.
25+
committed_head_path = f"{prefix}@/data.txt"
26+
assert fs.read_text(committed_head_path) == "data1"
27+
28+
# Reading a relative commit should yield the correct data.
29+
temp_branch.commit(message="Update data.txt")
30+
relative_commit_path = f"{prefix}~1/data.txt"
31+
assert fs.read_text(relative_commit_path) == "data1"

tests/test_util.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
import re
2+
13
import pytest
24

3-
from lakefs_spec.util import _batched
5+
from lakefs_spec.util import _batched, _uri_parts
46

57

68
def test_batched_empty_iterable():
@@ -26,3 +28,44 @@ def test_batched_batch_size_greater_than_iterable():
2628
def test_batched_invalid_batch_size():
2729
with pytest.raises(ValueError, match="n must be at least one"):
2830
list(_batched([1, 2, 3], 0))
31+
32+
33+
class TestLakeFSUriPartRegexes:
34+
@pytest.mark.parametrize(
35+
"repo_name, valid",
36+
[
37+
("my-repo", True),
38+
("@@repo", False),
39+
("", False),
40+
("a", False),
41+
("a" * 63, True),
42+
("a" * 64, False),
43+
],
44+
)
45+
def test_repository(self, repo_name: str, valid: bool) -> None:
46+
result = re.match(_uri_parts["repository"], repo_name + "/")
47+
if valid:
48+
assert result is not None
49+
else:
50+
assert result is None
51+
52+
@pytest.mark.parametrize(
53+
"refexp, valid",
54+
[
55+
("", False),
56+
("main", True),
57+
("main@", True),
58+
("main~", True),
59+
("main^", True),
60+
("main^2", True),
61+
("main^^^", True),
62+
("main^1^1", True),
63+
("main^1~1", True),
64+
],
65+
)
66+
def test_ref_expression(self, refexp: str, valid: bool) -> None:
67+
result = re.match(_uri_parts["ref expression"], refexp + "/")
68+
if valid:
69+
assert result is not None
70+
else:
71+
assert result is None

0 commit comments

Comments
 (0)