Skip to content

Add preliminary support for ISO-8601 timestamps via date: archive match pattern (#8715) #8776

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
282d70c
Add preliminary support for ISO-8601 timestamps (no timezones at the …
c-herz Apr 19, 2025
db46cdb
reformatted to pass style checks
degabe Apr 21, 2025
4363bf7
Applied recommended changes from ThomasWald, still working as intende…
degabe Apr 21, 2025
69e8608
fix bug with local timezone attachment not correctly respecting DST
c-herz Apr 21, 2025
5c20d8f
Reformatted for consistency with code style guide
c-herz Apr 22, 2025
6f1bcd4
Added basic test suite for ISO-8601 and Unix timestamp matching
c-herz Apr 22, 2025
4060e94
Merge remote-tracking branch 'origin/dateFilterImprov' into datefilter
c-herz Apr 22, 2025
e9a8c5f
add day-precision filter test for `date:YYYY-MM-DD`
c-herz Apr 22, 2025
470758d
support timezone suffixes in date: patterns and add tests
c-herz Apr 22, 2025
df2d33d
Wildcard working. Done some manual testing, will focus on more rigoro…
degabe Apr 23, 2025
870bf7a
add tests for wildcard support in date: archive match patterns; refor…
c-herz Apr 25, 2025
461df75
fix bug with wildcards in date: match patterns not respecting supplie…
c-herz Apr 25, 2025
9553c35
remove stray testfile.txt
c-herz Apr 25, 2025
409733b
refactor date: pattern parser to use structured bottom-up regex, per …
c-herz Apr 25, 2025
de03806
refactor date: pattern parsing to use helper functions for datetime c…
c-herz Apr 25, 2025
796981c
add explicit time interval matching in date: archive match pattern (w…
c-herz Apr 25, 2025
7b8a194
add duration-based interval support for date: archive match patterns;…
c-herz Apr 25, 2025
8e3f1e4
add support for keyword-based date intervals in archive date: matchin…
c-herz Apr 25, 2025
904853d
refactor time.py: rename internal functions for clarity and consistency
c-herz Apr 25, 2025
6032c4a
add support for ISO week-date and ordinal-date matching in date: arch…
c-herz Apr 25, 2025
9cb5e5f
enhance compile_date_pattern docstring: clarify TIMESTAMP and DURATIO…
c-herz Apr 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
309 changes: 306 additions & 3 deletions src/borg/helpers/time.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import re
from datetime import datetime, timezone, timedelta
from datetime import datetime, timezone, timedelta, date
from zoneinfo import ZoneInfo


def parse_timestamp(timestamp, tzinfo=timezone.utc):
Expand Down Expand Up @@ -159,8 +160,15 @@
following_month, year_of_following_month = get_month_and_year_from_total(total_months + 1)
max_days_in_month = (datetime(year_of_following_month, following_month, 1) - timedelta(1)).day

return datetime(day=min(from_ts.day, max_days_in_month), month=target_month, year=target_year).replace(
tzinfo=from_ts.tzinfo
return datetime(
year=target_year,
month=target_month,
day=min(from_ts.day, max_days_in_month),
hour=from_ts.hour,
minute=from_ts.minute,
second=from_ts.second,
microsecond=from_ts.microsecond,
tzinfo=from_ts.tzinfo,
)


Expand All @@ -185,3 +193,298 @@
def archive_ts_now():
"""return tz-aware datetime obj for current time for usage as archive timestamp"""
return datetime.now(timezone.utc) # utc time / utc timezone


class DatePatternError(ValueError):
"""Raised when a date: archive pattern cannot be parsed."""


def exact_predicate(dt: datetime):
"""Return predicate matching archives whose ts equals dt (UTC)."""
dt_utc = dt.astimezone(timezone.utc)
return lambda ts: ts.astimezone(timezone.utc) == dt_utc

Check warning on line 205 in src/borg/helpers/time.py

View check run for this annotation

Codecov / codecov/patch

src/borg/helpers/time.py#L204-L205

Added lines #L204 - L205 were not covered by tests


def interval_predicate(start: datetime, end: datetime):
start_utc = start.astimezone(timezone.utc)
end_utc = end.astimezone(timezone.utc)
if start_utc > end_utc:
raise DatePatternError("start date must be before end date")
return lambda ts: start_utc <= ts.astimezone(timezone.utc) < end_utc


def parse_tz(tzstr: str):
"""
Parses a UTC offset like +08:00 or [Region/Name] into a timezone object.
"""
if not tzstr:
return None
if tzstr == "Z":
return timezone.utc
if tzstr[0] in "+-":
sign = 1 if tzstr[0] == "+" else -1
try:
hh, mm = map(int, tzstr[1:].split(":"))
if not (0 <= mm < 60):
raise ValueError
except Exception:
raise DatePatternError("invalid UTC offset format")
# we do it this way so that, for example, -8:30 is
# -8 hours and -30 minutes, not -8 hours and +30 minutes
total_minutes = sign * (hh * 60 + mm)
# enforce ISO-8601 bounds (-12:00 to +14:00)
if not (-12 * 60 <= total_minutes <= 14 * 60):
raise DatePatternError("UTC offset outside ISO-8601 bounds")
return timezone(timedelta(minutes=total_minutes))
# [Region/Name]
try:
return ZoneInfo(tzstr.strip("[]"))
except Exception:
raise DatePatternError("invalid timezone format")


def build_datetime_from_groups(gd: dict, tz: timezone) -> datetime:
"""
Construct a datetime from partial ISO groups, filling missing fields with
the earliest valid value, and attaching tzinfo.
"""
year = int(gd["year"])
month = int(gd.get("month") or 1)
day = int(gd.get("day") or 1)
hour = int(gd.get("hour") or 0)
minute = int(gd.get("minute") or 0)
# handle fractional seconds
microsecond = 0
second = 0
sec_str = gd.get("second")
if sec_str:
if "." in sec_str:
whole, frac = sec_str.split(".", 1)
second = int(whole)

Check warning on line 263 in src/borg/helpers/time.py

View check run for this annotation

Codecov / codecov/patch

src/borg/helpers/time.py#L262-L263

Added lines #L262 - L263 were not covered by tests
# pad or trim frac to microseconds
microsecond = int(float(f"0.{frac}") * 1_000_000)

Check warning on line 265 in src/borg/helpers/time.py

View check run for this annotation

Codecov / codecov/patch

src/borg/helpers/time.py#L265

Added line #L265 was not covered by tests
else:
second = int(sec_str)
return datetime(year, month, day, hour, minute, second, microsecond, tzinfo=tz)


# Regex for ISO-8601 timestamps:
# Accepts both 'T' and space as separators between date and time per RFC-3339/IXDTF.
MAIN_RE = r"""
^
(?:
# ISO week date: YYYY-Www or YYYY-Www-D
(?P<isoweek_year>\d{4})-W(?P<isoweek_week>\d{2})(?:-(?P<isoweek_day>\d))?
| # Ordinal date: YYYY-DDD
(?P<ordinal_year>\d{4})-(?P<ordinal_day>\d{3})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might be a bit silly, but it just got me thinking: Could we support ordinal days with wildcard years? 🤔 Same for ISO weeks, possibly even *-W*-5 (wow, that looks crazy 😆) to match all archives created on a Friday? Not sure whether users would use it, but if it's possible? WDYT?

| # Unix epoch
@(?P<epoch>\d+)
Copy link
Contributor

@PhrozenByte PhrozenByte Apr 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also supporting fractal seconds here would be amazing! ❤️

Side note: I might read the regex wrong, but this also means @1745577106[Europe/Berlin] (or any other TZ format) is supported? AFAIK Unix timestamps are UTC per definition, right? Or is the TZ info used later for something else?

| # Calendar date
(?P<year>\d{4}|\*) # year (YYYY or *)
(?:- # start month/day/time block
(?P<month>\d{2}|\*) # month (MM or *)
(?:- # start day/time block
(?P<day>\d{2}|\*) # day (DD or *)
(?:[T ] # date/time separator (T or space)
(?P<hour>\d{2}|\*) # hour (HH or *)
(?:
:(?P<minute>\d{2}|\*) # minute (MM or *)
(?:
:(?P<second>\d{2}(?:\.\d+)?|\*) # second (SS or SS.fff or *)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yay, that is a nice regex now. :-)

You could also deal with fractional seconds as with all other components: it is a optional component, so you can also match it with a named group and later check the groupdict.

)?
)?
)?
)?
)?
)
(?P<tz>Z|[+\-]\d\d:\d\d|\[[^\]]+\])? # optional timezone suffix (Z, ±HH:MM or [Zone])
$
"""

DURATION_RE = re.compile(
r"^D"
r"(?:(?P<years>\d+)Y)?"
r"(?:(?P<months>\d+)M)?"
r"(?:(?P<weeks>\d+)W)?"
r"(?:(?P<days>\d+)D)?"
r"(?:(?P<hours>\d+)h)?"
r"(?:(?P<minutes>\d+)m)?"
r"(?:(?P<seconds>\d+)s)?"
Comment on lines +306 to +312
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

didn't we use to have lowercase ymwd and uppercase HMS?

Copy link
Contributor

@PhrozenByte PhrozenByte Apr 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@c-herz, is that a custom format, or did I miss an ISO 8601 or RFC update? I know ISO-8601's P3Y6M4DT12H30M5S (P designator, 3 years, 6 months, 4 days, T time separator, 12 hours, 30 minutes, 5 seconds, all being optional) format. I honestly don't like that "official" format very much (it's so cumbersome…), especially in regards to using M for both months and minutes (it's still unambiguous due to the T separator), but I feel like we should support it, because it's an official part of ISO 8601.

However, I kinda like the idea of additionally supporting our own format. Like, why not support 12:34:56 (or similar, just a quick idea) to specify a 12 hours, 34 minutes, 56 seconds duration? Why not also support (space) as alternative to T to separate times? The designators could ignore case (i.e. also supporting 7d for 7 days), and we could allow a space after each term. This might go too far though. In general, I'm not sure whether we should put things into the "official" P designator, or rather additionally support our own (like the suggested D, e.g. D 3y 6m 4d 12:30:05?). Is there maybe another common or even formalized/standardized (like another RFC; I did some research, unfortunately I didn't find anything official or quasi-official) format? WDYT?

r"$"
)


def parse_duration(expr: str) -> tuple[int, timedelta]:
"""
Parse D… duration into (months, timedelta of days/weeks/hours/minutes/seconds).
"""
m = DURATION_RE.match(expr)
if not m:
raise DatePatternError(f"invalid duration: {expr!r}")

Check warning on line 323 in src/borg/helpers/time.py

View check run for this annotation

Codecov / codecov/patch

src/borg/helpers/time.py#L323

Added line #L323 was not covered by tests
gd = m.groupdict(default="0")
total_months = int(gd["years"]) * 12 + int(gd["months"])
days = int(gd["weeks"]) * 7 + int(gd["days"])
hours = int(gd["hours"])
minutes = int(gd["minutes"])
seconds = int(gd["seconds"])
td = timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)
return total_months, td


def parse_to_interval(expr: str) -> tuple[datetime, datetime]:
"""
Parse a possibly incomplete ISO-8601 timestamp (with optional timezone) into
a start and end datetime representing the full interval.
"""
# note: we match the same pattern that supports wildcards, but at the point this function is called,
# we know that the pattern contains no wildcards. This is to allow us to reuse the same regex.
m = re.match(MAIN_RE, expr, re.VERBOSE)
if not m:
raise DatePatternError(f"unrecognised date: {expr!r}")

Check warning on line 343 in src/borg/helpers/time.py

View check run for this annotation

Codecov / codecov/patch

src/borg/helpers/time.py#L343

Added line #L343 was not covered by tests

gd = m.groupdict()
tz = parse_tz(gd["tz"])
# ISO week-date support (YYYY-Www or YYYY-Www-D)
if gd.get("isoweek_year"):
y = int(gd["isoweek_year"])
w = int(gd["isoweek_week"])
d = int(gd.get("isoweek_day") or 1)
# fromisocalendar returns a date
iso_date = date.fromisocalendar(y, w, d)
start = datetime(iso_date.year, iso_date.month, iso_date.day, tzinfo=tz)
if gd.get("isoweek_day"):
# if we have a day, we want to end at the next day
end = start + timedelta(days=1)
else:
# match the whole week
end = start + timedelta(weeks=1)
return start, end

# Ordinal date support (YYYY-DDD)
if gd.get("ordinal_year"):
y = int(gd["ordinal_year"])
doy = int(gd["ordinal_day"])
start = datetime(y, 1, 1, tzinfo=tz) + timedelta(days=doy - 1)
end = start + timedelta(days=1)
return start, end

# handle unix-epoch forms directly
if gd["epoch"]:
epoch = int(gd["epoch"])
start = datetime.fromtimestamp(epoch, tz=timezone.utc)
end = start + timedelta(seconds=1)
return start, end

# build the start moment
start = build_datetime_from_groups(gd, tz)
# determine the end moment based on the highest precision present
if gd["second"]:
# fractional or whole second precision
end = start + timedelta(seconds=1)
elif gd["minute"]:
end = start + timedelta(minutes=1)
elif gd["hour"]:
end = start + timedelta(hours=1)
elif gd["day"]:
end = start + timedelta(days=1)
elif gd["month"]:
end = offset_n_months(start, 1)
elif gd["year"]:
end = offset_n_months(start, 12)
else:
# fallback to one-second window (shouldn't occur)
end = start + timedelta(seconds=1)

Check warning on line 396 in src/borg/helpers/time.py

View check run for this annotation

Codecov / codecov/patch

src/borg/helpers/time.py#L396

Added line #L396 was not covered by tests
return start, end


def compile_date_pattern(expr: str):
"""
Accepts any TIMESTAMP of:
YYYY
YYYY-MM
YYYY-MM-DD
YYYY-MM-DDTHH (with 'T') or YYYY-MM-DD HH:MM (with space)
YYYY-MM-DD HH:MM:SS (RFC-3339 space-separated)
Unix epoch (@123456789)
…with an optional trailing timezone (Z or ±HH:MM or [Region/City]).

Also supports:
TIMESTAMP/TIMESTAMP
TIMESTAMP/DURATION
DURATION/TIMESTAMP.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great work! ❤️

Just DURATION (i.e. without a TIMESTAMP) isn't supported yet, right? Just specifying a duration is helpful to match the latest archives relative to "now" (which could be another useful keyword).

DURATION is a string of the form:
D[years]Y[months]M[weeks]W[days]D[hours]h[minutes]m[seconds]s (any combination).

Additionally supports wildcards (`*`) in year, month, or day (or any combination), e.g.:
"*-04-22" # April 22 of any year
"2025-*-01" # 1st day of any month in 2025
"*-*-15" # 15th of every month, any year
Returns a predicate that is True for timestamps in that interval (inclusive, exclusive).
"""
expr = expr.strip()

# 1) detect explicit user-defined intervals (split slash outside brackets to allow for [Region/Name])
parts = re.split(r"/(?![^\[]*\])", expr, maxsplit=1)
if len(parts) == 2:
left, right = parts
# duration / timestamp
if left.startswith("D") and not right.startswith("D"):
# months are handled separately via offset_n_months() because month lengths vary
months, td = parse_duration(left)
end_dt, _ = parse_to_interval(right)
start_dt = offset_n_months(end_dt, -months) - td
return interval_predicate(start_dt, end_dt)
# timestamp / duration
if right.startswith("D") and not left.startswith("D"):
start_dt, _ = parse_to_interval(left)
# months are handled separately via offset_n_months() because month lengths vary
months, td = parse_duration(right)
mid_dt = offset_n_months(start_dt, months)
end_dt = mid_dt + td
return interval_predicate(start_dt, end_dt)
# timestamp / timestamp
start_left, _ = parse_to_interval(left)
start_right, _ = parse_to_interval(right)
return interval_predicate(start_left, start_right)
m = re.match(MAIN_RE, expr, re.VERBOSE)
if not m:
raise DatePatternError(f"unrecognised date: {expr!r}")

Check warning on line 451 in src/borg/helpers/time.py

View check run for this annotation

Codecov / codecov/patch

src/borg/helpers/time.py#L451

Added line #L451 was not covered by tests

gd = m.groupdict()
tz = parse_tz(gd["tz"])

# 2) detect explicit wildcards (*) in any named group
wildcard_fields = ("year", "month", "day", "hour", "minute", "second")
if any(gd[f] == "*" for f in wildcard_fields if f in gd):
# build a discrete‐match predicate
yi = None if gd["year"] == "*" else int(gd["year"])
mi = None if gd["month"] == "*" else int(gd["month"]) if gd["month"] else None
di = None if gd["day"] == "*" else int(gd["day"]) if gd["day"] else None
hi = None if gd["hour"] == "*" else int(gd["hour"]) if gd["hour"] else None
ni = None if gd["minute"] == "*" else int(gd["minute"]) if gd["minute"] else None
si = None
if gd["second"]:
if gd["second"] != "*":
si = float(gd["second"])

def wildcard_pred(ts):
dt = ts.astimezone(tz)
return (
(yi is None or dt.year == yi)
and (mi is None or dt.month == mi)
and (di is None or dt.day == di)
and (hi is None or dt.hour == hi)
and (ni is None or dt.minute == ni)
and (si is None or (si <= dt.second + dt.microsecond / 1e6 < si + 1))
)

return wildcard_pred

# 3) fraction‐precision exact match
if gd["second"] and "." in gd["second"]:
dt = build_datetime_from_groups(gd, tz)
return exact_predicate(dt)

Check warning on line 486 in src/borg/helpers/time.py

View check run for this annotation

Codecov / codecov/patch

src/borg/helpers/time.py#L485-L486

Added lines #L485 - L486 were not covered by tests

# 4) remaining precisions: use _parse_to_interval to get start/end
start, end = parse_to_interval(expr)
return interval_predicate(start, end)
52 changes: 51 additions & 1 deletion src/borg/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,13 @@
from .constants import * # NOQA
from .helpers.datastruct import StableDict
from .helpers.parseformat import bin_to_hex, hex_to_bin
from .helpers.time import parse_timestamp, calculate_relative_offset, archive_ts_now
from .helpers.time import (
parse_timestamp,
calculate_relative_offset,
archive_ts_now,
compile_date_pattern,
DatePatternError,
)
from .helpers.errors import Error, CommandError
from .item import ArchiveItem
from .patterns import get_regex_from_pattern
Expand Down Expand Up @@ -198,6 +204,50 @@ def _matching_info_tuples(self, match_patterns, match_end, *, deleted=False):
elif match.startswith("host:"):
wanted_host = match.removeprefix("host:")
archive_infos = [x for x in archive_infos if x.host == wanted_host]
elif match.startswith("date:"):
wanted_date = match.removeprefix("date:")
# resolve keyword tokens for oldest, newest, now
parts = re.split(r"/(?![^\[]*\])", wanted_date, maxsplit=1)
orig_left = parts[0]
orig_right = parts[1] if len(parts) == 2 else None

def resolve_kw(token):
if token == "oldest":
return min(x.ts for x in archive_infos).isoformat(timespec="seconds")
if token == "newest":
return max(x.ts for x in archive_infos).isoformat(timespec="seconds")
if token == "now":
return archive_ts_now().isoformat(timespec="seconds")
return token # token is not a keyword, return it as is

left = resolve_kw(orig_left)
if orig_right is not None:
# interval keyword/keyword or keyword/timestamp or timestamp/keyword
right = resolve_kw(orig_right)
wanted_date = f"{left}/{right}"
elif orig_left in ("oldest", "newest", "now"):
# single keyword: exact match only for that timestamp
dt = parse_timestamp(left)
archive_infos = [x for x in archive_infos if x.ts == dt]
continue
else:
wanted_date = orig_left
# compile and filter
try:
pred = compile_date_pattern(wanted_date)
except DatePatternError as e:
raise CommandError(f"Invalid date pattern: {match} ({e})")
# filter by predicate, but include newest timestamp if it was requested
# This is a bit of a hack to get around the fact that compile_date_pattern
# returns a predicate that is not inclusive of the end date. However,
# oldest/newest should intuitively include the newest archive, hence this hack.
had_newest = "newest" in (orig_left, orig_right)
base_infos = archive_infos
if had_newest and base_infos:
newest_ts = max(x.ts for x in base_infos)
archive_infos = [x for x in archive_infos if pred(x.ts) or x.ts == newest_ts]
else:
archive_infos = [x for x in archive_infos if pred(x.ts)]
else: # do a match on the name
match = match.removeprefix("name:") # accept optional name: prefix
regex = get_regex_from_pattern(match)
Expand Down
Loading
Loading