-
-
Notifications
You must be signed in to change notification settings - Fork 767
Add preliminary support for ISO-8601 timestamps via date: archive match pattern (#8715) #8776
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
282d70c
db46cdb
4363bf7
69e8608
5c20d8f
6f1bcd4
4060e94
e9a8c5f
470758d
df2d33d
870bf7a
461df75
9553c35
409733b
de03806
796981c
7b8a194
8e3f1e4
904853d
6032c4a
9cb5e5f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
import os | ||
import re | ||
from datetime import datetime, timezone, timedelta | ||
from datetime import datetime, timezone, timedelta, date | ||
from zoneinfo import ZoneInfo | ||
|
||
|
||
def parse_timestamp(timestamp, tzinfo=timezone.utc): | ||
|
@@ -159,8 +160,15 @@ | |
following_month, year_of_following_month = get_month_and_year_from_total(total_months + 1) | ||
max_days_in_month = (datetime(year_of_following_month, following_month, 1) - timedelta(1)).day | ||
|
||
return datetime(day=min(from_ts.day, max_days_in_month), month=target_month, year=target_year).replace( | ||
tzinfo=from_ts.tzinfo | ||
return datetime( | ||
year=target_year, | ||
month=target_month, | ||
day=min(from_ts.day, max_days_in_month), | ||
hour=from_ts.hour, | ||
minute=from_ts.minute, | ||
second=from_ts.second, | ||
microsecond=from_ts.microsecond, | ||
tzinfo=from_ts.tzinfo, | ||
) | ||
|
||
|
||
|
@@ -185,3 +193,298 @@ | |
def archive_ts_now(): | ||
"""return tz-aware datetime obj for current time for usage as archive timestamp""" | ||
return datetime.now(timezone.utc) # utc time / utc timezone | ||
|
||
|
||
class DatePatternError(ValueError): | ||
"""Raised when a date: archive pattern cannot be parsed.""" | ||
|
||
|
||
def exact_predicate(dt: datetime): | ||
"""Return predicate matching archives whose ts equals dt (UTC).""" | ||
dt_utc = dt.astimezone(timezone.utc) | ||
return lambda ts: ts.astimezone(timezone.utc) == dt_utc | ||
|
||
|
||
def interval_predicate(start: datetime, end: datetime): | ||
start_utc = start.astimezone(timezone.utc) | ||
end_utc = end.astimezone(timezone.utc) | ||
if start_utc > end_utc: | ||
raise DatePatternError("start date must be before end date") | ||
return lambda ts: start_utc <= ts.astimezone(timezone.utc) < end_utc | ||
|
||
|
||
def parse_tz(tzstr: str): | ||
""" | ||
Parses a UTC offset like +08:00 or [Region/Name] into a timezone object. | ||
""" | ||
if not tzstr: | ||
return None | ||
if tzstr == "Z": | ||
return timezone.utc | ||
if tzstr[0] in "+-": | ||
sign = 1 if tzstr[0] == "+" else -1 | ||
try: | ||
hh, mm = map(int, tzstr[1:].split(":")) | ||
if not (0 <= mm < 60): | ||
raise ValueError | ||
except Exception: | ||
raise DatePatternError("invalid UTC offset format") | ||
# we do it this way so that, for example, -8:30 is | ||
# -8 hours and -30 minutes, not -8 hours and +30 minutes | ||
total_minutes = sign * (hh * 60 + mm) | ||
# enforce ISO-8601 bounds (-12:00 to +14:00) | ||
if not (-12 * 60 <= total_minutes <= 14 * 60): | ||
raise DatePatternError("UTC offset outside ISO-8601 bounds") | ||
return timezone(timedelta(minutes=total_minutes)) | ||
# [Region/Name] | ||
try: | ||
return ZoneInfo(tzstr.strip("[]")) | ||
except Exception: | ||
raise DatePatternError("invalid timezone format") | ||
|
||
|
||
def build_datetime_from_groups(gd: dict, tz: timezone) -> datetime: | ||
""" | ||
Construct a datetime from partial ISO groups, filling missing fields with | ||
the earliest valid value, and attaching tzinfo. | ||
""" | ||
year = int(gd["year"]) | ||
month = int(gd.get("month") or 1) | ||
day = int(gd.get("day") or 1) | ||
hour = int(gd.get("hour") or 0) | ||
minute = int(gd.get("minute") or 0) | ||
# handle fractional seconds | ||
microsecond = 0 | ||
second = 0 | ||
sec_str = gd.get("second") | ||
if sec_str: | ||
if "." in sec_str: | ||
whole, frac = sec_str.split(".", 1) | ||
second = int(whole) | ||
# pad or trim frac to microseconds | ||
microsecond = int(float(f"0.{frac}") * 1_000_000) | ||
else: | ||
second = int(sec_str) | ||
return datetime(year, month, day, hour, minute, second, microsecond, tzinfo=tz) | ||
|
||
|
||
# Regex for ISO-8601 timestamps: | ||
# Accepts both 'T' and space as separators between date and time per RFC-3339/IXDTF. | ||
MAIN_RE = r""" | ||
^ | ||
(?: | ||
# ISO week date: YYYY-Www or YYYY-Www-D | ||
(?P<isoweek_year>\d{4})-W(?P<isoweek_week>\d{2})(?:-(?P<isoweek_day>\d))? | ||
| # Ordinal date: YYYY-DDD | ||
(?P<ordinal_year>\d{4})-(?P<ordinal_day>\d{3}) | ||
| # Unix epoch | ||
@(?P<epoch>\d+) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also supporting fractal seconds here would be amazing! ❤️ Side note: I might read the regex wrong, but this also means |
||
| # Calendar date | ||
(?P<year>\d{4}|\*) # year (YYYY or *) | ||
(?:- # start month/day/time block | ||
(?P<month>\d{2}|\*) # month (MM or *) | ||
(?:- # start day/time block | ||
(?P<day>\d{2}|\*) # day (DD or *) | ||
(?:[T ] # date/time separator (T or space) | ||
(?P<hour>\d{2}|\*) # hour (HH or *) | ||
(?: | ||
:(?P<minute>\d{2}|\*) # minute (MM or *) | ||
(?: | ||
:(?P<second>\d{2}(?:\.\d+)?|\*) # second (SS or SS.fff or *) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yay, that is a nice regex now. :-) You could also deal with fractional seconds as with all other components: it is a optional component, so you can also match it with a named group and later check the groupdict. |
||
)? | ||
)? | ||
)? | ||
)? | ||
)? | ||
) | ||
(?P<tz>Z|[+\-]\d\d:\d\d|\[[^\]]+\])? # optional timezone suffix (Z, ±HH:MM or [Zone]) | ||
$ | ||
""" | ||
|
||
DURATION_RE = re.compile( | ||
r"^D" | ||
r"(?:(?P<years>\d+)Y)?" | ||
r"(?:(?P<months>\d+)M)?" | ||
r"(?:(?P<weeks>\d+)W)?" | ||
r"(?:(?P<days>\d+)D)?" | ||
r"(?:(?P<hours>\d+)h)?" | ||
r"(?:(?P<minutes>\d+)m)?" | ||
r"(?:(?P<seconds>\d+)s)?" | ||
Comment on lines
+306
to
+312
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. didn't we use to have lowercase ymwd and uppercase HMS? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @c-herz, is that a custom format, or did I miss an ISO 8601 or RFC update? I know ISO-8601's However, I kinda like the idea of additionally supporting our own format. Like, why not support |
||
r"$" | ||
) | ||
|
||
|
||
def parse_duration(expr: str) -> tuple[int, timedelta]: | ||
""" | ||
Parse D… duration into (months, timedelta of days/weeks/hours/minutes/seconds). | ||
""" | ||
m = DURATION_RE.match(expr) | ||
if not m: | ||
raise DatePatternError(f"invalid duration: {expr!r}") | ||
gd = m.groupdict(default="0") | ||
total_months = int(gd["years"]) * 12 + int(gd["months"]) | ||
days = int(gd["weeks"]) * 7 + int(gd["days"]) | ||
hours = int(gd["hours"]) | ||
minutes = int(gd["minutes"]) | ||
seconds = int(gd["seconds"]) | ||
td = timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds) | ||
return total_months, td | ||
|
||
|
||
def parse_to_interval(expr: str) -> tuple[datetime, datetime]: | ||
""" | ||
Parse a possibly incomplete ISO-8601 timestamp (with optional timezone) into | ||
a start and end datetime representing the full interval. | ||
""" | ||
# note: we match the same pattern that supports wildcards, but at the point this function is called, | ||
# we know that the pattern contains no wildcards. This is to allow us to reuse the same regex. | ||
m = re.match(MAIN_RE, expr, re.VERBOSE) | ||
if not m: | ||
raise DatePatternError(f"unrecognised date: {expr!r}") | ||
|
||
gd = m.groupdict() | ||
tz = parse_tz(gd["tz"]) | ||
# ISO week-date support (YYYY-Www or YYYY-Www-D) | ||
if gd.get("isoweek_year"): | ||
y = int(gd["isoweek_year"]) | ||
w = int(gd["isoweek_week"]) | ||
d = int(gd.get("isoweek_day") or 1) | ||
# fromisocalendar returns a date | ||
iso_date = date.fromisocalendar(y, w, d) | ||
start = datetime(iso_date.year, iso_date.month, iso_date.day, tzinfo=tz) | ||
if gd.get("isoweek_day"): | ||
# if we have a day, we want to end at the next day | ||
end = start + timedelta(days=1) | ||
else: | ||
# match the whole week | ||
end = start + timedelta(weeks=1) | ||
return start, end | ||
|
||
# Ordinal date support (YYYY-DDD) | ||
if gd.get("ordinal_year"): | ||
y = int(gd["ordinal_year"]) | ||
doy = int(gd["ordinal_day"]) | ||
start = datetime(y, 1, 1, tzinfo=tz) + timedelta(days=doy - 1) | ||
end = start + timedelta(days=1) | ||
return start, end | ||
|
||
# handle unix-epoch forms directly | ||
if gd["epoch"]: | ||
epoch = int(gd["epoch"]) | ||
start = datetime.fromtimestamp(epoch, tz=timezone.utc) | ||
end = start + timedelta(seconds=1) | ||
return start, end | ||
|
||
# build the start moment | ||
start = build_datetime_from_groups(gd, tz) | ||
# determine the end moment based on the highest precision present | ||
if gd["second"]: | ||
# fractional or whole second precision | ||
end = start + timedelta(seconds=1) | ||
elif gd["minute"]: | ||
end = start + timedelta(minutes=1) | ||
elif gd["hour"]: | ||
end = start + timedelta(hours=1) | ||
elif gd["day"]: | ||
end = start + timedelta(days=1) | ||
elif gd["month"]: | ||
end = offset_n_months(start, 1) | ||
elif gd["year"]: | ||
end = offset_n_months(start, 12) | ||
else: | ||
# fallback to one-second window (shouldn't occur) | ||
end = start + timedelta(seconds=1) | ||
return start, end | ||
|
||
|
||
def compile_date_pattern(expr: str): | ||
""" | ||
Accepts any TIMESTAMP of: | ||
YYYY | ||
YYYY-MM | ||
YYYY-MM-DD | ||
YYYY-MM-DDTHH (with 'T') or YYYY-MM-DD HH:MM (with space) | ||
YYYY-MM-DD HH:MM:SS (RFC-3339 space-separated) | ||
Unix epoch (@123456789) | ||
…with an optional trailing timezone (Z or ±HH:MM or [Region/City]). | ||
|
||
Also supports: | ||
TIMESTAMP/TIMESTAMP | ||
TIMESTAMP/DURATION | ||
DURATION/TIMESTAMP. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great work! ❤️ Just |
||
DURATION is a string of the form: | ||
D[years]Y[months]M[weeks]W[days]D[hours]h[minutes]m[seconds]s (any combination). | ||
|
||
Additionally supports wildcards (`*`) in year, month, or day (or any combination), e.g.: | ||
"*-04-22" # April 22 of any year | ||
"2025-*-01" # 1st day of any month in 2025 | ||
"*-*-15" # 15th of every month, any year | ||
Returns a predicate that is True for timestamps in that interval (inclusive, exclusive). | ||
""" | ||
expr = expr.strip() | ||
|
||
# 1) detect explicit user-defined intervals (split slash outside brackets to allow for [Region/Name]) | ||
parts = re.split(r"/(?![^\[]*\])", expr, maxsplit=1) | ||
if len(parts) == 2: | ||
left, right = parts | ||
# duration / timestamp | ||
if left.startswith("D") and not right.startswith("D"): | ||
# months are handled separately via offset_n_months() because month lengths vary | ||
months, td = parse_duration(left) | ||
end_dt, _ = parse_to_interval(right) | ||
start_dt = offset_n_months(end_dt, -months) - td | ||
return interval_predicate(start_dt, end_dt) | ||
# timestamp / duration | ||
if right.startswith("D") and not left.startswith("D"): | ||
start_dt, _ = parse_to_interval(left) | ||
# months are handled separately via offset_n_months() because month lengths vary | ||
months, td = parse_duration(right) | ||
mid_dt = offset_n_months(start_dt, months) | ||
end_dt = mid_dt + td | ||
return interval_predicate(start_dt, end_dt) | ||
# timestamp / timestamp | ||
start_left, _ = parse_to_interval(left) | ||
start_right, _ = parse_to_interval(right) | ||
return interval_predicate(start_left, start_right) | ||
m = re.match(MAIN_RE, expr, re.VERBOSE) | ||
if not m: | ||
raise DatePatternError(f"unrecognised date: {expr!r}") | ||
|
||
gd = m.groupdict() | ||
tz = parse_tz(gd["tz"]) | ||
|
||
# 2) detect explicit wildcards (*) in any named group | ||
wildcard_fields = ("year", "month", "day", "hour", "minute", "second") | ||
if any(gd[f] == "*" for f in wildcard_fields if f in gd): | ||
# build a discrete‐match predicate | ||
yi = None if gd["year"] == "*" else int(gd["year"]) | ||
mi = None if gd["month"] == "*" else int(gd["month"]) if gd["month"] else None | ||
di = None if gd["day"] == "*" else int(gd["day"]) if gd["day"] else None | ||
hi = None if gd["hour"] == "*" else int(gd["hour"]) if gd["hour"] else None | ||
ni = None if gd["minute"] == "*" else int(gd["minute"]) if gd["minute"] else None | ||
si = None | ||
if gd["second"]: | ||
if gd["second"] != "*": | ||
si = float(gd["second"]) | ||
|
||
def wildcard_pred(ts): | ||
dt = ts.astimezone(tz) | ||
return ( | ||
(yi is None or dt.year == yi) | ||
and (mi is None or dt.month == mi) | ||
and (di is None or dt.day == di) | ||
and (hi is None or dt.hour == hi) | ||
and (ni is None or dt.minute == ni) | ||
and (si is None or (si <= dt.second + dt.microsecond / 1e6 < si + 1)) | ||
) | ||
|
||
return wildcard_pred | ||
|
||
# 3) fraction‐precision exact match | ||
if gd["second"] and "." in gd["second"]: | ||
dt = build_datetime_from_groups(gd, tz) | ||
return exact_predicate(dt) | ||
|
||
# 4) remaining precisions: use _parse_to_interval to get start/end | ||
start, end = parse_to_interval(expr) | ||
return interval_predicate(start, end) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This might be a bit silly, but it just got me thinking: Could we support ordinal days with wildcard years? 🤔 Same for ISO weeks, possibly even
*-W*-5
(wow, that looks crazy 😆) to match all archives created on a Friday? Not sure whether users would use it, but if it's possible? WDYT?