Skip to content

Commit 537a311

Browse files
authored
fix!: make repr deterministic for fingerprinting (#4925)
1 parent 40bb6c8 commit 537a311

File tree

3 files changed

+330
-2
lines changed

3 files changed

+330
-2
lines changed
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
"""
2+
When serializing some objects, like `__sqlmesh__vars__`, the order of keys in the dictionary were not deterministic
3+
and therefore this migration applies deterministic sorting to the keys of the dictionary.
4+
"""
5+
6+
import json
7+
import typing as t
8+
from dataclasses import dataclass
9+
10+
from sqlglot import exp
11+
12+
from sqlmesh.utils.migration import index_text_type, blob_text_type
13+
14+
15+
# Make sure `SqlValue` is defined so it can be used by `eval` call in the migration
16+
@dataclass
17+
class SqlValue:
18+
"""A SQL string representing a generated SQLGlot AST."""
19+
20+
sql: str
21+
22+
23+
def _deterministic_repr(obj: t.Any) -> str:
24+
"""
25+
This is a copy of the function from utils.metaprogramming
26+
"""
27+
28+
def _normalize_for_repr(o: t.Any) -> t.Any:
29+
if isinstance(o, dict):
30+
sorted_items = sorted(o.items(), key=lambda x: str(x[0]))
31+
return {k: _normalize_for_repr(v) for k, v in sorted_items}
32+
if isinstance(o, (list, tuple)):
33+
# Recursively normalize nested structures
34+
normalized = [_normalize_for_repr(item) for item in o]
35+
return type(o)(normalized)
36+
return o
37+
38+
try:
39+
return repr(_normalize_for_repr(obj))
40+
except Exception:
41+
return repr(obj)
42+
43+
44+
def migrate(state_sync, **kwargs): # type: ignore
45+
import pandas as pd
46+
47+
engine_adapter = state_sync.engine_adapter
48+
schema = state_sync.schema
49+
snapshots_table = "_snapshots"
50+
if schema:
51+
snapshots_table = f"{schema}.{snapshots_table}"
52+
53+
migration_needed = False
54+
new_snapshots = []
55+
56+
for (
57+
name,
58+
identifier,
59+
version,
60+
snapshot,
61+
kind_name,
62+
updated_ts,
63+
unpaused_ts,
64+
ttl_ms,
65+
unrestorable,
66+
) in engine_adapter.fetchall(
67+
exp.select(
68+
"name",
69+
"identifier",
70+
"version",
71+
"snapshot",
72+
"kind_name",
73+
"updated_ts",
74+
"unpaused_ts",
75+
"ttl_ms",
76+
"unrestorable",
77+
).from_(snapshots_table),
78+
quote_identifiers=True,
79+
):
80+
parsed_snapshot = json.loads(snapshot)
81+
python_env = parsed_snapshot["node"].get("python_env")
82+
83+
if python_env:
84+
for key, executable in python_env.items():
85+
if isinstance(executable, dict) and executable.get("kind") == "value":
86+
old_payload = executable["payload"]
87+
try:
88+
# Try to parse the old payload and re-serialize it deterministically
89+
parsed_value = eval(old_payload)
90+
new_payload = _deterministic_repr(parsed_value)
91+
92+
# Only update if the representation changed
93+
if old_payload != new_payload:
94+
executable["payload"] = new_payload
95+
migration_needed = True
96+
except Exception:
97+
# If we still can't eval it, leave it as-is
98+
pass
99+
100+
new_snapshots.append(
101+
{
102+
"name": name,
103+
"identifier": identifier,
104+
"version": version,
105+
"snapshot": json.dumps(parsed_snapshot),
106+
"kind_name": kind_name,
107+
"updated_ts": updated_ts,
108+
"unpaused_ts": unpaused_ts,
109+
"ttl_ms": ttl_ms,
110+
"unrestorable": unrestorable,
111+
}
112+
)
113+
114+
if migration_needed and new_snapshots:
115+
engine_adapter.delete_from(snapshots_table, "TRUE")
116+
117+
index_type = index_text_type(engine_adapter.dialect)
118+
blob_type = blob_text_type(engine_adapter.dialect)
119+
120+
engine_adapter.insert_append(
121+
snapshots_table,
122+
pd.DataFrame(new_snapshots),
123+
columns_to_types={
124+
"name": exp.DataType.build(index_type),
125+
"identifier": exp.DataType.build(index_type),
126+
"version": exp.DataType.build(index_type),
127+
"snapshot": exp.DataType.build(blob_type),
128+
"kind_name": exp.DataType.build("text"),
129+
"updated_ts": exp.DataType.build("bigint"),
130+
"unpaused_ts": exp.DataType.build("bigint"),
131+
"ttl_ms": exp.DataType.build("bigint"),
132+
"unrestorable": exp.DataType.build("boolean"),
133+
},
134+
)

sqlmesh/utils/metaprogramming.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,9 @@ def is_value(self) -> bool:
425425

426426
@classmethod
427427
def value(cls, v: t.Any, is_metadata: t.Optional[bool] = None) -> Executable:
428-
return Executable(payload=repr(v), kind=ExecutableKind.VALUE, is_metadata=is_metadata)
428+
return Executable(
429+
payload=_deterministic_repr(v), kind=ExecutableKind.VALUE, is_metadata=is_metadata
430+
)
429431

430432

431433
def serialize_env(env: t.Dict[str, t.Any], path: Path) -> t.Dict[str, Executable]:
@@ -633,6 +635,38 @@ def print_exception(
633635
out.write(tb)
634636

635637

638+
def _deterministic_repr(obj: t.Any) -> str:
639+
"""Create a deterministic representation by ensuring consistent ordering before repr().
640+
641+
For dictionaries, ensures consistent key ordering to prevent non-deterministic
642+
serialization that affects fingerprinting. Uses Python's native repr() logic
643+
for all formatting to handle edge cases properly.
644+
645+
Note that this function assumes list/tuple order is significant and therefore does not sort them.
646+
647+
Args:
648+
obj: The object to represent as a string.
649+
650+
Returns:
651+
A deterministic string representation of the object.
652+
"""
653+
654+
def _normalize_for_repr(o: t.Any) -> t.Any:
655+
if isinstance(o, dict):
656+
sorted_items = sorted(o.items(), key=lambda x: str(x[0]))
657+
return {k: _normalize_for_repr(v) for k, v in sorted_items}
658+
if isinstance(o, (list, tuple)):
659+
# Recursively normalize nested structures
660+
normalized = [_normalize_for_repr(item) for item in o]
661+
return type(o)(normalized)
662+
return o
663+
664+
try:
665+
return repr(_normalize_for_repr(obj))
666+
except Exception:
667+
return repr(obj)
668+
669+
636670
def import_python_file(path: Path, relative_base: Path = Path()) -> types.ModuleType:
637671
relative_path = path.absolute().relative_to(relative_base.absolute())
638672
module_name = str(relative_path.with_suffix("")).replace(os.path.sep, ".")

tests/utils/test_metaprogramming.py

Lines changed: 161 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from sqlmesh.utils.metaprogramming import (
2323
Executable,
2424
ExecutableKind,
25+
_deterministic_repr,
2526
build_env,
2627
func_globals,
2728
normalize_source,
@@ -48,7 +49,7 @@ def test_print_exception(mocker: MockerFixture):
4849
except Exception as ex:
4950
print_exception(ex, test_env, out_mock)
5051

51-
expected_message = r""" File ".*?.tests.utils.test_metaprogramming\.py", line 47, in test_print_exception
52+
expected_message = r""" File ".*?.tests.utils.test_metaprogramming\.py", line 48, in test_print_exception
5253
eval\("test_fun\(\)", env\).*
5354
5455
File '/test/path.py' \(or imported file\), line 2, in test_fun
@@ -457,3 +458,162 @@ def test_serialize_env_with_enum_import_appearing_in_two_functions() -> None:
457458
}
458459

459460
assert serialized_env == expected_env
461+
462+
463+
def test_deterministic_repr_basic_types():
464+
"""Test _deterministic_repr with basic Python types."""
465+
# Test basic types that should use standard repr
466+
assert _deterministic_repr(42) == "42"
467+
assert _deterministic_repr("hello") == "'hello'"
468+
assert _deterministic_repr(True) == "True"
469+
assert _deterministic_repr(None) == "None"
470+
assert _deterministic_repr(3.14) == "3.14"
471+
472+
473+
def test_deterministic_repr_dict_ordering():
474+
"""Test that _deterministic_repr produces consistent output for dicts with different key ordering."""
475+
# Same dict with different key ordering
476+
dict1 = {"c": 3, "a": 1, "b": 2}
477+
dict2 = {"a": 1, "b": 2, "c": 3}
478+
dict3 = {"b": 2, "c": 3, "a": 1}
479+
480+
repr1 = _deterministic_repr(dict1)
481+
repr2 = _deterministic_repr(dict2)
482+
repr3 = _deterministic_repr(dict3)
483+
484+
# All should produce the same representation
485+
assert repr1 == repr2 == repr3
486+
assert repr1 == "{'a': 1, 'b': 2, 'c': 3}"
487+
488+
489+
def test_deterministic_repr_mixed_key_types():
490+
"""Test _deterministic_repr with mixed key types (strings and numbers)."""
491+
dict1 = {42: "number", "string": "text", 1: "one"}
492+
dict2 = {"string": "text", 1: "one", 42: "number"}
493+
494+
repr1 = _deterministic_repr(dict1)
495+
repr2 = _deterministic_repr(dict2)
496+
497+
# Should produce consistent ordering despite mixed key types
498+
assert repr1 == repr2
499+
# Numbers come before strings when sorting by string representation
500+
assert repr1 == "{1: 'one', 42: 'number', 'string': 'text'}"
501+
502+
503+
def test_deterministic_repr_nested_structures():
504+
"""Test _deterministic_repr with deeply nested dictionaries."""
505+
nested1 = {"outer": {"z": 26, "a": 1}, "list": [3, {"y": 2, "x": 1}], "simple": "value"}
506+
507+
nested2 = {"simple": "value", "list": [3, {"x": 1, "y": 2}], "outer": {"a": 1, "z": 26}}
508+
509+
repr1 = _deterministic_repr(nested1)
510+
repr2 = _deterministic_repr(nested2)
511+
512+
assert repr1 == repr2
513+
# Verify structure is maintained with sorted keys
514+
expected = "{'list': [3, {'x': 1, 'y': 2}], 'outer': {'a': 1, 'z': 26}, 'simple': 'value'}"
515+
assert repr1 == expected
516+
517+
518+
def test_deterministic_repr_lists_and_tuples():
519+
"""Test _deterministic_repr preserves order for lists/tuples but sorts nested dicts."""
520+
# Lists should maintain their order
521+
list_with_dicts = [{"b": 2, "a": 1}, {"d": 4, "c": 3}]
522+
list_repr = _deterministic_repr(list_with_dicts)
523+
expected_list = "[{'a': 1, 'b': 2}, {'c': 3, 'd': 4}]"
524+
assert list_repr == expected_list
525+
526+
# Tuples should maintain their order
527+
tuple_with_dicts = ({"z": 26, "a": 1}, {"y": 25, "b": 2})
528+
tuple_repr = _deterministic_repr(tuple_with_dicts)
529+
expected_tuple = "({'a': 1, 'z': 26}, {'b': 2, 'y': 25})"
530+
assert tuple_repr == expected_tuple
531+
532+
533+
def test_deterministic_repr_empty_containers():
534+
"""Test _deterministic_repr with empty containers."""
535+
assert _deterministic_repr({}) == "{}"
536+
assert _deterministic_repr([]) == "[]"
537+
assert _deterministic_repr(()) == "()"
538+
539+
540+
def test_deterministic_repr_special_characters():
541+
"""Test _deterministic_repr handles special characters correctly."""
542+
special_dict = {
543+
"quotes": "text with 'single' and \"double\" quotes",
544+
"unicode": "unicode: ñáéíóú",
545+
"newlines": "text\nwith\nnewlines",
546+
"backslashes": "path\\to\\file",
547+
}
548+
549+
result = _deterministic_repr(special_dict)
550+
551+
# Should be valid Python that can be evaluated
552+
reconstructed = eval(result)
553+
assert reconstructed == special_dict
554+
555+
# Should be deterministic - same input produces same output
556+
result2 = _deterministic_repr(special_dict)
557+
assert result == result2
558+
559+
560+
def test_deterministic_repr_executable_integration():
561+
"""Test that _deterministic_repr works correctly with Executable.value()."""
562+
# Test the integration with Executable.value which is the main use case
563+
variables1 = {"env": "dev", "debug": True, "timeout": 30}
564+
variables2 = {"timeout": 30, "debug": True, "env": "dev"}
565+
566+
exec1 = Executable.value(variables1)
567+
exec2 = Executable.value(variables2)
568+
569+
# Should produce identical payloads despite different input ordering
570+
assert exec1.payload == exec2.payload
571+
assert exec1.payload == "{'debug': True, 'env': 'dev', 'timeout': 30}"
572+
573+
# Should be valid Python
574+
reconstructed = eval(exec1.payload)
575+
assert reconstructed == variables1
576+
577+
578+
def test_deterministic_repr_complex_example():
579+
"""Test _deterministic_repr with a complex real-world-like structure."""
580+
complex_vars = {
581+
"database_config": {
582+
"host": "localhost",
583+
"port": 5432,
584+
"credentials": {"username": "admin", "password": "secret"},
585+
},
586+
"feature_flags": ["flag_b", "flag_a"],
587+
"metadata": {
588+
"version": "1.0.0",
589+
"environment": "production",
590+
"tags": {"team": "data", "project": "analytics"},
591+
},
592+
42: "numeric_key",
593+
"arrays": [{"config": {"nested": True, "level": 2}}, {"simple": "value"}],
594+
}
595+
596+
expected_structure = {
597+
42: "numeric_key",
598+
"arrays": [{"config": {"level": 2, "nested": True}}, {"simple": "value"}],
599+
"database_config": {
600+
"credentials": {"password": "secret", "username": "admin"},
601+
"host": "localhost",
602+
"port": 5432,
603+
},
604+
"feature_flags": ["flag_b", "flag_a"],
605+
"metadata": {
606+
"environment": "production",
607+
"tags": {"project": "analytics", "team": "data"},
608+
"version": "1.0.0",
609+
},
610+
}
611+
612+
actual_repr = _deterministic_repr(complex_vars)
613+
expected_repr = repr(expected_structure)
614+
assert actual_repr == expected_repr
615+
616+
# Should be valid Python
617+
reconstructed = eval(actual_repr)
618+
assert isinstance(reconstructed, dict)
619+
assert reconstructed == complex_vars

0 commit comments

Comments
 (0)