Widen parent start date override if it ends up being later than a child start date override

erindru · erindru · commit 1e56590c2c7e · 2025-07-03T21:59:38.000Z
diff --git a/sqlmesh/core/context.py b/sqlmesh/core/context.py
@@ -2925,7 +2925,7 @@ def _calculate_start_override_per_model(
             # If we dont have a minimum number of intervals to consider, then we dont need to adjust the start date on a per-model basis
             return {}
 
-        start_overrides = {}
+        start_overrides: t.Dict[str, datetime] = {}
         end_override_per_model = end_override_per_model or {}
 
         plan_execution_time_dt = to_datetime(plan_execution_time)
@@ -2934,14 +2934,42 @@ def _calculate_start_override_per_model(
             plan_end or plan_execution_time_dt, relative_base=plan_execution_time_dt
         )
 
-        for model_fqn in backfill_model_fqns:
+        # we need to take the DAG into account so that parent models can be expanded to cover at least as much as their children
+        # for example, A(hourly) <- B(daily)
+        # if min_intervals=1, A would have 1 hour and B would have 1 day
+        # but B depends on A so in order for B to have 1 valid day, A needs to be expanded to 24 hours
+        backfill_dag: DAG[str] = DAG()
+        for fqn in backfill_model_fqns:
+            backfill_dag.add(
+                fqn,
+                [
+                    p.name
+                    for p in snapshots_by_model_fqn[fqn].parents
+                    if p.name in backfill_model_fqns
+                ],
+            )
+
+        # start from the leaf nodes and work back towards the root because the min_start at the root node is determined by the calculated starts in the leaf nodes
+        reversed_dag = backfill_dag.reversed
+        graph = reversed_dag.graph
+
+        for model_fqn in reversed_dag:
+            # Get the earliest start from all immediate children of this snapshot
+            # this works because topological ordering guarantees that they've already been visited
+            # and we always set a start override
+            min_child_start = min(
+                [start_overrides[immediate_child_fqn] for immediate_child_fqn in graph[model_fqn]],
+                default=plan_start_dt,
+            )
+
             snapshot = snapshots_by_model_fqn.get(model_fqn)
+
             if not snapshot:
                 continue
 
             starting_point = end_override_per_model.get(model_fqn, plan_end_dt)
             if node_end := snapshot.node.end:
-                # if we dont do this, if the node end is a date (as opposed to a timestamp)
+                # if we dont do this, if the node end is a *date* (as opposed to a timestamp)
                 # we end up incorrectly winding back an extra day
                 node_end_dt = make_exclusive(node_end)
 
@@ -2956,10 +2984,7 @@ def _calculate_start_override_per_model(
                 # wind back the starting point by :min_intervals intervals to arrive at the minimum snapshot start date
                 snapshot_start = snapshot.node.cron_prev(snapshot_start)
 
-            # only consider this an override if the wound-back start date is earlier than the plan start date
-            # if it isnt then the plan already covers :min_intervals intervals for this snapshot
-            if snapshot_start < plan_start_dt:
-                start_overrides[model_fqn] = snapshot_start
+            start_overrides[model_fqn] = min(min_child_start, snapshot_start)
 
         return start_overrides
 
diff --git a/tests/core/test_context.py b/tests/core/test_context.py
@@ -2567,3 +2567,167 @@ def _get_missing_intervals(plan: Plan, name: str) -> t.List[t.Tuple[datetime, da
     ) == [
         (to_datetime("2020-01-18 00:00:00"), to_datetime("2020-01-18 23:59:59.999999")),
     ]
+
+
+def test_plan_min_intervals_adjusted_for_downstream(tmp_path: Path):
+    """
+    Scenario:
+        A(hourly) <- B(daily) <- C(weekly)
+                  <- D(two-hourly)
+        E(monthly)
+
+    We need to ensure that :min_intervals covers at least :min_intervals of all downstream models for the dag to be valid
+    In this scenario, if min_intervals=1:
+        - A would need to cover at least (7 days * 24 hours) because its downstream model C is weekly. It should also be unaffected by its sibling, E
+        - B would need to cover at least 7 days because its downstream model C is weekly
+        - C would need to cover at least 1 week because min_intervals: 1
+        - D would need to cover at least 2 hours because min_intervals: 1 and should be unaffected by C
+        - E is unrelated to A, B, C and D so would need to cover 1 month satisfy min_intervals: 1.
+            - It also ensures that each tree branch has a unique cumulative date, because
+              if the dag is iterated purely in topological order with a global min date it would set A to to 1 month instead if 1 week
+    """
+
+    init_example_project(tmp_path, engine_type="duckdb", dialect="duckdb")
+
+    context = Context(
+        paths=tmp_path, config=Config(model_defaults=ModelDefaultsConfig(dialect="duckdb"))
+    )
+
+    current_time = to_datetime("2020-02-01 00:00:01")
+
+    # initial state of example project
+    context.plan(auto_apply=True, execution_time=current_time)
+
+    (tmp_path / "models" / "hourly_model.sql").write_text("""
+    MODEL (
+      name sqlmesh_example.hourly_model,
+      kind INCREMENTAL_BY_TIME_RANGE (
+        time_column start_dt,
+        batch_size 1
+      ),
+      start '2020-01-01',
+      cron '@hourly'
+    );                        
+
+    select @start_dt as start_dt, @end_dt as end_dt;
+    """)
+
+    (tmp_path / "models" / "two_hourly_model.sql").write_text("""
+    MODEL (
+      name sqlmesh_example.two_hourly_model,
+      kind INCREMENTAL_BY_TIME_RANGE (
+        time_column start_dt        
+      ),
+      start '2020-01-01',
+      cron '0 */2 * * *'
+    );                        
+
+    select start_dt, end_dt from sqlmesh_example.hourly_model where start_dt between @start_dt and @end_dt;
+    """)
+
+    (tmp_path / "models" / "unrelated_monthly_model.sql").write_text("""
+    MODEL (
+      name sqlmesh_example.unrelated_monthly_model,
+      kind INCREMENTAL_BY_TIME_RANGE (
+        time_column start_dt        
+      ),
+      start '2020-01-01',
+      cron '@monthly'
+    );                        
+
+    select @start_dt as start_dt, @end_dt as end_dt;
+    """)
+
+    (tmp_path / "models" / "daily_model.sql").write_text("""
+    MODEL (
+      name sqlmesh_example.daily_model,
+      kind INCREMENTAL_BY_TIME_RANGE (
+        time_column start_dt
+      ),
+      start '2020-01-01',
+      cron '@daily'
+    );                        
+
+    select start_dt, end_dt from sqlmesh_example.hourly_model where start_dt between @start_dt and @end_dt;
+    """)
+
+    (tmp_path / "models" / "weekly_model.sql").write_text("""
+    MODEL (
+      name sqlmesh_example.weekly_model,
+      kind INCREMENTAL_BY_TIME_RANGE (
+        time_column start_dt
+      ),
+      start '2020-01-01',
+      cron '@weekly'
+    );                        
+
+    select start_dt, end_dt from sqlmesh_example.daily_model where start_dt between @start_dt and @end_dt;
+    """)
+
+    context.load()
+
+    # create a dev env for "1 day ago" with min_intervals=1
+    # this should force a weeks worth of intervals for every model
+    plan = context.plan(
+        environment="pr_env",
+        start="1 day ago",
+        execution_time=current_time,
+        min_intervals=1,
+    )
+
+    def _get_missing_intervals(name: str) -> t.List[t.Tuple[datetime, datetime]]:
+        snapshot_id = context.get_snapshot(name, raise_if_missing=True).snapshot_id
+        snapshot_intervals = next(
+            si for si in plan.missing_intervals if si.snapshot_id == snapshot_id
+        )
+        return [(to_datetime(s), to_datetime(e)) for s, e in snapshot_intervals.merged_intervals]
+
+    # We only operate on completed intervals, so given the current_time this is the range of the last completed week
+    _get_missing_intervals("sqlmesh_example.weekly_model") == [
+        (to_datetime("2020-01-19 00:00:00"), to_datetime("2020-01-26 00:00:00"))
+    ]
+
+    # The daily model needs to cover the week, so it gets its start date moved back to line up
+    _get_missing_intervals("sqlmesh_example.daily_model") == [
+        (to_datetime("2020-01-19 00:00:00"), to_datetime("2020-02-01 00:00:00"))
+    ]
+
+    # The hourly model needs to cover both the daily model and the weekly model, so it also gets its start date moved back to line up with the weekly model
+    assert _get_missing_intervals("sqlmesh_example.hourly_model") == [
+        (to_datetime("2020-01-19 00:00:00"), to_datetime("2020-02-01 00:00:00"))
+    ]
+
+    # The two-hourly model only needs to cover 2 hours and should be unaffected by the fact its sibling node has a weekly child node
+    # However it still gets backfilled for 24 hours because the plan start is 1 day and this satisfies min_intervals: 1
+    assert _get_missing_intervals("sqlmesh_example.two_hourly_model") == [
+        (to_datetime("2020-01-31 00:00:00"), to_datetime("2020-02-01 00:00:00"))
+    ]
+
+    # The unrelated model has no upstream constraints, so its start date doesnt get moved to line up with the weekly model
+    # However it still gets backfilled for 24 hours because the plan start is 1 day and this satisfies min_intervals: 1
+    _get_missing_intervals("sqlmesh_example.unrelated_monthly_model") == [
+        (to_datetime("2020-01-01 00:00:00"), to_datetime("2020-02-01 00:00:00"))
+    ]
+
+    # Check that actually running the plan produces the correct result, since missing intervals are re-calculated in the evaluator
+    context.apply(plan)
+
+    assert context.engine_adapter.fetchall(
+        "select min(start_dt), max(end_dt) from sqlmesh_example__pr_env.weekly_model"
+    ) == [(to_datetime("2020-01-19 00:00:00"), to_datetime("2020-01-25 23:59:59.999999"))]
+
+    assert context.engine_adapter.fetchall(
+        "select min(start_dt), max(end_dt) from sqlmesh_example__pr_env.daily_model"
+    ) == [(to_datetime("2020-01-19 00:00:00"), to_datetime("2020-01-31 23:59:59.999999"))]
+
+    assert context.engine_adapter.fetchall(
+        "select min(start_dt), max(end_dt) from sqlmesh_example__pr_env.hourly_model"
+    ) == [(to_datetime("2020-01-19 00:00:00"), to_datetime("2020-01-31 23:59:59.999999"))]
+
+    assert context.engine_adapter.fetchall(
+        "select min(start_dt), max(end_dt) from sqlmesh_example__pr_env.two_hourly_model"
+    ) == [(to_datetime("2020-01-31 00:00:00"), to_datetime("2020-01-31 23:59:59.999999"))]
+
+    assert context.engine_adapter.fetchall(
+        "select min(start_dt), max(end_dt) from sqlmesh_example__pr_env.unrelated_monthly_model"
+    ) == [(to_datetime("2020-01-01 00:00:00"), to_datetime("2020-01-31 23:59:59.999999"))]