feat(lsp): add go to definition for ctes (#4543)

benfdking · web-flow · commit a69f1a4d8ceb · 2025-05-26T22:47:39.000+02:00
diff --git a/sqlmesh/lsp/main.py b/sqlmesh/lsp/main.py
@@ -279,21 +279,31 @@ def goto_definition(
                     raise RuntimeError(f"No context found for document: {document.path}")
 
                 references = get_references(self.lsp_context, uri, params.position)
-                return [
-                    types.LocationLink(
-                        target_uri=reference.uri,
-                        target_selection_range=types.Range(
+                location_links = []
+                for reference in references:
+                    # Use target_range if available (for CTEs), otherwise default to start of file
+                    if reference.target_range:
+                        target_range = reference.target_range
+                        target_selection_range = reference.target_range
+                    else:
+                        target_range = types.Range(
                             start=types.Position(line=0, character=0),
                             end=types.Position(line=0, character=0),
-                        ),
-                        target_range=types.Range(
+                        )
+                        target_selection_range = types.Range(
                             start=types.Position(line=0, character=0),
                             end=types.Position(line=0, character=0),
-                        ),
-                        origin_selection_range=reference.range,
+                        )
+
+                    location_links.append(
+                        types.LocationLink(
+                            target_uri=reference.uri,
+                            target_selection_range=target_selection_range,
+                            target_range=target_range,
+                            origin_selection_range=reference.range,
+                        )
                     )
-                    for reference in references
-                ]
+                return location_links
             except Exception as e:
                 ls.show_message(f"Error getting references: {e}", types.MessageType.Error)
                 return []
diff --git a/sqlmesh/lsp/reference.py b/sqlmesh/lsp/reference.py
@@ -6,23 +6,27 @@
 from sqlmesh.lsp.context import LSPContext, ModelTarget, AuditTarget
 from sqlglot import exp
 from sqlmesh.lsp.description import generate_markdown_description
+from sqlglot.optimizer.scope import build_scope
 from sqlmesh.lsp.uri import URI
 from sqlmesh.utils.pydantic import PydanticModel
+from sqlglot.optimizer.normalize_identifiers import normalize_identifiers
 
 
 class Reference(PydanticModel):
     """
-    A reference to a model.
+    A reference to a model or CTE.
 
     Attributes:
         range: The range of the reference in the source file
         uri: The uri of the referenced model
         markdown_description: The markdown description of the referenced model
+        target_range: The range of the definition for go-to-definition (optional, used for CTEs)
     """
 
     range: Range
     uri: str
     markdown_description: t.Optional[str] = None
+    target_range: t.Optional[Range] = None
 
 
 def by_position(position: Position) -> t.Callable[[Reference], bool]:
@@ -88,6 +92,7 @@ def get_model_definitions_for_a_path(
     - Need to normalize it before matching
     - Try get_model before normalization
     - Match to models that the model refers to
+    - Also find CTE references within the query
     """
     path = document_uri.to_path()
     if path.suffix != ".sql":
@@ -126,66 +131,95 @@ def get_model_definitions_for_a_path(
     # Find all possible references
     references = []
 
-    # Get SQL query and find all table references
-    tables = list(query.find_all(exp.Table))
-    if len(tables) == 0:
-        return []
-
     with open(file_path, "r", encoding="utf-8") as file:
         read_file = file.readlines()
 
-    for table in tables:
-        # Normalize the table reference
-        unaliased = table.copy()
-        if unaliased.args.get("alias") is not None:
-            unaliased.set("alias", None)
-        reference_name = unaliased.sql(dialect=dialect)
-        try:
-            normalized_reference_name = normalize_model_name(
-                reference_name,
-                default_catalog=lint_context.context.default_catalog,
-                dialect=dialect,
-            )
-            if normalized_reference_name not in depends_on:
-                continue
-        except Exception:
-            # Skip references that cannot be normalized
-            continue
-
-        # Get the referenced model uri
-        referenced_model = lint_context.context.get_model(
-            model_or_snapshot=normalized_reference_name, raise_if_missing=False
-        )
-        if referenced_model is None:
-            continue
-        referenced_model_path = referenced_model._path
-        # Check whether the path exists
-        if not referenced_model_path.is_file():
-            continue
-        referenced_model_uri = URI.from_path(referenced_model_path)
-
-        # Extract metadata for positioning
-        table_meta = TokenPositionDetails.from_meta(table.this.meta)
-        table_range = _range_from_token_position_details(table_meta, read_file)
-        start_pos = table_range.start
-        end_pos = table_range.end
-
-        # If there's a catalog or database qualifier, adjust the start position
-        catalog_or_db = table.args.get("catalog") or table.args.get("db")
-        if catalog_or_db is not None:
-            catalog_or_db_meta = TokenPositionDetails.from_meta(catalog_or_db.meta)
-            catalog_or_db_range = _range_from_token_position_details(catalog_or_db_meta, read_file)
-            start_pos = catalog_or_db_range.start
-
-        description = generate_markdown_description(referenced_model)
-
-        references.append(
-            Reference(
-                uri=referenced_model_uri.value,
-                range=Range(start=start_pos, end=end_pos),
-                markdown_description=description,
-            )
-        )
+    # Build scope tree to properly handle nested CTEs
+    query = normalize_identifiers(query.copy(), dialect=dialect)
+    root_scope = build_scope(query)
+
+    if root_scope:
+        # Traverse all scopes to find CTE definitions and table references
+        for scope in root_scope.traverse():
+            for table in scope.tables:
+                table_name = table.name
+
+                # Check if this table reference is a CTE in the current scope
+                if cte_scope := scope.cte_sources.get(table_name):
+                    cte = cte_scope.expression.parent
+                    alias = cte.args["alias"]
+                    if isinstance(alias, exp.TableAlias):
+                        identifier = alias.this
+                        if isinstance(identifier, exp.Identifier):
+                            target_range = _range_from_token_position_details(
+                                TokenPositionDetails.from_meta(identifier.meta), read_file
+                            )
+                            table_range = _range_from_token_position_details(
+                                TokenPositionDetails.from_meta(table.this.meta), read_file
+                            )
+                            references.append(
+                                Reference(
+                                    uri=document_uri.value,  # Same file
+                                    range=table_range,
+                                    target_range=target_range,
+                                )
+                            )
+                    continue
+
+                # For non-CTE tables, process as before (external model references)
+                # Normalize the table reference
+                unaliased = table.copy()
+                if unaliased.args.get("alias") is not None:
+                    unaliased.set("alias", None)
+                reference_name = unaliased.sql(dialect=dialect)
+                try:
+                    normalized_reference_name = normalize_model_name(
+                        reference_name,
+                        default_catalog=lint_context.context.default_catalog,
+                        dialect=dialect,
+                    )
+                    if normalized_reference_name not in depends_on:
+                        continue
+                except Exception:
+                    # Skip references that cannot be normalized
+                    continue
+
+                # Get the referenced model uri
+                referenced_model = lint_context.context.get_model(
+                    model_or_snapshot=normalized_reference_name, raise_if_missing=False
+                )
+                if referenced_model is None:
+                    continue
+                referenced_model_path = referenced_model._path
+                # Check whether the path exists
+                if not referenced_model_path.is_file():
+                    continue
+                referenced_model_uri = URI.from_path(referenced_model_path)
+
+                # Extract metadata for positioning
+                table_meta = TokenPositionDetails.from_meta(table.this.meta)
+                table_range = _range_from_token_position_details(table_meta, read_file)
+                start_pos = table_range.start
+                end_pos = table_range.end
+
+                # If there's a catalog or database qualifier, adjust the start position
+                catalog_or_db = table.args.get("catalog") or table.args.get("db")
+                if catalog_or_db is not None:
+                    catalog_or_db_meta = TokenPositionDetails.from_meta(catalog_or_db.meta)
+                    catalog_or_db_range = _range_from_token_position_details(
+                        catalog_or_db_meta, read_file
+                    )
+                    start_pos = catalog_or_db_range.start
+
+                description = generate_markdown_description(referenced_model)
+
+                references.append(
+                    Reference(
+                        uri=referenced_model_uri.value,
+                        range=Range(start=start_pos, end=end_pos),
+                        markdown_description=description,
+                    )
+                )
 
     return references
 
diff --git a/tests/lsp/test_reference_cte.py b/tests/lsp/test_reference_cte.py
@@ -0,0 +1,64 @@
+import re
+from sqlmesh.core.context import Context
+from sqlmesh.lsp.context import LSPContext, ModelTarget
+from sqlmesh.lsp.reference import get_references
+from sqlmesh.lsp.uri import URI
+from lsprotocol.types import Range, Position
+import typing as t
+
+
+def test_cte_parsing():
+    context = Context(paths=["examples/sushi"])
+    lsp_context = LSPContext(context)
+
+    # Find model URIs
+    sushi_customers_path = next(
+        path
+        for path, info in lsp_context.map.items()
+        if isinstance(info, ModelTarget) and "sushi.customers" in info.names
+    )
+
+    with open(sushi_customers_path, "r", encoding="utf-8") as file:
+        read_file = file.readlines()
+
+    # Find position of the cte reference
+    ranges = find_ranges_from_regex(read_file, r"current_marketing(?!_outer)")
+    assert len(ranges) == 2
+    position = Position(line=ranges[1].start.line, character=ranges[1].start.character + 4)
+    references = get_references(lsp_context, URI.from_path(sushi_customers_path), position)
+    assert len(references) == 1
+    assert references[0].uri == URI.from_path(sushi_customers_path).value
+    assert references[0].markdown_description is None
+    assert (
+        references[0].range.start.line == ranges[1].start.line
+    )  # The reference location (where we clicked)
+    assert (
+        references[0].target_range.start.line == ranges[0].start.line
+    )  # The CTE definition location
+
+    # Find the position of the current_marketing_outer reference
+    ranges = find_ranges_from_regex(read_file, r"current_marketing_outer")
+    assert len(ranges) == 2
+    position = Position(line=ranges[1].start.line, character=ranges[1].start.character + 4)
+    references = get_references(lsp_context, URI.from_path(sushi_customers_path), position)
+    assert len(references) == 1
+    assert references[0].uri == URI.from_path(sushi_customers_path).value
+    assert references[0].markdown_description is None
+    assert (
+        references[0].range.start.line == ranges[1].start.line
+    )  # The reference location (where we clicked)
+    assert (
+        references[0].target_range.start.line == ranges[0].start.line
+    )  # The CTE definition location
+
+
+def find_ranges_from_regex(read_file: t.List[str], regex: str) -> t.List[Range]:
+    """Find all ranges in the read file that match the regex."""
+    return [
+        Range(
+            start=Position(line=line_number, character=match.start()),
+            end=Position(line=line_number, character=match.end()),
+        )
+        for line_number, line in enumerate(read_file)
+        for match in [m for m in [re.search(regex, line)] if m]
+    ]