progress

psteinroe · psteinroe · commit 0ce720312d93 · 2025-07-03T10:12:47.000+02:00
diff --git a/crates/pgt_lexer_new/src/lexed.rs b/crates/pgt_lexer_new/src/lexed.rs
@@ -0,0 +1,96 @@
+use pgt_diagnostics::MessageAndDescription;
+use pgt_text_size::TextRange;
+
+use crate::SyntaxKind;
+
+/// Internal error type used during lexing
+#[derive(Debug, Clone)]
+pub struct LexError {
+    pub msg: String,
+    pub token: u32,
+}
+
+/// A specialized diagnostic for lex errors.
+#[derive(Clone, Debug, PartialEq)]
+pub struct LexDiagnostic {
+    /// The location where the error occurred
+    pub span: TextRange,
+    /// The error message
+    pub message: MessageAndDescription,
+}
+
+/// Result of lexing a string, providing access to tokens and diagnostics
+pub struct Lexed<'a> {
+    pub(crate) text: &'a str,
+    pub(crate) kind: Vec<SyntaxKind>,
+    pub(crate) start: Vec<u32>,
+    pub(crate) error: Vec<LexError>,
+}
+
+impl<'a> Lexed<'a> {
+    /// Returns the number of tokens (excluding EOF)
+    pub fn len(&self) -> usize {
+        self.kind.len() - 1
+    }
+
+    /// Returns true if there are no tokens
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Returns an iterator over token kinds
+    pub fn tokens(&self) -> impl Iterator<Item = SyntaxKind> + '_ {
+        (0..self.len()).map(move |i| self.kind(i))
+    }
+
+    /// Returns the kind of token at the given index
+    pub fn kind(&self, idx: usize) -> SyntaxKind {
+        assert!(idx < self.len());
+        self.kind[idx]
+    }
+
+    /// Returns the text range of token at the given index
+    pub fn range(&self, idx: usize) -> TextRange {
+        let range = self.text_range(idx);
+        TextRange::new(
+            range.start.try_into().unwrap(),
+            range.end.try_into().unwrap(),
+        )
+    }
+
+    /// Returns the text of token at the given index
+    pub fn text(&self, idx: usize) -> &str {
+        self.range_text(idx..idx + 1)
+    }
+
+    /// Returns all lexing errors with their text ranges
+    pub fn errors(&self) -> Vec<LexDiagnostic> {
+        self.error
+            .iter()
+            .map(|it| {
+                let range = self.text_range(it.token as usize);
+                LexDiagnostic {
+                    message: it.msg.as_str().into(),
+                    span: TextRange::new(
+                        range.start.try_into().unwrap(),
+                        range.end.try_into().unwrap(),
+                    ),
+                }
+            })
+            .collect()
+    }
+
+    pub(crate) fn text_range(&self, i: usize) -> std::ops::Range<usize> {
+        assert!(i < self.len());
+        let lo = self.start[i] as usize;
+        let hi = self.start[i + 1] as usize;
+        lo..hi
+    }
+
+    fn range_text(&self, r: std::ops::Range<usize>) -> &str {
+        assert!(r.start < r.end && r.end <= self.len());
+        let lo = self.start[r.start] as usize;
+        let hi = self.start[r.end] as usize;
+        &self.text[lo..hi]
+    }
+}
diff --git a/crates/pgt_lexer_new/src/lexer.rs b/crates/pgt_lexer_new/src/lexer.rs
@@ -1,115 +1,60 @@
-// based on https://github.com/rust-lang/rust-analyzer/blob/d8887c0758bbd2d5f752d5bd405d4491e90e7ed6/crates/parser/src/lexed_str.rs
-
-use std::ops;
-
 use pgt_tokenizer::tokenize;
 
 use crate::SyntaxKind;
+use crate::lexed::{LexError, Lexed};
 
-pub struct LexedStr<'a> {
+/// Lexer that processes input text into tokens and diagnostics
+pub struct Lexer<'a> {
     text: &'a str,
     kind: Vec<SyntaxKind>,
     start: Vec<u32>,
     error: Vec<LexError>,
-}
-
-struct LexError {
-    msg: String,
-    token: u32,
-}
-
-impl<'a> LexedStr<'a> {
-    pub fn new(text: &'a str) -> LexedStr<'a> {
-        let mut conv = Converter::new(text);
-
-        for token in tokenize(&text[conv.offset..]) {
-            let token_text = &text[conv.offset..][..token.len as usize];
-
-            conv.extend_token(&token.kind, token_text);
-        }
-
-        conv.finalize_with_eof()
-    }
-
-    pub fn len(&self) -> usize {
-        self.kind.len() - 1
-    }
-
-    pub fn kind(&self, i: usize) -> SyntaxKind {
-        assert!(i < self.len());
-        self.kind[i]
-    }
-
-    pub fn text(&self, i: usize) -> &str {
-        self.range_text(i..i + 1)
-    }
-
-    pub fn range_text(&self, r: ops::Range<usize>) -> &str {
-        assert!(r.start < r.end && r.end <= self.len());
-        let lo = self.start[r.start] as usize;
-        let hi = self.start[r.end] as usize;
-        &self.text[lo..hi]
-    }
-
-    // Naming is hard.
-    pub fn text_range(&self, i: usize) -> ops::Range<usize> {
-        assert!(i < self.len());
-        let lo = self.start[i] as usize;
-        let hi = self.start[i + 1] as usize;
-        lo..hi
-    }
-
-    pub fn errors(&self) -> impl Iterator<Item = (usize, &str)> + '_ {
-        self.error
-            .iter()
-            .map(|it| (it.token as usize, it.msg.as_str()))
-    }
-
-    fn push(&mut self, kind: SyntaxKind, offset: usize) {
-        self.kind.push(kind);
-        self.start.push(offset as u32);
-    }
-}
-
-struct Converter<'a> {
-    res: LexedStr<'a>,
     offset: usize,
 }
 
-impl<'a> Converter<'a> {
-    fn new(text: &'a str) -> Self {
+impl<'a> Lexer<'a> {
+    /// Create a new lexer for the given text
+    pub fn new(text: &'a str) -> Self {
         Self {
-            res: LexedStr {
-                text,
-                kind: Vec::new(),
-                start: Vec::new(),
-                error: Vec::new(),
-            },
+            text,
+            kind: Vec::new(),
+            start: Vec::new(),
+            error: Vec::new(),
             offset: 0,
         }
     }
 
-    fn finalize_with_eof(mut self) -> LexedStr<'a> {
-        self.res.push(SyntaxKind::EOF, self.offset);
-        self.res
+    /// Lex the input text and return the result
+    pub fn lex(mut self) -> Lexed<'a> {
+        for token in tokenize(&self.text[self.offset..]) {
+            let token_text = &self.text[self.offset..][..token.len as usize];
+            self.extend_token(&token.kind, token_text);
+        }
+
+        // Add EOF token
+        self.push(SyntaxKind::EOF, 0, None);
+
+        Lexed {
+            text: self.text,
+            kind: self.kind,
+            start: self.start,
+            error: self.error,
+        }
     }
 
     fn push(&mut self, kind: SyntaxKind, len: usize, err: Option<&str>) {
-        self.res.push(kind, self.offset);
+        self.kind.push(kind);
+        self.start.push(self.offset as u32);
         self.offset += len;
 
         if let Some(err) = err {
-            let token = self.res.len() as u32;
+            let token = (self.kind.len() - 1) as u32;
             let msg = err.to_owned();
-            self.res.error.push(LexError { msg, token });
+            self.error.push(LexError { msg, token });
         }
     }
 
     fn extend_token(&mut self, kind: &pgt_tokenizer::TokenKind, token_text: &str) {
-        // A note on an intended tradeoff:
-        // We drop some useful information here (see patterns with double dots `..`)
-        // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
-        // being `u16` that come from `rowan::SyntaxKind`.
         let mut err = "";
 
         let syntax_kind = {
@@ -121,25 +66,13 @@ impl<'a> Converter<'a> {
                     }
                     SyntaxKind::COMMENT
                 }
-
-                // whitespace
                 pgt_tokenizer::TokenKind::Space => SyntaxKind::SPACE,
                 pgt_tokenizer::TokenKind::Tab => SyntaxKind::TAB,
                 pgt_tokenizer::TokenKind::Newline => SyntaxKind::NEWLINE,
                 pgt_tokenizer::TokenKind::CarriageReturn => SyntaxKind::CARRIAGE_RETURN,
                 pgt_tokenizer::TokenKind::VerticalTab => SyntaxKind::VERTICAL_TAB,
                 pgt_tokenizer::TokenKind::FormFeed => SyntaxKind::FORM_FEED,
                 pgt_tokenizer::TokenKind::Ident => {
-                    // TODO: check for max identifier length
-                    //
-                    // see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
-                    // The system uses no more than NAMEDATALEN-1 bytes of an
-                    // identifier; longer names can be written in commands, but
-                    // they will be truncated. By default, NAMEDATALEN is 64 so
-                    // the maximum identifier length is 63 bytes. If this limit
-                    // is problematic, it can be raised by changing the
-                    // NAMEDATALEN constant in src/include/pg_config_manual.h.
-                    // see: https://github.com/postgres/postgres/blob/e032e4c7ddd0e1f7865b246ec18944365d4f8614/src/include/pg_config_manual.h#L29
                     SyntaxKind::from_keyword(token_text).unwrap_or(SyntaxKind::IDENT)
                 }
                 pgt_tokenizer::TokenKind::Literal { kind, .. } => {
@@ -214,43 +147,36 @@ impl<'a> Converter<'a> {
                 if !terminated {
                     err = "Missing trailing `'` symbol to terminate the string literal";
                 }
-                // TODO: rust analzyer checks for un-escaped strings, we should too
                 SyntaxKind::STRING
             }
             pgt_tokenizer::LiteralKind::ByteStr { terminated } => {
                 if !terminated {
                     err = "Missing trailing `'` symbol to terminate the hex bit string literal";
                 }
-                // TODO: rust analzyer checks for un-escaped strings, we should too
                 SyntaxKind::BYTE_STRING
             }
             pgt_tokenizer::LiteralKind::BitStr { terminated } => {
                 if !terminated {
-                    err = "Missing trailing `\'` symbol to terminate the bit string literal";
+                    err = "Missing trailing `'` symbol to terminate the bit string literal";
                 }
-                // TODO: rust analzyer checks for un-escaped strings, we should too
                 SyntaxKind::BIT_STRING
             }
             pgt_tokenizer::LiteralKind::DollarQuotedString { terminated } => {
                 if !terminated {
-                    // TODO: we could be fancier and say the ending string we're looking for
                     err = "Unterminated dollar quoted string literal";
                 }
-                // TODO: rust analzyer checks for un-escaped strings, we should too
                 SyntaxKind::DOLLAR_QUOTED_STRING
             }
             pgt_tokenizer::LiteralKind::UnicodeEscStr { terminated } => {
                 if !terminated {
                     err = "Missing trailing `'` symbol to terminate the unicode escape string literal";
                 }
-                // TODO: rust analzyer checks for un-escaped strings, we should too
                 SyntaxKind::BYTE_STRING
             }
             pgt_tokenizer::LiteralKind::EscStr { terminated } => {
                 if !terminated {
-                    err = "Missing trailing `\'` symbol to terminate the escape string literal";
+                    err = "Missing trailing `'` symbol to terminate the escape string literal";
                 }
-                // TODO: rust analzyer checks for un-escaped strings, we should too
                 SyntaxKind::ESC_STRING
             }
         };
diff --git a/crates/pgt_lexer_new/src/lib.rs b/crates/pgt_lexer_new/src/lib.rs
@@ -1,78 +1,15 @@
 mod codegen;
 mod diagnostics;
-mod lexed_str;
-
-use diagnostics::LexError;
-use lexed_str::LexedStr;
-use pgt_text_size::TextRange;
+mod lexed;
+mod lexer;
 
 pub use crate::codegen::syntax_kind::SyntaxKind;
-
-/// Result of lexing a string, providing access to tokens and diagnostics
-///
-/// Thin wrapper around LexedStr for better API ergonomics
-pub struct Lexed<'a> {
-    inner: LexedStr<'a>,
-}
-
-impl Lexed<'_> {
-    /// Returns the number of tokens (excluding EOF)
-    pub fn len(&self) -> usize {
-        self.inner.len()
-    }
-
-    /// Returns true if there are no tokens
-    pub fn is_empty(&self) -> bool {
-        self.len() == 0
-    }
-
-    /// Returns an iterator over token kinds
-    pub fn tokens(&self) -> impl Iterator<Item = SyntaxKind> + '_ {
-        (0..self.len()).map(move |i| self.inner.kind(i))
-    }
-
-    /// Returns the kind of token at the given index
-    pub fn kind(&self, idx: usize) -> SyntaxKind {
-        self.inner.kind(idx)
-    }
-
-    /// Returns the text range of token at the given index
-    pub fn range(&self, idx: usize) -> TextRange {
-        let range = self.inner.text_range(idx);
-        TextRange::new(
-            range.start.try_into().unwrap(),
-            range.end.try_into().unwrap(),
-        )
-    }
-
-    /// Returns the text of token at the given index
-    pub fn text(&self, idx: usize) -> &str {
-        self.inner.text(idx)
-    }
-
-    /// Returns all lexing errors with their text ranges
-    pub fn errors(&self) -> Vec<LexError> {
-        self.inner
-            .errors()
-            .map(|(i, msg)| {
-                let range = self.inner.text_range(i);
-                LexError {
-                    message: msg.into(),
-                    span: TextRange::new(
-                        range.start.try_into().unwrap(),
-                        range.end.try_into().unwrap(),
-                    ),
-                }
-            })
-            .collect()
-    }
-}
+pub use crate::lexed::{LexDiagnostic, Lexed};
+pub use crate::lexer::Lexer;
 
 /// Lex the input string into tokens and diagnostics
 pub fn lex(input: &str) -> Lexed {
-    Lexed {
-        inner: LexedStr::new(input),
-    }
+    Lexer::new(input).lex()
 }
 
 #[cfg(test)]