Skip to content

Commit 2cdc659

Browse files
committed
progress
1 parent fb1594c commit 2cdc659

File tree

8 files changed

+270
-4
lines changed

8 files changed

+270
-4
lines changed

.claude/settings.local.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"permissions": {
3+
"allow": [
4+
"Bash(grep:*)",
5+
"Bash(rg:*)",
6+
"Bash(cargo test:*)",
7+
"Bash(cargo run:*)"
8+
],
9+
"deny": []
10+
}
11+
}

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/pgt_diagnostics/src/display/message.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,15 @@ impl From<String> for MessageAndDescription {
4747
}
4848
}
4949

50+
impl From<&str> for MessageAndDescription {
51+
fn from(description: &str) -> Self {
52+
Self {
53+
message: markup! { {description} }.to_owned(),
54+
description: description.into(),
55+
}
56+
}
57+
}
58+
5059
impl From<MarkupBuf> for MessageAndDescription {
5160
fn from(message: MarkupBuf) -> Self {
5261
let description = markup_to_string(&message);

crates/pgt_lexer_new/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@ version = "0.0.0"
1212

1313

1414
[dependencies]
15+
pgt_diagnostics.workspace = true
1516
pgt_lexer_new_codegen.workspace = true
17+
pgt_text_size.workspace = true
1618
pgt_tokenizer.workspace = true
1719

1820
[dev-dependencies]
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
use pgt_lexer_new::{SyntaxKind, lex};
2+
3+
fn main() {
4+
let sql = "SELECT id, name FROM users WHERE active = true;";
5+
let lexed = lex(sql);
6+
7+
println!("Total tokens: {}", lexed.len());
8+
println!("\nToken details:");
9+
10+
// Iterate over tokens
11+
for (idx, kind) in lexed.tokens().enumerate() {
12+
// Skip whitespace for cleaner output
13+
if matches!(
14+
kind,
15+
SyntaxKind::SPACE | SyntaxKind::TAB | SyntaxKind::NEWLINE
16+
) {
17+
continue;
18+
}
19+
20+
let range = lexed.range(idx);
21+
let text = lexed.text(idx);
22+
23+
println!(" [{:3}] {:?} @ {:?} = {:?}", idx, kind, range, text);
24+
}
25+
26+
// Check for errors
27+
let errors = lexed.errors();
28+
if !errors.is_empty() {
29+
println!("\nLexing errors:");
30+
for error in errors {
31+
println!(" Error at {:?}: {}", error.span, error.message);
32+
}
33+
} else {
34+
println!("\nNo lexing errors found.");
35+
}
36+
37+
// Example: Find all identifiers
38+
println!("\nIdentifiers found:");
39+
for (idx, kind) in lexed.tokens().enumerate() {
40+
if kind == SyntaxKind::IDENT {
41+
println!(" - {} at {:?}", lexed.text(idx), lexed.range(idx));
42+
}
43+
}
44+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
use pgt_diagnostics::{Diagnostic, MessageAndDescription};
2+
use pgt_text_size::TextRange;
3+
4+
/// A specialized diagnostic for lex errors.
5+
#[derive(Clone, Debug, Diagnostic, PartialEq)]
6+
#[diagnostic(category = "syntax", severity = Error)]
7+
pub struct LexError {
8+
/// The location where the error is occurred
9+
#[location(span)]
10+
pub span: TextRange,
11+
#[message]
12+
#[description]
13+
pub message: MessageAndDescription,
14+
}
15+
16+
#[cfg(test)]
17+
mod tests {
18+
use crate::lex;
19+
20+
#[test]
21+
fn finds_lex_errors() {
22+
// Test with unterminated block comment
23+
let input = "/* unterminated comment";
24+
let lexed = lex(input);
25+
let errors = lexed.errors();
26+
27+
// Should have error for unterminated block comment
28+
assert!(!errors.is_empty());
29+
assert!(errors[0].message.to_string().contains("Missing trailing"));
30+
assert!(errors[0].span.start() < errors[0].span.end());
31+
32+
// Test with unterminated string
33+
let input2 = "SELECT 'unterminated string";
34+
let lexed2 = lex(input2);
35+
let errors2 = lexed2.errors();
36+
37+
// Should have error for unterminated string
38+
assert!(!errors2.is_empty());
39+
assert!(errors2[0].message.to_string().contains("Missing trailing"));
40+
}
41+
}

crates/pgt_lexer_new/src/lexed_str.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,20 +31,20 @@ impl<'a> LexedStr<'a> {
3131
conv.finalize_with_eof()
3232
}
3333

34-
pub(crate) fn len(&self) -> usize {
34+
pub fn len(&self) -> usize {
3535
self.kind.len() - 1
3636
}
3737

38-
pub(crate) fn kind(&self, i: usize) -> SyntaxKind {
38+
pub fn kind(&self, i: usize) -> SyntaxKind {
3939
assert!(i < self.len());
4040
self.kind[i]
4141
}
4242

43-
pub(crate) fn text(&self, i: usize) -> &str {
43+
pub fn text(&self, i: usize) -> &str {
4444
self.range_text(i..i + 1)
4545
}
4646

47-
pub(crate) fn range_text(&self, r: ops::Range<usize>) -> &str {
47+
pub fn range_text(&self, r: ops::Range<usize>) -> &str {
4848
assert!(r.start < r.end && r.end <= self.len());
4949
let lo = self.start[r.start] as usize;
5050
let hi = self.start[r.end] as usize;

crates/pgt_lexer_new/src/lib.rs

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,161 @@
11
mod codegen;
2+
mod diagnostics;
23
mod lexed_str;
34

5+
use diagnostics::LexError;
6+
use lexed_str::LexedStr;
7+
use pgt_text_size::TextRange;
8+
49
pub use crate::codegen::syntax_kind::SyntaxKind;
10+
11+
/// Result of lexing a string, providing access to tokens and diagnostics
12+
pub struct Lexed<'a> {
13+
inner: LexedStr<'a>,
14+
}
15+
16+
impl<'a> Lexed<'a> {
17+
/// Returns the number of tokens (excluding EOF)
18+
pub fn len(&self) -> usize {
19+
self.inner.len()
20+
}
21+
22+
/// Returns true if there are no tokens
23+
pub fn is_empty(&self) -> bool {
24+
self.len() == 0
25+
}
26+
27+
/// Returns an iterator over token kinds
28+
pub fn tokens(&self) -> impl Iterator<Item = SyntaxKind> + '_ {
29+
(0..self.len()).map(move |i| self.inner.kind(i))
30+
}
31+
32+
/// Returns the kind of token at the given index
33+
pub fn kind(&self, idx: usize) -> SyntaxKind {
34+
self.inner.kind(idx)
35+
}
36+
37+
/// Returns the text range of token at the given index
38+
pub fn range(&self, idx: usize) -> TextRange {
39+
let range = self.inner.text_range(idx);
40+
TextRange::new(
41+
range.start.try_into().unwrap(),
42+
range.end.try_into().unwrap(),
43+
)
44+
}
45+
46+
/// Returns the text of token at the given index
47+
pub fn text(&self, idx: usize) -> &str {
48+
self.inner.text(idx)
49+
}
50+
51+
/// Returns all lexing errors with their text ranges
52+
pub fn errors(&self) -> Vec<LexError> {
53+
self.inner
54+
.errors()
55+
.map(|(i, msg)| {
56+
let range = self.inner.text_range(i);
57+
LexError {
58+
message: msg.into(),
59+
span: TextRange::new(
60+
range.start.try_into().unwrap(),
61+
range.end.try_into().unwrap(),
62+
),
63+
}
64+
})
65+
.collect()
66+
}
67+
}
68+
69+
/// Lex the input string into tokens and diagnostics
70+
pub fn lex(input: &str) -> Lexed {
71+
Lexed {
72+
inner: LexedStr::new(input),
73+
}
74+
}
75+
76+
#[cfg(test)]
77+
mod tests {
78+
use super::*;
79+
80+
#[test]
81+
fn test_basic_lexing() {
82+
let input = "SELECT * FROM users WHERE id = 1;";
83+
let lexed = lex(input);
84+
85+
// Check we have tokens
86+
assert!(!lexed.is_empty());
87+
88+
// Iterate over tokens and collect identifiers
89+
let mut identifiers = Vec::new();
90+
for (idx, kind) in lexed.tokens().enumerate() {
91+
if kind == SyntaxKind::IDENT {
92+
identifiers.push((lexed.text(idx), lexed.range(idx)));
93+
}
94+
}
95+
96+
// Should find at least "users" and "id" as identifiers
97+
assert!(identifiers.len() >= 2);
98+
}
99+
100+
#[test]
101+
fn test_lexing_with_errors() {
102+
let input = "SELECT 'unterminated string";
103+
let lexed = lex(input);
104+
105+
// Should have tokens
106+
assert!(!lexed.is_empty());
107+
108+
// Should have an error for unterminated string
109+
let errors = lexed.errors();
110+
assert!(!errors.is_empty());
111+
// Check the error message exists
112+
assert!(!errors[0].message.to_string().is_empty());
113+
}
114+
115+
#[test]
116+
fn test_token_ranges() {
117+
let input = "SELECT id";
118+
let lexed = lex(input);
119+
120+
// First token should be a keyword (SELECT gets parsed as a keyword)
121+
let _first_kind = lexed.kind(0);
122+
assert_eq!(u32::from(lexed.range(0).start()), 0);
123+
assert_eq!(u32::from(lexed.range(0).end()), 6);
124+
assert_eq!(lexed.text(0), "SELECT");
125+
126+
// Find the id token
127+
for (idx, kind) in lexed.tokens().enumerate() {
128+
if kind == SyntaxKind::IDENT && lexed.text(idx) == "id" {
129+
assert_eq!(u32::from(lexed.range(idx).start()), 7);
130+
assert_eq!(u32::from(lexed.range(idx).end()), 9);
131+
}
132+
}
133+
}
134+
135+
#[test]
136+
fn test_empty_input() {
137+
let input = "";
138+
let lexed = lex(input);
139+
assert!(lexed.is_empty());
140+
assert_eq!(lexed.len(), 0);
141+
}
142+
143+
#[test]
144+
fn test_whitespace_handling() {
145+
let input = " SELECT \n id ";
146+
let lexed = lex(input);
147+
148+
// Collect non-whitespace tokens
149+
let mut non_whitespace = Vec::new();
150+
for (idx, kind) in lexed.tokens().enumerate() {
151+
if !matches!(
152+
kind,
153+
SyntaxKind::SPACE | SyntaxKind::TAB | SyntaxKind::NEWLINE
154+
) {
155+
non_whitespace.push(lexed.text(idx));
156+
}
157+
}
158+
159+
assert_eq!(non_whitespace.len(), 2); // SELECT and id
160+
}
161+
}

0 commit comments

Comments
 (0)