Skip to content

Commit 0ce7203

Browse files
committed
progress
1 parent 1676f3b commit 0ce7203

File tree

3 files changed

+133
-174
lines changed

3 files changed

+133
-174
lines changed

crates/pgt_lexer_new/src/lexed.rs

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
use pgt_diagnostics::MessageAndDescription;
2+
use pgt_text_size::TextRange;
3+
4+
use crate::SyntaxKind;
5+
6+
/// Internal error type used during lexing
7+
#[derive(Debug, Clone)]
8+
pub struct LexError {
9+
pub msg: String,
10+
pub token: u32,
11+
}
12+
13+
/// A specialized diagnostic for lex errors.
14+
#[derive(Clone, Debug, PartialEq)]
15+
pub struct LexDiagnostic {
16+
/// The location where the error occurred
17+
pub span: TextRange,
18+
/// The error message
19+
pub message: MessageAndDescription,
20+
}
21+
22+
/// Result of lexing a string, providing access to tokens and diagnostics
23+
pub struct Lexed<'a> {
24+
pub(crate) text: &'a str,
25+
pub(crate) kind: Vec<SyntaxKind>,
26+
pub(crate) start: Vec<u32>,
27+
pub(crate) error: Vec<LexError>,
28+
}
29+
30+
impl<'a> Lexed<'a> {
31+
/// Returns the number of tokens (excluding EOF)
32+
pub fn len(&self) -> usize {
33+
self.kind.len() - 1
34+
}
35+
36+
/// Returns true if there are no tokens
37+
pub fn is_empty(&self) -> bool {
38+
self.len() == 0
39+
}
40+
41+
/// Returns an iterator over token kinds
42+
pub fn tokens(&self) -> impl Iterator<Item = SyntaxKind> + '_ {
43+
(0..self.len()).map(move |i| self.kind(i))
44+
}
45+
46+
/// Returns the kind of token at the given index
47+
pub fn kind(&self, idx: usize) -> SyntaxKind {
48+
assert!(idx < self.len());
49+
self.kind[idx]
50+
}
51+
52+
/// Returns the text range of token at the given index
53+
pub fn range(&self, idx: usize) -> TextRange {
54+
let range = self.text_range(idx);
55+
TextRange::new(
56+
range.start.try_into().unwrap(),
57+
range.end.try_into().unwrap(),
58+
)
59+
}
60+
61+
/// Returns the text of token at the given index
62+
pub fn text(&self, idx: usize) -> &str {
63+
self.range_text(idx..idx + 1)
64+
}
65+
66+
/// Returns all lexing errors with their text ranges
67+
pub fn errors(&self) -> Vec<LexDiagnostic> {
68+
self.error
69+
.iter()
70+
.map(|it| {
71+
let range = self.text_range(it.token as usize);
72+
LexDiagnostic {
73+
message: it.msg.as_str().into(),
74+
span: TextRange::new(
75+
range.start.try_into().unwrap(),
76+
range.end.try_into().unwrap(),
77+
),
78+
}
79+
})
80+
.collect()
81+
}
82+
83+
pub(crate) fn text_range(&self, i: usize) -> std::ops::Range<usize> {
84+
assert!(i < self.len());
85+
let lo = self.start[i] as usize;
86+
let hi = self.start[i + 1] as usize;
87+
lo..hi
88+
}
89+
90+
fn range_text(&self, r: std::ops::Range<usize>) -> &str {
91+
assert!(r.start < r.end && r.end <= self.len());
92+
let lo = self.start[r.start] as usize;
93+
let hi = self.start[r.end] as usize;
94+
&self.text[lo..hi]
95+
}
96+
}

crates/pgt_lexer_new/src/lexed_str.rs renamed to crates/pgt_lexer_new/src/lexer.rs

Lines changed: 32 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -1,115 +1,60 @@
1-
// based on https://github.com/rust-lang/rust-analyzer/blob/d8887c0758bbd2d5f752d5bd405d4491e90e7ed6/crates/parser/src/lexed_str.rs
2-
3-
use std::ops;
4-
51
use pgt_tokenizer::tokenize;
62

73
use crate::SyntaxKind;
4+
use crate::lexed::{LexError, Lexed};
85

9-
pub struct LexedStr<'a> {
6+
/// Lexer that processes input text into tokens and diagnostics
7+
pub struct Lexer<'a> {
108
text: &'a str,
119
kind: Vec<SyntaxKind>,
1210
start: Vec<u32>,
1311
error: Vec<LexError>,
14-
}
15-
16-
struct LexError {
17-
msg: String,
18-
token: u32,
19-
}
20-
21-
impl<'a> LexedStr<'a> {
22-
pub fn new(text: &'a str) -> LexedStr<'a> {
23-
let mut conv = Converter::new(text);
24-
25-
for token in tokenize(&text[conv.offset..]) {
26-
let token_text = &text[conv.offset..][..token.len as usize];
27-
28-
conv.extend_token(&token.kind, token_text);
29-
}
30-
31-
conv.finalize_with_eof()
32-
}
33-
34-
pub fn len(&self) -> usize {
35-
self.kind.len() - 1
36-
}
37-
38-
pub fn kind(&self, i: usize) -> SyntaxKind {
39-
assert!(i < self.len());
40-
self.kind[i]
41-
}
42-
43-
pub fn text(&self, i: usize) -> &str {
44-
self.range_text(i..i + 1)
45-
}
46-
47-
pub fn range_text(&self, r: ops::Range<usize>) -> &str {
48-
assert!(r.start < r.end && r.end <= self.len());
49-
let lo = self.start[r.start] as usize;
50-
let hi = self.start[r.end] as usize;
51-
&self.text[lo..hi]
52-
}
53-
54-
// Naming is hard.
55-
pub fn text_range(&self, i: usize) -> ops::Range<usize> {
56-
assert!(i < self.len());
57-
let lo = self.start[i] as usize;
58-
let hi = self.start[i + 1] as usize;
59-
lo..hi
60-
}
61-
62-
pub fn errors(&self) -> impl Iterator<Item = (usize, &str)> + '_ {
63-
self.error
64-
.iter()
65-
.map(|it| (it.token as usize, it.msg.as_str()))
66-
}
67-
68-
fn push(&mut self, kind: SyntaxKind, offset: usize) {
69-
self.kind.push(kind);
70-
self.start.push(offset as u32);
71-
}
72-
}
73-
74-
struct Converter<'a> {
75-
res: LexedStr<'a>,
7612
offset: usize,
7713
}
7814

79-
impl<'a> Converter<'a> {
80-
fn new(text: &'a str) -> Self {
15+
impl<'a> Lexer<'a> {
16+
/// Create a new lexer for the given text
17+
pub fn new(text: &'a str) -> Self {
8118
Self {
82-
res: LexedStr {
83-
text,
84-
kind: Vec::new(),
85-
start: Vec::new(),
86-
error: Vec::new(),
87-
},
19+
text,
20+
kind: Vec::new(),
21+
start: Vec::new(),
22+
error: Vec::new(),
8823
offset: 0,
8924
}
9025
}
9126

92-
fn finalize_with_eof(mut self) -> LexedStr<'a> {
93-
self.res.push(SyntaxKind::EOF, self.offset);
94-
self.res
27+
/// Lex the input text and return the result
28+
pub fn lex(mut self) -> Lexed<'a> {
29+
for token in tokenize(&self.text[self.offset..]) {
30+
let token_text = &self.text[self.offset..][..token.len as usize];
31+
self.extend_token(&token.kind, token_text);
32+
}
33+
34+
// Add EOF token
35+
self.push(SyntaxKind::EOF, 0, None);
36+
37+
Lexed {
38+
text: self.text,
39+
kind: self.kind,
40+
start: self.start,
41+
error: self.error,
42+
}
9543
}
9644

9745
fn push(&mut self, kind: SyntaxKind, len: usize, err: Option<&str>) {
98-
self.res.push(kind, self.offset);
46+
self.kind.push(kind);
47+
self.start.push(self.offset as u32);
9948
self.offset += len;
10049

10150
if let Some(err) = err {
102-
let token = self.res.len() as u32;
51+
let token = (self.kind.len() - 1) as u32;
10352
let msg = err.to_owned();
104-
self.res.error.push(LexError { msg, token });
53+
self.error.push(LexError { msg, token });
10554
}
10655
}
10756

10857
fn extend_token(&mut self, kind: &pgt_tokenizer::TokenKind, token_text: &str) {
109-
// A note on an intended tradeoff:
110-
// We drop some useful information here (see patterns with double dots `..`)
111-
// Storing that info in `SyntaxKind` is not possible due to its layout requirements of
112-
// being `u16` that come from `rowan::SyntaxKind`.
11358
let mut err = "";
11459

11560
let syntax_kind = {
@@ -121,25 +66,13 @@ impl<'a> Converter<'a> {
12166
}
12267
SyntaxKind::COMMENT
12368
}
124-
125-
// whitespace
12669
pgt_tokenizer::TokenKind::Space => SyntaxKind::SPACE,
12770
pgt_tokenizer::TokenKind::Tab => SyntaxKind::TAB,
12871
pgt_tokenizer::TokenKind::Newline => SyntaxKind::NEWLINE,
12972
pgt_tokenizer::TokenKind::CarriageReturn => SyntaxKind::CARRIAGE_RETURN,
13073
pgt_tokenizer::TokenKind::VerticalTab => SyntaxKind::VERTICAL_TAB,
13174
pgt_tokenizer::TokenKind::FormFeed => SyntaxKind::FORM_FEED,
13275
pgt_tokenizer::TokenKind::Ident => {
133-
// TODO: check for max identifier length
134-
//
135-
// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
136-
// The system uses no more than NAMEDATALEN-1 bytes of an
137-
// identifier; longer names can be written in commands, but
138-
// they will be truncated. By default, NAMEDATALEN is 64 so
139-
// the maximum identifier length is 63 bytes. If this limit
140-
// is problematic, it can be raised by changing the
141-
// NAMEDATALEN constant in src/include/pg_config_manual.h.
142-
// see: https://github.com/postgres/postgres/blob/e032e4c7ddd0e1f7865b246ec18944365d4f8614/src/include/pg_config_manual.h#L29
14376
SyntaxKind::from_keyword(token_text).unwrap_or(SyntaxKind::IDENT)
14477
}
14578
pgt_tokenizer::TokenKind::Literal { kind, .. } => {
@@ -214,43 +147,36 @@ impl<'a> Converter<'a> {
214147
if !terminated {
215148
err = "Missing trailing `'` symbol to terminate the string literal";
216149
}
217-
// TODO: rust analzyer checks for un-escaped strings, we should too
218150
SyntaxKind::STRING
219151
}
220152
pgt_tokenizer::LiteralKind::ByteStr { terminated } => {
221153
if !terminated {
222154
err = "Missing trailing `'` symbol to terminate the hex bit string literal";
223155
}
224-
// TODO: rust analzyer checks for un-escaped strings, we should too
225156
SyntaxKind::BYTE_STRING
226157
}
227158
pgt_tokenizer::LiteralKind::BitStr { terminated } => {
228159
if !terminated {
229-
err = "Missing trailing `\'` symbol to terminate the bit string literal";
160+
err = "Missing trailing `'` symbol to terminate the bit string literal";
230161
}
231-
// TODO: rust analzyer checks for un-escaped strings, we should too
232162
SyntaxKind::BIT_STRING
233163
}
234164
pgt_tokenizer::LiteralKind::DollarQuotedString { terminated } => {
235165
if !terminated {
236-
// TODO: we could be fancier and say the ending string we're looking for
237166
err = "Unterminated dollar quoted string literal";
238167
}
239-
// TODO: rust analzyer checks for un-escaped strings, we should too
240168
SyntaxKind::DOLLAR_QUOTED_STRING
241169
}
242170
pgt_tokenizer::LiteralKind::UnicodeEscStr { terminated } => {
243171
if !terminated {
244172
err = "Missing trailing `'` symbol to terminate the unicode escape string literal";
245173
}
246-
// TODO: rust analzyer checks for un-escaped strings, we should too
247174
SyntaxKind::BYTE_STRING
248175
}
249176
pgt_tokenizer::LiteralKind::EscStr { terminated } => {
250177
if !terminated {
251-
err = "Missing trailing `\'` symbol to terminate the escape string literal";
178+
err = "Missing trailing `'` symbol to terminate the escape string literal";
252179
}
253-
// TODO: rust analzyer checks for un-escaped strings, we should too
254180
SyntaxKind::ESC_STRING
255181
}
256182
};

crates/pgt_lexer_new/src/lib.rs

Lines changed: 5 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,78 +1,15 @@
11
mod codegen;
22
mod diagnostics;
3-
mod lexed_str;
4-
5-
use diagnostics::LexError;
6-
use lexed_str::LexedStr;
7-
use pgt_text_size::TextRange;
3+
mod lexed;
4+
mod lexer;
85

96
pub use crate::codegen::syntax_kind::SyntaxKind;
10-
11-
/// Result of lexing a string, providing access to tokens and diagnostics
12-
///
13-
/// Thin wrapper around LexedStr for better API ergonomics
14-
pub struct Lexed<'a> {
15-
inner: LexedStr<'a>,
16-
}
17-
18-
impl Lexed<'_> {
19-
/// Returns the number of tokens (excluding EOF)
20-
pub fn len(&self) -> usize {
21-
self.inner.len()
22-
}
23-
24-
/// Returns true if there are no tokens
25-
pub fn is_empty(&self) -> bool {
26-
self.len() == 0
27-
}
28-
29-
/// Returns an iterator over token kinds
30-
pub fn tokens(&self) -> impl Iterator<Item = SyntaxKind> + '_ {
31-
(0..self.len()).map(move |i| self.inner.kind(i))
32-
}
33-
34-
/// Returns the kind of token at the given index
35-
pub fn kind(&self, idx: usize) -> SyntaxKind {
36-
self.inner.kind(idx)
37-
}
38-
39-
/// Returns the text range of token at the given index
40-
pub fn range(&self, idx: usize) -> TextRange {
41-
let range = self.inner.text_range(idx);
42-
TextRange::new(
43-
range.start.try_into().unwrap(),
44-
range.end.try_into().unwrap(),
45-
)
46-
}
47-
48-
/// Returns the text of token at the given index
49-
pub fn text(&self, idx: usize) -> &str {
50-
self.inner.text(idx)
51-
}
52-
53-
/// Returns all lexing errors with their text ranges
54-
pub fn errors(&self) -> Vec<LexError> {
55-
self.inner
56-
.errors()
57-
.map(|(i, msg)| {
58-
let range = self.inner.text_range(i);
59-
LexError {
60-
message: msg.into(),
61-
span: TextRange::new(
62-
range.start.try_into().unwrap(),
63-
range.end.try_into().unwrap(),
64-
),
65-
}
66-
})
67-
.collect()
68-
}
69-
}
7+
pub use crate::lexed::{LexDiagnostic, Lexed};
8+
pub use crate::lexer::Lexer;
709

7110
/// Lex the input string into tokens and diagnostics
7211
pub fn lex(input: &str) -> Lexed {
73-
Lexed {
74-
inner: LexedStr::new(input),
75-
}
12+
Lexer::new(input).lex()
7613
}
7714

7815
#[cfg(test)]

0 commit comments

Comments
 (0)