1
- // based on https://github.com/rust-lang/rust-analyzer/blob/d8887c0758bbd2d5f752d5bd405d4491e90e7ed6/crates/parser/src/lexed_str.rs
2
-
3
- use std:: ops;
4
-
5
1
use pgt_tokenizer:: tokenize;
6
2
7
3
use crate :: SyntaxKind ;
4
+ use crate :: lexed:: { LexError , Lexed } ;
8
5
9
- pub struct LexedStr < ' a > {
6
+ /// Lexer that processes input text into tokens and diagnostics
7
+ pub struct Lexer < ' a > {
10
8
text : & ' a str ,
11
9
kind : Vec < SyntaxKind > ,
12
10
start : Vec < u32 > ,
13
11
error : Vec < LexError > ,
14
- }
15
-
16
- struct LexError {
17
- msg : String ,
18
- token : u32 ,
19
- }
20
-
21
- impl < ' a > LexedStr < ' a > {
22
- pub fn new ( text : & ' a str ) -> LexedStr < ' a > {
23
- let mut conv = Converter :: new ( text) ;
24
-
25
- for token in tokenize ( & text[ conv. offset ..] ) {
26
- let token_text = & text[ conv. offset ..] [ ..token. len as usize ] ;
27
-
28
- conv. extend_token ( & token. kind , token_text) ;
29
- }
30
-
31
- conv. finalize_with_eof ( )
32
- }
33
-
34
- pub fn len ( & self ) -> usize {
35
- self . kind . len ( ) - 1
36
- }
37
-
38
- pub fn kind ( & self , i : usize ) -> SyntaxKind {
39
- assert ! ( i < self . len( ) ) ;
40
- self . kind [ i]
41
- }
42
-
43
- pub fn text ( & self , i : usize ) -> & str {
44
- self . range_text ( i..i + 1 )
45
- }
46
-
47
- pub fn range_text ( & self , r : ops:: Range < usize > ) -> & str {
48
- assert ! ( r. start < r. end && r. end <= self . len( ) ) ;
49
- let lo = self . start [ r. start ] as usize ;
50
- let hi = self . start [ r. end ] as usize ;
51
- & self . text [ lo..hi]
52
- }
53
-
54
- // Naming is hard.
55
- pub fn text_range ( & self , i : usize ) -> ops:: Range < usize > {
56
- assert ! ( i < self . len( ) ) ;
57
- let lo = self . start [ i] as usize ;
58
- let hi = self . start [ i + 1 ] as usize ;
59
- lo..hi
60
- }
61
-
62
- pub fn errors ( & self ) -> impl Iterator < Item = ( usize , & str ) > + ' _ {
63
- self . error
64
- . iter ( )
65
- . map ( |it| ( it. token as usize , it. msg . as_str ( ) ) )
66
- }
67
-
68
- fn push ( & mut self , kind : SyntaxKind , offset : usize ) {
69
- self . kind . push ( kind) ;
70
- self . start . push ( offset as u32 ) ;
71
- }
72
- }
73
-
74
- struct Converter < ' a > {
75
- res : LexedStr < ' a > ,
76
12
offset : usize ,
77
13
}
78
14
79
- impl < ' a > Converter < ' a > {
80
- fn new ( text : & ' a str ) -> Self {
15
+ impl < ' a > Lexer < ' a > {
16
+ /// Create a new lexer for the given text
17
+ pub fn new ( text : & ' a str ) -> Self {
81
18
Self {
82
- res : LexedStr {
83
- text,
84
- kind : Vec :: new ( ) ,
85
- start : Vec :: new ( ) ,
86
- error : Vec :: new ( ) ,
87
- } ,
19
+ text,
20
+ kind : Vec :: new ( ) ,
21
+ start : Vec :: new ( ) ,
22
+ error : Vec :: new ( ) ,
88
23
offset : 0 ,
89
24
}
90
25
}
91
26
92
- fn finalize_with_eof ( mut self ) -> LexedStr < ' a > {
93
- self . res . push ( SyntaxKind :: EOF , self . offset ) ;
94
- self . res
27
+ /// Lex the input text and return the result
28
+ pub fn lex ( mut self ) -> Lexed < ' a > {
29
+ for token in tokenize ( & self . text [ self . offset ..] ) {
30
+ let token_text = & self . text [ self . offset ..] [ ..token. len as usize ] ;
31
+ self . extend_token ( & token. kind , token_text) ;
32
+ }
33
+
34
+ // Add EOF token
35
+ self . push ( SyntaxKind :: EOF , 0 , None ) ;
36
+
37
+ Lexed {
38
+ text : self . text ,
39
+ kind : self . kind ,
40
+ start : self . start ,
41
+ error : self . error ,
42
+ }
95
43
}
96
44
97
45
fn push ( & mut self , kind : SyntaxKind , len : usize , err : Option < & str > ) {
98
- self . res . push ( kind, self . offset ) ;
46
+ self . kind . push ( kind) ;
47
+ self . start . push ( self . offset as u32 ) ;
99
48
self . offset += len;
100
49
101
50
if let Some ( err) = err {
102
- let token = self . res . len ( ) as u32 ;
51
+ let token = ( self . kind . len ( ) - 1 ) as u32 ;
103
52
let msg = err. to_owned ( ) ;
104
- self . res . error . push ( LexError { msg, token } ) ;
53
+ self . error . push ( LexError { msg, token } ) ;
105
54
}
106
55
}
107
56
108
57
fn extend_token ( & mut self , kind : & pgt_tokenizer:: TokenKind , token_text : & str ) {
109
- // A note on an intended tradeoff:
110
- // We drop some useful information here (see patterns with double dots `..`)
111
- // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
112
- // being `u16` that come from `rowan::SyntaxKind`.
113
58
let mut err = "" ;
114
59
115
60
let syntax_kind = {
@@ -121,25 +66,13 @@ impl<'a> Converter<'a> {
121
66
}
122
67
SyntaxKind :: COMMENT
123
68
}
124
-
125
- // whitespace
126
69
pgt_tokenizer:: TokenKind :: Space => SyntaxKind :: SPACE ,
127
70
pgt_tokenizer:: TokenKind :: Tab => SyntaxKind :: TAB ,
128
71
pgt_tokenizer:: TokenKind :: Newline => SyntaxKind :: NEWLINE ,
129
72
pgt_tokenizer:: TokenKind :: CarriageReturn => SyntaxKind :: CARRIAGE_RETURN ,
130
73
pgt_tokenizer:: TokenKind :: VerticalTab => SyntaxKind :: VERTICAL_TAB ,
131
74
pgt_tokenizer:: TokenKind :: FormFeed => SyntaxKind :: FORM_FEED ,
132
75
pgt_tokenizer:: TokenKind :: Ident => {
133
- // TODO: check for max identifier length
134
- //
135
- // see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
136
- // The system uses no more than NAMEDATALEN-1 bytes of an
137
- // identifier; longer names can be written in commands, but
138
- // they will be truncated. By default, NAMEDATALEN is 64 so
139
- // the maximum identifier length is 63 bytes. If this limit
140
- // is problematic, it can be raised by changing the
141
- // NAMEDATALEN constant in src/include/pg_config_manual.h.
142
- // see: https://github.com/postgres/postgres/blob/e032e4c7ddd0e1f7865b246ec18944365d4f8614/src/include/pg_config_manual.h#L29
143
76
SyntaxKind :: from_keyword ( token_text) . unwrap_or ( SyntaxKind :: IDENT )
144
77
}
145
78
pgt_tokenizer:: TokenKind :: Literal { kind, .. } => {
@@ -214,43 +147,36 @@ impl<'a> Converter<'a> {
214
147
if !terminated {
215
148
err = "Missing trailing `'` symbol to terminate the string literal" ;
216
149
}
217
- // TODO: rust analzyer checks for un-escaped strings, we should too
218
150
SyntaxKind :: STRING
219
151
}
220
152
pgt_tokenizer:: LiteralKind :: ByteStr { terminated } => {
221
153
if !terminated {
222
154
err = "Missing trailing `'` symbol to terminate the hex bit string literal" ;
223
155
}
224
- // TODO: rust analzyer checks for un-escaped strings, we should too
225
156
SyntaxKind :: BYTE_STRING
226
157
}
227
158
pgt_tokenizer:: LiteralKind :: BitStr { terminated } => {
228
159
if !terminated {
229
- err = "Missing trailing `\ ' ` symbol to terminate the bit string literal" ;
160
+ err = "Missing trailing `'` symbol to terminate the bit string literal" ;
230
161
}
231
- // TODO: rust analzyer checks for un-escaped strings, we should too
232
162
SyntaxKind :: BIT_STRING
233
163
}
234
164
pgt_tokenizer:: LiteralKind :: DollarQuotedString { terminated } => {
235
165
if !terminated {
236
- // TODO: we could be fancier and say the ending string we're looking for
237
166
err = "Unterminated dollar quoted string literal" ;
238
167
}
239
- // TODO: rust analzyer checks for un-escaped strings, we should too
240
168
SyntaxKind :: DOLLAR_QUOTED_STRING
241
169
}
242
170
pgt_tokenizer:: LiteralKind :: UnicodeEscStr { terminated } => {
243
171
if !terminated {
244
172
err = "Missing trailing `'` symbol to terminate the unicode escape string literal" ;
245
173
}
246
- // TODO: rust analzyer checks for un-escaped strings, we should too
247
174
SyntaxKind :: BYTE_STRING
248
175
}
249
176
pgt_tokenizer:: LiteralKind :: EscStr { terminated } => {
250
177
if !terminated {
251
- err = "Missing trailing `\ ' ` symbol to terminate the escape string literal" ;
178
+ err = "Missing trailing `'` symbol to terminate the escape string literal" ;
252
179
}
253
- // TODO: rust analzyer checks for un-escaped strings, we should too
254
180
SyntaxKind :: ESC_STRING
255
181
}
256
182
} ;
0 commit comments