Skip to content

Commit 8c0678a

Browse files
committed
progress
1 parent 5b3322e commit 8c0678a

File tree

3 files changed

+53
-19
lines changed

3 files changed

+53
-19
lines changed

crates/pgt_tokenizer/src/cursor.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,14 @@ impl<'a> Cursor<'a> {
3333
self.chars.clone().next().unwrap_or(EOF_CHAR)
3434
}
3535

36+
/// Peeks the second next symbol from the input stream without consuming it.
37+
/// If requested position doesn't exist, `EOF_CHAR` is returned.
38+
/// However, getting `EOF_CHAR` doesn't always mean actual end of file,
39+
/// it should be checked with `is_eof` method.
40+
pub(crate) fn second(&self) -> char {
41+
self.chars.clone().nth(1).unwrap_or(EOF_CHAR)
42+
}
43+
3644
/// Checks if there is nothing more to consume.
3745
pub(crate) fn is_eof(&self) -> bool {
3846
self.chars.as_str().is_empty()

crates/pgt_tokenizer/src/lib.rs

Lines changed: 41 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,10 @@ const fn is_tab(c: char) -> bool {
3030
)
3131
}
3232

33-
const fn is_newline(c: char) -> bool {
33+
const fn is_line_ending(c: char) -> bool {
3434
matches!(
35-
c, '\n' // newline
36-
)
37-
}
38-
39-
const fn is_carriage_return(c: char) -> bool {
40-
matches!(
41-
c, '\r' // carriage return
35+
c,
36+
'\n' | '\r' // newline or carriage return
4237
)
4338
}
4439

@@ -81,15 +76,7 @@ impl Cursor<'_> {
8176
TokenKind::Tab
8277
}
8378

84-
c if is_newline(c) => {
85-
self.eat_while(is_newline);
86-
TokenKind::Newline
87-
}
88-
89-
c if is_carriage_return(c) => {
90-
self.eat_while(is_carriage_return);
91-
TokenKind::CarriageReturn
92-
}
79+
c if is_line_ending(c) => self.line_ending_sequence(c),
9380

9481
c if is_vertical_tab(c) => {
9582
self.eat_while(is_vertical_tab);
@@ -254,6 +241,43 @@ impl Cursor<'_> {
254241
}
255242
}
256243

244+
// invariant: we care about the number of consecutive newlines so we count them.
245+
//
246+
// Postgres considers a DOS-style \r\n sequence as two successive newlines, but we care about
247+
// logical line breaks and consider \r\n as one logical line break
248+
fn line_ending_sequence(&mut self, prev: char) -> TokenKind {
249+
// already consumed first line ending character (\n or \r)
250+
let mut line_breaks = 1;
251+
252+
// started with \r, check if it's part of \r\n
253+
if prev == '\r' && self.first() == '\n' {
254+
// consume the \n - \r\n still counts as 1 logical line break
255+
self.bump();
256+
}
257+
258+
// continue checking for more line endings
259+
loop {
260+
match self.first() {
261+
'\r' if self.second() == '\n' => {
262+
self.bump(); // consume \r
263+
self.bump(); // consume \n
264+
line_breaks += 1;
265+
}
266+
'\n' => {
267+
self.bump();
268+
line_breaks += 1;
269+
}
270+
'\r' => {
271+
self.bump();
272+
line_breaks += 1;
273+
}
274+
_ => break,
275+
}
276+
}
277+
278+
TokenKind::LineEnding { count: line_breaks }
279+
}
280+
257281
fn prefixed_string(
258282
&mut self,
259283
mk_kind: fn(bool) -> LiteralKind,

crates/pgt_tokenizer/src/token.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@ pub enum TokenKind {
1414
/// Whitespace characters.
1515
Space,
1616
Tab,
17-
Newline,
18-
CarriageReturn,
1917
VerticalTab,
2018
FormFeed,
19+
// Handles \n, \r, and sequences
20+
LineEnding {
21+
count: usize,
22+
},
2123
/// Identifier
2224
///
2325
/// case-sensitive

0 commit comments

Comments
 (0)