Skip to content

Commit b101a87

Browse files
authored
refactor(es/lexer): Share skip_block_comment (#10549)
This change may introduce minor regression, but I consider it acceptable.
1 parent 7bae1b8 commit b101a87

File tree

10 files changed

+130
-264
lines changed

10 files changed

+130
-264
lines changed

.changeset/fresh-insects-crash.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
swc_ecma_lexer: patch
3+
swc_ecma_parser: patch
4+
swc_core: patch
5+
---
6+
7+
refactor(ecma/lexer): common `skip_block_comment`

crates/swc_ecma_lexer/src/common/lexer/mod.rs

Lines changed: 117 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use std::borrow::Cow;
22

33
use char::{Char, CharExt};
4+
use comments_buffer::{BufferedComment, BufferedCommentKind};
45
use either::Either::{self, Left, Right};
56
use num_bigint::BigInt as BigIntValue;
67
use num_traits::{Num as NumTrait, ToPrimitive};
@@ -9,6 +10,7 @@ use smartstring::{LazyCompact, SmartString};
910
use state::State;
1011
use swc_atoms::Atom;
1112
use swc_common::{
13+
comments::{Comment, CommentKind},
1214
input::{Input, StringInput},
1315
BytePos, Span,
1416
};
@@ -49,8 +51,6 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
4951
fn atom<'b>(&self, s: impl Into<Cow<'b, str>>) -> swc_atoms::Atom;
5052
fn push_error(&self, error: crate::error::Error);
5153
fn buf(&self) -> std::rc::Rc<std::cell::RefCell<String>>;
52-
// TODO: invest why there has regression if implement this by trait
53-
fn skip_block_comment(&mut self);
5454

5555
#[inline(always)]
5656
#[allow(clippy::misnamed_getters)]
@@ -198,14 +198,14 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
198198
while idx < len {
199199
let b = *unsafe { bytes.get_unchecked(idx) };
200200
if b == b'\r' || b == b'\n' {
201-
self.state_mut().set_had_line_break(true);
201+
self.state_mut().mark_had_line_break();
202202
break;
203203
} else if b > 127 {
204204
// non-ASCII case: Check for Unicode line termination characters
205205
let s = unsafe { input_str.get_unchecked(idx..) };
206206
if let Some(first_char) = s.chars().next() {
207207
if first_char == '\u{2028}' || first_char == '\u{2029}' {
208-
self.state_mut().set_had_line_break(true);
208+
self.state_mut().mark_had_line_break();
209209
break;
210210
}
211211
idx += first_char.len_utf8() - 1; // `-1` will incrumented
@@ -250,6 +250,117 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
250250
}
251251
}
252252

253+
/// Expects current char to be '/' and next char to be '*'.
254+
fn skip_block_comment(&mut self) {
255+
let start = self.cur_pos();
256+
257+
debug_assert_eq!(self.cur(), Some('/'));
258+
debug_assert_eq!(self.peek(), Some('*'));
259+
260+
self.input_mut().bump_bytes(2);
261+
262+
// jsdoc
263+
let slice_start = self.cur_pos();
264+
265+
// Check if there's an asterisk at the beginning (JSDoc style)
266+
let mut was_star = if self.input().is_byte(b'*') {
267+
self.bump();
268+
true
269+
} else {
270+
false
271+
};
272+
273+
let mut is_for_next =
274+
self.state().had_line_break() || !self.state().can_have_trailing_comment();
275+
276+
// Optimization for finding block comment end position
277+
let input_str = self.input().as_str();
278+
let bytes = input_str.as_bytes();
279+
let mut pos = 0;
280+
let len = bytes.len();
281+
let mut should_mark_had_line_break = false;
282+
283+
// Byte-based scanning for faster search
284+
while pos < len {
285+
let b = *unsafe { bytes.get_unchecked(pos) };
286+
287+
if was_star && b == b'/' {
288+
if should_mark_had_line_break {
289+
self.state_mut().mark_had_line_break();
290+
}
291+
// Found comment end: "*/"
292+
self.input_mut().bump_bytes(pos + 1);
293+
294+
let end = self.cur_pos();
295+
296+
self.skip_space::<false>();
297+
298+
// Check if this is a comment before semicolon
299+
if !self.state().had_line_break() && self.input().is_byte(b';') {
300+
is_for_next = false;
301+
}
302+
303+
if self.comments_buffer().is_some() {
304+
let src = unsafe {
305+
// Safety: We got slice_start and end from self.input so those are valid.
306+
self.input_mut().slice(slice_start, end)
307+
};
308+
let s = &src[..src.len() - 2];
309+
let cmt = Comment {
310+
kind: CommentKind::Block,
311+
span: Span::new(start, end),
312+
text: self.atom(s),
313+
};
314+
315+
let _ = self.input().peek();
316+
if is_for_next {
317+
self.comments_buffer_mut()
318+
.unwrap()
319+
.push_pending_leading(cmt);
320+
} else {
321+
let pos = self.state().prev_hi();
322+
self.comments_buffer_mut().unwrap().push(BufferedComment {
323+
kind: BufferedCommentKind::Trailing,
324+
pos,
325+
comment: cmt,
326+
});
327+
}
328+
}
329+
330+
return;
331+
}
332+
333+
// Check for line break characters - ASCII case
334+
if b == b'\r' || b == b'\n' {
335+
should_mark_had_line_break = true;
336+
}
337+
// Check for Unicode line breaks (rare case)
338+
else if b > 127 {
339+
let remaining = &input_str[pos..];
340+
if let Some(c) = remaining.chars().next() {
341+
if c == '\u{2028}' || c == '\u{2029}' {
342+
should_mark_had_line_break = true;
343+
}
344+
// Skip multibyte characters
345+
pos += c.len_utf8() - 1; // `-1` will incrumented below
346+
}
347+
}
348+
349+
was_star = b == b'*';
350+
pos += 1;
351+
}
352+
353+
if should_mark_had_line_break {
354+
self.state_mut().mark_had_line_break();
355+
}
356+
357+
// If we reached here, it's an unterminated block comment
358+
self.input_mut().bump_bytes(len); // skip remaining
359+
let end = self.input().end_pos();
360+
let span = Span::new(end, end);
361+
self.emit_error_span(span, SyntaxError::UnterminatedBlockComment)
362+
}
363+
253364
/// Skip comments or whitespaces.
254365
///
255366
/// See https://tc39.github.io/ecma262/#sec-white-space
@@ -270,7 +381,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
270381

271382
self.input_mut().bump_bytes(offset as usize);
272383
if newline {
273-
self.state_mut().set_had_line_break(true);
384+
self.state_mut().mark_had_line_break();
274385
}
275386

276387
if LEX_COMMENTS && self.input().is_byte(b'/') {
@@ -1213,7 +1324,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
12131324

12141325
cooked_slice_start = self.cur_pos();
12151326
} else if c.is_line_terminator() {
1216-
self.state_mut().set_had_line_break(true);
1327+
self.state_mut().mark_had_line_break();
12171328

12181329
consume_cooked!();
12191330

crates/swc_ecma_lexer/src/common/lexer/state.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ pub trait State: Clone {
5353
fn set_is_expr_allowed(&mut self, is_expr_allowed: bool);
5454
fn set_next_regexp(&mut self, start: Option<BytePos>);
5555
fn had_line_break(&self) -> bool;
56-
fn set_had_line_break(&mut self, had_line_break: bool);
56+
fn mark_had_line_break(&mut self);
5757
fn had_line_break_before_last(&self) -> bool;
5858
fn token_contexts(&self) -> &crate::TokenContexts;
5959
fn mut_token_contexts(&mut self) -> &mut crate::TokenContexts;

crates/swc_ecma_lexer/src/common/parser/pat.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -775,7 +775,7 @@ pub fn parse_unique_formal_params<'a>(p: &mut impl Parser<'a>) -> PResult<Vec<Pa
775775
parse_formal_params(p)
776776
}
777777

778-
pub fn parse_paren_items_as_params<'a, P: Parser<'a>>(
778+
pub(super) fn parse_paren_items_as_params<'a, P: Parser<'a>>(
779779
p: &mut P,
780780
mut exprs: Vec<AssignTargetOrSpread>,
781781
trailing_comma: Option<Span>,

crates/swc_ecma_lexer/src/lexer/mod.rs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ mod state;
3232
mod table;
3333
#[cfg(test)]
3434
mod tests;
35-
pub mod util;
3635

3736
#[derive(Clone)]
3837
pub struct Lexer<'a> {
@@ -119,11 +118,6 @@ impl<'a> crate::common::lexer::Lexer<'a, TokenAndSpan> for Lexer<'a> {
119118
self.atoms.atom(s)
120119
}
121120

122-
#[inline(always)]
123-
fn skip_block_comment(&mut self) {
124-
self.skip_block_comment();
125-
}
126-
127121
#[inline(always)]
128122
fn buf(&self) -> std::rc::Rc<std::cell::RefCell<String>> {
129123
self.buf.clone()

crates/swc_ecma_lexer/src/lexer/state.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,8 @@ impl common::lexer::state::State for State {
7070
}
7171

7272
#[inline(always)]
73-
fn set_had_line_break(&mut self, had_line_break: bool) {
74-
self.had_line_break = had_line_break;
73+
fn mark_had_line_break(&mut self) {
74+
self.had_line_break = true;
7575
}
7676

7777
#[inline(always)]

crates/swc_ecma_lexer/src/lexer/util.rs

Lines changed: 0 additions & 121 deletions
This file was deleted.

crates/swc_ecma_parser/src/lexer/mod.rs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ mod jsx;
2424
mod state;
2525
mod table;
2626
mod token;
27-
pub mod util;
2827

2928
pub(crate) use token::{NextTokenAndSpan, Token, TokenAndSpan, TokenValue};
3029

@@ -115,11 +114,6 @@ impl<'a> swc_ecma_lexer::common::lexer::Lexer<'a, TokenAndSpan> for Lexer<'a> {
115114
self.atoms.atom(s)
116115
}
117116

118-
#[inline(always)]
119-
fn skip_block_comment(&mut self) {
120-
self.skip_block_comment();
121-
}
122-
123117
#[inline(always)]
124118
fn buf(&self) -> std::rc::Rc<std::cell::RefCell<String>> {
125119
self.buf.clone()

crates/swc_ecma_parser/src/lexer/state.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -349,8 +349,8 @@ impl swc_ecma_lexer::common::lexer::state::State for State {
349349
}
350350

351351
#[inline(always)]
352-
fn set_had_line_break(&mut self, had_line_break: bool) {
353-
self.had_line_break = had_line_break;
352+
fn mark_had_line_break(&mut self) {
353+
self.had_line_break = true;
354354
}
355355

356356
#[inline(always)]

0 commit comments

Comments
 (0)