17
17
RE_WHITESPACE = re .compile (r"[ \n\r\t]+" )
18
18
RE_PROPERTY = re .compile (r"[\u0080-\uFFFFa-zA-Z_][\u0080-\uFFFFa-zA-Z0-9_-]*" )
19
19
RE_INDEX = re .compile (r"-?[0-9]+" )
20
- RE_INT = re .compile (r"-?[0-9]+" )
21
- RE_EXPONENT = re . compile ( r"[eE][+-]?[0-9]+" )
22
- RE_NEGATIVE_EXPONENT = re .compile (r"[ eE]- [0-9]+" )
20
+ RE_INT = re .compile (r"-?[0-9]+(?:[eE]\+?[0-9]+)? " )
21
+ # RE_FLOAT includes numbers with a negative exponent and no decimal point.
22
+ RE_FLOAT = re .compile (r"(:?-?[0-9]+\.[0-9]+(?:[ eE][+-]? [0-9]+)?)|(-?[0-9]+[eE]-[0-9]+) " )
23
23
RE_FUNCTION_NAME = re .compile (r"[a-z][a-z_0-9]*" )
24
- RE_AND = re .compile (r"&&" )
25
- RE_OR = re .compile (r"\|\|" )
26
- RE_TRUE = re .compile (r"true" )
27
- RE_FALSE = re .compile (r"false" )
28
- RE_NULL = re .compile (r"null" )
29
- RE_ESCAPE = re .compile (r"\\[bfnrtu/]" )
24
+ ESCAPES = frozenset (["b" , "f" , "n" , "r" , "t" , "u" , "/" , "\\ " ])
30
25
31
26
32
27
class Lexer :
@@ -77,13 +72,13 @@ def emit(self, t: TokenType) -> None:
77
72
78
73
def next (self ) -> str :
79
74
"""Return the next character, or the empty string if no more characters."""
80
- if self .pos >= len (self .query ):
75
+ try :
76
+ c = self .query [self .pos ]
77
+ self .pos += 1
78
+ return c
79
+ except IndexError :
81
80
return ""
82
81
83
- c = self .query [self .pos ]
84
- self .pos += 1
85
- return c
86
-
87
82
def ignore (self ) -> None :
88
83
"""Ignore characters up to the pointer."""
89
84
self .start = self .pos
@@ -100,18 +95,16 @@ def backup(self) -> None:
100
95
101
96
def peek (self ) -> str :
102
97
"""Return the next character without advancing the pointer."""
103
- c = self . next ()
104
- if c :
105
- self . backup ()
106
- return c
107
-
108
- def accept (self , pattern : Pattern [ str ] ) -> bool :
109
- """Increment the pointer if the current character matches _pattern_ ."""
110
- c = self .next ()
111
- if pattern . match ( c ):
98
+ try :
99
+ return self . query [ self . pos ]
100
+ except IndexError :
101
+ return ""
102
+
103
+ def accept (self , s : str ) -> bool :
104
+ """Increment the pointer if the current position starts with _s_ ."""
105
+ if self .query . startswith ( s , self . pos ):
106
+ self . pos += len ( s )
112
107
return True
113
- if c :
114
- self .backup ()
115
108
return False
116
109
117
110
def accept_match (self , pattern : Pattern [str ]) -> bool :
@@ -140,7 +133,16 @@ def ignore_whitespace(self) -> bool:
140
133
141
134
def error (self , msg : str ) -> None :
142
135
"""Emit an error token."""
143
- self .tokens .append (Token (TokenType .ERROR , msg , self .pos , self .query ))
136
+ # better error messages.
137
+ self .tokens .append (
138
+ Token (
139
+ TokenType .ERROR ,
140
+ self .query [self .start : self .pos ],
141
+ self .start ,
142
+ self .query ,
143
+ msg ,
144
+ )
145
+ )
144
146
145
147
146
148
StateFn = Callable [[Lexer ], Optional ["StateFn" ]]
@@ -150,7 +152,6 @@ def lex_root(l: Lexer) -> Optional[StateFn]: # noqa: D103
150
152
c = l .next ()
151
153
152
154
if c != "$" :
153
- l .backup ()
154
155
l .error (f"expected '$', found { c !r} " )
155
156
return None
156
157
@@ -180,9 +181,8 @@ def lex_segment(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0911
180
181
l .emit (TokenType .LBRACKET )
181
182
return lex_inside_bracketed_segment
182
183
183
- # default
184
- l .backup ()
185
184
if l .filter_depth :
185
+ l .backup ()
186
186
return lex_inside_filter
187
187
188
188
l .error (f"expected '.', '..' or a bracketed selection, found { c !r} " )
@@ -204,21 +204,21 @@ def lex_descendant_segment(l: Lexer) -> Optional[StateFn]: # noqa: D103
204
204
l .emit (TokenType .LBRACKET )
205
205
return lex_inside_bracketed_segment
206
206
207
- # default
208
207
l .backup ()
209
208
210
209
if l .accept_match (RE_PROPERTY ):
211
210
l .emit (TokenType .PROPERTY )
212
211
return lex_segment
213
212
213
+ l .next ()
214
214
l .error (f"unexpected descendant selection token { c !r} " )
215
215
return None
216
216
217
217
218
218
def lex_shorthand_selector (l : Lexer ) -> Optional [StateFn ]: # noqa: D103
219
219
l .ignore () # ignore dot
220
220
221
- if l .ignore_whitespace ( ):
221
+ if l .accept_match ( RE_WHITESPACE ):
222
222
l .error ("unexpected whitespace after dot" )
223
223
return None
224
224
@@ -318,11 +318,9 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0915, PL
318
318
return lex_inside_bracketed_segment
319
319
320
320
if c == "'" :
321
- # String literal
322
321
return lex_single_quoted_string_inside_filter_expression
323
322
324
323
if c == '"' :
325
- # String literal
326
324
return lex_double_quoted_string_inside_filter_expression
327
325
328
326
if c == "(" :
@@ -388,61 +386,31 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0915, PL
388
386
l .emit (TokenType .GT )
389
387
continue
390
388
391
- # default
392
389
l .backup ()
393
390
394
- # numbers
395
- if l .accept_match (RE_INT ):
396
- if l .peek () == "." :
397
- # A float
398
- l .next ()
399
- if not l .accept_match (RE_INT ):
400
- l .error ("a fractional digit is required after a decimal point" )
401
- return None
402
-
403
- l .accept_match (RE_EXPONENT )
404
- l .emit (TokenType .FLOAT )
405
- continue
406
-
407
- # An int, or float if exponent is negative
408
- if l .accept_match (RE_NEGATIVE_EXPONENT ):
409
- l .emit (TokenType .FLOAT )
410
- else :
411
- l .accept_match (RE_EXPONENT )
412
- l .emit (TokenType .INT )
413
- continue
414
-
415
- if l .accept_match (RE_AND ):
391
+ if l .accept ("&&" ):
416
392
l .emit (TokenType .AND )
417
- continue
418
-
419
- if l .accept_match (RE_OR ):
393
+ elif l .accept ("||" ):
420
394
l .emit (TokenType .OR )
421
- continue
422
-
423
- if l .accept_match (RE_TRUE ):
395
+ elif l .accept ("true" ):
424
396
l .emit (TokenType .TRUE )
425
- continue
426
-
427
- if l .accept_match (RE_FALSE ):
397
+ elif l .accept ("false" ):
428
398
l .emit (TokenType .FALSE )
429
- continue
430
-
431
- if l .accept_match (RE_NULL ):
399
+ elif l .accept ("null" ):
432
400
l .emit (TokenType .NULL )
433
- continue
434
-
435
- # functions
436
- if l .accept_match (RE_FUNCTION_NAME ) and l .peek () == "(" :
401
+ elif l .accept_match (RE_FLOAT ):
402
+ l .emit (TokenType .FLOAT )
403
+ elif l .accept_match (RE_INT ):
404
+ l .emit (TokenType .INT )
405
+ elif l .accept_match (RE_FUNCTION_NAME ) and l .peek () == "(" :
437
406
# Keep track of parentheses for this function call.
438
407
l .paren_stack .append (1 )
439
408
l .emit (TokenType .FUNCTION )
440
409
l .next ()
441
410
l .ignore () # ignore LPAREN
442
- continue
443
-
444
- l .error (f"unexpected filter selector token { c !r} " )
445
- return None
411
+ else :
412
+ l .error (f"unexpected filter selector token { c !r} " )
413
+ return None
446
414
447
415
448
416
def lex_string_factory (quote : str , state : StateFn ) -> StateFn :
@@ -467,16 +435,15 @@ def _lex_string(l: Lexer) -> Optional[StateFn]:
467
435
return state
468
436
469
437
while True :
470
- head = l .query [l .pos : l .pos + 2 ]
471
438
c = l .next ()
472
439
473
- if head in ( "\\ \\ " , f" \\ { quote } " ) :
474
- l . next ()
475
- continue
476
-
477
- if c == " \\ " and not RE_ESCAPE . match ( head ) :
478
- l .error ("invalid escape" )
479
- return None
440
+ if c == "\\ " :
441
+ peeked = l . peek ()
442
+ if peeked in ESCAPES or peeked == quote :
443
+ l . next ()
444
+ else :
445
+ l .error ("invalid escape" )
446
+ return None
480
447
481
448
if not c :
482
449
l .error (f"unclosed string starting at index { l .start } " )
@@ -522,6 +489,6 @@ def tokenize(query: str) -> List[Token]:
522
489
lexer .run ()
523
490
524
491
if tokens and tokens [- 1 ].type_ == TokenType .ERROR :
525
- raise JSONPathSyntaxError (tokens [- 1 ].value , token = tokens [- 1 ])
492
+ raise JSONPathSyntaxError (tokens [- 1 ].message , token = tokens [- 1 ])
526
493
527
494
return tokens
0 commit comments