@@ -2,30 +2,45 @@ package io.github.cybercodernaj.parkour.lexer.internal
2
2
3
3
import io.github.cybercodernaj.parkour.datasource.TextSource
4
4
import io.github.cybercodernaj.parkour.exceptions.LexicalException
5
+ import io.github.cybercodernaj.parkour.lexer.LexerBuilder
5
6
import io.github.cybercodernaj.parkour.utils.Position
6
7
7
8
/* *
9
+ * # Lexer
10
+ *
8
11
* The lexer is responsible to convert the given string into a stream of [Token]s.
9
- * The lexer take in multiple settings that configure how it behaves.
12
+ * The lexer take in multiple settings via the [LexerBuilder] that configures how it behaves.
10
13
* It will perform lexical analysis on a line-by-line basis and return the next unconsumed token.
11
- * A newline character is **always** separates a token.
14
+ * A newline character **always** separates a token unless it is a multiline comment.
15
+ *
16
+ * ## Literals
17
+ *
18
+ * There are only three types of literals the lexer manages.
19
+ * 1. Integer literals are normally lexed with a pure stream of numbers with underscores.
20
+ * 2. Floating literals are normally lexed with a forced decimal point with optional exponentiation.
21
+ * 3. String literals are normally lexed exact strings till it finds the original match.
22
+ *
23
+ * Additionally, escape sequences are required to input special characters inside string literals.
12
24
*
13
25
* @constructor Creates a lexer with the provided properties.
14
- * @param ignorePattern characters that satisfy this regex would be skipped. (Default: "\s+")
26
+ * @param ignorePattern characters that satisfy this regex would be skipped.
15
27
* @param singleLineComments The regex that defines how a single-line comment starts.
16
28
* Once identified, the lexer will skip the remaining line. (Default: null)
17
29
* @param multilineComments A pair of regexes, the starting pattern and the ending pattern for a
18
30
* multiline comment block. (Default: null)
19
- * @param identifiers A regex string that defines the rules for defining a name. (Default: "[a-zA-Z_]\w*")
31
+ * @param identifiers A regex string that defines the rules for defining a name.
20
32
* @param hardKeywords A set of strings that are considered hard keywords.
21
33
* Hard keywords are a characters and symbols that give a particular meaning to a program.
22
34
* They may not be used as identifiers. (Default: [])
23
35
* @param operators A set of strings that are considered as operators.
24
36
* Operators are characters and symbols that may perform arithmetic or logical operations. (Default: [])
25
37
* @param separators A set of strings that are considered as separators.
26
38
* Separators are characters and symbols that act like delimiters to separate other meaningful elements. (Default: [])
27
- * @param literals The configuration of literals. Literals denote constant values
28
- * such as numbers, strings, and characters. (Default: see [Literals])
39
+ * @param integerLiteral a regex that detects an integer literal.
40
+ * @param floatingLiteral a regex that detects a floating point number literal.
41
+ * @param singleLineString a set of strings that denote the start and end enclosing strings.
42
+ * The lexer will throw a [LexicalException] when a string literal is not terminated in the same line.
43
+ * @param escapeSequences a list of regex that matches an escape sequence. On match, it will return a Char based on the string matched.
29
44
*
30
45
* @author Nishant Aanjaney Jalan
31
46
* @since 0.1.0
@@ -38,9 +53,27 @@ class Lexer internal constructor(
38
53
private val hardKeywords : Set <String > = emptySet(),
39
54
private val operators : Set <String > = emptySet(),
40
55
private val separators : Set <String > = emptySet(),
41
- private val literals : Literals = Literals ()
56
+ private val integerLiteral : Regex ? = Defaults .integerLiterals,
57
+ private val floatingLiteral : Regex ? = Defaults .floatingLiterals,
58
+ private val singleLineString : Set <String > = Defaults .singleLineString,
59
+ private val escapeSequences : List <Pair <Regex , (String ) - > Char >> = Defaults .escapeSequences,
42
60
) {
61
+ /* *
62
+ * A list of common patterns and lists of items that most programming languages and
63
+ * data serialization formats.
64
+ *
65
+ * @author Nishant Aanjaney Jalan
66
+ * @since 0.2.0
67
+ */
43
68
object Defaults {
69
+ /* *
70
+ * ignorePattern is what the lexer will use to skip over.
71
+ * The part of the string that matches this regex will be ignored.
72
+ * This acts like a token separator.
73
+ *
74
+ * @author Nishant Aanjaney Jalan
75
+ * @since 0.2.0
76
+ */
44
77
val ignorePattern = Regex (""" \s+""" )
45
78
val identifiers = Regex (""" [a-zA-Z_]\w*""" )
46
79
val integerLiterals = Regex (""" [-+]?[0-9_]+""" )
@@ -164,7 +197,7 @@ class Lexer internal constructor(
164
197
}
165
198
166
199
private fun tryLiterals (): Token .Literal ? {
167
- (position pointsAt literals. floatingLiteral)
200
+ (position pointsAt floatingLiteral)
168
201
?.let { match ->
169
202
if (match.value.isBlank())
170
203
return null
@@ -177,7 +210,7 @@ class Lexer internal constructor(
177
210
} ? : throw LexicalException (" Double regex is badly formed." )
178
211
}
179
212
180
- (position pointsAt literals. integerLiteral)
213
+ (position pointsAt integerLiteral)
181
214
?.let { match ->
182
215
if (match.value.isBlank())
183
216
return null
@@ -190,14 +223,14 @@ class Lexer internal constructor(
190
223
} ? : throw LexicalException (" Int regex is badly formed. Tried parsing ${match.value} to an integer" )
191
224
}
192
225
193
- val stringStart = position pointsAtSome literals. singleLineString
226
+ val stringStart = position pointsAtSome singleLineString
194
227
if (stringStart != null ) {
195
228
val stringLit = StringBuilder ().append(currentLine[position.col])
196
229
val start = position++
197
230
if (position.col >= currentLine.length)
198
231
throw LexicalException (" String not closed in the given line" )
199
232
while (currentLine[position.col].toString() != stringStart) {
200
- val matches = literals. escapeSequences.mapNotNull { (regex, getEscapeChar) ->
233
+ val matches = escapeSequences.mapNotNull { (regex, getEscapeChar) ->
201
234
val result = (position pointsAt regex) ? : return @mapNotNull null
202
235
result.value to getEscapeChar(result.value)
203
236
}
0 commit comments