|
| 1 | +-- Copyright 2025, Didier Willis |
| 2 | +-- (Donated to Scintillua, under the same license.) |
| 3 | +-- See LICENSE. |
| 4 | +-- |
| 5 | +-- Lexer for the SIL "TeX-like" syntax used by the SILE typesetting system. |
| 6 | +-- |
| 7 | +-- The SILE typesetting system (https://sile-typesetter.org/) can support |
| 8 | +-- different input formats, via dedicated "inputters". |
| 9 | +-- It is not tied to a given input syntax. |
| 10 | +-- Yet, the "SIL in TeX-like flavor" format is a common one, included in the |
| 11 | +-- core distribution and used in most of the examples and documentation. |
| 12 | +-- |
| 13 | +-- SIL TeX-like is similar to LaTeX, but with a parity with SIL XML. |
| 14 | +-- I.e. \foo[attr=val]{...} is equivalent to <foo attr="val">...</foo>. |
| 15 | +-- Environments are syntactic sugar for commands. |
| 16 | +-- I.e. \begin[attr=val]{foo}...\end{foo} is the same as \foo[attr=val]{...}. |
| 17 | +-- Rules below try to respect this parity. |
| 18 | + |
| 19 | +local lexer = lexer |
| 20 | +local P, S, R, Cb, Cg, Ct, Cmt = lpeg.P, lpeg.S, lpeg.R, lpeg.Cb, lpeg.Cg, lpeg.Ct, lpeg.Cmt |
| 21 | + |
| 22 | +local lex = lexer.new(...) |
| 23 | +local ws = lex:get_rule('whitespace') |
| 24 | + |
| 25 | +-- 1. Syntax bits. |
| 26 | + |
| 27 | +-- SIL identifiers can contain letters, numbers, and the characters :-. |
| 28 | +-- There are additional rules (e.g. no leading digits), but we'll keep it simple here. |
| 29 | +local identifier = lexer.alnum^1 * (S(':-') * lexer.alnum^1)^0 |
| 30 | + |
| 31 | +-- Reserved hard-coded "pass-through" commands/environments. |
| 32 | +local reserved_specials = { |
| 33 | + ftl = 'text', -- Well Fluent has a syntax, but let's not care here. |
| 34 | + lua = 'lua', |
| 35 | + math = 'tex', -- We'd need a (La)TeX math-only lexer to handle this properly. |
| 36 | + raw = 'text', |
| 37 | + script = 'lua', |
| 38 | + -- sil = ... -- It's the default here, so no need to add a rule for it. |
| 39 | + xml = 'xml', |
| 40 | + use = 'lua', |
| 41 | +} |
| 42 | +-- Other reserved keywords are "comment" and "begin"/"end", |
| 43 | +-- but we'll handle them in the rules below. |
| 44 | + |
| 45 | +-- Parameters (key-value pairs). |
| 46 | +local eq = lex:tag(lexer.OPERATOR, '=') |
| 47 | +local simple_value = (P(1) - S(',;]'))^1 |
| 48 | +local quoted_value = lexer.range('"', false, false) |
| 49 | +local param = lex:tag(lexer.ATTRIBUTE, identifier) * eq * lex:tag(lexer.STRING, quoted_value + simple_value) |
| 50 | +local param_list = param * (ws^0 * lex:tag(lexer.OPERATOR, ',') * ws^0 * param)^0 |
| 51 | +local optparams = (lex:tag(lexer.OPERATOR, '[') * ws^0 * param_list^0 * ws^0 * lex:tag(lexer.OPERATOR, ']'))^0 |
| 52 | + |
| 53 | +-- 2. Comments. |
| 54 | +local line_comment = lexer.to_eol('%') |
| 55 | +local env_comment = lexer.range(P('\\begin') * optparams * P('{comment}'), P('\\end{comment}')) |
| 56 | +local cmd_comment = P('\\comment') * optparams * lexer.range('{', '}', false, false, true) |
| 57 | +lex:add_rule('comment', lex:tag(lexer.COMMENT, line_comment + env_comment + cmd_comment)) |
| 58 | + |
| 59 | +-- 3. Special reserved pass-through commands/environments. |
| 60 | + |
| 61 | +local function check_exit_brace_level(_, _, current_level) |
| 62 | + current_level = tonumber(current_level) |
| 63 | + return current_level == 0 |
| 64 | +end |
| 65 | + |
| 66 | +local function increment_brace_level(increment) |
| 67 | + local function update_brace_level(_, _, current_level) |
| 68 | + current_level = tonumber(current_level) |
| 69 | + local next_level = tostring(current_level + increment) |
| 70 | + return true, next_level |
| 71 | + end |
| 72 | + return Cg(Cmt(Cb('brace_level'), update_brace_level), 'brace_level') |
| 73 | +end |
| 74 | + |
| 75 | +local is_exit_brace = Cmt(Cb('brace_level'), check_exit_brace_level) |
| 76 | +local init_brace_level = Cg(Ct('') / '0', 'brace_level') |
| 77 | + |
| 78 | +for name, lang in pairs(reserved_specials) do |
| 79 | + -- Order matters: environments, commands with arguments, commands without arguments. |
| 80 | + -- We need alt names for multiple embeddings and rules. |
| 81 | + local base_rule_id = name .. '_' .. lang |
| 82 | + |
| 83 | + -- 3.1. Reserved environments. |
| 84 | + -- Ex. \begin{lua} ... Lua code ... \end{lua} |
| 85 | + local env_embedder = lexer.load(lang, base_rule_id .. '_env') |
| 86 | + lex:embed( |
| 87 | + env_embedder, |
| 88 | + lex:tag(lexer.FUNCTION_BUILTIN, '\\begin') * optparams |
| 89 | + * lex:tag(lexer.OPERATOR, '{') * lex:tag(lexer.FUNCTION_BUILTIN, name) * lex:tag(lexer.OPERATOR, '}'), |
| 90 | + lex:tag(lexer.FUNCTION_BUILTIN, '\\end') |
| 91 | + * lex:tag(lexer.OPERATOR, '{') * lex:tag(lexer.FUNCTION_BUILTIN, name) * lex:tag(lexer.OPERATOR, '}')) |
| 92 | + |
| 93 | + -- 3.2. Reserved commands. |
| 94 | + -- Ex. \lua{... Lua code ...} |
| 95 | + -- The hard trick here is that we want to want to keep track of the paired braces, |
| 96 | + -- in order to exit the embedding on the right closing brace. |
| 97 | + local cmd_embedder = lang == 'text' |
| 98 | + and lexer.new(base_rule_id .. '_cmd') -- pseudo-lexer for text |
| 99 | + or lexer.load(lang, base_rule_id .. '_cmd') -- real lexer for Lua, TeX, XML |
| 100 | + if lang == 'lua' then |
| 101 | + -- We hack the Lua lexer to intercept and handle the pairs of braces, |
| 102 | + -- i.e. we remove them for the 'operator' rule and handle them separately. |
| 103 | + cmd_embedder:modify_rule('operator', cmd_embedder:tag(lexer.OPERATOR, '..' + S('+-*/%^#=<>&|~;:,.[]()'))) |
| 104 | + cmd_embedder:add_rule( |
| 105 | + 'sil_brace_open', |
| 106 | + cmd_embedder:tag(lexer.OPERATOR, '{') * increment_brace_level(1) |
| 107 | + ) |
| 108 | + cmd_embedder:add_rule( |
| 109 | + 'sil_brace_close', |
| 110 | + cmd_embedder:tag(lexer.OPERATOR, '}') * increment_brace_level(-1) |
| 111 | + ) |
| 112 | + elseif lang == 'tex' then |
| 113 | + -- We hack the TeX math lexer to intercept and handle the pairs of braces, |
| 114 | + -- i.e. we remove them for the 'operator' rule and handle them separately. |
| 115 | + -- We also take the opportunity remove some operators not expected in math mode, |
| 116 | + -- and add some extra operators for math mode. |
| 117 | + cmd_embedder:modify_rule('operator', cmd_embedder:tag(lexer.OPERATOR, S('&()[]'))) |
| 118 | + cmd_embedder:add_rule('operator_math', cmd_embedder:tag(lexer.OPERATOR .. ".math", S('+-=^_'))) |
| 119 | + cmd_embedder:add_rule( |
| 120 | + 'sil_brace_open', |
| 121 | + cmd_embedder:tag(lexer.OPERATOR, '{') * increment_brace_level(1) |
| 122 | + ) |
| 123 | + cmd_embedder:add_rule( |
| 124 | + 'sil_brace_close', |
| 125 | + cmd_embedder:tag(lexer.OPERATOR, '}') * increment_brace_level(-1) |
| 126 | + ) |
| 127 | + else |
| 128 | + -- We just need to keep track of the braces for the XML and text lexers, |
| 129 | + -- without any special marking. |
| 130 | + cmd_embedder:add_rule( |
| 131 | + 'sil_brace_open', |
| 132 | + P'{' * increment_brace_level(1) |
| 133 | + ) |
| 134 | + cmd_embedder:add_rule( |
| 135 | + 'sil_brace_close', |
| 136 | + P'}' * increment_brace_level(-1) |
| 137 | + ) |
| 138 | + end |
| 139 | + lex:embed( |
| 140 | + cmd_embedder, |
| 141 | + lex:tag(lexer.FUNCTION_BUILTIN, '\\' .. name) * optparams * init_brace_level * lex:tag(lexer.FUNCTION_BUILTIN, '{'), |
| 142 | + lex:tag(lexer.FUNCTION_BUILTIN, '}' * is_exit_brace) |
| 143 | + ) |
| 144 | + |
| 145 | + -- 3.3. Reserved commands without arguments (must come after the commands with arguments). |
| 146 | + -- Ex. \use[module=packages.highlighter] |
| 147 | + lex:add_rule(base_rule_id .. '_cmd_no_arg', lex:tag(lexer.FUNCTION_BUILTIN, P('\\' .. name)) * optparams) |
| 148 | +end |
| 149 | + |
| 150 | +-- 4. Sections (for mere convenience / visibility). |
| 151 | +-- As of 0.15.9 SILE's default book class has chapter, section, subsection. |
| 152 | +-- The resilient.book class from 3rd-party module resilient.sile adds part, |
| 153 | +-- appendix, subsubsection, and frontmatter, mainmatter, backmatter. |
| 154 | +local sections = lex:word_match('sections') |
| 155 | +lex:set_word_list('sections', { |
| 156 | + 'frontmatter', 'mainmatter', 'backmatter', |
| 157 | + 'part', 'chapter', 'appendix', |
| 158 | + 'section', 'subsection', 'subsubsection', |
| 159 | +}) |
| 160 | +lex:add_rule('section', lex:tag('command.section', '\\' * sections) * optparams) |
| 161 | + |
| 162 | +-- 5. Regular commands/environments. |
| 163 | +-- Order matters: environments, commands |
| 164 | +local env_cmd = lex:tag(lexer.OPERATOR,'{') * lex:tag(lexer.TAG, identifier) * lex:tag(lexer.OPERATOR, '}') |
| 165 | +lex:add_rule( |
| 166 | + 'environment_start', |
| 167 | + lex:tag(lexer.FUNCTION_BUILTIN, '\\begin') * optparams * env_cmd |
| 168 | +) |
| 169 | +lex:add_rule( |
| 170 | + 'environment_end', |
| 171 | + lex:tag(lexer.FUNCTION_BUILTIN, '\\end') * env_cmd |
| 172 | +) |
| 173 | +lex:add_rule('command', lex:tag(lexer.TAG, '\\' * identifier) * optparams) |
| 174 | + |
| 175 | +-- 6. Groups. |
| 176 | +lex:add_rule('operator', lex:tag(lexer.OPERATOR, S('{}'))) |
| 177 | + |
| 178 | +lexer.property['scintillua.comment'] = '%' |
| 179 | + |
| 180 | +return lex |
0 commit comments