Skip to content

Commit 0c950a0

Browse files
OmikhleiaDidier Willis
authored andcommitted
feat: SIL TeX-like lexer (SILE)
1 parent e3f2637 commit 0c950a0

File tree

1 file changed

+180
-0
lines changed

1 file changed

+180
-0
lines changed

lexers/sil.lua

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
-- Copyright 2025, Didier Willis
2+
-- (Donated to Scintillua, under the same license.)
3+
-- See LICENSE.
4+
--
5+
-- Lexer for the SIL "TeX-like" syntax used by the SILE typesetting system.
6+
--
7+
-- The SILE typesetting system (https://sile-typesetter.org/) can support
8+
-- different input formats, via dedicated "inputters".
9+
-- It is not tied to a given input syntax.
10+
-- Yet, the "SIL in TeX-like flavor" format is a common one, included in the
11+
-- core distribution and used in most of the examples and documentation.
12+
--
13+
-- SIL TeX-like is similar to LaTeX, but with a parity with SIL XML.
14+
-- I.e. \foo[attr=val]{...} is equivalent to <foo attr="val">...</foo>.
15+
-- Environments are syntactic sugar for commands.
16+
-- I.e. \begin[attr=val]{foo}...\end{foo} is the same as \foo[attr=val]{...}.
17+
-- Rules below try to respect this parity.
18+
19+
local lexer = lexer
20+
local P, S, R, Cb, Cg, Ct, Cmt = lpeg.P, lpeg.S, lpeg.R, lpeg.Cb, lpeg.Cg, lpeg.Ct, lpeg.Cmt
21+
22+
local lex = lexer.new(...)
23+
local ws = lex:get_rule('whitespace')
24+
25+
-- 1. Syntax bits.
26+
27+
-- SIL identifiers can contain letters, numbers, and the characters :-.
28+
-- There are additional rules (e.g. no leading digits), but we'll keep it simple here.
29+
local identifier = lexer.alnum^1 * (S(':-') * lexer.alnum^1)^0
30+
31+
-- Reserved hard-coded "pass-through" commands/environments.
32+
local reserved_specials = {
33+
ftl = 'text', -- Well Fluent has a syntax, but let's not care here.
34+
lua = 'lua',
35+
math = 'tex', -- We'd need a (La)TeX math-only lexer to handle this properly.
36+
raw = 'text',
37+
script = 'lua',
38+
-- sil = ... -- It's the default here, so no need to add a rule for it.
39+
xml = 'xml',
40+
use = 'lua',
41+
}
42+
-- Other reserved keywords are "comment" and "begin"/"end",
43+
-- but we'll handle them in the rules below.
44+
45+
-- Parameters (key-value pairs).
46+
local eq = lex:tag(lexer.OPERATOR, '=')
47+
local simple_value = (P(1) - S(',;]'))^1
48+
local quoted_value = lexer.range('"', false, false)
49+
local param = lex:tag(lexer.ATTRIBUTE, identifier) * eq * lex:tag(lexer.STRING, quoted_value + simple_value)
50+
local param_list = param * (ws^0 * lex:tag(lexer.OPERATOR, ',') * ws^0 * param)^0
51+
local optparams = (lex:tag(lexer.OPERATOR, '[') * ws^0 * param_list^0 * ws^0 * lex:tag(lexer.OPERATOR, ']'))^0
52+
53+
-- 2. Comments.
54+
local line_comment = lexer.to_eol('%')
55+
local env_comment = lexer.range(P('\\begin') * optparams * P('{comment}'), P('\\end{comment}'))
56+
local cmd_comment = P('\\comment') * optparams * lexer.range('{', '}', false, false, true)
57+
lex:add_rule('comment', lex:tag(lexer.COMMENT, line_comment + env_comment + cmd_comment))
58+
59+
-- 3. Special reserved pass-through commands/environments.
60+
61+
local function check_exit_brace_level(_, _, current_level)
62+
current_level = tonumber(current_level)
63+
return current_level == 0
64+
end
65+
66+
local function increment_brace_level(increment)
67+
local function update_brace_level(_, _, current_level)
68+
current_level = tonumber(current_level)
69+
local next_level = tostring(current_level + increment)
70+
return true, next_level
71+
end
72+
return Cg(Cmt(Cb('brace_level'), update_brace_level), 'brace_level')
73+
end
74+
75+
local is_exit_brace = Cmt(Cb('brace_level'), check_exit_brace_level)
76+
local init_brace_level = Cg(Ct('') / '0', 'brace_level')
77+
78+
for name, lang in pairs(reserved_specials) do
79+
-- Order matters: environments, commands with arguments, commands without arguments.
80+
-- We need alt names for multiple embeddings and rules.
81+
local base_rule_id = name .. '_' .. lang
82+
83+
-- 3.1. Reserved environments.
84+
-- Ex. \begin{lua} ... Lua code ... \end{lua}
85+
local env_embedder = lexer.load(lang, base_rule_id .. '_env')
86+
lex:embed(
87+
env_embedder,
88+
lex:tag(lexer.FUNCTION_BUILTIN, '\\begin') * optparams
89+
* lex:tag(lexer.OPERATOR, '{') * lex:tag(lexer.FUNCTION_BUILTIN, name) * lex:tag(lexer.OPERATOR, '}'),
90+
lex:tag(lexer.FUNCTION_BUILTIN, '\\end')
91+
* lex:tag(lexer.OPERATOR, '{') * lex:tag(lexer.FUNCTION_BUILTIN, name) * lex:tag(lexer.OPERATOR, '}'))
92+
93+
-- 3.2. Reserved commands.
94+
-- Ex. \lua{... Lua code ...}
95+
-- The hard trick here is that we want to want to keep track of the paired braces,
96+
-- in order to exit the embedding on the right closing brace.
97+
local cmd_embedder = lang == 'text'
98+
and lexer.new(base_rule_id .. '_cmd') -- pseudo-lexer for text
99+
or lexer.load(lang, base_rule_id .. '_cmd') -- real lexer for Lua, TeX, XML
100+
if lang == 'lua' then
101+
-- We hack the Lua lexer to intercept and handle the pairs of braces,
102+
-- i.e. we remove them for the 'operator' rule and handle them separately.
103+
cmd_embedder:modify_rule('operator', cmd_embedder:tag(lexer.OPERATOR, '..' + S('+-*/%^#=<>&|~;:,.[]()')))
104+
cmd_embedder:add_rule(
105+
'sil_brace_open',
106+
cmd_embedder:tag(lexer.OPERATOR, '{') * increment_brace_level(1)
107+
)
108+
cmd_embedder:add_rule(
109+
'sil_brace_close',
110+
cmd_embedder:tag(lexer.OPERATOR, '}') * increment_brace_level(-1)
111+
)
112+
elseif lang == 'tex' then
113+
-- We hack the TeX math lexer to intercept and handle the pairs of braces,
114+
-- i.e. we remove them for the 'operator' rule and handle them separately.
115+
-- We also take the opportunity remove some operators not expected in math mode,
116+
-- and add some extra operators for math mode.
117+
cmd_embedder:modify_rule('operator', cmd_embedder:tag(lexer.OPERATOR, S('&()[]')))
118+
cmd_embedder:add_rule('operator_math', cmd_embedder:tag(lexer.OPERATOR .. ".math", S('+-=^_')))
119+
cmd_embedder:add_rule(
120+
'sil_brace_open',
121+
cmd_embedder:tag(lexer.OPERATOR, '{') * increment_brace_level(1)
122+
)
123+
cmd_embedder:add_rule(
124+
'sil_brace_close',
125+
cmd_embedder:tag(lexer.OPERATOR, '}') * increment_brace_level(-1)
126+
)
127+
else
128+
-- We just need to keep track of the braces for the XML and text lexers,
129+
-- without any special marking.
130+
cmd_embedder:add_rule(
131+
'sil_brace_open',
132+
P'{' * increment_brace_level(1)
133+
)
134+
cmd_embedder:add_rule(
135+
'sil_brace_close',
136+
P'}' * increment_brace_level(-1)
137+
)
138+
end
139+
lex:embed(
140+
cmd_embedder,
141+
lex:tag(lexer.FUNCTION_BUILTIN, '\\' .. name) * optparams * init_brace_level * lex:tag(lexer.FUNCTION_BUILTIN, '{'),
142+
lex:tag(lexer.FUNCTION_BUILTIN, '}' * is_exit_brace)
143+
)
144+
145+
-- 3.3. Reserved commands without arguments (must come after the commands with arguments).
146+
-- Ex. \use[module=packages.highlighter]
147+
lex:add_rule(base_rule_id .. '_cmd_no_arg', lex:tag(lexer.FUNCTION_BUILTIN, P('\\' .. name)) * optparams)
148+
end
149+
150+
-- 4. Sections (for mere convenience / visibility).
151+
-- As of 0.15.9 SILE's default book class has chapter, section, subsection.
152+
-- The resilient.book class from 3rd-party module resilient.sile adds part,
153+
-- appendix, subsubsection, and frontmatter, mainmatter, backmatter.
154+
local sections = lex:word_match('sections')
155+
lex:set_word_list('sections', {
156+
'frontmatter', 'mainmatter', 'backmatter',
157+
'part', 'chapter', 'appendix',
158+
'section', 'subsection', 'subsubsection',
159+
})
160+
lex:add_rule('section', lex:tag('command.section', '\\' * sections) * optparams)
161+
162+
-- 5. Regular commands/environments.
163+
-- Order matters: environments, commands
164+
local env_cmd = lex:tag(lexer.OPERATOR,'{') * lex:tag(lexer.TAG, identifier) * lex:tag(lexer.OPERATOR, '}')
165+
lex:add_rule(
166+
'environment_start',
167+
lex:tag(lexer.FUNCTION_BUILTIN, '\\begin') * optparams * env_cmd
168+
)
169+
lex:add_rule(
170+
'environment_end',
171+
lex:tag(lexer.FUNCTION_BUILTIN, '\\end') * env_cmd
172+
)
173+
lex:add_rule('command', lex:tag(lexer.TAG, '\\' * identifier) * optparams)
174+
175+
-- 6. Groups.
176+
lex:add_rule('operator', lex:tag(lexer.OPERATOR, S('{}')))
177+
178+
lexer.property['scintillua.comment'] = '%'
179+
180+
return lex

0 commit comments

Comments
 (0)