Skip to content

Commit 45854a4

Browse files
committed
Migrate to Automa v1
1 parent a98d262 commit 45854a4

File tree

8 files changed

+53
-101
lines changed

8 files changed

+53
-101
lines changed

Project.toml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "FASTX"
22
uuid = "c2308a5c-f048-11e8-3e8a-31650f418d12"
33
authors = ["Sabrina J. Ward <sabrinajward@protonmail.com>", "Jakob N. Nissen <jakobnybonissen@gmail.com>"]
4-
version = "2.1.2"
4+
version = "2.1.3"
55

66
[weakdeps]
77
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
@@ -10,7 +10,6 @@ BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
1010
Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
1111
BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea"
1212
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
13-
ScanByte = "7b38b023-a4d7-4c5e-8d43-3f3097f304eb"
1413
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
1514
StringViews = "354b36f9-a18e-4713-926e-db85100087ba"
1615
TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
@@ -19,10 +18,9 @@ TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
1918
BioSequencesExt = "BioSequences"
2019

2120
[compat]
22-
Automa = "0.8"
21+
Automa = "1"
2322
BioGenerics = "0.1.2"
2423
BioSequences = "3"
25-
ScanByte = "0.4"
2624
PrecompileTools = "1"
2725
StringViews = "1"
2826
TranscodingStreams = "0.9.5"

src/FASTX.jl

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,6 @@ julia> seqsize(parse(FASTA.Record, ">hdr\\nαβγδϵ"))
8787
"""
8888
function seqsize end
8989

90-
const UTF8 = Union{AbstractVector{UInt8}, String, SubString{String}}
91-
9290
# line is nothing if the reader does not have line information after random IO access.
9391
@noinline function throw_parser_error(
9492
data::Vector{UInt8},
@@ -147,7 +145,7 @@ end
147145

148146
CONTEXT = Automa.CodeGenContext(
149147
generator=:goto,
150-
vars=Automa.Variables(:p, :p_end, :p_eof, :ts, :te, :cs, :data, :mem, :byte)
148+
vars=Automa.Variables(;p=:p, p_end=:p_end, cs=:cs, data=:data, mem=:mem, byte=:byte)
151149
)
152150

153151
include("fasta/fasta.jl")

src/fasta/fasta.jl

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,13 @@ Module under FASTX with code related to FASTA files.
88
"""
99
module FASTA
1010

11-
import Automa
12-
import Automa.RegExp: @re_str
13-
import Automa.Stream: @mark, @markpos, @relpos, @abspos
11+
using Automa: Automa, @re_str, @mark, @markpos, @relpos, @abspos, onenter!, onexit!, onall!
1412
import BioGenerics: BioGenerics
1513
import StringViews: StringView
1614
import TranscodingStreams: TranscodingStreams, TranscodingStream, NoopStream
15+
import ..FASTX: identifier, description, sequence, seqsize, truncate, memcmp, appendfrom!, CONTEXT, throw_parser_error
1716

18-
# Trivial use, I only use it here because it's a dep of Automa anyway.
19-
# Can be removed with no big problems
20-
using ScanByte: memchr, ByteSet
21-
import ..FASTX: identifier, description, sequence, UTF8, seqsize, throw_parser_error, truncate, memcmp, appendfrom!, CONTEXT
17+
const Re = Automa.RegExp
2218

2319
include("record.jl")
2420
include("readrecord.jl")

src/fasta/index.jl

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -100,24 +100,17 @@ function Base.show(io::IO, index::Index)
100100
end
101101

102102
index_machine = let
103-
re = Automa.RegExp
104103
newline = let
105-
lf = re"\n"
106-
lf.actions[:enter] = [:countline]
107-
re.opt('\r') * lf
104+
lf = onenter!(re"\n", :countline)
105+
Re.opt('\r') * lf
108106
end
109107

110108
# The specs refer to the SAM specs, which contain this regex
111-
name = re"[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*"
112-
name.actions[:enter] = [:mark]
113-
name.actions[:exit] = [:name]
114-
115-
number = re"[0-9]+"
116-
number.actions[:all] = [:digit]
117-
number.actions[:exit] = [:number]
109+
name = onexit!(onenter!(re"[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*", :mark), :name)
110+
number = onexit!(onall!(re"[0-9]+", :digit), :number)
118111

119112
line = name * re"\t" * number * re"\t" * number * re"\t" * number * re"\t" * number
120-
fai = re.opt(line * re.rep(newline * line)) * re.rep(newline)
113+
fai = Re.opt(line * Re.rep(newline * line)) * Re.rep(newline)
121114
Automa.compile(fai)
122115
end
123116

@@ -173,7 +166,6 @@ index_actions = Dict{Symbol, Expr}(
173166
error("Error when parsing FAI file: Unexpected byte at index $p (line $linenum col $col)")
174167
end
175168

176-
ctx = Automa.CodeGenContext(vars=Automa.Variables(:p, :p_end, :p_eof, :ts, :te, :cs, :data, :mem, :byte))
177169
@eval function read_faidx(data::Vector{UInt8})
178170
start = 0
179171
linenum = 1
@@ -183,15 +175,9 @@ ctx = Automa.CodeGenContext(vars=Automa.Variables(:p, :p_end, :p_eof, :ts, :te,
183175
linebases = 0
184176
linebases_num = 0
185177
vectors = (Int[], Int[], UInt[])
186-
$(Automa.generate_init_code(ctx, index_machine))
187-
p_eof = p_end = length(data)
188178

189-
GC.@preserve data begin
190-
$(Automa.generate_exec_code(ctx, index_machine, index_actions))
191-
end
179+
$(Automa.generate_code(CONTEXT, index_machine, index_actions))
192180

193-
# TODO: Rely on Automa's new error code
194-
iszero(cs) || throw_index_error(data, linenum, p)
195181
return Index(names, vectors...)
196182
end
197183

@@ -303,10 +289,9 @@ returncode = quote
303289
return Index(names, lengths, offsets, encoded_linebases)
304290
end
305291

306-
Automa.Stream.generate_reader(
292+
Automa.generate_reader(
307293
:faidx_,
308294
machine,
309-
arguments = (),
310295
actions = index_fasta_actions,
311296
context = CONTEXT,
312297
initcode = initcode,

src/fasta/reader.jl

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -155,8 +155,6 @@ function seekrecord(reader::Reader, i::Integer)
155155
nothing
156156
end
157157

158-
const UNALLOWED_BYTESET = Val(ByteSet((UInt8('\n'), UInt8('\r'), UInt8('>'))))
159-
160158
"""
161159
extract(reader::Reader, name::AbstractString, range::Union{Nothing, UnitRange})
162160
@@ -218,11 +216,16 @@ function extract(
218216
resize!(buffer, total_bases)
219217

220218
# Now check that there are no bad bytes in our buffer
221-
# Note: This ByteSet must correspond to the allowed bytes in
219+
# Note: The disallowed bytes must correspond to the allowed bytes in
222220
# the FASTA machine to ensure we can seek the same FASTA files we can read
223-
badpos = memchr(buffer, UNALLOWED_BYTESET)
224-
if badpos !== nothing
225-
error("Invalid byte in FASTA sequence line: $(buffer[badpos])")
221+
bad_byte = false
222+
for byte in buffer
223+
bad_byte |= (
224+
(byte === UInt8('\r')) |
225+
(byte === UInt8('\n')) |
226+
(byte === UInt8('>'))
227+
)
228+
bad_byte && error("Invalid byte in FASTA sequence line: '>', '\\r' or '\\n'")
226229
end
227230

228231
# Return the Reader to a usable state after having messed with its

src/fasta/readrecord.jl

Lines changed: 13 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -10,55 +10,41 @@
1010
# This implies all whitespace except newlines, including trailing whitespace, is part
1111
# of the sequence.
1212
machine = let
13-
re = Automa.RegExp
14-
1513
hspace = re"[ \t\v]"
1614
newline = let
17-
lf = re"\n"
18-
lf.actions[:enter] = [:countline]
19-
re.opt('\r') * lf
15+
lf = onenter!(re"\n", :countline)
16+
Re.opt('\r') * lf
2017
end
2118
space = hspace | newline
2219

2320
# Identifier: Leading non-space
24-
identifier = re.rep(re.any() \ re.space())
25-
identifier.actions[:enter] = [:mark]
26-
# Action: Store length of identifier
27-
identifier.actions[:exit] = [:identifier]
21+
identifier = onexit!(onenter!(Re.rep(Re.any() \ Re.space()), :mark), :identifier)
2822

2923
# Description here include trailing whitespace.
3024
# This is needed for the FSM, since the description can contain arbitrary
3125
# whitespace, the only way to know the description ends is to encounter a newline.
3226
# NB: Make sure to also change the Index machine to match this is you change it.
33-
description = identifier * re.opt(hspace * re"[^\r\n]*")
34-
35-
# Action: Store length of description and append description to record.data
36-
description.actions[:exit] = [:description]
27+
description = onexit!(identifier * Re.opt(hspace * re"[^\r\n]*"), :description)
3728

3829
# Header: '>' then description
3930
header = re">" * description
4031

4132
# Sequence line: Anything except \r, \n and >
42-
# Note: Must be consistent with the ByteSet in reader.jl used for seeking
43-
sequence_line = re"[^\n\r>]+"
44-
sequence_line.actions[:enter] = [:mark]
45-
# Action: Append letters to sequence_line
46-
sequence_line.actions[:exit] = [:seqline]
33+
# Note: Must be consistent with the disallowed bytes in reader.jl used for seeking
34+
sequence_line = onexit!(onenter!(re"[^\n\r>]+", :mark), :seqline)
4735

4836
# Sequence: This is intentionally very liberal with whitespace.
4937
# Any trailing whitespace is simply considered part of the sequence.
5038
# Is this bad? Maybe.
51-
sequence = re.rep1(re.opt(sequence_line) * re.rep1(newline))
39+
sequence = Re.rep1(Re.opt(sequence_line) * Re.rep1(newline))
5240

5341
# We have sequence_eof to allow the final sequence to not end in whitespace
54-
sequence_eof = re.opt(sequence_line) * re.rep(re.rep1(newline) * re.opt(sequence_line))
42+
sequence_eof = Re.opt(sequence_line) * Re.rep(Re.rep1(newline) * Re.opt(sequence_line))
5543

56-
record = header * newline * sequence
57-
record.actions[:exit] = [:record]
58-
record_eof = header * newline * sequence_eof
59-
record_eof.actions[:exit] = [:record]
44+
record = onexit!(header * newline * sequence, :record)
45+
record_eof = onexit!(header * newline * sequence_eof, :record)
6046

61-
fasta = re.rep(space) * re.rep(record) * re.opt(record_eof)
47+
fasta = Re.rep(space) * Re.rep(record) * Re.opt(record_eof)
6248

6349
Automa.compile(fasta)
6450
end
@@ -106,7 +92,7 @@ end
10692

10793
returncode = :(return cs, linenum, found)
10894

109-
Automa.Stream.generate_reader(
95+
Automa.generate_reader(
11096
:readrecord!,
11197
machine,
11298
arguments = (:(record::Record), :(state::Tuple{Int,Int})),
@@ -120,7 +106,7 @@ Automa.Stream.generate_reader(
120106
validator_actions = Dict(k => quote nothing end for k in keys(actions))
121107
validator_actions[:countline] = :(linenum += 1)
122108

123-
Automa.Stream.generate_reader(
109+
Automa.generate_reader(
124110
:validate_fasta,
125111
machine,
126112
arguments = (),

src/fastq/fastq.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ Module under FASTX with code related to FASTA files.
55
"""
66
module FASTQ
77

8-
import Automa
9-
import Automa.RegExp: @re_str
10-
import Automa.Stream: @mark, @markpos, @relpos, @abspos
8+
using Automa: Automa, @re_str, @mark, @markpos, @relpos, @abspos, onenter!, onexit!
119
import BioGenerics: BioGenerics
1210
import StringViews: StringView
1311
import TranscodingStreams: TranscodingStreams, TranscodingStream, NoopStream
14-
import ..FASTX: identifier, description, sequence, UTF8, seqsize, throw_parser_error, truncate, memcmp, appendfrom!, CONTEXT
12+
import ..FASTX: identifier, description, sequence, seqsize, truncate, memcmp, appendfrom!, CONTEXT
13+
14+
const Re = Automa.RegExp
1515

1616
include("quality.jl")
1717
include("record.jl")

src/fastq/readrecord.jl

Lines changed: 14 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,34 @@
11
machine = let
2-
re = Automa.RegExp
3-
42
hspace = re"[ \t\v]"
53

64
header1 = let
7-
identifier = re.rep(re.any() \ re.space())
8-
identifier.actions[:enter] = [:mark]
9-
identifier.actions[:exit] = [:header1_identifier]
5+
identifier = onexit!(onenter!(Re.rep(Re.any() \ Re.space()), :mark), :header1_identifier)
106

117
# Description here means "after whitespace", not whole line
12-
description = re.cat(re.any() \ re.space(), re"[^\r\n]*")
13-
re.cat('@', identifier, re.opt(re.cat(re.rep1(hspace), re.opt(description))))
8+
description = (Re.any() \ Re.space()) * re"[^\r\n]*"
9+
'@' * identifier * Re.opt(Re.rep1(hspace) * Re.opt(description))
1410
end
15-
header1.actions[:exit] = [:header1_description]
11+
onexit!(header1, :header1_description)
1612

17-
sequence = re"[A-z]*"
18-
sequence.actions[:enter] = [:mark]
19-
sequence.actions[:exit] = [:sequence]
13+
sequence = onexit!(onenter!(re"[A-z]*", :mark), :sequence)
2014

2115
# The pattern recognized by header2 should be identical to header1
2216
# with the only difference being that h1 is split into identifier
2317
# and description
2418
header2 = let
25-
description2 = re"[^\r\n]+"
26-
description2.actions[:enter] = [:mark]
27-
description2.actions[:exit] = [:header2_description]
28-
re.cat('+', re.opt(description2))
19+
description2 = onexit!(onenter!(re"[^\r\n]+", :mark), :header2_description)
20+
'+' * Re.opt(description2)
2921
end
3022

31-
quality = re"[!-~]*"
32-
quality.actions[:enter] = [:mark]
33-
quality.actions[:exit] = [:quality]
23+
quality = onexit!(onenter!(re"[!-~]*", :mark), :quality)
3424

3525
newline = let
36-
lf = re"\n"
37-
lf.actions[:enter] = [:countline]
38-
re.cat(re.opt('\r'), lf)
26+
lf = onenter!(re"\n", :countline)
27+
Re.opt('\r') * lf
3928
end
4029

41-
record = re.cat(header1, newline, sequence, newline, header2, newline, quality)
42-
record.actions[:enter] = [:mark]
43-
record.actions[:exit] = [:record]
44-
45-
fastq = re.opt(record) * re.rep(newline * record) * re.opt(newline)
30+
record = onexit!(onenter!(header1 * newline * sequence * newline * header2 * newline * quality, :mark), :record)
31+
fastq = Re.opt(record) * Re.rep(newline * record) * Re.opt(newline)
4632

4733
Automa.compile(fastq)
4834
end
@@ -121,7 +107,7 @@ end
121107

122108
returncode = :(return cs, linenum, found)
123109

124-
Automa.Stream.generate_reader(
110+
Automa.generate_reader(
125111
:readrecord!,
126112
machine,
127113
arguments = (:(record::Record), :(state::Tuple{Int,Int})),
@@ -174,7 +160,7 @@ initcode = quote
174160
headerbuffer = Vector{UInt8}(undef, 1024)
175161
end
176162

177-
Automa.Stream.generate_reader(
163+
Automa.generate_reader(
178164
:validate_fastq,
179165
machine,
180166
arguments = (),

0 commit comments

Comments
 (0)