@@ -47,12 +47,12 @@ function lemmatize!(d::AbstractDict, lemmatizer)
47
47
end
48
48
49
49
function tokenizer (text:: AbstractString , regexp= r" \w +" )
50
- [text[i] for i in findall (regexp, text)]
50
+ (m . match for m in eachmatch (regexp, text))
51
51
end
52
52
53
- function tokenizer_eng (text:: AbstractString , regexp= r" \b\w +(?:' \w +)* \b " )
54
- indices = findall (regexp, text)
55
- [ endswith (text[i] , " 's" ) ? text[i][ 1 : prevind (text[i] , end , 2 )] : text[i] for i in indices]
53
+ function tokenizer_eng (text:: AbstractString , regexp= r" \b [ \w ']+ \b " )
54
+ ms = eachmatch (regexp, text)
55
+ ( endswith (m . match , " 's" ) ? m . match[ 1 : prevind (m . match , end , 2 )] : m . match for m in ms)
56
56
end
57
57
58
58
# ISO 639-3 macrolanguages
@@ -98,10 +98,9 @@ Count words in text. And save results into `counter`.
98
98
`text_or_counter` can be a String, a Vector of Strings, an opend file (IO) or a Dict.
99
99
`regexp` is a regular expression to partially match and filter words. For example, `regexp=r"\S (?:[\s\S ]*\S )?"` will trim whitespaces then eliminate empty words.
100
100
"""
101
- function countwords (words, counts; language= :auto ,
101
+ function countwords (words, counts; lemmatizer = :auto , language= :auto ,
102
102
regexp= r" (?:\S [\s\S ]*)?[^0-9_\W ](?:[\s\S ]*\S )?" , counter= Dict {String,Int} ())
103
103
# strip whitespace and filter out pure punctuation and number string
104
- language = detect_language (words, language)
105
104
for (w, c) in zip (words, counts)
106
105
if regexp != = nothing
107
106
m = match (regexp, w)
@@ -113,21 +112,26 @@ function countwords(words, counts; language=:auto,
113
112
counter[w] = get (counter, w, 0 ) + c
114
113
end
115
114
end
116
- lemmatizer_ = get (LEMMATIZERS, language, LEMMATIZERS[" _default_" ])
117
- lemmatize! (counter, lemmatizer_)
115
+ if lemmatizer == :auto
116
+ language = detect_language (words, language)
117
+ lemmatizer = get (LEMMATIZERS, language, LEMMATIZERS[" _default_" ])
118
+ end
119
+ lemmatize! (counter, lemmatizer)
118
120
counter
119
121
end
120
- function countwords (text:: AbstractString ; language= :auto , kargs... )
121
- language = detect_language (text, language)
122
- if ! haskey (TOKENIZERS, language)
123
- @warn " No built-in tokenizer for $(language) !"
122
+ function countwords (text:: AbstractString ; tokenizer= :auto , language= :auto , kargs... )
123
+ if tokenizer == :auto
124
+ language = detect_language (text, language)
125
+ if ! haskey (TOKENIZERS, language)
126
+ @info " No dedicated built-in tokenizer for $(language) ; using basic tokenizer instead"
127
+ end
128
+ tokenizer = get (TOKENIZERS, language, TOKENIZERS[" _default_" ])
124
129
end
125
- tokenizer_ = get (TOKENIZERS, language, TOKENIZERS[" _default_" ])
126
- countwords (tokenizer_ (text); language= language, kargs... )
130
+ countwords (tokenizer (text); language= language, kargs... )
127
131
end
128
- countwords (words:: AbstractVector{<:AbstractString} ; kargs... ) = countwords (words, Iterators. repeated (1 ); kargs... )
129
132
countwords (counter:: AbstractDict{<:AbstractString,<:Real} ; kargs... ) = countwords (keys (counter), values (counter); kargs... )
130
133
countwords (wordscounts:: Tuple ; kargs... ) = countwords (wordscounts... ; kargs... )
134
+ countwords (words; kargs... ) = countwords (words, Iterators. repeated (1 ); kargs... )
131
135
function countwords (counter:: AbstractVector{<:Union{Pair,Tuple,AbstractVector}} ; kargs... )
132
136
countwords (first .(counter), [v[2 ] for v in counter]; kargs... )
133
137
end
@@ -234,7 +238,7 @@ function processtext(counter::AbstractDict{<:AbstractString,<:Real};
234
238
235
239
language = detect_language (keys (counter), language)
236
240
if ! haskey (STOPWORDS, language)
237
- @warn " No built-in stopwords for $(language) !"
241
+ @info " No built-in stopwords for $(language) !"
238
242
end
239
243
stopwords == :auto && (stopwords = get (STOPWORDS, language, nothing ))
240
244
stopwords === nothing && (stopwords = Set {String} ())
277
281
278
282
function processtext (text; language= :auto , kargs... )
279
283
language = detect_language (text, language)
280
- cwkw = (:counter , :regexp )
284
+ cwkw = (:counter , :regexp , :tokenizer , :lemmatizer )
281
285
processtext (
282
286
countwords (text; language= language, filter (kw -> first (kw) ∈ cwkw, kargs)... );
283
287
language= language,
0 commit comments