Skip to content

Commit f310ce3

Browse files
committed
fix(textprocessing): improve tokenizer
1 parent 43ec72f commit f310ce3

File tree

3 files changed

+35
-18
lines changed

3 files changed

+35
-18
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "WordCloud"
22
uuid = "6385f0a0-cb03-45b6-9089-4e0acc74b26b"
33
authors = ["guoyongzhi <guo-yong-zhi@outlook.com>"]
4-
version = "1.2.2"
4+
version = "1.3.0"
55

66
[deps]
77
ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4"

src/textprocessing.jl

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,12 @@ function lemmatize!(d::AbstractDict, lemmatizer)
4747
end
4848

4949
function tokenizer(text::AbstractString, regexp=r"\w+")
50-
[text[i] for i in findall(regexp, text)]
50+
(m.match for m in eachmatch(regexp, text))
5151
end
5252

53-
function tokenizer_eng(text::AbstractString, regexp=r"\b\w+(?:'\w+)*\b")
54-
indices = findall(regexp, text)
55-
[endswith(text[i], "'s") ? text[i][1:prevind(text[i], end, 2)] : text[i] for i in indices]
53+
function tokenizer_eng(text::AbstractString, regexp=r"\b[\w']+\b")
54+
ms = eachmatch(regexp, text)
55+
(endswith(m.match, "'s") ? m.match[1:prevind(m.match, end, 2)] : m.match for m in ms)
5656
end
5757

5858
# ISO 639-3 macrolanguages
@@ -98,10 +98,9 @@ Count words in text. And save results into `counter`.
9898
`text_or_counter` can be a String, a Vector of Strings, an opend file (IO) or a Dict.
9999
`regexp` is a regular expression to partially match and filter words. For example, `regexp=r"\S(?:[\s\S]*\S)?"` will trim whitespaces then eliminate empty words.
100100
"""
101-
function countwords(words, counts; language=:auto,
101+
function countwords(words, counts; lemmatizer=:auto, language=:auto,
102102
regexp=r"(?:\S[\s\S]*)?[^0-9_\W](?:[\s\S]*\S)?", counter=Dict{String,Int}())
103103
# strip whitespace and filter out pure punctuation and number string
104-
language = detect_language(words, language)
105104
for (w, c) in zip(words, counts)
106105
if regexp !== nothing
107106
m = match(regexp, w)
@@ -113,21 +112,26 @@ function countwords(words, counts; language=:auto,
113112
counter[w] = get(counter, w, 0) + c
114113
end
115114
end
116-
lemmatizer_ = get(LEMMATIZERS, language, LEMMATIZERS["_default_"])
117-
lemmatize!(counter, lemmatizer_)
115+
if lemmatizer == :auto
116+
language = detect_language(words, language)
117+
lemmatizer = get(LEMMATIZERS, language, LEMMATIZERS["_default_"])
118+
end
119+
lemmatize!(counter, lemmatizer)
118120
counter
119121
end
120-
function countwords(text::AbstractString; language=:auto, kargs...)
121-
language = detect_language(text, language)
122-
if !haskey(TOKENIZERS, language)
123-
@warn "No built-in tokenizer for $(language)!"
122+
function countwords(text::AbstractString; tokenizer=:auto, language=:auto, kargs...)
123+
if tokenizer == :auto
124+
language = detect_language(text, language)
125+
if !haskey(TOKENIZERS, language)
126+
@info "No dedicated built-in tokenizer for $(language); using basic tokenizer instead"
127+
end
128+
tokenizer = get(TOKENIZERS, language, TOKENIZERS["_default_"])
124129
end
125-
tokenizer_ = get(TOKENIZERS, language, TOKENIZERS["_default_"])
126-
countwords(tokenizer_(text); language=language, kargs...)
130+
countwords(tokenizer(text); language=language, kargs...)
127131
end
128-
countwords(words::AbstractVector{<:AbstractString}; kargs...) = countwords(words, Iterators.repeated(1); kargs...)
129132
countwords(counter::AbstractDict{<:AbstractString,<:Real}; kargs...) = countwords(keys(counter), values(counter); kargs...)
130133
countwords(wordscounts::Tuple; kargs...) = countwords(wordscounts...; kargs...)
134+
countwords(words; kargs...) = countwords(words, Iterators.repeated(1); kargs...)
131135
function countwords(counter::AbstractVector{<:Union{Pair,Tuple,AbstractVector}}; kargs...)
132136
countwords(first.(counter), [v[2] for v in counter]; kargs...)
133137
end
@@ -234,7 +238,7 @@ function processtext(counter::AbstractDict{<:AbstractString,<:Real};
234238

235239
language = detect_language(keys(counter), language)
236240
if !haskey(STOPWORDS, language)
237-
@warn "No built-in stopwords for $(language)!"
241+
@info "No built-in stopwords for $(language)!"
238242
end
239243
stopwords == :auto && (stopwords = get(STOPWORDS, language, nothing))
240244
stopwords === nothing && (stopwords = Set{String}())
@@ -277,7 +281,7 @@ end
277281

278282
function processtext(text; language=:auto, kargs...)
279283
language = detect_language(text, language)
280-
cwkw = (:counter, :regexp)
284+
cwkw = (:counter, :regexp, :tokenizer, :lemmatizer)
281285
processtext(
282286
countwords(text; language=language, filter(kw -> first(kw) cwkw, kargs)...);
283287
language=language,

test/test_textprocessing.jl

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,19 @@
99
words, weights = WordCloud.TextProcessing.processtext(c)
1010
@test !("to" in words) # stopwords
1111

12+
tokenizer_eng = WordCloud.TextProcessing.tokenizer_eng
13+
tokenizer_default = WordCloud.TextProcessing.tokenizer
14+
@test tokenizer_default(" a man の 书本\n 1234") .|> strip == ["a", "man", "", "书本", "1234"]
15+
@test tokenizer_eng(" a book in 1994\n") .|> strip == ["a", "book", "in", "1994"]
16+
@test tokenizer_eng(" the 'best-book' in 1994\n") .|> strip == ["the", "best", "book", "in", "1994"]
17+
@test tokenizer_eng("")|>collect == tokenizer_eng(" ")|>collect == tokenizer_eng(" ,")|>collect == []
18+
@test tokenizer_eng(" a _int_var3") .|> strip == ["a", "_int_var3"]
19+
@test tokenizer_eng("bob's book") .|> strip == ["bob", "book"]
20+
@test tokenizer_eng("bob's 'book' 'book'") .|> strip == ["bob", "book", "book"]
21+
@test tokenizer_eng("abc'de fg'h'ij k'l") .|> strip == ["abc'de", "fg'h'ij", "k'l"]
22+
@test tokenizer_eng("abc'de', fg'h'ij' k'l'") .|> strip == ["abc'de", "fg'h'ij", "k'l"]
23+
@test tokenizer_eng(" abc'de'. fg'h'ij',k'l'") .|> strip == ["abc'de", "fg'h'ij", "k'l"]
24+
1225
lemmatizer_eng = WordCloud.TextProcessing.lemmatizer_eng
1326
lemmatize! = WordCloud.TextProcessing.lemmatize!
1427
@test lemmatizer_eng("Cars") == "Car"

0 commit comments

Comments
 (0)