Skip to content

Commit dce0067

Browse files
committed
feat(textprocessing): unknown languages use eng stopwords
1 parent fb3bdbd commit dce0067

File tree

2 files changed

+4
-1
lines changed

2 files changed

+4
-1
lines changed

src/textprocessing.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,10 +242,12 @@ function processtext(counter::AbstractDict{<:AbstractString,<:Real};
242242

243243
if stopwords == :auto
244244
language = detect_language(keys(counter), language)
245+
lk = language
245246
if !haskey(STOPWORDS, language)
246247
@info "No built-in stopwords for $(language)!"
248+
lk = "eng"
247249
end
248-
stopwords = get(STOPWORDS, language, nothing)
250+
stopwords = STOPWORDS[lk]
249251
end
250252
stopwords === nothing && (stopwords = Set{String}())
251253
stopwords isa AbstractSet || (stopwords = Set(stopwords))

test/test_textprocessing.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@
7373
@test processtext("word cloud is a cloud", language="en", stopwords_extra=["word"])[1] |> only == "cloud"
7474
# settokenizer! ...
7575
WordCloud.settokenizer!("mylang", t->split(t, "a"))
76+
WordCloud.setstopwords!("mylang", [])
7677
@test Set(processtext("bananais", language="mylang")[1]) == Set(["b", "n", "is"])
7778
WordCloud.setlemmatizer!("mylang", uppercase)
7879
@test Set(processtext("bananais", language="mylang")[1]) == Set(["B", "N", "IS"])

0 commit comments

Comments
 (0)