Skip to content

Commit 6414e72

Browse files
Several improvements to token filter types (#4291) (#4332)
(cherry picked from commit f5dca08) Co-authored-by: Josh Mock <joshua.mock@elastic.co>
1 parent f99ac88 commit 6414e72

File tree

5 files changed

+313
-45
lines changed

5 files changed

+313
-45
lines changed

specification/_types/analysis/StopWords.ts

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,50 @@
1717
* under the License.
1818
*/
1919

20+
export enum StopWordLanguage {
21+
_arabic_,
22+
_armenian_,
23+
_basque_,
24+
_bengali_,
25+
_brazilian_,
26+
_bulgarian_,
27+
_catalan_,
28+
_cjk_,
29+
_czech_,
30+
_danish_,
31+
_dutch_,
32+
_english_,
33+
_estonian_,
34+
_finnish_,
35+
_french_,
36+
_galician_,
37+
_german_,
38+
_greek_,
39+
_hindi_,
40+
_hungarian_,
41+
_indonesian_,
42+
_irish_,
43+
_italian_,
44+
_latvian_,
45+
_lithuanian_,
46+
_norwegian_,
47+
_persian_,
48+
_portuguese_,
49+
_romanian_,
50+
_russian_,
51+
_serbian_,
52+
_sorani_,
53+
_spanish_,
54+
_swedish_,
55+
_thai_,
56+
_turkish_,
57+
_none_
58+
}
59+
2060
/**
2161
* Language value, such as _arabic_ or _thai_. Defaults to _english_.
2262
* Each language value corresponds to a predefined list of stop words in Lucene. See Stop words by language for supported language values and their stop words.
2363
* Also accepts an array of stop words.
2464
* @class_serializer: StopWordsFormatter
2565
*/
26-
export type StopWords = string | string[]
66+
export type StopWords = StopWordLanguage | string[]

specification/_types/analysis/kuromoji-plugin.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import { integer } from '@_types/Numeric'
2121
import { CharFilterBase } from './char_filters'
22+
import { StopWords } from './StopWords'
2223
import { TokenizerBase } from './tokenizers'
2324
import { TokenFilterBase } from './token_filters'
2425

@@ -28,6 +29,11 @@ export class KuromojiAnalyzer {
2829
user_dictionary?: string
2930
}
3031

32+
export class JaStopTokenFilter extends TokenFilterBase {
33+
type: 'ja_stop'
34+
stopwords?: StopWords
35+
}
36+
3137
export class KuromojiIterationMarkCharFilter extends CharFilterBase {
3238
type: 'kuromoji_iteration_mark'
3339
normalize_kana: boolean

specification/_types/analysis/languages.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,25 +18,30 @@
1818
*/
1919

2020
export enum SnowballLanguage {
21+
Arabic,
2122
Armenian,
2223
Basque,
2324
Catalan,
2425
Danish,
2526
Dutch,
2627
English,
28+
Estonian,
2729
Finnish,
2830
French,
2931
German,
3032
German2,
3133
Hungarian,
3234
Italian,
35+
Irish,
3336
Kp,
37+
Lithuanian,
3438
Lovins,
3539
Norwegian,
3640
Porter,
3741
Portuguese,
3842
Romanian,
3943
Russian,
44+
Serbian,
4045
Spanish,
4146
Swedish,
4247
Turkish

specification/_types/analysis/nori-plugin.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
*/
1919

2020
import { TokenizerBase } from './tokenizers'
21+
import { TokenFilterBase } from './token_filters'
2122

2223
export enum NoriDecompoundMode {
2324
discard,
@@ -32,3 +33,9 @@ export class NoriTokenizer extends TokenizerBase {
3233
user_dictionary?: string
3334
user_dictionary_rules?: string[]
3435
}
36+
37+
export class NoriPartOfSpeechTokenFilter extends TokenFilterBase {
38+
type: 'nori_part_of_speech'
39+
/** An array of part-of-speech tags that should be removed. */
40+
stoptags?: string[]
41+
}

0 commit comments

Comments
 (0)