|
1 | 1 | use hashbrown::HashMap;
|
2 | 2 |
|
| 3 | +use constants::{MAX_TOTAL_DISTANCE, MAX_TRIGRAM_DISTANCE}; |
| 4 | +use info::Info; |
3 | 5 | use lang::*;
|
| 6 | +use options::{List, Options}; |
4 | 7 | use script::*;
|
5 | 8 | use trigrams::*;
|
6 |
| -use info::Info; |
7 |
| -use options::{Options, List}; |
8 |
| -use constants::{MAX_TRIGRAM_DISTANCE, MAX_TOTAL_DISTANCE}; |
9 | 9 |
|
10 | 10 | /// Detect a language and a script by a given text.
|
11 | 11 | ///
|
@@ -39,56 +39,66 @@ pub fn detect_lang_with_options(text: &str, options: &Options) -> Option<Lang> {
|
39 | 39 |
|
40 | 40 | pub fn detect_with_options(text: &str, options: &Options) -> Option<Info> {
|
41 | 41 | detect_script(text).and_then(|script| {
|
42 |
| - detect_lang_based_on_script(text, options, script).map( |(lang, confidence)| { |
43 |
| - Info { lang, script, confidence } |
| 42 | + detect_lang_based_on_script(text, options, script).map(|(lang, confidence)| Info { |
| 43 | + lang, |
| 44 | + script, |
| 45 | + confidence, |
44 | 46 | })
|
45 | 47 | })
|
46 | 48 | }
|
47 | 49 |
|
48 |
| -fn detect_lang_based_on_script(text: &str, options: &Options, script : Script) -> Option<(Lang, f64)> { |
| 50 | +fn detect_lang_based_on_script( |
| 51 | + text: &str, |
| 52 | + options: &Options, |
| 53 | + script: Script, |
| 54 | +) -> Option<(Lang, f64)> { |
49 | 55 | match script {
|
50 |
| - Script::Latin => detect_lang_in_profiles(text, options, LATIN_LANGS), |
51 |
| - Script::Cyrillic => detect_lang_in_profiles(text, options, CYRILLIC_LANGS), |
| 56 | + Script::Latin => detect_lang_in_profiles(text, options, LATIN_LANGS), |
| 57 | + Script::Cyrillic => detect_lang_in_profiles(text, options, CYRILLIC_LANGS), |
52 | 58 | Script::Devanagari => detect_lang_in_profiles(text, options, DEVANAGARI_LANGS),
|
53 |
| - Script::Hebrew => detect_lang_in_profiles(text, options, HEBREW_LANGS), |
54 |
| - Script::Ethiopic => detect_lang_in_profiles(text, options, ETHIOPIC_LANGS), |
55 |
| - Script::Arabic => detect_lang_in_profiles(text, options, ARABIC_LANGS), |
56 |
| - Script::Mandarin => Some((Lang::Cmn, 1.0)), |
57 |
| - Script::Bengali => Some((Lang::Ben, 1.0)), |
58 |
| - Script::Hangul => Some((Lang::Kor, 1.0)), |
59 |
| - Script::Georgian => Some((Lang::Kat, 1.0)), |
60 |
| - Script::Greek => Some((Lang::Ell, 1.0)), |
61 |
| - Script::Kannada => Some((Lang::Kan, 1.0)), |
62 |
| - Script::Tamil => Some((Lang::Tam, 1.0)), |
63 |
| - Script::Thai => Some((Lang::Tha, 1.0)), |
64 |
| - Script::Gujarati => Some((Lang::Guj, 1.0)), |
65 |
| - Script::Gurmukhi => Some((Lang::Pan, 1.0)), |
66 |
| - Script::Telugu => Some((Lang::Tel, 1.0)), |
| 59 | + Script::Hebrew => detect_lang_in_profiles(text, options, HEBREW_LANGS), |
| 60 | + Script::Ethiopic => detect_lang_in_profiles(text, options, ETHIOPIC_LANGS), |
| 61 | + Script::Arabic => detect_lang_in_profiles(text, options, ARABIC_LANGS), |
| 62 | + Script::Mandarin => Some((Lang::Cmn, 1.0)), |
| 63 | + Script::Bengali => Some((Lang::Ben, 1.0)), |
| 64 | + Script::Hangul => Some((Lang::Kor, 1.0)), |
| 65 | + Script::Georgian => Some((Lang::Kat, 1.0)), |
| 66 | + Script::Greek => Some((Lang::Ell, 1.0)), |
| 67 | + Script::Kannada => Some((Lang::Kan, 1.0)), |
| 68 | + Script::Tamil => Some((Lang::Tam, 1.0)), |
| 69 | + Script::Thai => Some((Lang::Tha, 1.0)), |
| 70 | + Script::Gujarati => Some((Lang::Guj, 1.0)), |
| 71 | + Script::Gurmukhi => Some((Lang::Pan, 1.0)), |
| 72 | + Script::Telugu => Some((Lang::Tel, 1.0)), |
67 | 73 | Script::Malayalam => Some((Lang::Mal, 1.0)),
|
68 |
| - Script::Oriya => Some((Lang::Ori, 1.0)), |
69 |
| - Script::Myanmar => Some((Lang::Mya, 1.0)), |
70 |
| - Script::Sinhala => Some((Lang::Sin, 1.0)), |
71 |
| - Script::Khmer => Some((Lang::Khm, 1.0)), |
72 |
| - Script::Katakana | Script::Hiragana => Some((Lang::Jpn, 1.0)) |
| 74 | + Script::Oriya => Some((Lang::Ori, 1.0)), |
| 75 | + Script::Myanmar => Some((Lang::Mya, 1.0)), |
| 76 | + Script::Sinhala => Some((Lang::Sin, 1.0)), |
| 77 | + Script::Khmer => Some((Lang::Khm, 1.0)), |
| 78 | + Script::Katakana | Script::Hiragana => Some((Lang::Jpn, 1.0)), |
73 | 79 | }
|
74 | 80 | }
|
75 | 81 |
|
76 |
| -fn detect_lang_in_profiles(text: &str, options: &Options, lang_profile_list : LangProfileList) -> Option<(Lang, f64)> { |
77 |
| - let mut lang_distances : Vec<(Lang, u32)> = vec![]; |
| 82 | +fn detect_lang_in_profiles( |
| 83 | + text: &str, |
| 84 | + options: &Options, |
| 85 | + lang_profile_list: LangProfileList, |
| 86 | +) -> Option<(Lang, f64)> { |
| 87 | + let mut lang_distances: Vec<(Lang, u32)> = vec![]; |
78 | 88 | let trigrams = get_trigrams_with_positions(text);
|
79 | 89 |
|
80 | 90 | for &(ref lang, lang_trigrams) in lang_profile_list {
|
81 | 91 | match options.list {
|
82 | 92 | Some(List::White(ref whitelist)) if !whitelist.contains(lang) => continue,
|
83 | 93 | Some(List::Black(ref blacklist)) if blacklist.contains(lang) => continue,
|
84 |
| - _ => {}, |
| 94 | + _ => {} |
85 | 95 | }
|
86 | 96 | let dist = calculate_distance(lang_trigrams, &trigrams);
|
87 | 97 | lang_distances.push(((*lang), dist));
|
88 | 98 | }
|
89 | 99 |
|
90 | 100 | // Sort languages by distance
|
91 |
| - lang_distances.sort_by_key(|key| key.1 ); |
| 101 | + lang_distances.sort_by_key(|key| key.1); |
92 | 102 |
|
93 | 103 | // Return None if lang_distances is empty
|
94 | 104 | // Return the only language with is_reliable=true if there is only 1 item
|
@@ -117,37 +127,36 @@ fn detect_lang_in_profiles(text: &str, options: &Options, lang_profile_list : La
|
117 | 127 | // * Text really matches one of the languages.
|
118 | 128 | //
|
119 | 129 | // Number 500.0 is based on experiments and common sense expectations.
|
120 |
| - let mut confidence = (score1 as f64) / 500.0; |
| 130 | + let mut confidence = f64::from(score1) / 500.0; |
121 | 131 | if confidence > 1.0 {
|
122 | 132 | confidence = 1.0;
|
123 | 133 | }
|
124 | 134 | return Some((lang_dist1.0, confidence));
|
125 | 135 | }
|
126 | 136 |
|
127 |
| - let rate = (score1 - score2) as f64 / (score2 as f64); |
| 137 | + let rate = f64::from(score1 - score2) / f64::from(score2); |
128 | 138 |
|
129 | 139 | // Hyperbola function. Everything that is above the function has confidence = 1.0
|
130 | 140 | // If rate is below, confidence is calculated proportionally.
|
131 | 141 | // Numbers 12.0 and 0.05 are obtained experimentally, so the function represents common sense.
|
132 | 142 | //
|
133 | 143 | let confident_rate = (12.0 / trigrams.len() as f64) + 0.05;
|
134 |
| - let confidence = |
135 |
| - if rate > confident_rate { |
136 |
| - 1.0 |
137 |
| - } else { |
138 |
| - rate / confident_rate |
139 |
| - }; |
| 144 | + let confidence = if rate > confident_rate { |
| 145 | + 1.0 |
| 146 | + } else { |
| 147 | + rate / confident_rate |
| 148 | + }; |
140 | 149 |
|
141 | 150 | Some((lang_dist1.0, confidence))
|
142 | 151 | }
|
143 | 152 |
|
144 |
| -fn calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap<String, u32>) -> u32 { |
| 153 | +fn calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap<String, u32>) -> u32 { |
145 | 154 | let mut total_dist = 0u32;
|
146 | 155 |
|
147 | 156 | for (i, &trigram) in lang_trigrams.iter().enumerate() {
|
148 | 157 | let dist = match text_trigrams.get(trigram) {
|
149 | 158 | Some(&n) => (n as i32 - i as i32).abs() as u32,
|
150 |
| - None => MAX_TRIGRAM_DISTANCE |
| 159 | + None => MAX_TRIGRAM_DISTANCE, |
151 | 160 | };
|
152 | 161 | total_dist += dist;
|
153 | 162 | }
|
@@ -186,7 +195,16 @@ mod tests {
|
186 | 195 | assert_eq!(info.lang, Lang::Tgl);
|
187 | 196 |
|
188 | 197 | // with blacklist
|
189 |
| - let blacklist = vec![Lang::Tgl, Lang::Jav, Lang::Nld, Lang::Uzb, Lang::Swe, Lang::Nob, Lang::Ceb, Lang::Ilo]; |
| 198 | + let blacklist = vec![ |
| 199 | + Lang::Tgl, |
| 200 | + Lang::Jav, |
| 201 | + Lang::Nld, |
| 202 | + Lang::Uzb, |
| 203 | + Lang::Swe, |
| 204 | + Lang::Nob, |
| 205 | + Lang::Ceb, |
| 206 | + Lang::Ilo, |
| 207 | + ]; |
190 | 208 | let options = Options::new().set_blacklist(blacklist);
|
191 | 209 | let output = detect_with_options(text, &options);
|
192 | 210 | assert_eq!(output.is_some(), true);
|
@@ -224,7 +242,9 @@ mod tests {
|
224 | 242 | let info = detect("qwertyuioasdfghjklzxcvbnm").unwrap();
|
225 | 243 | assert!(!info.is_reliable());
|
226 | 244 |
|
227 |
| - let info = detect("qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm").unwrap(); |
| 245 | + let info = |
| 246 | + detect("qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm") |
| 247 | + .unwrap(); |
228 | 248 | assert!(!info.is_reliable());
|
229 | 249 |
|
230 | 250 | // 1000 chars of randomly generated Cyrillic text
|
|
0 commit comments