Skip to content

Commit 73e1e2f

Browse files
authored
Merge pull request #28 from greyblake/prepare-v06
Prepare for v 0.6.0
2 parents e23fb7d + 201c89e commit 73e1e2f

File tree

16 files changed

+359
-261
lines changed

16 files changed

+359
-261
lines changed

.travis.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,11 @@ language: rust
22
rust:
33
- 1.30.1
44
- stable
5+
install:
6+
- rustup component add rustfmt-preview
7+
- rustup component add clippy-preview
8+
script:
9+
- cargo fmt -- --check
10+
- cargo clippy -- -D warnings
11+
- cargo test
12+
- cargo package

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ include = [
1414
"src/**/*",
1515
"test/**/*",
1616
"misc/data.json",
17-
"misc/supported_laguages.csv",
17+
"misc/supported_languages.csv",
18+
"templates/lang.rs",
1819
"build.rs",
1920
"Cargo.toml",
2021
"README.md"

benches/example.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#[macro_use]
22
extern crate bencher;
3-
extern crate whatlang;
43
extern crate serde_json;
4+
extern crate whatlang;
55

66
use bencher::Bencher;
77
use std::collections::HashMap;

build.rs

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
extern crate csv;
2-
extern crate skeptic;
3-
extern crate serde_json;
42
extern crate serde;
3+
extern crate serde_json;
4+
extern crate skeptic;
55
#[macro_use]
66
extern crate serde_derive;
77
extern crate tera;
88

9-
use std::io::{Write, BufReader, BufWriter};
109
use std::collections::HashMap;
10+
use std::env;
1111
use std::fs::File;
12+
use std::io::{BufReader, BufWriter, Write};
1213
use std::path::Path;
13-
use std::env;
1414

1515
const DATA_PATH: &'static str = "misc/data.json";
1616
const SUPPORTED_LANG_PATH: &'static str = "misc/supported_languages.csv";
@@ -53,16 +53,21 @@ fn generate_source_files() {
5353

5454
fn load_data() -> (Vec<LangInfo>, HashMap<String, Vec<Lang>>) {
5555
let data_file = BufReader::new(File::open(DATA_PATH).unwrap());
56-
let mut lang_reader = csv::ReaderBuilder::new().flexible(true).from_path(SUPPORTED_LANG_PATH).unwrap();
56+
let mut lang_reader = csv::ReaderBuilder::new()
57+
.flexible(true)
58+
.from_path(SUPPORTED_LANG_PATH)
59+
.unwrap();
5760

5861
let mut lang_infos: Vec<LangInfo> = lang_reader.deserialize().map(Result::unwrap).collect();
5962
lang_infos.sort_by(|left, right| left.code.cmp(&right.code));
6063

61-
let supported_lang_codes: HashMap<String, LangInfo> = lang_infos.iter()
64+
let supported_lang_codes: HashMap<String, LangInfo> = lang_infos
65+
.iter()
6266
.map(|lang| (lang.code.clone(), lang.clone()))
6367
.collect();
6468

65-
let lang_data: HashMap<String, HashMap<String, String>> = serde_json::from_reader(data_file).unwrap();
69+
let lang_data: HashMap<String, HashMap<String, String>> =
70+
serde_json::from_reader(data_file).unwrap();
6671

6772
let mut scripts: HashMap<String, Vec<Lang>> = HashMap::with_capacity(lang_data.len());
6873
let mut all_langs: Vec<Lang> = Vec::new();
@@ -75,23 +80,36 @@ fn load_data() -> (Vec<LangInfo>, HashMap<String, Vec<Lang>>) {
7580
let lang = Lang {
7681
info: (*info).clone(),
7782
script: script.clone(),
78-
trigrams: trigrams.split('|').map(Into::into).collect()
83+
trigrams: trigrams.split('|').map(Into::into).collect(),
7984
};
8085
if lang.trigrams.len() != TRIGRAM_COUNT {
81-
panic!("Language {} has {} trigrams, instead of {}", code, lang.trigrams.len(), TRIGRAM_COUNT);
86+
panic!(
87+
"Language {} has {} trigrams, instead of {}",
88+
code,
89+
lang.trigrams.len(),
90+
TRIGRAM_COUNT
91+
);
8292
}
8393

8494
all_langs.push(lang.clone());
85-
scripts.entry(script.clone()).or_insert_with(Vec::new).push(lang);
95+
scripts
96+
.entry(script.clone())
97+
.or_insert_with(Vec::new)
98+
.push(lang);
8699
}
87100
}
88101

89102
(lang_infos, scripts)
90103
}
91104

92-
fn render_lang_rs(buf: &mut BufWriter<File>, lang_infos: &[LangInfo], scripts: &HashMap<String, Vec<Lang>>) {
105+
fn render_lang_rs(
106+
buf: &mut BufWriter<File>,
107+
lang_infos: &[LangInfo],
108+
scripts: &HashMap<String, Vec<Lang>>,
109+
) {
93110
let mut tera = tera::Tera::default();
94-
tera.add_template_file(TEMPLATE_LANG_RS_PATH, Some("lang.rs")).unwrap();
111+
tera.add_template_file(TEMPLATE_LANG_RS_PATH, Some("lang.rs"))
112+
.unwrap();
95113

96114
let mut ctx = tera::Context::new();
97115
ctx.insert("lang_infos", lang_infos);

examples/cli.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ use whatlang::detect;
66
fn main() {
77
let mut text = String::new();
88
println!("Please enter a text:");
9-
io::stdin().read_line(&mut text).expect("Failed to read line");
9+
io::stdin()
10+
.read_line(&mut text)
11+
.expect("Failed to read line");
1012

1113
if let Some(info) = detect(&text) {
1214
println!("Language: {}", info.lang());
@@ -16,4 +18,3 @@ fn main() {
1618
println!("Cannot recognize a language :(");
1719
}
1820
}
19-

src/detect.rs

Lines changed: 63 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
use hashbrown::HashMap;
22

3+
use constants::{MAX_TOTAL_DISTANCE, MAX_TRIGRAM_DISTANCE};
4+
use info::Info;
35
use lang::*;
6+
use options::{List, Options};
47
use script::*;
58
use trigrams::*;
6-
use info::Info;
7-
use options::{Options, List};
8-
use constants::{MAX_TRIGRAM_DISTANCE, MAX_TOTAL_DISTANCE};
99

1010
/// Detect a language and a script by a given text.
1111
///
@@ -39,56 +39,66 @@ pub fn detect_lang_with_options(text: &str, options: &Options) -> Option<Lang> {
3939

4040
pub fn detect_with_options(text: &str, options: &Options) -> Option<Info> {
4141
detect_script(text).and_then(|script| {
42-
detect_lang_based_on_script(text, options, script).map( |(lang, confidence)| {
43-
Info { lang, script, confidence }
42+
detect_lang_based_on_script(text, options, script).map(|(lang, confidence)| Info {
43+
lang,
44+
script,
45+
confidence,
4446
})
4547
})
4648
}
4749

48-
fn detect_lang_based_on_script(text: &str, options: &Options, script : Script) -> Option<(Lang, f64)> {
50+
fn detect_lang_based_on_script(
51+
text: &str,
52+
options: &Options,
53+
script: Script,
54+
) -> Option<(Lang, f64)> {
4955
match script {
50-
Script::Latin => detect_lang_in_profiles(text, options, LATIN_LANGS),
51-
Script::Cyrillic => detect_lang_in_profiles(text, options, CYRILLIC_LANGS),
56+
Script::Latin => detect_lang_in_profiles(text, options, LATIN_LANGS),
57+
Script::Cyrillic => detect_lang_in_profiles(text, options, CYRILLIC_LANGS),
5258
Script::Devanagari => detect_lang_in_profiles(text, options, DEVANAGARI_LANGS),
53-
Script::Hebrew => detect_lang_in_profiles(text, options, HEBREW_LANGS),
54-
Script::Ethiopic => detect_lang_in_profiles(text, options, ETHIOPIC_LANGS),
55-
Script::Arabic => detect_lang_in_profiles(text, options, ARABIC_LANGS),
56-
Script::Mandarin => Some((Lang::Cmn, 1.0)),
57-
Script::Bengali => Some((Lang::Ben, 1.0)),
58-
Script::Hangul => Some((Lang::Kor, 1.0)),
59-
Script::Georgian => Some((Lang::Kat, 1.0)),
60-
Script::Greek => Some((Lang::Ell, 1.0)),
61-
Script::Kannada => Some((Lang::Kan, 1.0)),
62-
Script::Tamil => Some((Lang::Tam, 1.0)),
63-
Script::Thai => Some((Lang::Tha, 1.0)),
64-
Script::Gujarati => Some((Lang::Guj, 1.0)),
65-
Script::Gurmukhi => Some((Lang::Pan, 1.0)),
66-
Script::Telugu => Some((Lang::Tel, 1.0)),
59+
Script::Hebrew => detect_lang_in_profiles(text, options, HEBREW_LANGS),
60+
Script::Ethiopic => detect_lang_in_profiles(text, options, ETHIOPIC_LANGS),
61+
Script::Arabic => detect_lang_in_profiles(text, options, ARABIC_LANGS),
62+
Script::Mandarin => Some((Lang::Cmn, 1.0)),
63+
Script::Bengali => Some((Lang::Ben, 1.0)),
64+
Script::Hangul => Some((Lang::Kor, 1.0)),
65+
Script::Georgian => Some((Lang::Kat, 1.0)),
66+
Script::Greek => Some((Lang::Ell, 1.0)),
67+
Script::Kannada => Some((Lang::Kan, 1.0)),
68+
Script::Tamil => Some((Lang::Tam, 1.0)),
69+
Script::Thai => Some((Lang::Tha, 1.0)),
70+
Script::Gujarati => Some((Lang::Guj, 1.0)),
71+
Script::Gurmukhi => Some((Lang::Pan, 1.0)),
72+
Script::Telugu => Some((Lang::Tel, 1.0)),
6773
Script::Malayalam => Some((Lang::Mal, 1.0)),
68-
Script::Oriya => Some((Lang::Ori, 1.0)),
69-
Script::Myanmar => Some((Lang::Mya, 1.0)),
70-
Script::Sinhala => Some((Lang::Sin, 1.0)),
71-
Script::Khmer => Some((Lang::Khm, 1.0)),
72-
Script::Katakana | Script::Hiragana => Some((Lang::Jpn, 1.0))
74+
Script::Oriya => Some((Lang::Ori, 1.0)),
75+
Script::Myanmar => Some((Lang::Mya, 1.0)),
76+
Script::Sinhala => Some((Lang::Sin, 1.0)),
77+
Script::Khmer => Some((Lang::Khm, 1.0)),
78+
Script::Katakana | Script::Hiragana => Some((Lang::Jpn, 1.0)),
7379
}
7480
}
7581

76-
fn detect_lang_in_profiles(text: &str, options: &Options, lang_profile_list : LangProfileList) -> Option<(Lang, f64)> {
77-
let mut lang_distances : Vec<(Lang, u32)> = vec![];
82+
fn detect_lang_in_profiles(
83+
text: &str,
84+
options: &Options,
85+
lang_profile_list: LangProfileList,
86+
) -> Option<(Lang, f64)> {
87+
let mut lang_distances: Vec<(Lang, u32)> = vec![];
7888
let trigrams = get_trigrams_with_positions(text);
7989

8090
for &(ref lang, lang_trigrams) in lang_profile_list {
8191
match options.list {
8292
Some(List::White(ref whitelist)) if !whitelist.contains(lang) => continue,
8393
Some(List::Black(ref blacklist)) if blacklist.contains(lang) => continue,
84-
_ => {},
94+
_ => {}
8595
}
8696
let dist = calculate_distance(lang_trigrams, &trigrams);
8797
lang_distances.push(((*lang), dist));
8898
}
8999

90100
// Sort languages by distance
91-
lang_distances.sort_by_key(|key| key.1 );
101+
lang_distances.sort_by_key(|key| key.1);
92102

93103
// Return None if lang_distances is empty
94104
// Return the only language with is_reliable=true if there is only 1 item
@@ -117,37 +127,36 @@ fn detect_lang_in_profiles(text: &str, options: &Options, lang_profile_list : La
117127
// * Text really matches one of the languages.
118128
//
119129
// Number 500.0 is based on experiments and common sense expectations.
120-
let mut confidence = (score1 as f64) / 500.0;
130+
let mut confidence = f64::from(score1) / 500.0;
121131
if confidence > 1.0 {
122132
confidence = 1.0;
123133
}
124134
return Some((lang_dist1.0, confidence));
125135
}
126136

127-
let rate = (score1 - score2) as f64 / (score2 as f64);
137+
let rate = f64::from(score1 - score2) / f64::from(score2);
128138

129139
// Hyperbola function. Everything that is above the function has confidence = 1.0
130140
// If rate is below, confidence is calculated proportionally.
131141
// Numbers 12.0 and 0.05 are obtained experimentally, so the function represents common sense.
132142
//
133143
let confident_rate = (12.0 / trigrams.len() as f64) + 0.05;
134-
let confidence =
135-
if rate > confident_rate {
136-
1.0
137-
} else {
138-
rate / confident_rate
139-
};
144+
let confidence = if rate > confident_rate {
145+
1.0
146+
} else {
147+
rate / confident_rate
148+
};
140149

141150
Some((lang_dist1.0, confidence))
142151
}
143152

144-
fn calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap<String, u32>) -> u32 {
153+
fn calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap<String, u32>) -> u32 {
145154
let mut total_dist = 0u32;
146155

147156
for (i, &trigram) in lang_trigrams.iter().enumerate() {
148157
let dist = match text_trigrams.get(trigram) {
149158
Some(&n) => (n as i32 - i as i32).abs() as u32,
150-
None => MAX_TRIGRAM_DISTANCE
159+
None => MAX_TRIGRAM_DISTANCE,
151160
};
152161
total_dist += dist;
153162
}
@@ -186,7 +195,16 @@ mod tests {
186195
assert_eq!(info.lang, Lang::Tgl);
187196

188197
// with blacklist
189-
let blacklist = vec![Lang::Tgl, Lang::Jav, Lang::Nld, Lang::Uzb, Lang::Swe, Lang::Nob, Lang::Ceb, Lang::Ilo];
198+
let blacklist = vec![
199+
Lang::Tgl,
200+
Lang::Jav,
201+
Lang::Nld,
202+
Lang::Uzb,
203+
Lang::Swe,
204+
Lang::Nob,
205+
Lang::Ceb,
206+
Lang::Ilo,
207+
];
190208
let options = Options::new().set_blacklist(blacklist);
191209
let output = detect_with_options(text, &options);
192210
assert_eq!(output.is_some(), true);
@@ -224,7 +242,9 @@ mod tests {
224242
let info = detect("qwertyuioasdfghjklzxcvbnm").unwrap();
225243
assert!(!info.is_reliable());
226244

227-
let info = detect("qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm").unwrap();
245+
let info =
246+
detect("qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm")
247+
.unwrap();
228248
assert!(!info.is_reliable());
229249

230250
// 1000 chars of randomly generated Cyrillic text

src/detector.rs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
use lang::Lang;
2-
use script::Script;
3-
use script::detect_script;
1+
use detect;
42
use info::Info;
3+
use lang::Lang;
54
use options::Options;
6-
use detect;
5+
use script::detect_script;
6+
use script::Script;
77

88
/// Configurable structure that holds detection options and provides functions
99
/// to detect language and script.
@@ -72,7 +72,10 @@ mod tests {
7272
#[test]
7373
fn test_detect_script() {
7474
// Russian, Cyrillic
75-
assert_eq!(Detector::new().detect_script("Кириллица"), Some(Script::Cyrillic));
75+
assert_eq!(
76+
Detector::new().detect_script("Кириллица"),
77+
Some(Script::Cyrillic)
78+
);
7679
}
7780

7881
#[test]

src/info.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ const RELIABLE_CONFIDENCE_THRESHOLD: f64 = 0.8;
88
pub struct Info {
99
pub(crate) lang: Lang,
1010
pub(crate) script: Script,
11-
pub(crate) confidence: f64
11+
pub(crate) confidence: f64,
1212
}
1313

1414
impl Info {

src/lang.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ impl Lang {
3333
/// use whatlang::Lang;
3434
/// assert_eq!(Lang::Ukr.name(), "Українська");
3535
/// ```
36-
pub fn name(&self) -> &'static str {
37-
lang_to_name(*self)
36+
pub fn name(self) -> &'static str {
37+
lang_to_name(self)
3838
}
3939

4040
/// Get a human readable name of the language in English.
@@ -44,8 +44,8 @@ impl Lang {
4444
/// use whatlang::Lang;
4545
/// assert_eq!(Lang::Deu.eng_name(), "German");
4646
/// ```
47-
pub fn eng_name(&self) -> &'static str {
48-
lang_to_eng_name(*self)
47+
pub fn eng_name(self) -> &'static str {
48+
lang_to_eng_name(self)
4949
}
5050
}
5151

0 commit comments

Comments
 (0)