From 77cfccb3926f8dc4f0478de395eaefdc92d24c06 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Sat, 15 May 2021 19:29:27 -0500 Subject: [PATCH 1/2] refactor(varcon): Clarify check's meanings --- crates/typos-dict/verify/src/main.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/crates/typos-dict/verify/src/main.rs b/crates/typos-dict/verify/src/main.rs index 907571343..89b4994e9 100644 --- a/crates/typos-dict/verify/src/main.rs +++ b/crates/typos-dict/verify/src/main.rs @@ -6,8 +6,8 @@ use structopt::StructOpt; fn generate(file: &mut W, dict: &[u8]) { let mut wtr = csv::Writer::from_writer(file); - let disallowed_typos = disallowed_typos(); - let related_words = related_words(); + let disallowed_typos = varcon_words(); + let word_variants = proper_word_variants(); let mut reader = csv::ReaderBuilder::new() .has_headers(false) @@ -19,7 +19,7 @@ fn generate(file: &mut W, dict: &[u8]) { if disallowed_typos.contains(&unicase::UniCase::new(typo)) { continue; } - let correction = related_words + let correction = word_variants .get(correction) .and_then(|words| find_best_match(typo, correction, words)) .unwrap_or(correction); @@ -28,7 +28,9 @@ fn generate(file: &mut W, dict: &[u8]) { wtr.flush().unwrap(); } -fn disallowed_typos() -> HashSet> { +fn varcon_words() -> HashSet> { + // Even include improper ones because we should be letting varcon handle that rather than our + // dictionary varcon::VARCON .iter() .flat_map(|c| c.entries.iter()) @@ -37,7 +39,7 @@ fn disallowed_typos() -> HashSet> { .collect() } -fn related_words() -> HashMap<&'static str, HashSet<&'static str>> { +fn proper_word_variants() -> HashMap<&'static str, HashSet<&'static str>> { let mut words: HashMap<&'static str, HashSet<&'static str>> = HashMap::new(); for entry in varcon::VARCON.iter().flat_map(|c| c.entries.iter()) { let variants: HashSet<_> = entry @@ -57,11 +59,11 @@ fn related_words() -> HashMap<&'static str, HashSet<&'static str>> { fn find_best_match<'c>( typo: &'c str, correction: &'c str, - related_words: &HashSet<&'static str>, + word_variants: &HashSet<&'static str>, ) -> Option<&'c str> { - assert!(!related_words.contains(correction)); + assert!(!word_variants.contains(correction)); let current = edit_distance::edit_distance(typo, correction); - let mut matches: Vec<_> = related_words + let mut matches: Vec<_> = word_variants .iter() .map(|r| (edit_distance::edit_distance(typo, r), *r)) .filter(|(d, _)| *d < current) From 04e55e4e853a9a970840524963ef7fbb5d717bc4 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Mon, 17 May 2021 21:23:03 -0500 Subject: [PATCH 2/2] fix(dict): Correctly connect dict with varcon We had a bug where `finallizes` with EnGb would not correct to `finalises` --- src/dict.rs | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 2 deletions(-) diff --git a/src/dict.rs b/src/dict.rs index 7466593ee..e52453271 100644 --- a/src/dict.rs +++ b/src/dict.rs @@ -35,8 +35,14 @@ impl BuiltIn { let word = word_token.token(); let mut corrections = if let Some(correction) = self.correct_with_dict(word) { - self.correct_with_vars(word) - .unwrap_or_else(|| Status::Corrections(vec![Cow::Borrowed(correction)])) + match self.correct_with_vars(correction) { + Some(Status::Valid) => Status::Corrections(vec![Cow::Borrowed(correction)]), + Some(correction @ Status::Corrections(_)) => correction, + Some(Status::Invalid) => { + unreachable!("correct_with_vars should always have valid suggestions") + } + None => Status::Corrections(vec![Cow::Borrowed(correction)]), + } } else { self.correct_with_vars(word)? }; @@ -244,6 +250,75 @@ impl<'i, 'w, D: typos::Dictionary> typos::Dictionary for Override<'i, 'w, D> { mod test { use super::*; + #[cfg(feature = "dict")] + #[test] + fn test_dict_correct() { + let dict = BuiltIn::new(crate::config::Locale::default()); + let correction = dict.correct_word(typos::tokens::Word::new_unchecked( + "finallizes", + typos::tokens::Case::Lower, + 0, + )); + assert_eq!( + correction, + Some(Status::Corrections(vec!["finalizes".into()])) + ); + } + + #[cfg(feature = "vars")] + #[test] + fn test_varcon_no_locale() { + let dict = BuiltIn::new(crate::config::Locale::En); + let correction = dict.correct_word(typos::tokens::Word::new_unchecked( + "finalizes", + typos::tokens::Case::Lower, + 0, + )); + assert_eq!(correction, Some(Status::Valid)); + } + + #[cfg(feature = "vars")] + #[test] + fn test_varcon_same_locale() { + let dict = BuiltIn::new(crate::config::Locale::EnUs); + let correction = dict.correct_word(typos::tokens::Word::new_unchecked( + "finalizes", + typos::tokens::Case::Lower, + 0, + )); + assert_eq!(correction, Some(Status::Valid)); + } + + #[cfg(feature = "vars")] + #[test] + fn test_varcon_different_locale() { + let dict = BuiltIn::new(crate::config::Locale::EnGb); + let correction = dict.correct_word(typos::tokens::Word::new_unchecked( + "finalizes", + typos::tokens::Case::Lower, + 0, + )); + assert_eq!( + correction, + Some(Status::Corrections(vec!["finalises".into()])) + ); + } + + #[cfg(all(feature = "dict", feature = "vars"))] + #[test] + fn test_dict_to_varcon() { + let dict = BuiltIn::new(crate::config::Locale::EnGb); + let correction = dict.correct_word(typos::tokens::Word::new_unchecked( + "finallizes", + typos::tokens::Case::Lower, + 0, + )); + assert_eq!( + correction, + Some(Status::Corrections(vec!["finalises".into()])) + ); + } + #[test] fn test_case_correct() { let cases = [