Skip to content

Commit

Permalink
Merge pull request #251 from epage/vars
Browse files Browse the repository at this point in the history
fix(dict): Correctly connect dict with varcon
  • Loading branch information
epage authored May 18, 2021
2 parents fa7ce95 + 04e55e4 commit 444d2cc
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 10 deletions.
18 changes: 10 additions & 8 deletions crates/typos-dict/verify/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ use structopt::StructOpt;
fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
let mut wtr = csv::Writer::from_writer(file);

let disallowed_typos = disallowed_typos();
let related_words = related_words();
let disallowed_typos = varcon_words();
let word_variants = proper_word_variants();

let mut reader = csv::ReaderBuilder::new()
.has_headers(false)
Expand All @@ -19,7 +19,7 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
if disallowed_typos.contains(&unicase::UniCase::new(typo)) {
continue;
}
let correction = related_words
let correction = word_variants
.get(correction)
.and_then(|words| find_best_match(typo, correction, words))
.unwrap_or(correction);
Expand All @@ -28,7 +28,9 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
wtr.flush().unwrap();
}

fn disallowed_typos() -> HashSet<unicase::UniCase<&'static str>> {
fn varcon_words() -> HashSet<unicase::UniCase<&'static str>> {
// Even include improper ones because we should be letting varcon handle that rather than our
// dictionary
varcon::VARCON
.iter()
.flat_map(|c| c.entries.iter())
Expand All @@ -37,7 +39,7 @@ fn disallowed_typos() -> HashSet<unicase::UniCase<&'static str>> {
.collect()
}

fn related_words() -> HashMap<&'static str, HashSet<&'static str>> {
fn proper_word_variants() -> HashMap<&'static str, HashSet<&'static str>> {
let mut words: HashMap<&'static str, HashSet<&'static str>> = HashMap::new();
for entry in varcon::VARCON.iter().flat_map(|c| c.entries.iter()) {
let variants: HashSet<_> = entry
Expand All @@ -57,11 +59,11 @@ fn related_words() -> HashMap<&'static str, HashSet<&'static str>> {
fn find_best_match<'c>(
typo: &'c str,
correction: &'c str,
related_words: &HashSet<&'static str>,
word_variants: &HashSet<&'static str>,
) -> Option<&'c str> {
assert!(!related_words.contains(correction));
assert!(!word_variants.contains(correction));
let current = edit_distance::edit_distance(typo, correction);
let mut matches: Vec<_> = related_words
let mut matches: Vec<_> = word_variants
.iter()
.map(|r| (edit_distance::edit_distance(typo, r), *r))
.filter(|(d, _)| *d < current)
Expand Down
79 changes: 77 additions & 2 deletions src/dict.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,14 @@ impl BuiltIn {

let word = word_token.token();
let mut corrections = if let Some(correction) = self.correct_with_dict(word) {
self.correct_with_vars(word)
.unwrap_or_else(|| Status::Corrections(vec![Cow::Borrowed(correction)]))
match self.correct_with_vars(correction) {
Some(Status::Valid) => Status::Corrections(vec![Cow::Borrowed(correction)]),
Some(correction @ Status::Corrections(_)) => correction,
Some(Status::Invalid) => {
unreachable!("correct_with_vars should always have valid suggestions")
}
None => Status::Corrections(vec![Cow::Borrowed(correction)]),
}
} else {
self.correct_with_vars(word)?
};
Expand Down Expand Up @@ -244,6 +250,75 @@ impl<'i, 'w, D: typos::Dictionary> typos::Dictionary for Override<'i, 'w, D> {
mod test {
use super::*;

#[cfg(feature = "dict")]
#[test]
fn test_dict_correct() {
let dict = BuiltIn::new(crate::config::Locale::default());
let correction = dict.correct_word(typos::tokens::Word::new_unchecked(
"finallizes",
typos::tokens::Case::Lower,
0,
));
assert_eq!(
correction,
Some(Status::Corrections(vec!["finalizes".into()]))
);
}

#[cfg(feature = "vars")]
#[test]
fn test_varcon_no_locale() {
let dict = BuiltIn::new(crate::config::Locale::En);
let correction = dict.correct_word(typos::tokens::Word::new_unchecked(
"finalizes",
typos::tokens::Case::Lower,
0,
));
assert_eq!(correction, Some(Status::Valid));
}

#[cfg(feature = "vars")]
#[test]
fn test_varcon_same_locale() {
let dict = BuiltIn::new(crate::config::Locale::EnUs);
let correction = dict.correct_word(typos::tokens::Word::new_unchecked(
"finalizes",
typos::tokens::Case::Lower,
0,
));
assert_eq!(correction, Some(Status::Valid));
}

#[cfg(feature = "vars")]
#[test]
fn test_varcon_different_locale() {
let dict = BuiltIn::new(crate::config::Locale::EnGb);
let correction = dict.correct_word(typos::tokens::Word::new_unchecked(
"finalizes",
typos::tokens::Case::Lower,
0,
));
assert_eq!(
correction,
Some(Status::Corrections(vec!["finalises".into()]))
);
}

#[cfg(all(feature = "dict", feature = "vars"))]
#[test]
fn test_dict_to_varcon() {
let dict = BuiltIn::new(crate::config::Locale::EnGb);
let correction = dict.correct_word(typos::tokens::Word::new_unchecked(
"finallizes",
typos::tokens::Case::Lower,
0,
));
assert_eq!(
correction,
Some(Status::Corrections(vec!["finalises".into()]))
);
}

#[test]
fn test_case_correct() {
let cases = [
Expand Down

0 comments on commit 444d2cc

Please sign in to comment.