From af66072272a260232982fde994654aa787286e58 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Wed, 17 Apr 2019 07:30:17 -0600 Subject: [PATCH] feat(dict): Perform case-insensitive comparisons --- Cargo.lock | 17 +++++++++++++++++ Cargo.toml | 4 +++- build.rs | 18 ++++++++++++------ src/dict.rs | 30 +++++++++++++++++++++++------- 4 files changed, 55 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bb7c02631..ed4833601 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -162,6 +162,7 @@ dependencies = [ "serde_derive 1.0.85 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.36 (registry+https://github.com/rust-lang/crates.io-index)", "structopt 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)", + "unicase 1.4.2 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -434,6 +435,7 @@ version = "0.7.24" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "siphasher 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "unicase 1.4.2 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -786,6 +788,14 @@ name = "ucd-util" version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "unicase" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "unicode-segmentation" version = "1.2.1" @@ -819,6 +829,11 @@ name = "vec_map" version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "version_check" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "void" version = "1.0.2" @@ -955,12 +970,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" "checksum treeline 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a7f741b240f1a48843f9b8e0444fb55fb2a4ff67293b50a9179dfd5ea67f8d41" "checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86" +"checksum unicase 1.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7f4765f83163b74f957c797ad9253caf97f103fb064d3999aea9568d09fc8a33" "checksum unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "aa6024fc12ddfd1c6dbc14a80fa2324d4568849869b779f6bd37e5e4c03344d1" "checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" "checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" "checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737" "checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" +"checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" "checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" "checksum walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "9d9d7ed3431229a144296213105a390676cc49c9b6a72bd19f3176c98e129fa1" "checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0" diff --git a/Cargo.toml b/Cargo.toml index 367c68529..39cf09628 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ failure = "0.1" structopt = "0.2" clap = "2" ignore = "0.4" -phf = "0.7" +phf = { version = "0.7", features = ["unicase"] } regex = "1.0" lazy_static = "1.2.0" grep-searcher = "0.1" @@ -28,6 +28,7 @@ serde = "1.0" serde_derive = "1.0" serde_json = "1.0" itertools = "0.8" +unicase = "1.1" [dev-dependencies] assert_fs = "0.10" @@ -35,3 +36,4 @@ assert_fs = "0.10" [build-dependencies] phf_codegen = "0.7" csv = "1.0" +unicase = "1.1" diff --git a/build.rs b/build.rs index ecbdcd576..8458fcbdf 100644 --- a/build.rs +++ b/build.rs @@ -10,15 +10,21 @@ fn main() { let mut file = BufWriter::new(File::create(&path).unwrap()); println!("rerun-if-changed=./assets/words.csv"); - write!(&mut file, "static DICTIONARY: phf::Map<&'static str, &'static str> = ").unwrap(); + write!(&mut file, "use unicase::UniCase;").unwrap(); + write!( + &mut file, + "static DICTIONARY: phf::Map, &'static str> = " + ) + .unwrap(); let mut builder = phf_codegen::Map::new(); - let records: Vec<_> = csv::Reader::from_reader(CORPUS).records().map(|r| r.unwrap()).collect(); + let records: Vec<_> = csv::Reader::from_reader(CORPUS) + .records() + .map(|r| r.unwrap()) + .collect(); for record in &records { let value = format!(r#""{}""#, &record[1]); - builder.entry(&record[0], &value); + builder.entry(unicase::UniCase(&record[0]), &value); } - builder - .build(&mut file) - .unwrap(); + builder.build(&mut file).unwrap(); write!(&mut file, ";\n").unwrap(); } diff --git a/src/dict.rs b/src/dict.rs index 365e39da7..fbc7df9ee 100644 --- a/src/dict.rs +++ b/src/dict.rs @@ -1,18 +1,34 @@ include!(concat!(env!("OUT_DIR"), "/codegen.rs")); -pub struct Dictionary { -} +pub struct Dictionary {} impl Dictionary { pub fn new() -> Self { - Dictionary { } + Dictionary {} + } + + pub fn correct_str<'s, 'w>(&'s self, word: &'w str) -> Option<&'s str> { + map_lookup(&DICTIONARY, word) } - pub fn correct_str<'s>(&'s self, word: &str) -> Option<&'s str> { - DICTIONARY.get(word).map(|s| *s) + pub fn correct_bytes<'s, 'w>(&'s self, word: &'w [u8]) -> Option<&'s str> { + std::str::from_utf8(word) + .ok() + .and_then(|word| self.correct_str(word)) } +} - pub fn correct_bytes<'s>(&'s self, word: &[u8]) -> Option<&'s str> { - std::str::from_utf8(word).ok().and_then(|word| DICTIONARY.get(word)).map(|s| *s) +fn map_lookup( + map: &'static phf::Map, &'static str>, + key: &str, +) -> Option<&'static str> { + // This transmute should be safe as `get` will not store the reference with + // the expanded lifetime. This is due to `Borrow` being overly strict and + // can't have an impl for `&'static str` to `Borrow<&'a str>`. + // + // See https://github.com/rust-lang/rust/issues/28853#issuecomment-158735548 + unsafe { + let key = ::std::mem::transmute::<_, &'static str>(key); + map.get(&UniCase(key)).map(|s| *s) } }