Skip to content

Commit

Permalink
feat(dict): Perform case-insensitive comparisons
Browse files Browse the repository at this point in the history
  • Loading branch information
epage committed Jun 14, 2019
1 parent 719cc7d commit af66072
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 14 deletions.
17 changes: 17 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,20 @@ failure = "0.1"
structopt = "0.2"
clap = "2"
ignore = "0.4"
phf = "0.7"
phf = { version = "0.7", features = ["unicase"] }
regex = "1.0"
lazy_static = "1.2.0"
grep-searcher = "0.1"
serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
itertools = "0.8"
unicase = "1.1"

[dev-dependencies]
assert_fs = "0.10"

[build-dependencies]
phf_codegen = "0.7"
csv = "1.0"
unicase = "1.1"
18 changes: 12 additions & 6 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,21 @@ fn main() {
let mut file = BufWriter::new(File::create(&path).unwrap());

println!("rerun-if-changed=./assets/words.csv");
write!(&mut file, "static DICTIONARY: phf::Map<&'static str, &'static str> = ").unwrap();
write!(&mut file, "use unicase::UniCase;").unwrap();
write!(
&mut file,
"static DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static str> = "
)
.unwrap();
let mut builder = phf_codegen::Map::new();
let records: Vec<_> = csv::Reader::from_reader(CORPUS).records().map(|r| r.unwrap()).collect();
let records: Vec<_> = csv::Reader::from_reader(CORPUS)
.records()
.map(|r| r.unwrap())
.collect();
for record in &records {
let value = format!(r#""{}""#, &record[1]);
builder.entry(&record[0], &value);
builder.entry(unicase::UniCase(&record[0]), &value);
}
builder
.build(&mut file)
.unwrap();
builder.build(&mut file).unwrap();
write!(&mut file, ";\n").unwrap();
}
30 changes: 23 additions & 7 deletions src/dict.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,34 @@
include!(concat!(env!("OUT_DIR"), "/codegen.rs"));

pub struct Dictionary {
}
pub struct Dictionary {}

impl Dictionary {
pub fn new() -> Self {
Dictionary { }
Dictionary {}
}

pub fn correct_str<'s, 'w>(&'s self, word: &'w str) -> Option<&'s str> {
map_lookup(&DICTIONARY, word)
}

pub fn correct_str<'s>(&'s self, word: &str) -> Option<&'s str> {
DICTIONARY.get(word).map(|s| *s)
pub fn correct_bytes<'s, 'w>(&'s self, word: &'w [u8]) -> Option<&'s str> {
std::str::from_utf8(word)
.ok()
.and_then(|word| self.correct_str(word))
}
}

pub fn correct_bytes<'s>(&'s self, word: &[u8]) -> Option<&'s str> {
std::str::from_utf8(word).ok().and_then(|word| DICTIONARY.get(word)).map(|s| *s)
fn map_lookup(
map: &'static phf::Map<UniCase<&'static str>, &'static str>,
key: &str,
) -> Option<&'static str> {
// This transmute should be safe as `get` will not store the reference with
// the expanded lifetime. This is due to `Borrow` being overly strict and
// can't have an impl for `&'static str` to `Borrow<&'a str>`.
//
// See https://github.com/rust-lang/rust/issues/28853#issuecomment-158735548
unsafe {
let key = ::std::mem::transmute::<_, &'static str>(key);
map.get(&UniCase(key)).map(|s| *s)
}
}

0 comments on commit af66072

Please sign in to comment.