Skip to content

Commit

Permalink
Fix dictionary index case-sensitivity inconsistencies
Browse files Browse the repository at this point in the history
  • Loading branch information
welps committed Jun 12, 2020
1 parent 47354b8 commit 4cd7136
Show file tree
Hide file tree
Showing 8 changed files with 421 additions and 52 deletions.
11 changes: 11 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ filetime = "0.2.10"
fxhash = "0.2.1"
rand_core = "0.5.1"
rand_xoshiro = "0.4.0"
caseless = "0.2.1"

[dependencies.getopts]
version = "0.2.21"
Expand Down
276 changes: 239 additions & 37 deletions src/dictionary/indexing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@ use levenshtein::levenshtein;

use super::errors::DictError;
use super::errors::DictError::*;
use caseless::default_case_fold_str;

/// The index is partially loaded if `state` isn't `None`.
pub struct Index<R: BufRead> {
pub entries: Vec<Entry>,
pub state: Option<R>,
pub settings: Settings,
}

#[derive(Debug, Clone)]
Expand All @@ -40,48 +42,36 @@ pub struct Entry {
pub original: Option<String>,
}

#[derive(Debug, Clone)]
// Settings correspond to options detailed in `dictfmt`
pub struct Settings {
pub all_characters: bool,
pub case_sensitive: bool,

}

pub trait IndexReader {
fn load_and_find(&mut self, headword: &str, fuzzy: bool) -> Vec<Entry>;
fn find(&self, headword: &str, fuzzy: bool) -> Vec<Entry>;
fn settings(&self) -> Settings;
}

impl<R: BufRead> IndexReader for Index<R> {
fn load_and_find(&mut self, headword: &str, fuzzy: bool) -> Vec<Entry> {
if let Some(br) = self.state.take() {
if let Ok(mut index) = parse_index(br, false) {
if let Ok(mut index) = parse_index_with_settings(br, false, Option::Some(&self.settings)) {
self.entries.append(&mut index.entries);
}
}
self.find(headword, fuzzy)
}

fn find(&self, headword: &str, fuzzy: bool) -> Vec<Entry> {
if fuzzy {
self.entries.iter().filter(|entry| levenshtein(headword, &entry.headword) <= 1).cloned().collect()
} else {
if let Ok(mut i) = self.entries.binary_search_by_key(&headword, |entry| &entry.headword) {
let mut results = vec![self.entries[i].clone()];
let j = i;
while i > 0 {
i -= 1;
if self.entries[i].headword != headword {
break;
}
results.insert(0, self.entries[i].clone());
}
i = j;
while i < self.entries.len() - 1 {
i += 1;
if self.entries[i].headword != headword {
break;
}
results.push(self.entries[i].clone());
}
results
} else {
Vec::new()
}
}
find(self.entries.as_ref(), headword, fuzzy)
}

fn settings(&self) -> Settings {
self.settings.clone()
}
}

Expand Down Expand Up @@ -136,32 +126,89 @@ fn parse_line(line: &str, line_number: usize) -> Result<(&str, u64, u64, Option<

/// Parse the index for a dictionary from a given BufRead compatible object.
/// When `lazy` is `true`, the loop stops once all the metadata entries are parsed.
pub fn parse_index<B: BufRead>(mut br: B, lazy: bool) -> Result<Index<B>, DictError> {
let mut info = false;
pub fn parse_index<B: BufRead>(br: B, lazy: bool) -> Result<Index<B>, DictError> {
parse_index_with_settings(br, lazy, None)
}

// parse_index_with_settings accounts for the following possibilities:
// - lazy parse -> parse index metadata (00-database-*)
// - full parse -> parse whole index
// - resume parse -> resume from lazy parse
fn parse_index_with_settings<B: BufRead>(mut br: B, lazy: bool, settings: Option<&Settings>) -> Result<Index<B>, DictError> {
let mut found_metadata = false;
let mut settings_created = false;
let mut entries = Vec::new();
let mut line_number = 0;
let mut line = String::new();

let mut s = Settings{all_characters: false, case_sensitive: false};

if let Some(settings) = settings {
s = settings.clone();
found_metadata = true;
settings_created = true;
}

while let Ok(nb) = br.read_line(&mut line) {
if nb == 0 {
break;
}
let (headword, offset, size, original) = parse_line(line.trim_end(), line_number)?;
if lazy {
if !info && (headword.starts_with("00-database-") || headword.starts_with("00database")) {
info = true;
} else if info && !headword.starts_with("00-database-") && !headword.starts_with("00database") {
break;

if !found_metadata && (headword.starts_with("00-database-") || headword.starts_with("00database")) {
found_metadata = true;
} else if found_metadata && !settings_created && !headword.starts_with("00-database-") && !headword.starts_with("00database") {

// A DICT index may not be case-sensitive, but the indexed headwords may not have been casefolded
// Therefore if the index is not case-sensitive, we will have to casefold all headwords ourselves along with the query
let all_chars = !find(entries.as_ref(), "00-database-allchars", false).is_empty();

let word = if all_chars {
"00-database-case-sensitive"
} else {
"00databasecasesensitive"
};

let case_sensitive = !find(entries.as_ref(),word, false).is_empty();
s.all_characters = all_chars;
s.case_sensitive = case_sensitive;

settings_created = true;

// It is possible for headwords to precede the 00-database- entries so we need to go back and clean them up
for mut entry in entries.iter_mut() {
let formatted_entry = &mut Entry{
headword: default_case_fold_str(&entry.headword),
offset: entry.offset,
size: entry.size,
original: entry.original.clone()
};

entry = formatted_entry;
}
}

let formatted_word: String;

if !s.case_sensitive {
formatted_word = default_case_fold_str(headword.as_ref());
} else {
formatted_word = headword.to_string();
}

entries.push(Entry {
headword: headword.to_string(),
headword: formatted_word,
offset,
size,
original: original.map(String::from),
});
line_number += 1;
line.clear();

// Break *after* current headword is committed for lazy load
if lazy && settings_created {
break;
}
}

let state = if lazy {
Expand All @@ -170,12 +217,167 @@ pub fn parse_index<B: BufRead>(mut br: B, lazy: bool) -> Result<Index<B>, DictEr
None
};

Ok(Index { entries, state })
Ok(Index{entries, state, settings: s})
}

/// Parse the index for a dictionary from a given path.
pub fn parse_index_from_file<P: AsRef<Path>>(path: P, lazy: bool) -> Result<Index<BufReader<File>>, DictError> {
pub fn parse_index_from_file(path: impl AsRef<Path>, lazy: bool) -> Result<Index<BufReader<File>>, DictError> {
let file = File::open(path)?;
let reader = BufReader::new(file);
parse_index(reader, lazy)
}

fn find(entries: &Vec<Entry>, headword: &str, fuzzy: bool) -> Vec<Entry> {
if fuzzy {
entries.iter().filter(|entry| levenshtein(headword, &entry.headword) <= 1).cloned().collect()
} else {
if let Ok(mut i) = entries.binary_search_by_key(&headword, |entry| &entry.headword) {
let mut results = vec![entries[i].clone()];
let j = i;
while i > 0 {
i -= 1;
if entries[i].headword != headword {
break;
}
results.insert(0, entries[i].clone());
}
i = j;
while i < entries.len() - 1 {
i += 1;
if entries[i].headword != headword {
break;
}
results.push(entries[i].clone());
}
results
} else {
Vec::new()
}
}
}

#[cfg(test)]
mod tests {
use super::*;
use std::io::Empty;

const PATH_CASE_SENSITIVE_INDEX: &str = "src/dictionary/testdata/case_sensitive_dict.index";
const PATH_CASE_INSENSITIVE_INDEX: &str = "src/dictionary/testdata/case_insensitive_dict.index";

#[test]
fn test_index_find() {
let words = vec![
Entry{
headword: String::from("bar"),
offset: 0,
size: 8,
original: None,
},
Entry{
headword: String::from("baz"),
offset: 8,
size: 4,
original: None,
},
Entry{
headword: String::from("foo"),
offset: 12,
size: 4,
original: None,
},
];

let index: Index<Empty> = Index{
entries: words,
state: None,
settings: Settings{ all_characters: false, case_sensitive: false },
};

let r = index.find("apples", false);
assert!(r.is_empty());

let r = index.find("baz", false);
assert!(!r.is_empty());
assert_eq!(r.len(), 1);
assert_eq!(r.first().unwrap().headword, "baz");

let r = index.find("bas", true);
assert!(!r.is_empty());
assert_eq!(r.len(), 2);
assert_eq!(r.first().unwrap().headword, "bar");
}

#[test]
// Make sure that a lazy load does not inadvertently skip a word when it returns to BufRead
fn test_index_load_and_find() {
let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, true);
assert!(r.is_ok());

let mut index = r.unwrap();
assert_eq!(index.entries[0].headword, "00-database-allchars");
assert_eq!(index.entries.last().unwrap().headword, "bar");

let r = index.load_and_find("bar", false);
assert!(!r.is_empty());

let r = index.load_and_find("foo", false);
assert!(!r.is_empty());
}

#[test]
fn test_parse_index_from_file() {
let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, false);
assert!(r.is_ok());

let index = r.unwrap();
assert_eq!(index.entries[0].headword, "00-database-allchars");
assert_eq!(index.entries.last().unwrap().headword, "あいおい");
}

#[test]
fn test_parse_index_from_file_lazy() {
let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, true);
assert!(r.is_ok());

let index = r.unwrap();
assert_eq!(index.entries[0].headword, "00-database-allchars");
assert_eq!(index.entries.last().unwrap().headword, "bar");
}

#[test]
fn test_parse_index_from_file_handles_case_insensitivity() {
let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, false);
assert!(r.is_ok());

let index = r.unwrap();

let r = index.find("bar", false);
assert!(!r.is_empty());
assert_eq!(r.first().unwrap().headword, "bar");

// straße should fold to strasse
// https://www.w3.org/International/wiki/Case_folding
let r = index.find("strasse", false);
assert!(!r.is_empty());
assert_eq!(r.first().unwrap().headword, "strasse");

}

#[test]
fn test_parse_index_from_file_handles_case_sensitivity() {
let r = parse_index_from_file(PATH_CASE_SENSITIVE_INDEX, false);
assert!(r.is_ok());

let index = r.unwrap();

let r = index.find("Bar", false);
assert!(!r.is_empty());
assert_eq!(r.first().unwrap().headword, "Bar");

let r = index.find("straße", false);
assert!(!r.is_empty());
assert_eq!(r.first().unwrap().headword, "straße");

}
}

Loading

0 comments on commit 4cd7136

Please sign in to comment.