Skip to content

Commit

Permalink
First probably working phrase query doc filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
Kerollmops committed Dec 9, 2019
1 parent 2eeae7c commit daeb226
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 17 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions meilisearch-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ env_logger = "0.7.0"
fst = { version = "0.3.5", default-features = false }
hashbrown = { version = "0.6.0", features = ["serde"] }
heed = "0.6.0"
itertools = "0.8.2" # kill me please
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
log = "0.4.8"
meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.2" }
Expand Down
71 changes: 65 additions & 6 deletions meilisearch-core/src/bucket_sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,9 @@ pub fn bucket_sort<'c>(
let before_raw_documents_building = Instant::now();
let mut raw_documents = Vec::new();
for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
raw_documents.push(RawDocument {
raw_matches,
processed_matches: Vec::new(),
processed_distances: Vec::new(),
});
if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &arena) {
raw_documents.push(raw_document);
}
}
debug!("creating {} candidates documents took {:.02?}",
raw_documents.len(),
Expand Down Expand Up @@ -149,6 +147,57 @@ pub struct RawDocument<'a, 'tag> {
pub processed_distances: Vec<Option<u8>>,
}

impl<'a, 'tag> RawDocument<'a, 'tag> {
fn new<'txn>(
raw_matches: &'a mut [BareMatch<'tag>],
automatons: &[QueryWordAutomaton],
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
) -> Option<RawDocument<'a, 'tag>>
{
raw_matches.sort_unstable_by_key(|m| m.query_index);

// debug!("{:?} {:?}", raw_matches[0].document_id, raw_matches);

let mut previous_word = None;
for i in 0..raw_matches.len() {
let a = &raw_matches[i];
let auta = &automatons[a.query_index as usize];

match auta.phrase_query {
Some((0, _)) => {
previous_word = Some(a.query_index);
let b = raw_matches.get(i + 1)?;
if a.query_index + 1 != b.query_index {
return None;
}

let pla = &postings_lists[a.postings_list];
let plb = &postings_lists[b.postings_list];

let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| {
a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
});

if !iter.any(|eb| eb.is_both()) { return None }
},
Some((1, _)) => {
if previous_word.take() != Some(a.query_index - 1) {
return None;
}
},
Some((_, _)) => unreachable!(),
None => (),
}
}

Some(RawDocument {
raw_matches,
processed_matches: Vec::new(),
processed_distances: Vec::new(),
})
}
}

pub struct BareMatch<'tag> {
pub document_id: DocumentId,
pub query_index: u16,
Expand Down Expand Up @@ -186,6 +235,15 @@ pub struct PostingsListView<'txn> {
len: usize,
}

impl fmt::Debug for PostingsListView<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("PostingsListView")
.field("input", &std::str::from_utf8(&self.input).unwrap())
.field("postings_list", &self.as_ref())
.finish()
}
}

impl<'txn> PostingsListView<'txn> {
pub fn new(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
let len = postings_list.len();
Expand Down Expand Up @@ -275,6 +333,7 @@ fn fetch_matches<'txn, 'tag>(
let input = Rc::from(input);
let postings_list = Rc::new(postings_list);
let postings_list_view = PostingsListView::new(input, postings_list);

let mut offset = 0;
for group in postings_list_view.linear_group_by_key(|di| di.document_id) {

Expand Down Expand Up @@ -442,7 +501,7 @@ fn construct_automatons2(
}
}

if false && n == 1 {
if true && n == 1 {
if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
let mut left_automaton = QueryWordAutomaton::exact(left);
left_automaton.phrase_query = Some((0, 2));
Expand Down
50 changes: 39 additions & 11 deletions meilisearch-core/src/criterion2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,42 @@ pub trait Criterion {
}
}

fn prepare_query_distances(
documents: &mut [RawDocument],
fn prepare_query_distances<'a, 'tag, 'txn>(
documents: &mut [RawDocument<'a, 'tag>],
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
postings_lists: &PostingsListsArena<'tag, 'txn>,
) {
for document in documents {
if !document.processed_distances.is_empty() { continue }

// debug!("{:?}", document.raw_matches[0].document_id);

let mut processed = Vec::new();
for m in document.raw_matches.iter() {
let mut raw_matches = document.raw_matches.iter().peekable();
while let Some(m) = raw_matches.next() {

// let automaton = &automatons[m.query_index as usize];

// debug!("{:?} {:?}", m, automaton);
// debug!("{:?}", &postings_lists[m.postings_list]);

// match automaton.phrase_query {
// Some((0, len)) => {
// match raw_matches.peek() {
// Some(BareMatch { query_index, .. }) => {
// if *query_index != m.query_index + 1 {
// raw_matches.next();
// continue
// }
// },
// None => continue,
// }
// },
// Some((_, _)) => continue,
// None => (),
// }

// FIXME we really need to take splitted words into account
// those must be seen at the same level as the non-splitteds
// if automatons[m.query_index as usize].phrase_query.is_some() {
Expand All @@ -73,6 +99,8 @@ fn prepare_query_distances(
}
}

// debug!("{:?}", processed);

document.processed_distances = processed;
}
}
Expand All @@ -82,14 +110,14 @@ pub struct Typo;
impl Criterion for Typo {
fn name(&self) -> &str { "typo" }

fn prepare(
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument],
postings_lists: &mut PostingsListsArena,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_query_distances(documents, query_enhancer, automatons);
prepare_query_distances(documents, query_enhancer, automatons, postings_lists);
}

fn evaluate(
Expand Down Expand Up @@ -140,14 +168,14 @@ pub struct Words;
impl Criterion for Words {
fn name(&self) -> &str { "words" }

fn prepare(
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument],
postings_lists: &mut PostingsListsArena,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_query_distances(documents, query_enhancer, automatons);
prepare_query_distances(documents, query_enhancer, automatons, postings_lists);
}

fn evaluate(
Expand Down

0 comments on commit daeb226

Please sign in to comment.