Skip to content

Commit

Permalink
Speed up top-k retrieval on filtered conjunctions. (#13994)
Browse files Browse the repository at this point in the history
A while back we added an optimized bulk scorer that implements block-max AND,
this yielded a good speedup on nightly benchmarks, see annotation `FP` at
https://benchmarks.mikemccandless.com/AndHighHigh.html. With this PR, filtered
conjunctions now also run through this optimized bulk scorer by doing two
things:
 - It flattens inner conjunctions. This makes queries initially written as
   something like `+(+term1 +term2) #filter` rewritten to
   `+term1 +term2 #filter`.
 - It evaluates queries that have a mix of MUST and FILTER clauses evaluated
   through `BlockMaxConjunctionBulkScorer` by treating FILTER clauses as
   scoring clauses that produce a score of 0.
  • Loading branch information
jpountz authored Nov 18, 2024
1 parent a0e1eee commit 4400d55
Show file tree
Hide file tree
Showing 4 changed files with 175 additions and 24 deletions.
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ Optimizations
longs that would pack two integers. We are now moving back to integers to be
able to take advantage of 2x more lanes with the vector API. (Adrien Grand)

* GITHUB#13994: Speed up top-k retrieval of filtered conjunctions.
(Adrien Grand)

Bug Fixes
---------------------
* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended
Expand Down
50 changes: 46 additions & 4 deletions lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,11 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException {
return new MatchNoDocsQuery("empty BooleanQuery");
}

// Queries with no positive clauses have no matches
if (clauses.size() == clauseSets.get(Occur.MUST_NOT).size()) {
return new MatchNoDocsQuery("pure negative BooleanQuery");
}

// optimize 1-clause queries
if (clauses.size() == 1) {
BooleanClause c = clauses.get(0);
Expand All @@ -283,8 +288,6 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException {
// no scoring clauses, so return a score of 0
return new BoostQuery(new ConstantScoreQuery(query), 0);
case MUST_NOT:
// no positive clauses
return new MatchNoDocsQuery("pure negative BooleanQuery");
default:
throw new AssertionError();
}
Expand Down Expand Up @@ -539,8 +542,7 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException {
builder.setMinimumNumberShouldMatch(minimumNumberShouldMatch);
boolean actuallyRewritten = false;
for (BooleanClause clause : clauses) {
if (clause.occur() == Occur.SHOULD && clause.query() instanceof BooleanQuery) {
BooleanQuery innerQuery = (BooleanQuery) clause.query();
if (clause.occur() == Occur.SHOULD && clause.query() instanceof BooleanQuery innerQuery) {
if (innerQuery.isPureDisjunction()) {
actuallyRewritten = true;
for (BooleanClause innerClause : innerQuery.clauses()) {
Expand All @@ -558,6 +560,46 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException {
}
}

// Inline required / prohibited clauses. This helps run filtered conjunctive queries more
// efficiently by providing all clauses to the block-max AND scorer.
{
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.setMinimumNumberShouldMatch(minimumNumberShouldMatch);
boolean actuallyRewritten = false;
for (BooleanClause outerClause : clauses) {
if (outerClause.isRequired() && outerClause.query() instanceof BooleanQuery innerQuery) {
// Inlining prohibited clauses is not legal if the query is a pure negation, since pure
// negations have no matches. It works because the inner BooleanQuery would have first
// rewritten to a MatchNoDocsQuery if it only had prohibited clauses.
assert innerQuery.getClauses(Occur.MUST_NOT).size() != innerQuery.clauses().size();
if (innerQuery.getMinimumNumberShouldMatch() == 0
&& innerQuery.getClauses(Occur.SHOULD).isEmpty()) {

actuallyRewritten = true;
for (BooleanClause innerClause : innerQuery) {
Occur innerOccur = innerClause.occur();
if (innerOccur == Occur.FILTER
|| innerOccur == Occur.MUST_NOT
|| outerClause.occur() == Occur.MUST) {
builder.add(innerClause);
} else {
assert outerClause.occur() == Occur.FILTER && innerOccur == Occur.MUST;
// In this case we need to change the occur of the inner query from MUST to FILTER.
builder.add(innerClause.query(), Occur.FILTER);
}
}
} else {
builder.add(outerClause);
}
} else {
builder.add(outerClause);
}
}
if (actuallyRewritten) {
return builder.build();
}
}

// SHOULD clause count less than or equal to minimumNumberShouldMatch
// Important(this can only be processed after nested clauses have been flattened)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -333,10 +333,15 @@ private BulkScorer requiredBulkScorer() throws IOException {
requiredScoring.add(ss.get(leadCost));
}
if (scoreMode == ScoreMode.TOP_SCORES
&& requiredNoScoring.isEmpty()
&& requiredScoring.size() > 1
// Only specialize top-level conjunctions for clauses that don't have a two-phase iterator.
&& requiredNoScoring.stream().map(Scorer::twoPhaseIterator).allMatch(Objects::isNull)
&& requiredScoring.stream().map(Scorer::twoPhaseIterator).allMatch(Objects::isNull)) {
// Turn all filters into scoring clauses with a score of zero, so that
// BlockMaxConjunctionBulkScorer is applicable.
for (Scorer filter : requiredNoScoring) {
requiredScoring.add(new ConstantScoreScorer(0f, ScoreMode.COMPLETE, filter.iterator()));
}
return new BlockMaxConjunctionBulkScorer(maxDoc, requiredScoring);
}
if (scoreMode != ScoreMode.TOP_SCORES
Expand Down
139 changes: 120 additions & 19 deletions lucene/core/src/test/org/apache/lucene/search/TestBooleanRewrites.java
Original file line number Diff line number Diff line change
Expand Up @@ -330,8 +330,8 @@ public void testDeeplyNestedBooleanRewriteShouldClauses() throws IOException {
int depth = TestUtil.nextInt(random(), 10, 30);
TestRewriteQuery rewriteQueryExpected = new TestRewriteQuery();
TestRewriteQuery rewriteQuery = new TestRewriteQuery();
Query expectedQuery =
new BooleanQuery.Builder().add(rewriteQueryExpected, Occur.FILTER).build();
BooleanQuery.Builder expectedQueryBuilder =
new BooleanQuery.Builder().add(rewriteQueryExpected, Occur.FILTER);
Query deepBuilder =
new BooleanQuery.Builder()
.add(rewriteQuery, Occur.SHOULD)
Expand All @@ -345,21 +345,19 @@ public void testDeeplyNestedBooleanRewriteShouldClauses() throws IOException {
.add(tq, Occur.SHOULD)
.add(deepBuilder, Occur.SHOULD);
deepBuilder = bq.build();
BooleanQuery.Builder expectedBq = new BooleanQuery.Builder().add(tq, Occur.FILTER);
expectedQueryBuilder.add(tq, Occur.FILTER);
if (i == depth) {
expectedBq.add(rewriteQuery, Occur.FILTER);
} else {
expectedBq.add(expectedQuery, Occur.FILTER);
expectedQueryBuilder.add(rewriteQuery, Occur.FILTER);
}
expectedQuery = expectedBq.build();
}
BooleanQuery bq = new BooleanQuery.Builder().add(deepBuilder, Occur.FILTER).build();
expectedQuery = new BoostQuery(new ConstantScoreQuery(expectedQuery), 0.0f);
Query expectedQuery =
new BoostQuery(new ConstantScoreQuery(expectedQueryBuilder.build()), 0.0f);
Query rewritten = searcher.rewrite(bq);
assertEquals(expectedQuery, rewritten);
// the SHOULD clauses cause more rewrites because they incrementally change to `MUST` and then
// `FILTER`
assertEquals("Depth=" + depth, depth + 1, rewriteQuery.numRewrites);
// `FILTER`, plus the flattening of required clauses
assertEquals("Depth=" + depth, depth * 2, rewriteQuery.numRewrites);
}

public void testDeeplyNestedBooleanRewrite() throws IOException {
Expand All @@ -369,27 +367,26 @@ public void testDeeplyNestedBooleanRewrite() throws IOException {
int depth = TestUtil.nextInt(random(), 10, 30);
TestRewriteQuery rewriteQueryExpected = new TestRewriteQuery();
TestRewriteQuery rewriteQuery = new TestRewriteQuery();
Query expectedQuery =
new BooleanQuery.Builder().add(rewriteQueryExpected, Occur.FILTER).build();
BooleanQuery.Builder expectedQueryBuilder =
new BooleanQuery.Builder().add(rewriteQueryExpected, Occur.FILTER);
Query deepBuilder = new BooleanQuery.Builder().add(rewriteQuery, Occur.MUST).build();
for (int i = depth; i > 0; i--) {
TermQuery tq = termQueryFunction.apply(i);
BooleanQuery.Builder bq =
new BooleanQuery.Builder().add(tq, Occur.MUST).add(deepBuilder, Occur.MUST);
deepBuilder = bq.build();
BooleanQuery.Builder expectedBq = new BooleanQuery.Builder().add(tq, Occur.FILTER);
expectedQueryBuilder.add(tq, Occur.FILTER);
if (i == depth) {
expectedBq.add(rewriteQuery, Occur.FILTER);
} else {
expectedBq.add(expectedQuery, Occur.FILTER);
expectedQueryBuilder.add(rewriteQuery, Occur.FILTER);
}
expectedQuery = expectedBq.build();
}
BooleanQuery bq = new BooleanQuery.Builder().add(deepBuilder, Occur.FILTER).build();
expectedQuery = new BoostQuery(new ConstantScoreQuery(expectedQuery), 0.0f);
Query expectedQuery =
new BoostQuery(new ConstantScoreQuery(expectedQueryBuilder.build()), 0.0f);
Query rewritten = searcher.rewrite(bq);
assertEquals(expectedQuery, rewritten);
assertEquals("Depth=" + depth, 1, rewriteQuery.numRewrites);
// `depth` rewrites because of the flattening
assertEquals("Depth=" + depth, depth, rewriteQuery.numRewrites);
}

public void testRemoveMatchAllFilter() throws IOException {
Expand Down Expand Up @@ -691,6 +688,110 @@ public void testFlattenInnerDisjunctions() throws IOException {
assertSame(query, searcher.rewrite(query));
}

public void testFlattenInnerConjunctions() throws IOException {
IndexSearcher searcher = newSearcher(new MultiReader());

Query inner =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.MUST)
.build();
Query query =
new BooleanQuery.Builder()
.add(inner, Occur.MUST)
.add(new TermQuery(new Term("foo", "baz")), Occur.FILTER)
.build();
Query expectedRewritten =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.MUST)
.add(new TermQuery(new Term("foo", "baz")), Occur.FILTER)
.build();
assertEquals(expectedRewritten, searcher.rewrite(query));

query =
new BooleanQuery.Builder()
.setMinimumNumberShouldMatch(0)
.add(inner, Occur.MUST)
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD)
.build();
expectedRewritten =
new BooleanQuery.Builder()
.setMinimumNumberShouldMatch(0)
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.MUST)
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD)
.build();
assertEquals(expectedRewritten, searcher.rewrite(query));

query =
new BooleanQuery.Builder()
.add(inner, Occur.MUST)
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST_NOT)
.build();
expectedRewritten =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.MUST)
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST_NOT)
.build();
assertEquals(expectedRewritten, searcher.rewrite(query));

inner =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.FILTER)
.build();
query =
new BooleanQuery.Builder()
.add(inner, Occur.MUST)
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST)
.build();
expectedRewritten =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.FILTER)
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST)
.build();
assertEquals(expectedRewritten, searcher.rewrite(query));

inner =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.FILTER)
.build();
query =
new BooleanQuery.Builder()
.add(inner, Occur.FILTER)
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST)
.build();
expectedRewritten =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.FILTER)
.add(new TermQuery(new Term("foo", "quux")), Occur.FILTER)
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST)
.build();
assertEquals(expectedRewritten, searcher.rewrite(query));

inner =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.MUST_NOT)
.build();
query =
new BooleanQuery.Builder()
.add(inner, Occur.FILTER)
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST)
.build();
expectedRewritten =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.FILTER)
.add(new TermQuery(new Term("foo", "quux")), Occur.MUST_NOT)
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST)
.build();
assertEquals(expectedRewritten, searcher.rewrite(query));
}

public void testDiscardShouldClauses() throws IOException {
IndexSearcher searcher = newSearcher(new MultiReader());

Expand Down

0 comments on commit 4400d55

Please sign in to comment.