From 4400d552970e481e0b1fd08eafd908cc2edd3e95 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Mon, 18 Nov 2024 08:51:35 +0100 Subject: [PATCH] Speed up top-k retrieval on filtered conjunctions. (#13994) A while back we added an optimized bulk scorer that implements block-max AND, this yielded a good speedup on nightly benchmarks, see annotation `FP` at https://benchmarks.mikemccandless.com/AndHighHigh.html. With this PR, filtered conjunctions now also run through this optimized bulk scorer by doing two things: - It flattens inner conjunctions. This makes queries initially written as something like `+(+term1 +term2) #filter` rewritten to `+term1 +term2 #filter`. - It evaluates queries that have a mix of MUST and FILTER clauses evaluated through `BlockMaxConjunctionBulkScorer` by treating FILTER clauses as scoring clauses that produce a score of 0. --- lucene/CHANGES.txt | 3 + .../apache/lucene/search/BooleanQuery.java | 50 ++++++- .../lucene/search/BooleanScorerSupplier.java | 7 +- .../lucene/search/TestBooleanRewrites.java | 139 +++++++++++++++--- 4 files changed, 175 insertions(+), 24 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 66f13d264e5a..547b83081455 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -93,6 +93,9 @@ Optimizations longs that would pack two integers. We are now moving back to integers to be able to take advantage of 2x more lanes with the vector API. (Adrien Grand) +* GITHUB#13994: Speed up top-k retrieval of filtered conjunctions. + (Adrien Grand) + Bug Fixes --------------------- * GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java b/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java index 0d6daa07ba1e..b50b0530a2d1 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java @@ -268,6 +268,11 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException { return new MatchNoDocsQuery("empty BooleanQuery"); } + // Queries with no positive clauses have no matches + if (clauses.size() == clauseSets.get(Occur.MUST_NOT).size()) { + return new MatchNoDocsQuery("pure negative BooleanQuery"); + } + // optimize 1-clause queries if (clauses.size() == 1) { BooleanClause c = clauses.get(0); @@ -283,8 +288,6 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException { // no scoring clauses, so return a score of 0 return new BoostQuery(new ConstantScoreQuery(query), 0); case MUST_NOT: - // no positive clauses - return new MatchNoDocsQuery("pure negative BooleanQuery"); default: throw new AssertionError(); } @@ -539,8 +542,7 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException { builder.setMinimumNumberShouldMatch(minimumNumberShouldMatch); boolean actuallyRewritten = false; for (BooleanClause clause : clauses) { - if (clause.occur() == Occur.SHOULD && clause.query() instanceof BooleanQuery) { - BooleanQuery innerQuery = (BooleanQuery) clause.query(); + if (clause.occur() == Occur.SHOULD && clause.query() instanceof BooleanQuery innerQuery) { if (innerQuery.isPureDisjunction()) { actuallyRewritten = true; for (BooleanClause innerClause : innerQuery.clauses()) { @@ -558,6 +560,46 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException { } } + // Inline required / prohibited clauses. This helps run filtered conjunctive queries more + // efficiently by providing all clauses to the block-max AND scorer. + { + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + builder.setMinimumNumberShouldMatch(minimumNumberShouldMatch); + boolean actuallyRewritten = false; + for (BooleanClause outerClause : clauses) { + if (outerClause.isRequired() && outerClause.query() instanceof BooleanQuery innerQuery) { + // Inlining prohibited clauses is not legal if the query is a pure negation, since pure + // negations have no matches. It works because the inner BooleanQuery would have first + // rewritten to a MatchNoDocsQuery if it only had prohibited clauses. + assert innerQuery.getClauses(Occur.MUST_NOT).size() != innerQuery.clauses().size(); + if (innerQuery.getMinimumNumberShouldMatch() == 0 + && innerQuery.getClauses(Occur.SHOULD).isEmpty()) { + + actuallyRewritten = true; + for (BooleanClause innerClause : innerQuery) { + Occur innerOccur = innerClause.occur(); + if (innerOccur == Occur.FILTER + || innerOccur == Occur.MUST_NOT + || outerClause.occur() == Occur.MUST) { + builder.add(innerClause); + } else { + assert outerClause.occur() == Occur.FILTER && innerOccur == Occur.MUST; + // In this case we need to change the occur of the inner query from MUST to FILTER. + builder.add(innerClause.query(), Occur.FILTER); + } + } + } else { + builder.add(outerClause); + } + } else { + builder.add(outerClause); + } + } + if (actuallyRewritten) { + return builder.build(); + } + } + // SHOULD clause count less than or equal to minimumNumberShouldMatch // Important(this can only be processed after nested clauses have been flattened) { diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanScorerSupplier.java b/lucene/core/src/java/org/apache/lucene/search/BooleanScorerSupplier.java index a8169ad227f1..7a53bc9a4852 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanScorerSupplier.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanScorerSupplier.java @@ -333,10 +333,15 @@ private BulkScorer requiredBulkScorer() throws IOException { requiredScoring.add(ss.get(leadCost)); } if (scoreMode == ScoreMode.TOP_SCORES - && requiredNoScoring.isEmpty() && requiredScoring.size() > 1 // Only specialize top-level conjunctions for clauses that don't have a two-phase iterator. + && requiredNoScoring.stream().map(Scorer::twoPhaseIterator).allMatch(Objects::isNull) && requiredScoring.stream().map(Scorer::twoPhaseIterator).allMatch(Objects::isNull)) { + // Turn all filters into scoring clauses with a score of zero, so that + // BlockMaxConjunctionBulkScorer is applicable. + for (Scorer filter : requiredNoScoring) { + requiredScoring.add(new ConstantScoreScorer(0f, ScoreMode.COMPLETE, filter.iterator())); + } return new BlockMaxConjunctionBulkScorer(maxDoc, requiredScoring); } if (scoreMode != ScoreMode.TOP_SCORES diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBooleanRewrites.java b/lucene/core/src/test/org/apache/lucene/search/TestBooleanRewrites.java index 9083a62e37a1..f36c7539c7dc 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBooleanRewrites.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBooleanRewrites.java @@ -330,8 +330,8 @@ public void testDeeplyNestedBooleanRewriteShouldClauses() throws IOException { int depth = TestUtil.nextInt(random(), 10, 30); TestRewriteQuery rewriteQueryExpected = new TestRewriteQuery(); TestRewriteQuery rewriteQuery = new TestRewriteQuery(); - Query expectedQuery = - new BooleanQuery.Builder().add(rewriteQueryExpected, Occur.FILTER).build(); + BooleanQuery.Builder expectedQueryBuilder = + new BooleanQuery.Builder().add(rewriteQueryExpected, Occur.FILTER); Query deepBuilder = new BooleanQuery.Builder() .add(rewriteQuery, Occur.SHOULD) @@ -345,21 +345,19 @@ public void testDeeplyNestedBooleanRewriteShouldClauses() throws IOException { .add(tq, Occur.SHOULD) .add(deepBuilder, Occur.SHOULD); deepBuilder = bq.build(); - BooleanQuery.Builder expectedBq = new BooleanQuery.Builder().add(tq, Occur.FILTER); + expectedQueryBuilder.add(tq, Occur.FILTER); if (i == depth) { - expectedBq.add(rewriteQuery, Occur.FILTER); - } else { - expectedBq.add(expectedQuery, Occur.FILTER); + expectedQueryBuilder.add(rewriteQuery, Occur.FILTER); } - expectedQuery = expectedBq.build(); } BooleanQuery bq = new BooleanQuery.Builder().add(deepBuilder, Occur.FILTER).build(); - expectedQuery = new BoostQuery(new ConstantScoreQuery(expectedQuery), 0.0f); + Query expectedQuery = + new BoostQuery(new ConstantScoreQuery(expectedQueryBuilder.build()), 0.0f); Query rewritten = searcher.rewrite(bq); assertEquals(expectedQuery, rewritten); // the SHOULD clauses cause more rewrites because they incrementally change to `MUST` and then - // `FILTER` - assertEquals("Depth=" + depth, depth + 1, rewriteQuery.numRewrites); + // `FILTER`, plus the flattening of required clauses + assertEquals("Depth=" + depth, depth * 2, rewriteQuery.numRewrites); } public void testDeeplyNestedBooleanRewrite() throws IOException { @@ -369,27 +367,26 @@ public void testDeeplyNestedBooleanRewrite() throws IOException { int depth = TestUtil.nextInt(random(), 10, 30); TestRewriteQuery rewriteQueryExpected = new TestRewriteQuery(); TestRewriteQuery rewriteQuery = new TestRewriteQuery(); - Query expectedQuery = - new BooleanQuery.Builder().add(rewriteQueryExpected, Occur.FILTER).build(); + BooleanQuery.Builder expectedQueryBuilder = + new BooleanQuery.Builder().add(rewriteQueryExpected, Occur.FILTER); Query deepBuilder = new BooleanQuery.Builder().add(rewriteQuery, Occur.MUST).build(); for (int i = depth; i > 0; i--) { TermQuery tq = termQueryFunction.apply(i); BooleanQuery.Builder bq = new BooleanQuery.Builder().add(tq, Occur.MUST).add(deepBuilder, Occur.MUST); deepBuilder = bq.build(); - BooleanQuery.Builder expectedBq = new BooleanQuery.Builder().add(tq, Occur.FILTER); + expectedQueryBuilder.add(tq, Occur.FILTER); if (i == depth) { - expectedBq.add(rewriteQuery, Occur.FILTER); - } else { - expectedBq.add(expectedQuery, Occur.FILTER); + expectedQueryBuilder.add(rewriteQuery, Occur.FILTER); } - expectedQuery = expectedBq.build(); } BooleanQuery bq = new BooleanQuery.Builder().add(deepBuilder, Occur.FILTER).build(); - expectedQuery = new BoostQuery(new ConstantScoreQuery(expectedQuery), 0.0f); + Query expectedQuery = + new BoostQuery(new ConstantScoreQuery(expectedQueryBuilder.build()), 0.0f); Query rewritten = searcher.rewrite(bq); assertEquals(expectedQuery, rewritten); - assertEquals("Depth=" + depth, 1, rewriteQuery.numRewrites); + // `depth` rewrites because of the flattening + assertEquals("Depth=" + depth, depth, rewriteQuery.numRewrites); } public void testRemoveMatchAllFilter() throws IOException { @@ -691,6 +688,110 @@ public void testFlattenInnerDisjunctions() throws IOException { assertSame(query, searcher.rewrite(query)); } + public void testFlattenInnerConjunctions() throws IOException { + IndexSearcher searcher = newSearcher(new MultiReader()); + + Query inner = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.MUST) + .add(new TermQuery(new Term("foo", "quux")), Occur.MUST) + .build(); + Query query = + new BooleanQuery.Builder() + .add(inner, Occur.MUST) + .add(new TermQuery(new Term("foo", "baz")), Occur.FILTER) + .build(); + Query expectedRewritten = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.MUST) + .add(new TermQuery(new Term("foo", "quux")), Occur.MUST) + .add(new TermQuery(new Term("foo", "baz")), Occur.FILTER) + .build(); + assertEquals(expectedRewritten, searcher.rewrite(query)); + + query = + new BooleanQuery.Builder() + .setMinimumNumberShouldMatch(0) + .add(inner, Occur.MUST) + .add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) + .build(); + expectedRewritten = + new BooleanQuery.Builder() + .setMinimumNumberShouldMatch(0) + .add(new TermQuery(new Term("foo", "bar")), Occur.MUST) + .add(new TermQuery(new Term("foo", "quux")), Occur.MUST) + .add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) + .build(); + assertEquals(expectedRewritten, searcher.rewrite(query)); + + query = + new BooleanQuery.Builder() + .add(inner, Occur.MUST) + .add(new TermQuery(new Term("foo", "baz")), Occur.MUST_NOT) + .build(); + expectedRewritten = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.MUST) + .add(new TermQuery(new Term("foo", "quux")), Occur.MUST) + .add(new TermQuery(new Term("foo", "baz")), Occur.MUST_NOT) + .build(); + assertEquals(expectedRewritten, searcher.rewrite(query)); + + inner = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.MUST) + .add(new TermQuery(new Term("foo", "quux")), Occur.FILTER) + .build(); + query = + new BooleanQuery.Builder() + .add(inner, Occur.MUST) + .add(new TermQuery(new Term("foo", "baz")), Occur.MUST) + .build(); + expectedRewritten = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.MUST) + .add(new TermQuery(new Term("foo", "quux")), Occur.FILTER) + .add(new TermQuery(new Term("foo", "baz")), Occur.MUST) + .build(); + assertEquals(expectedRewritten, searcher.rewrite(query)); + + inner = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.MUST) + .add(new TermQuery(new Term("foo", "quux")), Occur.FILTER) + .build(); + query = + new BooleanQuery.Builder() + .add(inner, Occur.FILTER) + .add(new TermQuery(new Term("foo", "baz")), Occur.MUST) + .build(); + expectedRewritten = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.FILTER) + .add(new TermQuery(new Term("foo", "quux")), Occur.FILTER) + .add(new TermQuery(new Term("foo", "baz")), Occur.MUST) + .build(); + assertEquals(expectedRewritten, searcher.rewrite(query)); + + inner = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.MUST) + .add(new TermQuery(new Term("foo", "quux")), Occur.MUST_NOT) + .build(); + query = + new BooleanQuery.Builder() + .add(inner, Occur.FILTER) + .add(new TermQuery(new Term("foo", "baz")), Occur.MUST) + .build(); + expectedRewritten = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "bar")), Occur.FILTER) + .add(new TermQuery(new Term("foo", "quux")), Occur.MUST_NOT) + .add(new TermQuery(new Term("foo", "baz")), Occur.MUST) + .build(); + assertEquals(expectedRewritten, searcher.rewrite(query)); + } + public void testDiscardShouldClauses() throws IOException { IndexSearcher searcher = newSearcher(new MultiReader());