apache · msokolov · Oct 17, 2024 · Oct 14, 2024 · Oct 14, 2024 · Oct 17, 2024
diff --git a/...codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java b/...codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java
@@ -260,7 +260,7 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits
       int node = results.topNode();
       float minSimilarity = results.topScore();
       results.pop();
-      knnCollector.collect(node, minSimilarity);
+      knnCollector.collect(vectorValues.ordToDoc(node), minSimilarity);
     }
   }
 

diff --git a/...-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java b/...-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java
@@ -236,7 +236,7 @@ private void popToScratch(HnswGraphBuilder.GraphBuilderKnnCollector candidates)
     // extract all the Neighbors from the queue into an array; these will now be
     // sorted from worst to best
     for (int i = 0; i < candidateCount; i++) {
-      float similarity = candidates.minCompetitiveSimilarity();
+      float similarity = candidates.minimumScore();
       scratch.add(candidates.popNode(), similarity);
     }
   }

diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java
@@ -555,7 +555,7 @@ public int[] popUntilNearestKNodes() {
       return queue.nodes();
     }
 
-    float minimumScore() {
+    public float minimumScore() {
       return queue.topScore();
     }
 

diff --git a/.../src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java b/.../src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java
@@ -102,6 +102,11 @@ public void testSearch() throws Exception {
     }
   }
 
+  @Override
+  public void testRecall() {
+    // ignore this test since this class always returns no results from search
+  }
+
   public void testQuantizedVectorsWriteAndRead() throws Exception {
     // create lucene directory with codec
     int numVectors = 1 + random().nextInt(50);

diff --git a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java
@@ -562,7 +562,7 @@ private void add(
     String idString = Integer.toString(id);
     doc.add(new StringField("id", idString, Field.Store.YES));
     doc.add(new SortedDocValuesField("id", new BytesRef(idString)));
-    // XSSystem.out.println("add " + idString + " " + Arrays.toString(vector));
+    // System.out.println("add " + idString + " " + Arrays.toString(vector));
     iw.updateDocument(new Term("id", idString), doc);
   }
 }
diff --git a/...e/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/...e/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java
@@ -16,15 +16,21 @@
  */
 package org.apache.lucene.tests.index;
 
+import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT;
 import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
 
+import java.io.BufferedReader;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicReference;
@@ -70,6 +76,10 @@
 import org.apache.lucene.index.VectorEncoding;
 import org.apache.lucene.index.VectorSimilarityFunction;
 import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.KnnFloatVectorQuery;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.TopDocs;
@@ -1906,4 +1916,122 @@ public void testMismatchedFields() throws Exception {
 
     IOUtils.close(reader, w2, dir1, dir2);
   }
+
+  /**
+   * Test that the query is a viable approximation to exact search. This test is designed to uncover
+   * gross failures only, not to represent the true expected recall.
+   */
+  public void testRecall() throws IOException {
+    VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN;
+    int dim = 16;
+    try (Directory indexStore = getKnownIndexStore("field", dim, vectorSimilarityFunction);
+        IndexReader reader = DirectoryReader.open(indexStore)) {
+      IndexSearcher searcher = newSearcher(reader);
+      float[] queryEmbedding = new float[dim];
+      String queryString = "Apache License";
+      computeLineEmbedding(queryString, queryEmbedding);
+      // computeLineEmbedding("   END OF TERMS AND CONDITIONS", queryEmbedding);
+      // pass match-all "filter" to force full traversal, bypassing graph
+      KnnFloatVectorQuery exactQuery =
+          new KnnFloatVectorQuery("field", queryEmbedding, 1000, new MatchAllDocsQuery());
+      // indexed 421 lines from LICENSE.txt
+      // indexed 157 lines from NOTICE.txt
+      assertEquals(578, searcher.count(exactQuery)); // Same for exact search
+      KnnFloatVectorQuery query = new KnnFloatVectorQuery("field", queryEmbedding, 10);
+      assertEquals(10, searcher.count(query)); // Expect some results without timeout
+      TopDocs results = searcher.search(query, 10);
+      Set<Integer> resultDocs = new HashSet<>();
+      for (ScoreDoc scoreDoc : results.scoreDocs) {
+        /*
+        System.out.println(
+            "result " + i++ + ": " + reader.storedFields().document(scoreDoc.doc) + " " + scoreDoc);
+        */
+        resultDocs.add(scoreDoc.doc);
+      }
+      TopDocs expected = searcher.search(exactQuery, 10);
+      // int i = 0;
+      int recalled = 0;
+      for (ScoreDoc scoreDoc : expected.scoreDocs) {
+        /*
+        System.out.println(
+            "expected "
+                + i++
+                + ": "
+                + reader.storedFields().document(scoreDoc.doc)
+                + " "
+                + scoreDoc);
+        */
+        if (resultDocs.contains(scoreDoc.doc)) {
+          ++recalled;
+        }
+      }
+      assertTrue("recall should be at least 5/10, got " + recalled, recalled >= 5);
+      /*
+      assertEquals(queryString, reader.storedFields().document(results.scoreDocs[0].doc).get("text"));
+      assertEquals("Copyright (c) 2006 Dawid Weiss", reader.storedFields().document(results.scoreDocs[1].doc).get("text"));
+      assertEquals(queryString, reader.storedFields().document(expected.scoreDocs[0].doc).get("text"));
+      assertEquals("Copyright (c) 2006 Dawid Weiss", reader.storedFields().document(expected.scoreDocs[1].doc).get("text"));
+      */
+    }
+  }
+
+  /** Creates a new directory and adds documents with the given vectors as kNN vector fields */
+  Directory getKnownIndexStore(
+      String field, int dimension, VectorSimilarityFunction vectorSimilarityFunction)
+      throws IOException {
+    Directory indexStore = newDirectory(random());
+    IndexWriter writer = new IndexWriter(indexStore, newIndexWriterConfig());
+    float[] scratch = new float[dimension];
+    for (String file : List.of("LICENSE.txt", "NOTICE.txt")) {
+      try (InputStream in = BaseKnnVectorsFormatTestCase.class.getResourceAsStream(file);
+          BufferedReader reader = new BufferedReader(new InputStreamReader(in, UTF_8))) {
+        String line;
+        int lineNo = -1;
+        while ((line = reader.readLine()) != null) {
+          line = line.strip();
+          if (line.isEmpty()) {
+            continue;
+          }
+          ++lineNo;
+          Document doc = new Document();
+          doc.add(
+              new KnnFloatVectorField(
+                  field, computeLineEmbedding(line, scratch), vectorSimilarityFunction));
+          doc.add(new StoredField("text", line));
+          doc.add(new StringField("id", file + "." + lineNo, Field.Store.YES));
+          writer.addDocument(doc);
+          if (random().nextBoolean()) {
+            // Add some documents without a vector
+            addDocuments(writer, "id" + lineNo + ".", randomIntBetween(1, 5));
+          }
+        }
+        // System.out.println("indexed " + (lineNo + 1) + " lines from " + file);
+      }
+    }
+    // Add some documents without a vector nor an id
+    addDocuments(writer, null, 5);
+    writer.close();
+    return indexStore;
+  }
+
+  private float[] computeLineEmbedding(String line, float[] vector) {
+    Arrays.fill(vector, 0);
+    for (int i = 0; i < line.length(); i++) {
+      char c = line.charAt(i);
+      vector[i % vector.length] += c / ((float) (i + 1) / vector.length);
+    }
+    VectorUtil.l2normalize(vector, false);
+    return vector;
+  }
+
+  private void addDocuments(IndexWriter writer, String idBase, int count) throws IOException {
+    for (int i = 0; i < count; i++) {
+      Document doc = new Document();
+      doc.add(new StringField("other", "value", Field.Store.NO));
+      if (idBase != null) {
+        doc.add(new StringField("id", idBase + i, Field.Store.YES));
+      }
+      writer.addDocument(doc);
+    }
+  }
 }