-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add BaseKnnVectorsFormatTestCase.testRecall() and fix old codecs #13910
Changes from 1 commit
b581275
df712f5
dd9b77e
bee57d8
ac85cf2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -102,6 +102,11 @@ public void testSearch() throws Exception { | |
} | ||
} | ||
|
||
@Override | ||
public void testRecall() { | ||
// ignore this test since this class always returns no results from search | ||
} | ||
Comment on lines
+106
to
+108
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still think that the underlying flat format should allow |
||
|
||
public void testQuantizedVectorsWriteAndRead() throws Exception { | ||
// create lucene directory with codec | ||
int numVectors = 1 + random().nextInt(50); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -562,7 +562,7 @@ private void add( | |
String idString = Integer.toString(id); | ||
doc.add(new StringField("id", idString, Field.Store.YES)); | ||
doc.add(new SortedDocValuesField("id", new BytesRef(idString))); | ||
// XSSystem.out.println("add " + idString + " " + Arrays.toString(vector)); | ||
// System.out.println("add " + idString + " " + Arrays.toString(vector)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just delete the line? |
||
iw.updateDocument(new Term("id", idString), doc); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,15 +16,21 @@ | |
*/ | ||
package org.apache.lucene.tests.index; | ||
|
||
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween; | ||
import static java.nio.charset.StandardCharsets.UTF_8; | ||
import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; | ||
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.ByteArrayOutputStream; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.InputStreamReader; | ||
import java.util.Arrays; | ||
import java.util.Collections; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Set; | ||
import java.util.concurrent.atomic.AtomicReference; | ||
|
@@ -70,6 +76,10 @@ | |
import org.apache.lucene.index.VectorEncoding; | ||
import org.apache.lucene.index.VectorSimilarityFunction; | ||
import org.apache.lucene.search.DocIdSetIterator; | ||
import org.apache.lucene.search.IndexSearcher; | ||
import org.apache.lucene.search.KnnFloatVectorQuery; | ||
import org.apache.lucene.search.MatchAllDocsQuery; | ||
import org.apache.lucene.search.ScoreDoc; | ||
import org.apache.lucene.search.Sort; | ||
import org.apache.lucene.search.SortField; | ||
import org.apache.lucene.search.TopDocs; | ||
|
@@ -1906,4 +1916,122 @@ public void testMismatchedFields() throws Exception { | |
|
||
IOUtils.close(reader, w2, dir1, dir2); | ||
} | ||
|
||
/** | ||
* Test that the query is a viable approximation to exact search. This test is designed to uncover | ||
* gross failures only, not to represent the true expected recall. | ||
*/ | ||
public void testRecall() throws IOException { | ||
VectorSimilarityFunction vectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be really neat if this went through each one. Quantization will do special things for different similarity functions and exercising each of those paths would be good. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agreed, we want to test all the things |
||
int dim = 16; | ||
try (Directory indexStore = getKnownIndexStore("field", dim, vectorSimilarityFunction); | ||
IndexReader reader = DirectoryReader.open(indexStore)) { | ||
IndexSearcher searcher = newSearcher(reader); | ||
float[] queryEmbedding = new float[dim]; | ||
String queryString = "Apache License"; | ||
computeLineEmbedding(queryString, queryEmbedding); | ||
// computeLineEmbedding(" END OF TERMS AND CONDITIONS", queryEmbedding); | ||
// pass match-all "filter" to force full traversal, bypassing graph | ||
KnnFloatVectorQuery exactQuery = | ||
new KnnFloatVectorQuery("field", queryEmbedding, 1000, new MatchAllDocsQuery()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, I think for more consistent runs, we may want to have multiple query embeddings that we test with and gather I just think having a single query might be very flaky in the long run. |
||
// indexed 421 lines from LICENSE.txt | ||
// indexed 157 lines from NOTICE.txt | ||
Comment on lines
+1944
to
+1945
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we are adding two files like this, I wonder if we should simply take real vectors from a real embedding model and put them in the resources folder. Maybe we can use glove or My concern is that having such a simplistic vector with so few dimensions might not actually be useful. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lemme see if I can find bugs with the "dictionary" PR? We can always beef this up with more realistic data |
||
assertEquals(578, searcher.count(exactQuery)); // Same for exact search | ||
KnnFloatVectorQuery query = new KnnFloatVectorQuery("field", queryEmbedding, 10); | ||
assertEquals(10, searcher.count(query)); // Expect some results without timeout | ||
TopDocs results = searcher.search(query, 10); | ||
Set<Integer> resultDocs = new HashSet<>(); | ||
for (ScoreDoc scoreDoc : results.scoreDocs) { | ||
/* | ||
System.out.println( | ||
"result " + i++ + ": " + reader.storedFields().document(scoreDoc.doc) + " " + scoreDoc); | ||
*/ | ||
resultDocs.add(scoreDoc.doc); | ||
} | ||
TopDocs expected = searcher.search(exactQuery, 10); | ||
// int i = 0; | ||
int recalled = 0; | ||
for (ScoreDoc scoreDoc : expected.scoreDocs) { | ||
/* | ||
System.out.println( | ||
"expected " | ||
+ i++ | ||
+ ": " | ||
+ reader.storedFields().document(scoreDoc.doc) | ||
+ " " | ||
+ scoreDoc); | ||
*/ | ||
if (resultDocs.contains(scoreDoc.doc)) { | ||
++recalled; | ||
} | ||
} | ||
assertTrue("recall should be at least 5/10, got " + recalled, recalled >= 5); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would hope recall should be within some known parameter. It would be good to know if recall improved or worsened. Either case could show an unexpected change. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it could be tricky to be very precise given the range of codec options. I guess we could specialize per codec? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think having an |
||
/* | ||
assertEquals(queryString, reader.storedFields().document(results.scoreDocs[0].doc).get("text")); | ||
assertEquals("Copyright (c) 2006 Dawid Weiss", reader.storedFields().document(results.scoreDocs[1].doc).get("text")); | ||
assertEquals(queryString, reader.storedFields().document(expected.scoreDocs[0].doc).get("text")); | ||
assertEquals("Copyright (c) 2006 Dawid Weiss", reader.storedFields().document(expected.scoreDocs[1].doc).get("text")); | ||
*/ | ||
} | ||
} | ||
|
||
/** Creates a new directory and adds documents with the given vectors as kNN vector fields */ | ||
Directory getKnownIndexStore( | ||
String field, int dimension, VectorSimilarityFunction vectorSimilarityFunction) | ||
throws IOException { | ||
Directory indexStore = newDirectory(random()); | ||
IndexWriter writer = new IndexWriter(indexStore, newIndexWriterConfig()); | ||
float[] scratch = new float[dimension]; | ||
for (String file : List.of("LICENSE.txt", "NOTICE.txt")) { | ||
try (InputStream in = BaseKnnVectorsFormatTestCase.class.getResourceAsStream(file); | ||
BufferedReader reader = new BufferedReader(new InputStreamReader(in, UTF_8))) { | ||
String line; | ||
int lineNo = -1; | ||
while ((line = reader.readLine()) != null) { | ||
line = line.strip(); | ||
if (line.isEmpty()) { | ||
continue; | ||
} | ||
++lineNo; | ||
Document doc = new Document(); | ||
doc.add( | ||
new KnnFloatVectorField( | ||
field, computeLineEmbedding(line, scratch), vectorSimilarityFunction)); | ||
doc.add(new StoredField("text", line)); | ||
doc.add(new StringField("id", file + "." + lineNo, Field.Store.YES)); | ||
writer.addDocument(doc); | ||
if (random().nextBoolean()) { | ||
// Add some documents without a vector | ||
addDocuments(writer, "id" + lineNo + ".", randomIntBetween(1, 5)); | ||
} | ||
} | ||
// System.out.println("indexed " + (lineNo + 1) + " lines from " + file); | ||
} | ||
} | ||
// Add some documents without a vector nor an id | ||
addDocuments(writer, null, 5); | ||
writer.close(); | ||
return indexStore; | ||
} | ||
|
||
private float[] computeLineEmbedding(String line, float[] vector) { | ||
Arrays.fill(vector, 0); | ||
for (int i = 0; i < line.length(); i++) { | ||
char c = line.charAt(i); | ||
vector[i % vector.length] += c / ((float) (i + 1) / vector.length); | ||
} | ||
VectorUtil.l2normalize(vector, false); | ||
return vector; | ||
} | ||
|
||
private void addDocuments(IndexWriter writer, String idBase, int count) throws IOException { | ||
for (int i = 0; i < count; i++) { | ||
Document doc = new Document(); | ||
doc.add(new StringField("other", "value", Field.Store.NO)); | ||
if (idBase != null) { | ||
doc.add(new StringField("id", idBase + i, Field.Store.YES)); | ||
} | ||
writer.addDocument(doc); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since Lucene90 didn't support sparse vector values, I am not sure this is strictly necessary. But I can understand it from a consistency standpoint.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, that's a relief! I couldn't remember if we had that or not. At any rate it is possible to create a sparse 90 index in tests now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is odd? When a Codec format is replaced, we move the "read only" part to
backwards-codecs
module, and the full read/write original codec totest-framework
so that unit tests can produce a 9.0 index and run modern tests (even new unit tests added since 9.0) against such "old" indices.But that read/write codec format should not be able to produce an index that the original 9.0 (core) format was not able to produce. They should be the same original code ... so maybe sparse vector values were in fact writable in 9.0 (by accident?)?