diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 42a6e8073576..58d4df5c7574 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -287,7 +287,7 @@ New Features Improvements --------------------- -(No changes) +* GITHUB#16271: Add detail to KNN no-match explanations. (Jakub Slowinski) Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java index 4bdd821ccba0..2218a4bb3953 100644 --- a/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java @@ -149,9 +149,64 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException { topK = runSearchTasks(tasks, taskExecutor, perLeafResults, leafReaderContexts); } if (topK.scoreDocs.length == 0) { - return MatchNoDocsQuery.INSTANCE; + return new MatchNoDocsQuery("No documents matched the nearest-neighbor search"); } - return DocAndScoreQuery.createDocAndScoreQuery(reader, topK, reentryCount); + return DocAndScoreQuery.createDocAndScoreQuery( + reader, topK, reentryCount, noMatchExplainer(topK, filterWeight)); + } + + /** Builds the explainer for documents this query did not collect, capturing minTopKScore. */ + private DocAndScoreQuery.NoMatchExplainer noMatchExplainer(TopDocs topK, Weight filterWeight) { + // topK is score-descending, so the lowest collected score is the last entry. + final float minTopKScore = topK.scoreDocs[topK.scoreDocs.length - 1].score; + return (context, doc, topN) -> + explainNotCollected(context, doc, topN, filterWeight, minTopKScore); + } + + /** Explains why a doc was not collected, by recomputing its score. null when no vectors. */ + private Explanation explainNotCollected( + LeafReaderContext context, int doc, int topN, Weight filterWeight, float minTopKScore) + throws IOException { + String prefix = "Not in top " + topN + " doc(s): "; + FieldInfo fi = context.reader().getFieldInfos().fieldInfo(field); + if (fi == null || fi.getVectorDimension() == 0) { + return null; + } + VectorScorer vectorScorer = createVectorScorer(context, fi); + if (vectorScorer == null) { + return null; + } + if (vectorScorer.iterator().advance(doc) != doc) { + return Explanation.noMatch(prefix + "no vector value in field \"" + field + "\""); + } + if (filterWeight != null && docPassesFilter(filterWeight, context, doc) == false) { + return Explanation.noMatch(prefix + "excluded by filter"); + } + float score = vectorScorer.score(); + if (score < minTopKScore) { + return Explanation.noMatch(prefix + "score " + score + " < minTopKScore " + minTopKScore); + } + // Score meets the cutoff but the doc was not collected (tie-break, recall miss, or rescoring). + return Explanation.noMatch( + prefix + + "score " + + score + + " >= minTopKScore " + + minTopKScore + + " (tie-break or approximate-search miss)"); + } + + private static boolean docPassesFilter(Weight filterWeight, LeafReaderContext context, int doc) + throws IOException { + Scorer scorer = filterWeight.scorer(context); + if (scorer == null) { + return false; + } + TwoPhaseIterator twoPhase = scorer.twoPhaseIterator(); + if (twoPhase != null) { + return twoPhase.approximation().advance(doc) == doc && twoPhase.matches(); + } + return scorer.iterator().advance(doc) == doc; } private TopDocs runSearchTasks( diff --git a/lucene/core/src/java/org/apache/lucene/search/DocAndScoreQuery.java b/lucene/core/src/java/org/apache/lucene/search/DocAndScoreQuery.java index cf71ca8d8c20..d4dcea8a2d41 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DocAndScoreQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocAndScoreQuery.java @@ -25,10 +25,17 @@ import java.util.Objects; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.ReaderUtil; /** A query that wraps precomputed documents and scores */ class DocAndScoreQuery extends Query { + /** Optional hook to explain why a doc is missing. Returns null for the generic message. */ + @FunctionalInterface + interface NoMatchExplainer { + Explanation explain(LeafReaderContext context, int doc, int topN) throws IOException; + } + private final int[] docs; private final float[] scores; private final float maxScore; @@ -36,6 +43,8 @@ class DocAndScoreQuery extends Query { private final long visited; private final Object contextIdentity; private final int reentryCount; + // Only used in explain(). Omitted from equals()/hashCode(). + private final NoMatchExplainer noMatchExplainer; /** * Constructor @@ -60,6 +69,18 @@ class DocAndScoreQuery extends Query { long visited, Object contextIdentity, int reentryCount) { + this(docs, scores, maxScore, segmentStarts, visited, contextIdentity, reentryCount, null); + } + + DocAndScoreQuery( + int[] docs, + float[] scores, + float maxScore, + int[] segmentStarts, + long visited, + Object contextIdentity, + int reentryCount, + NoMatchExplainer noMatchExplainer) { this.docs = docs; this.scores = scores; this.maxScore = maxScore; @@ -67,9 +88,15 @@ class DocAndScoreQuery extends Query { this.visited = visited; this.contextIdentity = contextIdentity; this.reentryCount = reentryCount; + this.noMatchExplainer = noMatchExplainer; } static Query createDocAndScoreQuery(IndexReader reader, TopDocs topK, int reentryCount) { + return createDocAndScoreQuery(reader, topK, reentryCount, null); + } + + static Query createDocAndScoreQuery( + IndexReader reader, TopDocs topK, int reentryCount, NoMatchExplainer noMatchExplainer) { int len = topK.scoreDocs.length; assert len > 0; float maxScore = topK.scoreDocs[0].score; @@ -88,7 +115,8 @@ static Query createDocAndScoreQuery(IndexReader reader, TopDocs topK, int reentr segmentStarts, topK.totalHits.value(), reader.getContext().id(), - reentryCount); + reentryCount, + noMatchExplainer); } static int[] findSegmentStarts(List leaves, int[] docs) { @@ -121,12 +149,21 @@ public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float bo } return new Weight(this) { @Override - public Explanation explain(LeafReaderContext context, int doc) { + public Explanation explain(LeafReaderContext context, int doc) throws IOException { int found = Arrays.binarySearch(docs, doc + context.docBase); if (found < 0) { - return Explanation.noMatch("not in top " + docs.length + " docs"); + // Defer to the originating query for a richer reason, but only when this leaf belongs + // to the reader this query was built against. + if (noMatchExplainer != null + && ReaderUtil.getTopLevelContext(context).id() == contextIdentity) { + Explanation enriched = noMatchExplainer.explain(context, doc, docs.length); + if (enriched != null) { + return enriched; + } + } + return Explanation.noMatch("Not in top " + docs.length + " doc(s)"); } - return Explanation.match(scores[found] * boost, "within top " + docs.length + " docs"); + return Explanation.match(scores[found] * boost, "Within top " + docs.length + " doc(s)"); } @Override @@ -218,7 +255,7 @@ public boolean isCacheable(LeafReaderContext ctx) { @Override public String toString(String field) { - return "DocAndScoreQuery[" + docs[0] + ",...][" + scores[0] + ",...]," + maxScore; + return "DocAndScoreQuery[" + docs.length + " doc(s), maxScore=" + maxScore + "]"; } @Override diff --git a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java index f2c7acd3f6d3..7fcf44afebdd 100644 --- a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java @@ -169,6 +169,8 @@ public void testEmptyIndex() throws IOException { assertMatches(searcher, kvq, 0); Query q = searcher.rewrite(kvq); assertTrue(q instanceof MatchNoDocsQuery); + assertEquals( + "MatchNoDocsQuery(\"No documents matched the nearest-neighbor search\")", q.toString()); } } @@ -433,13 +435,16 @@ public void testExplain() throws IOException { // scores vary widely due to quantization assertEquals(1 / 2f, matched.getValue().doubleValue(), 0.5); assertEquals(0, matched.getDetails().length); - assertEquals("within top 3 docs", matched.getDescription()); + assertEquals("Within top 3 doc(s)", matched.getDescription()); - Explanation nomatch = searcher.explain(query, 5); + // Doc 0 ({0,0}) is farthest from the query {2,3}, so it is reliably ranked out. + Explanation nomatch = searcher.explain(query, 0); assertFalse(nomatch.isMatch()); assertEquals(0f, nomatch.getValue()); assertEquals(0, matched.getDetails().length); - assertEquals("not in top 3 docs", nomatch.getDescription()); + assertTrue( + nomatch.getDescription(), + nomatch.getDescription().startsWith("Not in top 3 doc(s): score ")); } } } @@ -462,13 +467,86 @@ public void testExplainMultipleSegments() throws IOException { // scores vary widely due to quantization assertEquals(1 / 2f, matched.getValue().doubleValue(), 0.5); assertEquals(0, matched.getDetails().length); - assertEquals("within top 3 docs", matched.getDescription()); + assertEquals("Within top 3 doc(s)", matched.getDescription()); - Explanation nomatch = searcher.explain(query, 4); + // Doc 0 ({0,0}) is farthest from the query {2,3}, so it is reliably ranked out. + Explanation nomatch = searcher.explain(query, 0); assertFalse(nomatch.isMatch()); assertEquals(0f, nomatch.getValue()); assertEquals(0, matched.getDetails().length); - assertEquals("not in top 3 docs", nomatch.getDescription()); + assertTrue( + nomatch.getDescription(), + nomatch.getDescription().startsWith("Not in top 3 doc(s): score ")); + } + } + } + + public void testExplainFiltered() throws IOException { + try (Directory d = newDirectoryForTest()) { + try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) { + for (int j = 0; j < 5; j++) { + Document doc = new Document(); + doc.add(getKnnVectorField("field", new float[] {j, j})); + doc.add(new IntPoint("tag", j)); + w.addDocument(doc); + } + // Doc 5 passes the filter (tag in range) but has no vector. + Document noVector = new Document(); + noVector.add(new IntPoint("tag", 1)); + w.addDocument(noVector); + } + try (IndexReader reader = DirectoryReader.open(d)) { + IndexSearcher searcher = new IndexSearcher(reader); + // Filter to docs 0-2, so docs 3 and 4 cannot be collected. + Query filter = IntPoint.newRangeQuery("tag", 0, 2); + AbstractKnnVectorQuery query = getKnnVectorQuery("field", new float[] {2, 3}, 3, filter); + + // Doc 4 has a vector but fails the filter. + Explanation filtered = searcher.explain(query, 4); + assertFalse(filtered.isMatch()); + assertEquals("Not in top 3 doc(s): excluded by filter", filtered.getDescription()); + + // Doc 5 passes the filter but has no vector, so blame the missing vector, not the filter. + Explanation noVector = searcher.explain(query, 5); + assertFalse(noVector.isMatch()); + assertEquals( + "Not in top 3 doc(s): no vector value in field \"field\"", noVector.getDescription()); + } + } + } + + public void testExplainTieBreak() throws IOException { + try (Directory d = newDirectoryForTest()) { + // All five docs share one vector, so they all score equally. With top k = 3, two are dropped + // due to tie-break. + try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) { + for (int j = 0; j < 5; j++) { + Document doc = new Document(); + doc.add(getKnnVectorField("field", new float[] {1, 1})); + w.addDocument(doc); + } + w.forceMerge(1); + } + try (IndexReader reader = DirectoryReader.open(d)) { + IndexSearcher searcher = new IndexSearcher(reader); + AbstractKnnVectorQuery query = getKnnVectorQuery("field", new float[] {1, 1}, 3); + + Set collected = new HashSet<>(); + for (ScoreDoc sd : searcher.search(query, 3).scoreDocs) { + collected.add(sd.doc); + } + int dropped = -1; + for (int doc = 0; doc < 5; doc++) { + if (collected.contains(doc) == false) { + dropped = doc; + break; + } + } + Explanation nomatch = searcher.explain(query, dropped); + assertFalse(nomatch.isMatch()); + String description = nomatch.getDescription(); + assertTrue(description, description.startsWith("Not in top 3 doc(s): score ")); + assertTrue(description, description.endsWith(" (tie-break or approximate-search miss)")); } } } @@ -1072,7 +1150,8 @@ void assertDocScoreQueryToString(Query query) { // Since a forceMerge could occur in this test, we must not assert that a specific doc_id is // matched // But that instead the string format is expected and that the max score is 1.0 - assertTrue(queryString.matches("DocAndScoreQuery\\[\\d+,...]\\[\\d+.\\d+,...],1.0")); + assertTrue( + queryString, queryString.matches("DocAndScoreQuery\\[\\d+ doc\\(s\\), maxScore=1.0]")); } /**