From 04e96763d5a37a733564ffd9545c436b1c148fbd Mon Sep 17 00:00:00 2001 From: Jakub Slowinski <32519034+slow-J@users.noreply.github.com> Date: Thu, 18 Jun 2026 17:42:45 +0100 Subject: [PATCH 1/4] Add detail to KNN no-match explanations Recompute the explained doc's score in explain() so "not in top N docs" says why, e.g. below cutoff, excluded by filter, no vector value, or a tie-break/recall miss. --- lucene/CHANGES.txt | 2 + .../lucene/search/AbstractKnnVectorQuery.java | 59 ++++++++++++++++++- .../lucene/search/DocAndScoreQuery.java | 44 ++++++++++++-- .../search/BaseKnnVectorQueryTestCase.java | 57 +++++++++++++++--- 4 files changed, 148 insertions(+), 14 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 42a6e8073576..f7857f33ea31 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -439,6 +439,8 @@ Improvements * GITHUB#16264: Check if merge is aborted before executing file integrity checks to avoid costly full-file checksums on segments when the merge has already been cancelled. (Tanguy Leroux) +* GITHUB#16271: Add detail to KNN no-match explanations. (Jakub Slowinski) + Optimizations --------------------- * GITHUB#16111: Optimize FieldExistsQuery.count() when all docs have the field. (Prithvi S) diff --git a/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java index 4bdd821ccba0..2218a4bb3953 100644 --- a/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java @@ -149,9 +149,64 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException { topK = runSearchTasks(tasks, taskExecutor, perLeafResults, leafReaderContexts); } if (topK.scoreDocs.length == 0) { - return MatchNoDocsQuery.INSTANCE; + return new MatchNoDocsQuery("No documents matched the nearest-neighbor search"); } - return DocAndScoreQuery.createDocAndScoreQuery(reader, topK, reentryCount); + return DocAndScoreQuery.createDocAndScoreQuery( + reader, topK, reentryCount, noMatchExplainer(topK, filterWeight)); + } + + /** Builds the explainer for documents this query did not collect, capturing minTopKScore. */ + private DocAndScoreQuery.NoMatchExplainer noMatchExplainer(TopDocs topK, Weight filterWeight) { + // topK is score-descending, so the lowest collected score is the last entry. + final float minTopKScore = topK.scoreDocs[topK.scoreDocs.length - 1].score; + return (context, doc, topN) -> + explainNotCollected(context, doc, topN, filterWeight, minTopKScore); + } + + /** Explains why a doc was not collected, by recomputing its score. null when no vectors. */ + private Explanation explainNotCollected( + LeafReaderContext context, int doc, int topN, Weight filterWeight, float minTopKScore) + throws IOException { + String prefix = "Not in top " + topN + " doc(s): "; + FieldInfo fi = context.reader().getFieldInfos().fieldInfo(field); + if (fi == null || fi.getVectorDimension() == 0) { + return null; + } + VectorScorer vectorScorer = createVectorScorer(context, fi); + if (vectorScorer == null) { + return null; + } + if (vectorScorer.iterator().advance(doc) != doc) { + return Explanation.noMatch(prefix + "no vector value in field \"" + field + "\""); + } + if (filterWeight != null && docPassesFilter(filterWeight, context, doc) == false) { + return Explanation.noMatch(prefix + "excluded by filter"); + } + float score = vectorScorer.score(); + if (score < minTopKScore) { + return Explanation.noMatch(prefix + "score " + score + " < minTopKScore " + minTopKScore); + } + // Score meets the cutoff but the doc was not collected (tie-break, recall miss, or rescoring). + return Explanation.noMatch( + prefix + + "score " + + score + + " >= minTopKScore " + + minTopKScore + + " (tie-break or approximate-search miss)"); + } + + private static boolean docPassesFilter(Weight filterWeight, LeafReaderContext context, int doc) + throws IOException { + Scorer scorer = filterWeight.scorer(context); + if (scorer == null) { + return false; + } + TwoPhaseIterator twoPhase = scorer.twoPhaseIterator(); + if (twoPhase != null) { + return twoPhase.approximation().advance(doc) == doc && twoPhase.matches(); + } + return scorer.iterator().advance(doc) == doc; } private TopDocs runSearchTasks( diff --git a/lucene/core/src/java/org/apache/lucene/search/DocAndScoreQuery.java b/lucene/core/src/java/org/apache/lucene/search/DocAndScoreQuery.java index cf71ca8d8c20..3dbd9d86bb17 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DocAndScoreQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocAndScoreQuery.java @@ -29,6 +29,12 @@ /** A query that wraps precomputed documents and scores */ class DocAndScoreQuery extends Query { + /** Optional hook to explain why a doc is missing. Returns null for the generic message. */ + @FunctionalInterface + interface NoMatchExplainer { + Explanation explain(LeafReaderContext context, int doc, int topN) throws IOException; + } + private final int[] docs; private final float[] scores; private final float maxScore; @@ -36,6 +42,8 @@ class DocAndScoreQuery extends Query { private final long visited; private final Object contextIdentity; private final int reentryCount; + // Only used in explain(). Omitted from equals()/hashCode(). + private final NoMatchExplainer noMatchExplainer; /** * Constructor @@ -60,6 +68,18 @@ class DocAndScoreQuery extends Query { long visited, Object contextIdentity, int reentryCount) { + this(docs, scores, maxScore, segmentStarts, visited, contextIdentity, reentryCount, null); + } + + DocAndScoreQuery( + int[] docs, + float[] scores, + float maxScore, + int[] segmentStarts, + long visited, + Object contextIdentity, + int reentryCount, + NoMatchExplainer noMatchExplainer) { this.docs = docs; this.scores = scores; this.maxScore = maxScore; @@ -67,9 +87,15 @@ class DocAndScoreQuery extends Query { this.visited = visited; this.contextIdentity = contextIdentity; this.reentryCount = reentryCount; + this.noMatchExplainer = noMatchExplainer; } static Query createDocAndScoreQuery(IndexReader reader, TopDocs topK, int reentryCount) { + return createDocAndScoreQuery(reader, topK, reentryCount, null); + } + + static Query createDocAndScoreQuery( + IndexReader reader, TopDocs topK, int reentryCount, NoMatchExplainer noMatchExplainer) { int len = topK.scoreDocs.length; assert len > 0; float maxScore = topK.scoreDocs[0].score; @@ -88,7 +114,8 @@ static Query createDocAndScoreQuery(IndexReader reader, TopDocs topK, int reentr segmentStarts, topK.totalHits.value(), reader.getContext().id(), - reentryCount); + reentryCount, + noMatchExplainer); } static int[] findSegmentStarts(List leaves, int[] docs) { @@ -121,12 +148,19 @@ public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float bo } return new Weight(this) { @Override - public Explanation explain(LeafReaderContext context, int doc) { + public Explanation explain(LeafReaderContext context, int doc) throws IOException { int found = Arrays.binarySearch(docs, doc + context.docBase); if (found < 0) { - return Explanation.noMatch("not in top " + docs.length + " docs"); + // Defer to the originating query for a richer reason, if available. + if (noMatchExplainer != null) { + Explanation enriched = noMatchExplainer.explain(context, doc, docs.length); + if (enriched != null) { + return enriched; + } + } + return Explanation.noMatch("Not in top " + docs.length + " doc(s)"); } - return Explanation.match(scores[found] * boost, "within top " + docs.length + " docs"); + return Explanation.match(scores[found] * boost, "Within top " + docs.length + " doc(s)"); } @Override @@ -218,7 +252,7 @@ public boolean isCacheable(LeafReaderContext ctx) { @Override public String toString(String field) { - return "DocAndScoreQuery[" + docs[0] + ",...][" + scores[0] + ",...]," + maxScore; + return "DocAndScoreQuery[" + docs.length + " doc(s), maxScore=" + maxScore + "]"; } @Override diff --git a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java index f2c7acd3f6d3..e98cdd51be4a 100644 --- a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java @@ -169,6 +169,8 @@ public void testEmptyIndex() throws IOException { assertMatches(searcher, kvq, 0); Query q = searcher.rewrite(kvq); assertTrue(q instanceof MatchNoDocsQuery); + assertEquals( + "MatchNoDocsQuery(\"No documents matched the nearest-neighbor search\")", q.toString()); } } @@ -433,13 +435,16 @@ public void testExplain() throws IOException { // scores vary widely due to quantization assertEquals(1 / 2f, matched.getValue().doubleValue(), 0.5); assertEquals(0, matched.getDetails().length); - assertEquals("within top 3 docs", matched.getDescription()); + assertEquals("Within top 3 doc(s)", matched.getDescription()); - Explanation nomatch = searcher.explain(query, 5); + // Doc 0 ({0,0}) is farthest from the query {2,3}, so it is reliably ranked out. + Explanation nomatch = searcher.explain(query, 0); assertFalse(nomatch.isMatch()); assertEquals(0f, nomatch.getValue()); assertEquals(0, matched.getDetails().length); - assertEquals("not in top 3 docs", nomatch.getDescription()); + assertTrue( + nomatch.getDescription(), + nomatch.getDescription().startsWith("Not in top 3 doc(s): score ")); } } } @@ -462,13 +467,50 @@ public void testExplainMultipleSegments() throws IOException { // scores vary widely due to quantization assertEquals(1 / 2f, matched.getValue().doubleValue(), 0.5); assertEquals(0, matched.getDetails().length); - assertEquals("within top 3 docs", matched.getDescription()); + assertEquals("Within top 3 doc(s)", matched.getDescription()); - Explanation nomatch = searcher.explain(query, 4); + // Doc 0 ({0,0}) is farthest from the query {2,3}, so it is reliably ranked out. + Explanation nomatch = searcher.explain(query, 0); assertFalse(nomatch.isMatch()); assertEquals(0f, nomatch.getValue()); assertEquals(0, matched.getDetails().length); - assertEquals("not in top 3 docs", nomatch.getDescription()); + assertTrue( + nomatch.getDescription(), + nomatch.getDescription().startsWith("Not in top 3 doc(s): score ")); + } + } + } + + public void testExplainFiltered() throws IOException { + try (Directory d = newDirectoryForTest()) { + try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) { + for (int j = 0; j < 5; j++) { + Document doc = new Document(); + doc.add(getKnnVectorField("field", new float[] {j, j})); + doc.add(new IntPoint("tag", j)); + w.addDocument(doc); + } + // Doc 5 passes the filter (tag in range) but has no vector. + Document noVector = new Document(); + noVector.add(new IntPoint("tag", 1)); + w.addDocument(noVector); + } + try (IndexReader reader = DirectoryReader.open(d)) { + IndexSearcher searcher = new IndexSearcher(reader); + // Filter to docs 0-2, so docs 3 and 4 cannot be collected. + Query filter = IntPoint.newRangeQuery("tag", 0, 2); + AbstractKnnVectorQuery query = getKnnVectorQuery("field", new float[] {2, 3}, 3, filter); + + // Doc 4 has a vector but fails the filter. + Explanation filtered = searcher.explain(query, 4); + assertFalse(filtered.isMatch()); + assertEquals("Not in top 3 doc(s): excluded by filter", filtered.getDescription()); + + // Doc 5 passes the filter but has no vector, so blame the missing vector, not the filter. + Explanation noVector = searcher.explain(query, 5); + assertFalse(noVector.isMatch()); + assertEquals( + "Not in top 3 doc(s): no vector value in field \"field\"", noVector.getDescription()); } } } @@ -1072,7 +1114,8 @@ void assertDocScoreQueryToString(Query query) { // Since a forceMerge could occur in this test, we must not assert that a specific doc_id is // matched // But that instead the string format is expected and that the max score is 1.0 - assertTrue(queryString.matches("DocAndScoreQuery\\[\\d+,...]\\[\\d+.\\d+,...],1.0")); + assertTrue( + queryString, queryString.matches("DocAndScoreQuery\\[\\d+ doc\\(s\\), maxScore=1.0]")); } /** From dd1a52efb0a76a7865b8cf79dd9fe390f29af43a Mon Sep 17 00:00:00 2001 From: Jakub Slowinski <32519034+slow-J@users.noreply.github.com> Date: Fri, 19 Jun 2026 12:16:26 +0100 Subject: [PATCH 2/4] Add testExplainTieBreak --- .../search/BaseKnnVectorQueryTestCase.java | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java index e98cdd51be4a..7fcf44afebdd 100644 --- a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java @@ -515,6 +515,42 @@ public void testExplainFiltered() throws IOException { } } + public void testExplainTieBreak() throws IOException { + try (Directory d = newDirectoryForTest()) { + // All five docs share one vector, so they all score equally. With top k = 3, two are dropped + // due to tie-break. + try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) { + for (int j = 0; j < 5; j++) { + Document doc = new Document(); + doc.add(getKnnVectorField("field", new float[] {1, 1})); + w.addDocument(doc); + } + w.forceMerge(1); + } + try (IndexReader reader = DirectoryReader.open(d)) { + IndexSearcher searcher = new IndexSearcher(reader); + AbstractKnnVectorQuery query = getKnnVectorQuery("field", new float[] {1, 1}, 3); + + Set collected = new HashSet<>(); + for (ScoreDoc sd : searcher.search(query, 3).scoreDocs) { + collected.add(sd.doc); + } + int dropped = -1; + for (int doc = 0; doc < 5; doc++) { + if (collected.contains(doc) == false) { + dropped = doc; + break; + } + } + Explanation nomatch = searcher.explain(query, dropped); + assertFalse(nomatch.isMatch()); + String description = nomatch.getDescription(); + assertTrue(description, description.startsWith("Not in top 3 doc(s): score ")); + assertTrue(description, description.endsWith(" (tie-break or approximate-search miss)")); + } + } + } + /** Test that when vectors are abnormally distributed among segments, we still find the top K */ public void testSkewedIndex() throws IOException { /* We have to choose the numbers carefully here so that some segment has more than the expected From 5e03bf0193db38eba41b69f2cc0969049ea60b7f Mon Sep 17 00:00:00 2001 From: Jakub Slowinski <32519034+slow-J@users.noreply.github.com> Date: Sun, 21 Jun 2026 11:07:51 +0100 Subject: [PATCH 3/4] Only allow noMatchExplainer.explain to be called for the reader the query was built against --- .../java/org/apache/lucene/search/DocAndScoreQuery.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/DocAndScoreQuery.java b/lucene/core/src/java/org/apache/lucene/search/DocAndScoreQuery.java index 3dbd9d86bb17..d4dcea8a2d41 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DocAndScoreQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocAndScoreQuery.java @@ -25,6 +25,7 @@ import java.util.Objects; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.ReaderUtil; /** A query that wraps precomputed documents and scores */ class DocAndScoreQuery extends Query { @@ -151,8 +152,10 @@ public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float bo public Explanation explain(LeafReaderContext context, int doc) throws IOException { int found = Arrays.binarySearch(docs, doc + context.docBase); if (found < 0) { - // Defer to the originating query for a richer reason, if available. - if (noMatchExplainer != null) { + // Defer to the originating query for a richer reason, but only when this leaf belongs + // to the reader this query was built against. + if (noMatchExplainer != null + && ReaderUtil.getTopLevelContext(context).id() == contextIdentity) { Explanation enriched = noMatchExplainer.explain(context, doc, docs.length); if (enriched != null) { return enriched; From 68c75ab7beead5bb703ed1f3f0b4c40c0eff1f9b Mon Sep 17 00:00:00 2001 From: Jakub Slowinski <32519034+slow-J@users.noreply.github.com> Date: Mon, 22 Jun 2026 16:01:38 +0100 Subject: [PATCH 4/4] Move GITHUB#16271 changelog entry to Lucene 10.6.0 --- lucene/CHANGES.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f7857f33ea31..58d4df5c7574 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -287,7 +287,7 @@ New Features Improvements --------------------- -(No changes) +* GITHUB#16271: Add detail to KNN no-match explanations. (Jakub Slowinski) Optimizations --------------------- @@ -439,8 +439,6 @@ Improvements * GITHUB#16264: Check if merge is aborted before executing file integrity checks to avoid costly full-file checksums on segments when the merge has already been cancelled. (Tanguy Leroux) -* GITHUB#16271: Add detail to KNN no-match explanations. (Jakub Slowinski) - Optimizations --------------------- * GITHUB#16111: Optimize FieldExistsQuery.count() when all docs have the field. (Prithvi S)