Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ New Features

Improvements
---------------------
(No changes)
* GITHUB#16271: Add detail to KNN no-match explanations. (Jakub Slowinski)

Optimizations
---------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,64 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException {
topK = runSearchTasks(tasks, taskExecutor, perLeafResults, leafReaderContexts);
}
if (topK.scoreDocs.length == 0) {
return MatchNoDocsQuery.INSTANCE;
return new MatchNoDocsQuery("No documents matched the nearest-neighbor search");
}
return DocAndScoreQuery.createDocAndScoreQuery(reader, topK, reentryCount);
return DocAndScoreQuery.createDocAndScoreQuery(
reader, topK, reentryCount, noMatchExplainer(topK, filterWeight));
}

/** Builds the explainer for documents this query did not collect, capturing minTopKScore. */
private DocAndScoreQuery.NoMatchExplainer noMatchExplainer(TopDocs topK, Weight filterWeight) {
// topK is score-descending, so the lowest collected score is the last entry.
final float minTopKScore = topK.scoreDocs[topK.scoreDocs.length - 1].score;
return (context, doc, topN) ->
explainNotCollected(context, doc, topN, filterWeight, minTopKScore);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

filterWeight here was created during rewrite() against a specific reader. But explain() can be called later with a different reader. when that happens, filterWeight.scorer(context) gets a LeafReaderContext it wasn't built for, which could blow up.
match side is fine since scores/docs are already materialized in the array. but this no-match path re-evaluates the filter live, so it has this stale-reader risk.

can you add a try/catch around the filter check.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, good catch!

Looking into this a bit, I don't think it would always throw an exception when called with a different reader, but it would evaluate the filter against the other reader's segment, giving the wrong answer.

Instead of the try/catch, I think we should gate the explain in DocAndScoreQuery#createWeight on contextIdentity, like so:

 if (noMatchExplainer != null
             && ReaderUtil.getTopLevelContext(context).id() == contextIdentity)

This is similar to the existing check in createWeight.

What do you think?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added the guard as proposed above and confirmed that it works as intended.

Steps taken to confirmed it:

  1. Opened two indexes with readerA and readerB.
  2. Created a KNN query and rewrote it with searcherA, creating the DocAndScoreQuery, and then called createWeight to get weightA.
  3. Called the explain(Weight, int) with searcherB.explain(weightA, 6);

Without the guard, the IndexSearcher#explain(Weight, int) gives an incorrect explanation, with the check it gives the generic Not in top X doc(s). message.

}

/** Explains why a doc was not collected, by recomputing its score. null when no vectors. */
private Explanation explainNotCollected(
LeafReaderContext context, int doc, int topN, Weight filterWeight, float minTopKScore)
throws IOException {
String prefix = "Not in top " + topN + " doc(s): ";
FieldInfo fi = context.reader().getFieldInfos().fieldInfo(field);
if (fi == null || fi.getVectorDimension() == 0) {
return null;
}
VectorScorer vectorScorer = createVectorScorer(context, fi);
if (vectorScorer == null) {
return null;
}
if (vectorScorer.iterator().advance(doc) != doc) {
return Explanation.noMatch(prefix + "no vector value in field \"" + field + "\"");
}
if (filterWeight != null && docPassesFilter(filterWeight, context, doc) == false) {
return Explanation.noMatch(prefix + "excluded by filter");
}
float score = vectorScorer.score();
if (score < minTopKScore) {
return Explanation.noMatch(prefix + "score " + score + " < minTopKScore " + minTopKScore);
}
// Score meets the cutoff but the doc was not collected (tie-break, recall miss, or rescoring).
return Explanation.noMatch(
prefix
+ "score "
+ score
+ " >= minTopKScore "
+ minTopKScore
+ " (tie-break or approximate-search miss)");
}

private static boolean docPassesFilter(Weight filterWeight, LeafReaderContext context, int doc)
throws IOException {
Scorer scorer = filterWeight.scorer(context);
if (scorer == null) {
return false;
}
TwoPhaseIterator twoPhase = scorer.twoPhaseIterator();
if (twoPhase != null) {
return twoPhase.approximation().advance(doc) == doc && twoPhase.matches();
}
return scorer.iterator().advance(doc) == doc;
}

private TopDocs runSearchTasks(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,26 @@
import java.util.Objects;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.ReaderUtil;

/** A query that wraps precomputed documents and scores */
class DocAndScoreQuery extends Query {

/** Optional hook to explain why a doc is missing. Returns null for the generic message. */
@FunctionalInterface
interface NoMatchExplainer {
Explanation explain(LeafReaderContext context, int doc, int topN) throws IOException;
}

private final int[] docs;
private final float[] scores;
private final float maxScore;
private final int[] segmentStarts;
private final long visited;
private final Object contextIdentity;
private final int reentryCount;
// Only used in explain(). Omitted from equals()/hashCode().
private final NoMatchExplainer noMatchExplainer;

/**
* Constructor
Expand All @@ -60,16 +69,34 @@ class DocAndScoreQuery extends Query {
long visited,
Object contextIdentity,
int reentryCount) {
this(docs, scores, maxScore, segmentStarts, visited, contextIdentity, reentryCount, null);
}

DocAndScoreQuery(
int[] docs,
float[] scores,
float maxScore,
int[] segmentStarts,
long visited,
Object contextIdentity,
int reentryCount,
NoMatchExplainer noMatchExplainer) {
this.docs = docs;
this.scores = scores;
this.maxScore = maxScore;
this.segmentStarts = segmentStarts;
this.visited = visited;
this.contextIdentity = contextIdentity;
this.reentryCount = reentryCount;
this.noMatchExplainer = noMatchExplainer;
}

static Query createDocAndScoreQuery(IndexReader reader, TopDocs topK, int reentryCount) {
return createDocAndScoreQuery(reader, topK, reentryCount, null);
}

static Query createDocAndScoreQuery(
IndexReader reader, TopDocs topK, int reentryCount, NoMatchExplainer noMatchExplainer) {
int len = topK.scoreDocs.length;
assert len > 0;
float maxScore = topK.scoreDocs[0].score;
Expand All @@ -88,7 +115,8 @@ static Query createDocAndScoreQuery(IndexReader reader, TopDocs topK, int reentr
segmentStarts,
topK.totalHits.value(),
reader.getContext().id(),
reentryCount);
reentryCount,
noMatchExplainer);
}

static int[] findSegmentStarts(List<LeafReaderContext> leaves, int[] docs) {
Expand Down Expand Up @@ -121,12 +149,21 @@ public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float bo
}
return new Weight(this) {
@Override
public Explanation explain(LeafReaderContext context, int doc) {
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
int found = Arrays.binarySearch(docs, doc + context.docBase);
if (found < 0) {
return Explanation.noMatch("not in top " + docs.length + " docs");
// Defer to the originating query for a richer reason, but only when this leaf belongs
// to the reader this query was built against.
if (noMatchExplainer != null
&& ReaderUtil.getTopLevelContext(context).id() == contextIdentity) {
Explanation enriched = noMatchExplainer.explain(context, doc, docs.length);
if (enriched != null) {
return enriched;
}
}
return Explanation.noMatch("Not in top " + docs.length + " doc(s)");
}
return Explanation.match(scores[found] * boost, "within top " + docs.length + " docs");
return Explanation.match(scores[found] * boost, "Within top " + docs.length + " doc(s)");
}

@Override
Expand Down Expand Up @@ -218,7 +255,7 @@ public boolean isCacheable(LeafReaderContext ctx) {

@Override
public String toString(String field) {
return "DocAndScoreQuery[" + docs[0] + ",...][" + scores[0] + ",...]," + maxScore;
return "DocAndScoreQuery[" + docs.length + " doc(s), maxScore=" + maxScore + "]";
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ public void testEmptyIndex() throws IOException {
assertMatches(searcher, kvq, 0);
Query q = searcher.rewrite(kvq);
assertTrue(q instanceof MatchNoDocsQuery);
assertEquals(
"MatchNoDocsQuery(\"No documents matched the nearest-neighbor search\")", q.toString());
}
}

Expand Down Expand Up @@ -433,13 +435,16 @@ public void testExplain() throws IOException {
// scores vary widely due to quantization
assertEquals(1 / 2f, matched.getValue().doubleValue(), 0.5);
assertEquals(0, matched.getDetails().length);
assertEquals("within top 3 docs", matched.getDescription());
assertEquals("Within top 3 doc(s)", matched.getDescription());

Explanation nomatch = searcher.explain(query, 5);
// Doc 0 ({0,0}) is farthest from the query {2,3}, so it is reliably ranked out.
Explanation nomatch = searcher.explain(query, 0);
assertFalse(nomatch.isMatch());
assertEquals(0f, nomatch.getValue());
assertEquals(0, matched.getDetails().length);
assertEquals("not in top 3 docs", nomatch.getDescription());
assertTrue(
nomatch.getDescription(),
nomatch.getDescription().startsWith("Not in top 3 doc(s): score "));
}
}
}
Expand All @@ -462,13 +467,86 @@ public void testExplainMultipleSegments() throws IOException {
// scores vary widely due to quantization
assertEquals(1 / 2f, matched.getValue().doubleValue(), 0.5);
assertEquals(0, matched.getDetails().length);
assertEquals("within top 3 docs", matched.getDescription());
assertEquals("Within top 3 doc(s)", matched.getDescription());

Explanation nomatch = searcher.explain(query, 4);
// Doc 0 ({0,0}) is farthest from the query {2,3}, so it is reliably ranked out.
Explanation nomatch = searcher.explain(query, 0);
assertFalse(nomatch.isMatch());
assertEquals(0f, nomatch.getValue());
assertEquals(0, matched.getDetails().length);
assertEquals("not in top 3 docs", nomatch.getDescription());
assertTrue(
nomatch.getDescription(),
nomatch.getDescription().startsWith("Not in top 3 doc(s): score "));
}
}
}

public void testExplainFiltered() throws IOException {
try (Directory d = newDirectoryForTest()) {
try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) {
for (int j = 0; j < 5; j++) {
Document doc = new Document();
doc.add(getKnnVectorField("field", new float[] {j, j}));
doc.add(new IntPoint("tag", j));
w.addDocument(doc);
}
// Doc 5 passes the filter (tag in range) but has no vector.
Document noVector = new Document();
noVector.add(new IntPoint("tag", 1));
w.addDocument(noVector);
}
try (IndexReader reader = DirectoryReader.open(d)) {
IndexSearcher searcher = new IndexSearcher(reader);
// Filter to docs 0-2, so docs 3 and 4 cannot be collected.
Query filter = IntPoint.newRangeQuery("tag", 0, 2);
AbstractKnnVectorQuery query = getKnnVectorQuery("field", new float[] {2, 3}, 3, filter);

// Doc 4 has a vector but fails the filter.
Explanation filtered = searcher.explain(query, 4);
assertFalse(filtered.isMatch());
assertEquals("Not in top 3 doc(s): excluded by filter", filtered.getDescription());

// Doc 5 passes the filter but has no vector, so blame the missing vector, not the filter.
Explanation noVector = searcher.explain(query, 5);
assertFalse(noVector.isMatch());
assertEquals(
"Not in top 3 doc(s): no vector value in field \"field\"", noVector.getDescription());
}
}
}

public void testExplainTieBreak() throws IOException {
try (Directory d = newDirectoryForTest()) {
// All five docs share one vector, so they all score equally. With top k = 3, two are dropped
// due to tie-break.
try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) {
for (int j = 0; j < 5; j++) {
Document doc = new Document();
doc.add(getKnnVectorField("field", new float[] {1, 1}));
w.addDocument(doc);
}
w.forceMerge(1);
}
try (IndexReader reader = DirectoryReader.open(d)) {
IndexSearcher searcher = new IndexSearcher(reader);
AbstractKnnVectorQuery query = getKnnVectorQuery("field", new float[] {1, 1}, 3);

Set<Integer> collected = new HashSet<>();
for (ScoreDoc sd : searcher.search(query, 3).scoreDocs) {
collected.add(sd.doc);
}
int dropped = -1;
for (int doc = 0; doc < 5; doc++) {
if (collected.contains(doc) == false) {
dropped = doc;
break;
}
}
Explanation nomatch = searcher.explain(query, dropped);
assertFalse(nomatch.isMatch());
String description = nomatch.getDescription();
assertTrue(description, description.startsWith("Not in top 3 doc(s): score "));
assertTrue(description, description.endsWith(" (tie-break or approximate-search miss)"));
}
}
}
Expand Down Expand Up @@ -1072,7 +1150,8 @@ void assertDocScoreQueryToString(Query query) {
// Since a forceMerge could occur in this test, we must not assert that a specific doc_id is
// matched
// But that instead the string format is expected and that the max score is 1.0
assertTrue(queryString.matches("DocAndScoreQuery\\[\\d+,...]\\[\\d+.\\d+,...],1.0"));
assertTrue(
queryString, queryString.matches("DocAndScoreQuery\\[\\d+ doc\\(s\\), maxScore=1.0]"));
}

/**
Expand Down
Loading