Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ API Changes

New Features
---------------------
* GITHUB#15794: Add DocValuesSkipper metadata for the maximum number of values
on any document in a field. (Prithvi S)

* GITHUB#15505: Upgrade snowball to 2d2e312df56f2ede014a4ffb3e91e6dea43c24be. New stemmer: PolishStemmer (and
PolishSnowballAnalyzer in the stempel package) (Justas Sakalauskas, Dawid Weiss)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.LENGTH;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.MAXLENGTH;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.MAXVALUE;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.MAXVALUECOUNT;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.MINVALUE;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.NUMVALUES;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.ORDPATTERN;
Expand Down Expand Up @@ -72,6 +73,7 @@ static class OneField {
long origin;
long minValue;
long maxValue;
int maxValueCount;
long numValues;
}

Expand Down Expand Up @@ -123,8 +125,19 @@ assert startsWith(DOCCOUNT)
: "got " + scratch.get().utf8ToString() + " field=" + fieldName + " ext=" + ext;
field.docCount = Integer.parseInt(stripPrefix(DOCCOUNT));

if (dvType == DocValuesType.NUMERIC) {
readLine();
if (startsWith(MAXVALUECOUNT)) {
field.maxValueCount = Integer.parseInt(stripPrefix(MAXVALUECOUNT));
readLine();
} else if (field.docCount == 0) {
field.maxValueCount = 0;
} else if (dvType == DocValuesType.NUMERIC || dvType == DocValuesType.SORTED) {
field.maxValueCount = 1;
} else {
field.maxValueCount = -1;
}

if (dvType == DocValuesType.NUMERIC) {
assert startsWith(ORIGIN)
: "got " + scratch.get().utf8ToString() + " field=" + fieldName + " ext=" + ext;
field.origin = Long.parseLong(stripPrefix(ORIGIN));
Expand All @@ -134,7 +147,6 @@ assert startsWith(ORIGIN)
field.dataStartFilePointer = data.getFilePointer();
data.seek(data.getFilePointer() + (1 + field.pattern.length() + 2) * (long) maxDoc);
} else if (dvType == DocValuesType.BINARY || dvType == DocValuesType.SORTED_NUMERIC) {
readLine();
assert startsWith(MAXLENGTH);
field.maxLength = Integer.parseInt(stripPrefix(MAXLENGTH));
readLine();
Expand All @@ -145,7 +157,6 @@ assert startsWith(ORIGIN)
data.getFilePointer()
+ (9 + field.pattern.length() + field.maxLength + 2) * (long) maxDoc);
} else if (dvType == DocValuesType.SORTED || dvType == DocValuesType.SORTED_SET) {
readLine();
assert startsWith(NUMVALUES);
field.numValues = Long.parseLong(stripPrefix(NUMVALUES));
readLine();
Expand Down Expand Up @@ -898,6 +909,11 @@ public int docCount() {
return field.docCount;
}

@Override
public int maxValueCount() {
return field.maxValueCount;
}

@Override
public int minDocID(int level) {
if (doc == -1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {

static final BytesRef MINVALUE = new BytesRef(" minvalue ");
static final BytesRef MAXVALUE = new BytesRef(" maxvalue ");
static final BytesRef MAXVALUECOUNT = new BytesRef(" maxvaluecount ");

static final BytesRef PATTERN = new BytesRef(" pattern ");
// used for bytes
Expand Down Expand Up @@ -115,6 +116,10 @@ public void addNumericField(FieldInfo field, DocValuesProducer valuesProducer)
SimpleTextUtil.write(data, Integer.toString(numValues), scratch);
SimpleTextUtil.writeNewline(data);

SimpleTextUtil.write(data, MAXVALUECOUNT);
SimpleTextUtil.write(data, Integer.toString(numValues == 0 ? 0 : 1), scratch);
SimpleTextUtil.writeNewline(data);

if (numValues != numDocs) {
minValue = Math.min(minValue, 0);
maxValue = Math.max(maxValue, 0);
Expand Down Expand Up @@ -185,6 +190,11 @@ public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) th

private void doAddBinaryField(FieldInfo field, DocValuesProducer valuesProducer)
throws IOException {
doAddBinaryField(field, valuesProducer, -1);
}

private void doAddBinaryField(
FieldInfo field, DocValuesProducer valuesProducer, int maxValueCount) throws IOException {
int maxLength = 0;
BinaryDocValues values = valuesProducer.getBinary(field);
int docCount = 0;
Expand All @@ -197,6 +207,13 @@ private void doAddBinaryField(FieldInfo field, DocValuesProducer valuesProducer)
SimpleTextUtil.write(data, Integer.toString(docCount), scratch);
SimpleTextUtil.writeNewline(data);

SimpleTextUtil.write(data, MAXVALUECOUNT);
SimpleTextUtil.write(
data,
Integer.toString(maxValueCount == -1 ? (docCount == 0 ? 0 : 1) : maxValueCount),
scratch);
SimpleTextUtil.writeNewline(data);

// write maxLength
SimpleTextUtil.write(data, MAXLENGTH);
SimpleTextUtil.write(data, Integer.toString(maxLength), scratch);
Expand Down Expand Up @@ -265,6 +282,10 @@ public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) th
SimpleTextUtil.write(data, Integer.toString(docCount), scratch);
SimpleTextUtil.writeNewline(data);

SimpleTextUtil.write(data, MAXVALUECOUNT);
SimpleTextUtil.write(data, Integer.toString(docCount == 0 ? 0 : 1), scratch);
SimpleTextUtil.writeNewline(data);

int valueCount = 0;
int maxLength = -1;
TermsEnum terms = valuesProducer.getSorted(field).termsEnum();
Expand Down Expand Up @@ -358,9 +379,12 @@ public void addSortedNumericField(FieldInfo field, final DocValuesProducer value

long minValue = Long.MAX_VALUE;
long maxValue = Long.MIN_VALUE;
int maxValueCount = 0;
SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
for (int i = 0; i < values.docValueCount(); ++i) {
int valueCount = values.docValueCount();
maxValueCount = Math.max(maxValueCount, valueCount);
for (int i = 0; i < valueCount; ++i) {
long v = values.nextValue();
minValue = Math.min(minValue, v);
maxValue = Math.max(maxValue, v);
Expand Down Expand Up @@ -440,7 +464,8 @@ public BytesRef binaryValue() throws IOException {
}
};
}
});
},
maxValueCount);
}

@Override
Expand All @@ -451,14 +476,20 @@ public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer)
writeFieldEntry(field, DocValuesType.SORTED_SET);

int docCount = 0;
int maxValueCount = 0;
SortedSetDocValues values = valuesProducer.getSortedSet(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
++docCount;
maxValueCount = Math.max(maxValueCount, values.docValueCount());
}
SimpleTextUtil.write(data, DOCCOUNT);
SimpleTextUtil.write(data, Integer.toString(docCount), scratch);
SimpleTextUtil.writeNewline(data);

SimpleTextUtil.write(data, MAXVALUECOUNT);
SimpleTextUtil.write(data, Integer.toString(maxValueCount), scratch);
SimpleTextUtil.writeNewline(data);

long valueCount = 0;
int maxLength = 0;
TermsEnum terms = valuesProducer.getSortedSet(field).termsEnum();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,14 +266,17 @@ private void writeSkipIndex(FieldInfo field, DocValuesProducer valuesProducer)
long globalMaxValue = Long.MIN_VALUE;
long globalMinValue = Long.MAX_VALUE;
int globalDocCount = 0;
int globalMaxValueCount = 0;
int maxDocId = -1;
final List<SkipAccumulator> accumulators = new ArrayList<>();
SkipAccumulator accumulator = null;
final int maxAccumulators = 1 << (SKIP_INDEX_LEVEL_SHIFT * (SKIP_INDEX_MAX_LEVEL - 1));
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
final int valueCount = values.docValueCount();
final long firstValue = values.nextValue();
globalMaxValueCount = Math.max(globalMaxValueCount, valueCount);
if (accumulator != null
&& accumulator.isDone(skipIndexIntervalSize, values.docValueCount(), firstValue, doc)) {
&& accumulator.isDone(skipIndexIntervalSize, valueCount, firstValue, doc)) {
globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
globalMinValue = Math.min(globalMinValue, accumulator.minValue);
globalDocCount += accumulator.docCount;
Expand All @@ -290,7 +293,7 @@ private void writeSkipIndex(FieldInfo field, DocValuesProducer valuesProducer)
}
accumulator.nextDoc(doc);
accumulator.accumulate(firstValue);
for (int i = 1, end = values.docValueCount(); i < end; ++i) {
for (int i = 1; i < valueCount; ++i) {
accumulator.accumulate(values.nextValue());
}
}
Expand All @@ -310,6 +313,7 @@ private void writeSkipIndex(FieldInfo field, DocValuesProducer valuesProducer)
assert globalDocCount <= maxDocId + 1;
meta.writeInt(globalDocCount);
meta.writeInt(maxDocId);
meta.writeInt(globalMaxValueCount);
}

private void writeLevels(List<SkipAccumulator> accumulators) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,8 @@ public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOExcepti
static final String SKIP_INDEX_EXTENSION = "dvs";
static final int VERSION_START = 0;
static final int VERSION_SKIPPER_SEPARATE_FILE = 1;
static final int VERSION_CURRENT = VERSION_SKIPPER_SEPARATE_FILE;
static final int VERSION_SKIPPER_MAX_VALUE_COUNT = 2;
static final int VERSION_CURRENT = VERSION_SKIPPER_MAX_VALUE_COUNT;

// indicates docvalues type
static final byte NUMERIC = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {

readFields(in, state.fieldInfos);

if (version < Lucene90DocValuesFormat.VERSION_SKIPPER_MAX_VALUE_COUNT) {
inferMaxValueCounts(state.fieldInfos);
}

} catch (Throwable exception) {
priorE = exception;
} finally {
Expand Down Expand Up @@ -216,6 +220,55 @@ public DocValuesProducer getMergeInstance() {
true);
}

private void inferMaxValueCounts(FieldInfos fieldInfos) {
for (var cursor : skippers) {
DocValuesSkipperEntry entry = cursor.value;
if (entry.maxValueCount == -1 && entry.docCount != 0) {
int fieldNumber = cursor.key;
FieldInfo info = fieldInfos.fieldInfo(fieldNumber);
int inferredMaxValueCount = -1;
if (info != null) {
switch (info.getDocValuesType()) {
case NUMERIC, SORTED -> inferredMaxValueCount = 1;
case SORTED_NUMERIC -> {
SortedNumericEntry sne = sortedNumerics.get(fieldNumber);
if (sne != null && sne.numValues == sne.numDocsWithField) {
inferredMaxValueCount = 1;
}
}
case SORTED_SET -> {
SortedSetEntry sse = sortedSets.get(fieldNumber);
if (sse != null) {
if (sse.singleValueEntry != null) {
inferredMaxValueCount = 1;
} else if (sse.ordsEntry != null
&& sse.ordsEntry.numValues == sse.ordsEntry.numDocsWithField) {
inferredMaxValueCount = 1;
}
}
}
// $CASES-OMITTED$
default -> {
// leave as -1
}
}
}
if (inferredMaxValueCount != -1) {
skippers.put(
fieldNumber,
new DocValuesSkipperEntry(
entry.offset,
entry.length,
entry.minValue,
entry.maxValue,
entry.docCount,
entry.maxDocId,
inferredMaxValueCount));
}
}
}
}

private void readFields(IndexInput meta, FieldInfos infos) throws IOException {
for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) {
FieldInfo info = infos.fieldInfo(fieldNumber);
Expand Down Expand Up @@ -255,8 +308,15 @@ private DocValuesSkipperEntry readDocValueSkipperMeta(IndexInput meta) throws IO
long minValue = meta.readLong();
int docCount = meta.readInt();
int maxDocID = meta.readInt();
final int maxValueCount;
if (version >= Lucene90DocValuesFormat.VERSION_SKIPPER_MAX_VALUE_COUNT) {
maxValueCount = meta.readInt();
} else {
maxValueCount = docCount == 0 ? 0 : -1;
}

return new DocValuesSkipperEntry(offset, length, minValue, maxValue, docCount, maxDocID);
return new DocValuesSkipperEntry(
offset, length, minValue, maxValue, docCount, maxDocID, maxValueCount);
}

private void readNumeric(IndexInput meta, NumericEntry entry) throws IOException {
Expand Down Expand Up @@ -389,7 +449,13 @@ public void close() throws IOException {
}

private record DocValuesSkipperEntry(
long offset, long length, long minValue, long maxValue, int docCount, int maxDocId) {}
long offset,
long length,
long minValue,
long maxValue,
int docCount,
int maxDocId,
int maxValueCount) {}

private static class NumericEntry {
long[] table;
Expand Down Expand Up @@ -2004,6 +2070,11 @@ public long maxValue() {
public int docCount() {
return entry.docCount;
}

@Override
public int maxValueCount() {
return entry.maxValueCount;
}
};
}
}
14 changes: 14 additions & 0 deletions lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -3624,6 +3624,20 @@ private static void checkDocValueSkipper(FieldInfo fi, DocValuesSkipper skipper)
+ " > "
+ skipper.maxValue());
}
if (skipper.maxValueCount() < -1) {
throw new CheckIndexException(
"skipper dv iterator for field: "
+ fieldName
+ " reports invalid maxValueCount, got "
+ skipper.maxValueCount());
}
if (skipper.docCount() == 0 && skipper.maxValueCount() != 0) {
throw new CheckIndexException(
"skipper dv iterator for field: "
+ fieldName
+ " reports maxValueCount for an empty field, got "
+ skipper.maxValueCount());
}
int docCount = 0;
int doc;
while (true) {
Expand Down
12 changes: 12 additions & 0 deletions lucene/core/src/java/org/apache/lucene/index/DocValuesSkipper.java
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,18 @@ public abstract class DocValuesSkipper {
/** Return the global number of documents with a value for the field. */
public abstract int docCount();

/**
* Return the global maximum number of values that any single document has for the field. Returns
* {@code -1} if the exact value is unavailable (e.g., the segment was written by an older codec
* that did not persist this metadata and it could not be inferred from other metadata).
*
* <p>This returns {@code 0} if {@link #docCount()} is {@code 0}. A field is known to be
* single-valued if this method returns {@code 1}.
*/
public int maxValueCount() {
return docCount() == 0 ? 0 : -1;
}

/**
* Advance this skipper so that all levels intersects the range given by {@code minValue} and
* {@code maxValue}. If there are no intersecting levels, the skipper is exhausted.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,11 @@ public long maxValue() {
public int docCount() {
return 1024 + 1024 / 2;
}

@Override
public int maxValueCount() {
return 1;
}
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1405,6 +1405,12 @@ public int minDocID(int level) {
return minDocID;
}

@Override
public int maxValueCount() {
assertThread("Doc values skipper", creationThread);
return in.maxValueCount();
}

@Override
public int maxDocID(int level) {
assertThread("Doc values skipper", creationThread);
Expand Down
Loading
Loading