diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 42a6e8073576..13a36b28f99e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -279,7 +279,9 @@ Other API Changes --------------------- -(No changes) + +* GITHUB#16286: Introduce BinaryDocValues#binaryValues to help speed up the + retrieval of many binary doc values at once. (Costin Leau) New Features --------------------- diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/BinaryDocValuesBulkDecodeBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/BinaryDocValuesBulkDecodeBenchmark.java new file mode 100644 index 000000000000..7b98e6b1eccc --- /dev/null +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/BinaryDocValuesBulkDecodeBenchmark.java @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.jmh; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Comparator; +import java.util.Random; +import java.util.concurrent.TimeUnit; +import java.util.stream.Stream; +import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.MMapDirectory; +import org.apache.lucene.util.BytesRef; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Warmup; + +/** + * Benchmarks bulk retrieval of dense binary doc values via {@link BinaryDocValues#binaryValues}. + * Compares the per-doc default with the Lucene90 codec override that reads directly from the data + * slice. + */ +@State(Scope.Thread) +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.SECONDS) +@Warmup(iterations = 3, time = 2) +@Measurement(iterations = 5, time = 2) +public class BinaryDocValuesBulkDecodeBenchmark { + + private Directory dir; + private DirectoryReader reader; + private BinaryDocValues values; + private Path path; + private int[] docs; + private BytesRef[] valueBuffer; + private int nextStart; + + @Param({"1000000"}) + public int docCount; + + @Param({"8", "32", "128"}) + public int valueLength; + + @Param({"128", "1024"}) + public int batchSize; + + @Param({"fixed", "variable"}) + public String encoding; + + @Setup(Level.Trial) + public void setup() throws Exception { + path = Files.createTempDirectory("binaryDocValuesBulkDecode"); + dir = MMapDirectory.open(path); + + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig()); + Random random = new Random(0); + for (int i = 0; i < docCount; i++) { + Document doc = new Document(); + int len; + if (encoding.equals("fixed")) { + len = valueLength; + } else { + len = 1 + random.nextInt(valueLength); + } + byte[] bytes = new byte[len]; + random.nextBytes(bytes); + doc.add(new BinaryDocValuesField("field", new BytesRef(bytes))); + writer.addDocument(doc); + } + writer.forceMerge(1); + reader = DirectoryReader.open(writer); + writer.close(); + + values = reader.leaves().get(0).reader().getBinaryDocValues("field"); + docs = new int[batchSize]; + valueBuffer = new BytesRef[batchSize]; + } + + @TearDown(Level.Trial) + public void tearDown() throws Exception { + reader.close(); + dir.close(); + if (Files.exists(path)) { + try (Stream walk = Files.walk(path)) { + walk.sorted(Comparator.reverseOrder()) + .forEach( + p -> { + try { + Files.delete(p); + } catch (IOException _) { + } + }); + } + } + } + + @Benchmark + @Fork( + value = 1, + jvmArgsAppend = {"-Xmx2g", "-Xms2g", "-XX:+AlwaysPreTouch"}) + public int binaryValuesBulk() throws IOException { + return readBatchBulk(); + } + + @Benchmark + @Fork( + value = 1, + jvmArgsAppend = {"-Xmx2g", "-Xms2g", "-XX:+AlwaysPreTouch"}) + public int binaryValuesPerDoc() throws IOException { + return readBatchPerDoc(); + } + + private int readBatchBulk() throws IOException { + final int maxStart = docCount - batchSize; + if (nextStart > maxStart) { + nextStart = 0; + } + for (int i = 0; i < batchSize; i++) { + docs[i] = nextStart + i; + } + nextStart += batchSize; + + values.binaryValues(batchSize, docs, valueBuffer); + int checksum = 0; + for (int i = 0; i < batchSize; i++) { + checksum += valueBuffer[i].length; + } + return checksum; + } + + private int readBatchPerDoc() throws IOException { + final int maxStart = docCount - batchSize; + if (nextStart > maxStart) { + nextStart = 0; + } + + int checksum = 0; + for (int i = 0; i < batchSize; i++) { + int doc = nextStart + i; + values.advanceExact(doc); + BytesRef ref = BytesRef.deepCopyOf(values.binaryValue()); + checksum += ref.length; + } + nextStart += batchSize; + return checksum; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java index 61f4f2942428..4d360f0a5526 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java @@ -1230,6 +1230,27 @@ public BytesRef binaryValue() throws IOException { bytesSlice.readBytes((long) doc * length, bytes.bytes, 0, length); return bytes; } + + @Override + public void binaryValues( + int size, int[] docs, int docsOffset, BytesRef[] values, int valuesOffset) + throws IOException { + if (size == 0) { + return; + } + byte[] bulk = new byte[size * length]; + if (isContiguous(size, docs, docsOffset)) { + bytesSlice.readBytes((long) docs[docsOffset] * length, bulk, 0, bulk.length); + } else { + for (int di = docsOffset, bi = 0, end = docsOffset + size; di < end; di++, bi++) { + bytesSlice.readBytes((long) docs[di] * length, bulk, bi * length, length); + } + } + for (int i = 0; i < size; i++) { + values[valuesOffset + i] = new BytesRef(bulk, i * length, length); + } + doc = docs[docsOffset + size - 1]; + } }; } else { // variable length @@ -1252,6 +1273,41 @@ public BytesRef binaryValue() throws IOException { bytesSlice.readBytes(startOffset, bytes.bytes, 0, bytes.length); return bytes; } + + @Override + public void binaryValues( + int size, int[] docs, int docsOffset, BytesRef[] values, int valuesOffset) + throws IOException { + if (size == 0) { + return; + } + if (isContiguous(size, docs, docsOffset)) { + long firstStart = addresses.get(docs[docsOffset]); + long lastEnd = addresses.get(docs[docsOffset + size - 1] + 1L); + int totalBytes = (int) (lastEnd - firstStart); + byte[] bulk = new byte[totalBytes]; + bytesSlice.readBytes(firstStart, bulk, 0, totalBytes); + for (int di = docsOffset, vi = valuesOffset, end = docsOffset + size; + di < end; + di++, vi++) { + int offset = (int) (addresses.get(docs[di]) - firstStart); + int len = (int) (addresses.get(docs[di] + 1L) - addresses.get(docs[di])); + values[vi] = new BytesRef(bulk, offset, len); + } + } else { + for (int di = docsOffset, vi = valuesOffset, end = docsOffset + size; + di < end; + di++, vi++) { + int d = docs[di]; + long startOffset = addresses.get(d); + int len = (int) (addresses.get(d + 1L) - startOffset); + byte[] b = new byte[len]; + bytesSlice.readBytes(startOffset, b, 0, len); + values[vi] = new BytesRef(b, 0, len); + } + } + doc = docs[docsOffset + size - 1]; + } }; } } else { diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValues.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValues.java index 468ebb398832..a7a33a4df531 100644 --- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValues.java @@ -18,9 +18,11 @@ package org.apache.lucene.index; import java.io.IOException; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.FieldExistsQuery; import org.apache.lucene.util.BytesRef; -/** A per-document numeric value. */ +/** A per-document binary value. */ public abstract class BinaryDocValues extends DocValuesIterator { /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ @@ -33,4 +35,76 @@ protected BinaryDocValues() {} * @return binary value */ public abstract BytesRef binaryValue() throws IOException; + + /** + * Bulk retrieval of binary doc values. This API helps reduce the performance impact of virtual + * function calls. + * + *

This API behaves as if implemented as below, which is the default implementation: + * + *


+   * public void binaryValues(int size, int[] docs, BytesRef[] values) throws IOException {
+   *   for (int i = 0; i < size; ++i) {
+   *     int doc = docs[i];
+   *     if (advanceExact(doc)) {
+   *       values[i] = BytesRef.deepCopyOf(binaryValue());
+   *     } else {
+   *       values[i] = null;
+   *     }
+   *   }
+   * }
+   * 
+ * + *

NOTE: The {@code docs} array is required to be sorted in ascending order with no + * duplicates. + * + *

NOTE: Documents that don't have a value for this field will have their corresponding + * entry set to {@code null}. If you need to exclude documents that don't have a value, then you + * could apply a {@link FieldExistsQuery} as a {@link Occur#FILTER} clause. Another option is to + * fall back to using {@link #advanceExact} and {@link #binaryValue()} on ranges of doc IDs that + * may not be dense, e.g. + * + *


+   * if (size > 0 && values.advanceExact(docs[0]) && values.docIDRunEnd() > docs[size - 1]) {
+   *   // use values#binaryValues to retrieve values
+   * } else {
+   *   // some docs may not have a value, use #advanceExact and #binaryValue
+   * }
+   * 
+ * + *

NOTE: Each returned {@link BytesRef} is a deep copy owned by the caller and remains + * valid after subsequent calls. + * + * @param size the number of values to retrieve + * @param docs the buffer of doc IDs whose values should be looked up + * @param values the buffer of values to fill; entries are set to {@code null} when a document + * doesn't have a value + */ + public void binaryValues(int size, int[] docs, BytesRef[] values) throws IOException { + binaryValues(size, docs, 0, values, 0); + } + + /** + * Offset-aware variant of {@link #binaryValues(int, int[], BytesRef[])}. Reads {@code size} doc + * IDs starting at {@code docs[docsOffset]} and writes the corresponding values starting at {@code + * values[valuesOffset]}. This follows the same convention as {@link System#arraycopy}. + * + * @param size the number of values to retrieve + * @param docs the buffer of doc IDs whose values should be looked up + * @param docsOffset first position in {@code docs} to read + * @param values the buffer of values to fill; entries are set to {@code null} when a document + * doesn't have a value + * @param valuesOffset first position in {@code values} to write + */ + public void binaryValues( + int size, int[] docs, int docsOffset, BytesRef[] values, int valuesOffset) + throws IOException { + for (int di = docsOffset, vi = valuesOffset, end = docsOffset + size; di < end; di++, vi++) { + if (advanceExact(docs[di])) { + values[vi] = BytesRef.deepCopyOf(binaryValue()); + } else { + values[vi] = null; + } + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 570adb9738da..303698149b83 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -4113,6 +4113,49 @@ private static void checkBulkFetchNumericDocValues( } } + private static void checkBulkFetchBinaryDocValues( + String fieldName, BinaryDocValues bdv, BinaryDocValues bdv2, int maxDoc) throws IOException { + + int[] docs = new int[16]; + BytesRef[] values = new BytesRef[16]; + + for (int doc = -1; doc < maxDoc; ) { + int size = 0; + for (int j = 0; j < docs.length; ++j) { + doc += 1 + (j & 0x03); + if (doc >= maxDoc) { + break; + } + docs[size++] = doc; + } + + bdv.binaryValues(size, docs, values); + + for (int j = 0; j < size; ++j) { + if (bdv2.advanceExact(docs[j])) { + BytesRef expected = BytesRef.deepCopyOf(bdv2.binaryValue()); + if (values[j] == null || values[j].equals(expected) == false) { + throw new CheckIndexException( + "field " + + fieldName + + " #binaryValues reports different value: " + + values[j] + + " != " + + expected); + } + } else { + if (values[j] != null) { + throw new CheckIndexException( + "field " + + fieldName + + " #binaryValues reports non-null for missing doc: " + + values[j]); + } + } + } + } + } + private static void checkDocValues( FieldInfo fi, int maxDoc, DocValuesProducer dvReader, DocValuesStatus status) throws Exception { @@ -4141,6 +4184,8 @@ private static void checkDocValues( status.totalBinaryFields++; checkDVIterator(fi, dvReader::getBinary); checkBinaryDocValues(fi.name, dvReader.getBinary(fi), dvReader.getBinary(fi)); + checkBulkFetchBinaryDocValues( + fi.name, dvReader.getBinary(fi), dvReader.getBinary(fi), maxDoc); break; case NUMERIC: status.totalNumericFields++; diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java index d816c250d1e7..ad5f61089171 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java @@ -279,6 +279,162 @@ private static void assertBulkLongValues( } } + public void testDenseBinaryValuesBulkFetch() throws Exception { + doTestDenseBinaryValuesBulkFetch(true); + doTestDenseBinaryValuesBulkFetch(false); + } + + private void doTestDenseBinaryValuesBulkFetch(boolean fixedLength) throws Exception { + final int numDocs = 512; + final BytesRef[] expected = new BytesRef[numDocs]; + for (int i = 0; i < numDocs; i++) { + int length = fixedLength ? 8 : 1 + random().nextInt(32); + byte[] bytes = new byte[length]; + random().nextBytes(bytes); + expected[i] = new BytesRef(bytes); + } + + Directory dir = newDirectory(); + IndexWriterConfig conf = new IndexWriterConfig(new MockAnalyzer(random())); + conf.setMergeScheduler(new SerialMergeScheduler()); + IndexWriter writer = new IndexWriter(dir, conf); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + doc.add(new BinaryDocValuesField("binary", expected[i])); + writer.addDocument(doc); + } + writer.forceMerge(1); + + try (DirectoryReader reader = DirectoryReader.open(writer)) { + BinaryDocValues values = reader.leaves().get(0).reader().getBinaryDocValues("binary"); + BinaryDocValues singleValues = reader.leaves().get(0).reader().getBinaryDocValues("binary"); + assertBulkBinaryValues(values, singleValues, expected, 0, 128, 1, fixedLength); + assertBulkBinaryValues(values, singleValues, expected, 17, 128, 1, fixedLength); + assertBulkBinaryValues(values, singleValues, expected, 0, 128, 2, fixedLength); + } + + writer.close(); + dir.close(); + } + + private static void assertBulkBinaryValues( + BinaryDocValues values, + BinaryDocValues singleValues, + BytesRef[] expected, + int startDoc, + int size, + int docStep, + boolean fixedLength) + throws IOException { + int[] docs = new int[size]; + BytesRef[] actual = new BytesRef[size]; + for (int i = 0; i < size; i++) { + docs[i] = startDoc + i * docStep; + } + + values.binaryValues(size, docs, actual); + if (size != 0) { + assertEquals(docs[size - 1], values.docID()); + } + for (int i = 0; i < size; i++) { + assertTrue(singleValues.advanceExact(docs[i])); + BytesRef singleValue = BytesRef.deepCopyOf(singleValues.binaryValue()); + assertEquals( + "fixedLength=" + + fixedLength + + " doc=" + + docs[i] + + " startDoc=" + + startDoc + + " docStep=" + + docStep, + singleValue, + actual[i]); + assertEquals( + "fixedLength=" + fixedLength + " doc=" + docs[i], expected[docs[i]], singleValue); + } + + // Verify offset-aware variant produces identical results + if (size > 2) { + int docsOffset = 1; + int sliceSize = size - 2; + BytesRef sentinel = new BytesRef("SENTINEL"); + BytesRef[] offsetActual = new BytesRef[sliceSize + 4]; + int valuesOffset = 2; + Arrays.fill(offsetActual, sentinel); + values.binaryValues(sliceSize, docs, docsOffset, offsetActual, valuesOffset); + for (int i = 0; i < sliceSize; i++) { + assertEquals( + "offset variant mismatch at i=" + i + " fixedLength=" + fixedLength, + actual[docsOffset + i], + offsetActual[valuesOffset + i]); + } + // sentinel positions should be untouched + assertSame(sentinel, offsetActual[0]); + assertSame(sentinel, offsetActual[1]); + assertSame(sentinel, offsetActual[valuesOffset + sliceSize]); + } + } + + public void testSparseBinaryValuesBulkFetch() throws Exception { + final int numDocs = 512; + final BytesRef[] expected = new BytesRef[numDocs]; + // Only even-numbered docs have binary values + for (int i = 0; i < numDocs; i++) { + if (i % 2 == 0) { + byte[] bytes = new byte[8]; + random().nextBytes(bytes); + expected[i] = new BytesRef(bytes); + } + } + + Directory dir = newDirectory(); + IndexWriterConfig conf = new IndexWriterConfig(new MockAnalyzer(random())); + conf.setMergeScheduler(new SerialMergeScheduler()); + IndexWriter writer = new IndexWriter(dir, conf); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (expected[i] != null) { + doc.add(new BinaryDocValuesField("binary", expected[i])); + } + writer.addDocument(doc); + } + writer.forceMerge(1); + + try (DirectoryReader reader = DirectoryReader.open(writer)) { + BinaryDocValues values = reader.leaves().get(0).reader().getBinaryDocValues("binary"); + BinaryDocValues singleValues = reader.leaves().get(0).reader().getBinaryDocValues("binary"); + + // Request a mix of docs with and without values + int size = 64; + int[] docs = new int[size]; + BytesRef[] actual = new BytesRef[size]; + for (int i = 0; i < size; i++) { + docs[i] = i * 4; // every 4th doc: even (has value) and odd (no value) mix + } + + values.binaryValues(size, docs, actual); + for (int i = 0; i < size; i++) { + if (singleValues.advanceExact(docs[i])) { + BytesRef singleValue = BytesRef.deepCopyOf(singleValues.binaryValue()); + assertNotNull("doc " + docs[i] + " should have a value", actual[i]); + assertEquals("doc " + docs[i], singleValue, actual[i]); + } else { + assertNull("doc " + docs[i] + " should be null", actual[i]); + } + } + + // Test size == 0 edge case + BinaryDocValues zeroValues = reader.leaves().get(0).reader().getBinaryDocValues("binary"); + int docIdBefore = zeroValues.docID(); + zeroValues.binaryValues(0, new int[0], new BytesRef[0]); + assertEquals(docIdBefore, zeroValues.docID()); + } + + writer.close(); + dir.close(); + } + private void doTestSparseDocValuesVsStoredFields() throws Exception { final long[] values = new long[TestUtil.nextInt(random(), 1, 500)]; for (int i = 0; i < values.length; ++i) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java index 50984ed10304..37c6b7de5c0c 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java @@ -967,6 +967,22 @@ public BytesRef binaryValue() throws IOException { return in.binaryValue(); } + @Override + public void binaryValues(int size, int[] docs, BytesRef[] values) throws IOException { + assertThread("Binary doc values", creationThread); + assert size >= 0; + assert size == 0 || docs[0] >= docID(); + assert size == 0 || docs[0] >= 0; + for (int i = 1; i < size; ++i) { + assert docs[i] > docs[i - 1]; + } + assert size == 0 || docs[size - 1] < maxDoc; + int expectedDocIdOnReturn = size == 0 ? docID() : docs[size - 1]; + super.binaryValues(size, docs, values); + lastDocID = in.docID(); + assert lastDocID == expectedDocIdOnReturn; + } + @Override public String toString() { return "AssertingBinaryDocValues(" + in + ")";