diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 42a6e8073576..962fbe68e700 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -291,7 +291,10 @@ Improvements Optimizations --------------------- -(No changes) +* GITHUB#16282: Add fast paths to FixedBitSet.copyOf() for SparseLiveDocs and DenseLiveDocs, + avoiding the O(maxDoc) generic fallback. SparseLiveDocs now copies in O(deletedDocs) by + iterating only deleted doc IDs; DenseLiveDocs copies in O(maxDoc/64) by cloning the backing + FixedBitSet directly. (salvatorecampagna) Bug Fixes --------------------- diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/LiveDocsCopyOfBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/LiveDocsCopyOfBenchmark.java new file mode 100644 index 000000000000..9dd8967a5ec2 --- /dev/null +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/LiveDocsCopyOfBenchmark.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.jmh; + +import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.lucene.util.DenseLiveDocs; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.SparseFixedBitSet; +import org.apache.lucene.util.SparseLiveDocs; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + +/** + * Benchmarks {@link FixedBitSet#copyOf(org.apache.lucene.util.Bits)} for {@link SparseLiveDocs} and + * {@link DenseLiveDocs} inputs. + * + *

This benchmark measures the speedup from the fast paths added to {@code copyOf()} for the + * {@link SparseLiveDocs} and {@link DenseLiveDocs} types introduced by GITHUB#15413. Without these + * fast paths, both types fall through to the generic O(maxDoc) loop. With them: + * + *

+ * + *

Usage

+ * + *

Run all benchmarks: + * + *

+ * java -jar lucene-benchmark-jmh.jar "LiveDocsCopyOfBenchmark"
+ * 
+ * + * @see SparseLiveDocs + * @see DenseLiveDocs + * @see LiveDocsBenchmark + */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MICROSECONDS) +@State(Scope.Benchmark) +@Warmup(iterations = 3, time = 2) +@Measurement(iterations = 5, time = 3) +@Fork( + value = 1, + jvmArgsAppend = {"-Xmx2g", "-Xms2g"}) +public class LiveDocsCopyOfBenchmark { + + /** Number of documents in the segment. */ + @Param({"1000000", "10000000", "100000000"}) + int maxDoc; + + /** + * Percentage of documents to delete. + * + *

Kept low to stay in the SparseLiveDocs regime ({@literal <=}1%). At these rates the + * O(deletedDocs) vs O(maxDoc) difference is most pronounced. + */ + @Param({"0.001", "0.01"}) + double deletionRate; + + private SparseLiveDocs sparseLiveDocs; + private DenseLiveDocs denseLiveDocs; + + @Setup(Level.Trial) + public void setup() { + Random random = new Random(42); + int numDeleted = Math.max(1, (int) (maxDoc * deletionRate)); + + SparseFixedBitSet sparseSet = new SparseFixedBitSet(maxDoc); + FixedBitSet fixedSet = new FixedBitSet(maxDoc); + fixedSet.set(0, maxDoc); + + for (int i = 0; i < numDeleted; i++) { + int doc = random.nextInt(maxDoc); + sparseSet.set(doc); + fixedSet.clear(doc); + } + + sparseLiveDocs = SparseLiveDocs.builder(sparseSet, maxDoc).build(); + denseLiveDocs = DenseLiveDocs.builder(fixedSet, maxDoc).build(); + } + + @Benchmark + public FixedBitSet copyOfSparseLiveDocs() { + return FixedBitSet.copyOf(sparseLiveDocs); + } + + @Benchmark + public FixedBitSet copyOfDenseLiveDocs() { + return FixedBitSet.copyOf(denseLiveDocs); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/DenseLiveDocs.java b/lucene/core/src/java/org/apache/lucene/util/DenseLiveDocs.java index 133469fc8bb7..150cc7c6eb8c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/DenseLiveDocs.java +++ b/lucene/core/src/java/org/apache/lucene/util/DenseLiveDocs.java @@ -144,6 +144,10 @@ public int deletedCount() { return deletedCount; } + FixedBitSet toFixedBitSet() { + return liveDocs.clone(); + } + /** * Returns the memory usage in bytes. * diff --git a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java index 071098edf03a..c3946aca6db8 100644 --- a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java @@ -867,6 +867,10 @@ public static FixedBitSet copyOf(Bits bits) { if (bits instanceof FixedBitSet fbs) { return fbs.clone(); + } else if (bits instanceof DenseLiveDocs denseLiveDocs) { + return denseLiveDocs.toFixedBitSet(); + } else if (bits instanceof SparseLiveDocs sparseLiveDocs) { + return sparseLiveDocs.toFixedBitSet(); } else { int length = bits.length(); FixedBitSet bitSet = new FixedBitSet(length); diff --git a/lucene/core/src/java/org/apache/lucene/util/SparseLiveDocs.java b/lucene/core/src/java/org/apache/lucene/util/SparseLiveDocs.java index 4220de9d51bd..8d1052b491bf 100644 --- a/lucene/core/src/java/org/apache/lucene/util/SparseLiveDocs.java +++ b/lucene/core/src/java/org/apache/lucene/util/SparseLiveDocs.java @@ -144,6 +144,17 @@ public int deletedCount() { return deletedCount; } + FixedBitSet toFixedBitSet() { + FixedBitSet result = new FixedBitSet(maxDoc); + result.set(0, maxDoc); + for (int doc = deletedDocs.nextSetBit(0); + doc != DocIdSetIterator.NO_MORE_DOCS; + doc = deletedDocs.nextSetBit(doc + 1)) { + result.clear(doc); + } + return result; + } + /** * Returns the memory usage in bytes. * diff --git a/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java b/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java index 4d5c89c97032..60f629bd9ca4 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java @@ -905,4 +905,41 @@ public void testOrMaskStraddling() { } } } + + public void testCopyOfDenseLiveDocs() { + final int maxDoc = atLeast(1000); + final int numDeleted = random().nextInt(maxDoc / 2) + 1; + + FixedBitSet liveDocsBitSet = new FixedBitSet(maxDoc); + liveDocsBitSet.set(0, maxDoc); + for (int i = 0; i < numDeleted; i++) { + liveDocsBitSet.clear(random().nextInt(maxDoc)); + } + DenseLiveDocs dense = DenseLiveDocs.builder(liveDocsBitSet, maxDoc).build(); + + FixedBitSet result = FixedBitSet.copyOf(dense); + + assertEquals(maxDoc, result.length()); + for (int doc = 0; doc < maxDoc; doc++) { + assertEquals("mismatch at doc " + doc, dense.get(doc), result.get(doc)); + } + } + + public void testCopyOfSparseLiveDocs() { + final int maxDoc = atLeast(1000); + final int numDeleted = random().nextInt(Math.max(1, maxDoc / 100)) + 1; + + SparseFixedBitSet deletedDocsBitSet = new SparseFixedBitSet(maxDoc); + for (int i = 0; i < numDeleted; i++) { + deletedDocsBitSet.set(random().nextInt(maxDoc)); + } + SparseLiveDocs sparse = SparseLiveDocs.builder(deletedDocsBitSet, maxDoc).build(); + + FixedBitSet result = FixedBitSet.copyOf(sparse); + + assertEquals(maxDoc, result.length()); + for (int doc = 0; doc < maxDoc; doc++) { + assertEquals("mismatch at doc " + doc, sparse.get(doc), result.get(doc)); + } + } }