From 6067606e8809a2dfef6ea14625e530352c6b7efb Mon Sep 17 00:00:00 2001 From: Salvatore Campagna Date: Sat, 20 Jun 2026 19:11:40 +0200 Subject: [PATCH 1/4] perf(util): add fast paths to FixedBitSet.copyOf() for LiveDocs types FixedBitSet.copyOf(Bits) already has fast paths for FixedBitSet and FixedBits, but SparseLiveDocs and DenseLiveDocs (introduced in #15413) fell through to the O(maxDoc) generic loop. Each type now exposes a package-private toFixedBitSet() method that FixedBitSet.copyOf() delegates to: - DenseLiveDocs stores live docs in a FixedBitSet: clone it directly, O(maxDoc/64). - SparseLiveDocs stores deleted docs in a SparseFixedBitSet: pre-fill the backing long[] with -1L and clear only deleted positions using nextSetBit, O(deletedDocs + maxDoc/64). The hot caller is PendingDeletes.getMutableBits(), which invokes copyOf(liveDocs) on the first delete after a snapshot. --- lucene/CHANGES.txt | 5 +- .../jmh/LiveDocsCopyOfBenchmark.java | 118 ++++++++++++++++++ .../org/apache/lucene/util/DenseLiveDocs.java | 4 + .../org/apache/lucene/util/FixedBitSet.java | 4 + .../apache/lucene/util/SparseLiveDocs.java | 17 +++ .../apache/lucene/util/TestFixedBitSet.java | 37 ++++++ 6 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/LiveDocsCopyOfBenchmark.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 42a6e8073576..cf0373fe62c0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -291,7 +291,10 @@ Improvements Optimizations --------------------- -(No changes) +* GITHUB#16XXX: Add fast paths to FixedBitSet.copyOf() for SparseLiveDocs and DenseLiveDocs, + avoiding the O(maxDoc) generic fallback. SparseLiveDocs now copies in O(deletedDocs) by + iterating only deleted doc IDs; DenseLiveDocs copies in O(maxDoc/64) by cloning the backing + FixedBitSet directly. (salvatorecampagna) Bug Fixes --------------------- diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/LiveDocsCopyOfBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/LiveDocsCopyOfBenchmark.java new file mode 100644 index 000000000000..7d6ed2c7968e --- /dev/null +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/LiveDocsCopyOfBenchmark.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.jmh; + +import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.lucene.util.DenseLiveDocs; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.SparseFixedBitSet; +import org.apache.lucene.util.SparseLiveDocs; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + +/** + * Benchmarks {@link FixedBitSet#copyOf(org.apache.lucene.util.Bits)} for {@link SparseLiveDocs} + * and {@link DenseLiveDocs} inputs. + * + *

This benchmark measures the speedup from the fast paths added to {@code copyOf()} for the + * {@link SparseLiveDocs} and {@link DenseLiveDocs} types introduced by GITHUB#15413. Without these + * fast paths, both types fall through to the generic O(maxDoc) loop. With them: + * + *

+ * + *

Usage

+ * + *

Run all benchmarks: + * + *

+ * java -jar lucene-benchmark-jmh.jar "LiveDocsCopyOfBenchmark"
+ * 
+ * + * @see SparseLiveDocs + * @see DenseLiveDocs + * @see LiveDocsBenchmark + */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MICROSECONDS) +@State(Scope.Benchmark) +@Warmup(iterations = 3, time = 2) +@Measurement(iterations = 5, time = 3) +@Fork( + value = 1, + jvmArgsAppend = {"-Xmx2g", "-Xms2g"}) +public class LiveDocsCopyOfBenchmark { + + /** Number of documents in the segment. */ + @Param({"1000000", "10000000", "100000000"}) + int maxDoc; + + /** + * Percentage of documents to delete. + * + *

Kept low to stay in the SparseLiveDocs regime ({@literal <=}1%). At these rates the + * O(deletedDocs) vs O(maxDoc) difference is most pronounced. + */ + @Param({"0.001", "0.01"}) + double deletionRate; + + private SparseLiveDocs sparseLiveDocs; + private DenseLiveDocs denseLiveDocs; + + @Setup(Level.Trial) + public void setup() { + Random random = new Random(42); + int numDeleted = Math.max(1, (int) (maxDoc * deletionRate)); + + SparseFixedBitSet sparseSet = new SparseFixedBitSet(maxDoc); + FixedBitSet fixedSet = new FixedBitSet(maxDoc); + fixedSet.set(0, maxDoc); + + for (int i = 0; i < numDeleted; i++) { + int doc = random.nextInt(maxDoc); + sparseSet.set(doc); + fixedSet.clear(doc); + } + + sparseLiveDocs = SparseLiveDocs.builder(sparseSet, maxDoc).build(); + denseLiveDocs = DenseLiveDocs.builder(fixedSet, maxDoc).build(); + } + + @Benchmark + public FixedBitSet copyOfSparseLiveDocs() { + return FixedBitSet.copyOf(sparseLiveDocs); + } + + @Benchmark + public FixedBitSet copyOfDenseLiveDocs() { + return FixedBitSet.copyOf(denseLiveDocs); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/DenseLiveDocs.java b/lucene/core/src/java/org/apache/lucene/util/DenseLiveDocs.java index 133469fc8bb7..150cc7c6eb8c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/DenseLiveDocs.java +++ b/lucene/core/src/java/org/apache/lucene/util/DenseLiveDocs.java @@ -144,6 +144,10 @@ public int deletedCount() { return deletedCount; } + FixedBitSet toFixedBitSet() { + return liveDocs.clone(); + } + /** * Returns the memory usage in bytes. * diff --git a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java index 071098edf03a..c3946aca6db8 100644 --- a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java @@ -867,6 +867,10 @@ public static FixedBitSet copyOf(Bits bits) { if (bits instanceof FixedBitSet fbs) { return fbs.clone(); + } else if (bits instanceof DenseLiveDocs denseLiveDocs) { + return denseLiveDocs.toFixedBitSet(); + } else if (bits instanceof SparseLiveDocs sparseLiveDocs) { + return sparseLiveDocs.toFixedBitSet(); } else { int length = bits.length(); FixedBitSet bitSet = new FixedBitSet(length); diff --git a/lucene/core/src/java/org/apache/lucene/util/SparseLiveDocs.java b/lucene/core/src/java/org/apache/lucene/util/SparseLiveDocs.java index 4220de9d51bd..8b679dfa145e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/SparseLiveDocs.java +++ b/lucene/core/src/java/org/apache/lucene/util/SparseLiveDocs.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.util; +import java.util.Arrays; import java.util.Locale; import org.apache.lucene.search.DocIdSetIterator; @@ -144,6 +145,22 @@ public int deletedCount() { return deletedCount; } + FixedBitSet toFixedBitSet() { + int numWords = FixedBitSet.bits2words(maxDoc); + long[] rawBits = new long[numWords]; + Arrays.fill(rawBits, -1L); + int ghostBits = maxDoc & 63; + if (ghostBits != 0) { + rawBits[numWords - 1] = (1L << ghostBits) - 1; + } + for (int doc = deletedDocs.nextSetBit(0); + doc != DocIdSetIterator.NO_MORE_DOCS; + doc = deletedDocs.nextSetBit(doc + 1)) { + rawBits[doc >> 6] &= ~(1L << doc); + } + return new FixedBitSet(rawBits, maxDoc); + } + /** * Returns the memory usage in bytes. * diff --git a/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java b/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java index 4d5c89c97032..60f629bd9ca4 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java @@ -905,4 +905,41 @@ public void testOrMaskStraddling() { } } } + + public void testCopyOfDenseLiveDocs() { + final int maxDoc = atLeast(1000); + final int numDeleted = random().nextInt(maxDoc / 2) + 1; + + FixedBitSet liveDocsBitSet = new FixedBitSet(maxDoc); + liveDocsBitSet.set(0, maxDoc); + for (int i = 0; i < numDeleted; i++) { + liveDocsBitSet.clear(random().nextInt(maxDoc)); + } + DenseLiveDocs dense = DenseLiveDocs.builder(liveDocsBitSet, maxDoc).build(); + + FixedBitSet result = FixedBitSet.copyOf(dense); + + assertEquals(maxDoc, result.length()); + for (int doc = 0; doc < maxDoc; doc++) { + assertEquals("mismatch at doc " + doc, dense.get(doc), result.get(doc)); + } + } + + public void testCopyOfSparseLiveDocs() { + final int maxDoc = atLeast(1000); + final int numDeleted = random().nextInt(Math.max(1, maxDoc / 100)) + 1; + + SparseFixedBitSet deletedDocsBitSet = new SparseFixedBitSet(maxDoc); + for (int i = 0; i < numDeleted; i++) { + deletedDocsBitSet.set(random().nextInt(maxDoc)); + } + SparseLiveDocs sparse = SparseLiveDocs.builder(deletedDocsBitSet, maxDoc).build(); + + FixedBitSet result = FixedBitSet.copyOf(sparse); + + assertEquals(maxDoc, result.length()); + for (int doc = 0; doc < maxDoc; doc++) { + assertEquals("mismatch at doc " + doc, sparse.get(doc), result.get(doc)); + } + } } From 64020969fc8134a8fb8e8f7971394eaf12627c84 Mon Sep 17 00:00:00 2001 From: Salvatore Campagna Date: Mon, 22 Jun 2026 12:39:06 +0200 Subject: [PATCH 2/4] CHANGES: update issue number to GITHUB#16282 --- lucene/CHANGES.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index cf0373fe62c0..962fbe68e700 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -291,7 +291,7 @@ Improvements Optimizations --------------------- -* GITHUB#16XXX: Add fast paths to FixedBitSet.copyOf() for SparseLiveDocs and DenseLiveDocs, +* GITHUB#16282: Add fast paths to FixedBitSet.copyOf() for SparseLiveDocs and DenseLiveDocs, avoiding the O(maxDoc) generic fallback. SparseLiveDocs now copies in O(deletedDocs) by iterating only deleted doc IDs; DenseLiveDocs copies in O(maxDoc/64) by cloning the backing FixedBitSet directly. (salvatorecampagna) From 4913df06782a8fc22634292eb568d02610675f6f Mon Sep 17 00:00:00 2001 From: Salvatore Campagna Date: Mon, 22 Jun 2026 13:35:50 +0200 Subject: [PATCH 3/4] tidy: fix google-java-format violation in LiveDocsCopyOfBenchmark --- .../apache/lucene/benchmark/jmh/LiveDocsCopyOfBenchmark.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/LiveDocsCopyOfBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/LiveDocsCopyOfBenchmark.java index 7d6ed2c7968e..9dd8967a5ec2 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/LiveDocsCopyOfBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/LiveDocsCopyOfBenchmark.java @@ -36,8 +36,8 @@ import org.openjdk.jmh.annotations.Warmup; /** - * Benchmarks {@link FixedBitSet#copyOf(org.apache.lucene.util.Bits)} for {@link SparseLiveDocs} - * and {@link DenseLiveDocs} inputs. + * Benchmarks {@link FixedBitSet#copyOf(org.apache.lucene.util.Bits)} for {@link SparseLiveDocs} and + * {@link DenseLiveDocs} inputs. * *

This benchmark measures the speedup from the fast paths added to {@code copyOf()} for the * {@link SparseLiveDocs} and {@link DenseLiveDocs} types introduced by GITHUB#15413. Without these From fef04b71b46932f2645313c06c5674571898d5a1 Mon Sep 17 00:00:00 2001 From: Salvatore Campagna Date: Tue, 23 Jun 2026 12:22:52 +0200 Subject: [PATCH 4/4] simplify SparseLiveDocs.toFixedBitSet() to use FixedBitSet API Replace the raw long[] pre-fill approach with FixedBitSet.set(0, maxDoc) followed by result.clear(doc) in the deletion loop. The two approaches are semantically identical: set(0, maxDoc) fills the backing array with -1L and masks off the ghost bits in the last word in one call. --- .../org/apache/lucene/util/SparseLiveDocs.java | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/SparseLiveDocs.java b/lucene/core/src/java/org/apache/lucene/util/SparseLiveDocs.java index 8b679dfa145e..8d1052b491bf 100644 --- a/lucene/core/src/java/org/apache/lucene/util/SparseLiveDocs.java +++ b/lucene/core/src/java/org/apache/lucene/util/SparseLiveDocs.java @@ -16,7 +16,6 @@ */ package org.apache.lucene.util; -import java.util.Arrays; import java.util.Locale; import org.apache.lucene.search.DocIdSetIterator; @@ -146,19 +145,14 @@ public int deletedCount() { } FixedBitSet toFixedBitSet() { - int numWords = FixedBitSet.bits2words(maxDoc); - long[] rawBits = new long[numWords]; - Arrays.fill(rawBits, -1L); - int ghostBits = maxDoc & 63; - if (ghostBits != 0) { - rawBits[numWords - 1] = (1L << ghostBits) - 1; - } + FixedBitSet result = new FixedBitSet(maxDoc); + result.set(0, maxDoc); for (int doc = deletedDocs.nextSetBit(0); doc != DocIdSetIterator.NO_MORE_DOCS; doc = deletedDocs.nextSetBit(doc + 1)) { - rawBits[doc >> 6] &= ~(1L << doc); + result.clear(doc); } - return new FixedBitSet(rawBits, maxDoc); + return result; } /**