Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,9 @@ Improvements

Optimizations
---------------------
(No changes)

* GITHUB#16285: Apply GCD bound transform to sorted numeric rangeIntoBitSet, comparing raw
encoded values directly instead of decoding every packed value. (Costin Leau)

Bug Fixes
---------------------
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Comparator;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import java.util.stream.Stream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.annotations.Warmup;

/**
* Benchmarks range queries over GCD/delta-encoded sorted numeric doc values with multiple values
* per doc.
*/
@State(Scope.Thread)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
@Warmup(iterations = 3, time = 3)
@Measurement(iterations = 5, time = 5)
public class SortedNumericGcdRangeIntoBitSetBenchmark {

private static final String FIELD = "val";
private static final long DOMAIN = 10_000_000L;
private static final long DELTA = 1_700_000_000_000L;

private Directory dir;
private DirectoryReader reader;
private IndexSearcher searcher;
private Path path;
private Query query;

@Param({"1000000"})
public int numDocs;

@Param({"delta_only", "gcd_1000", "gcd_100_delta"})
public String encoding;

@Param({"1", "3", "5"})
public int cardinality;

@Param({"0.01", "0.1", "0.5"})
public double selectivity;

@Setup(Level.Trial)
public void setup() throws Exception {
path = Files.createTempDirectory("sortedNumericGcdRange");
dir = MMapDirectory.open(path);

Random random = new Random(0);
try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig())) {
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
long base = valueForDoc(i, random);
for (int c = 0; c < cardinality; c++) {
doc.add(SortedNumericDocValuesField.indexedField(FIELD, base + c * step()));
}
writer.addDocument(doc);
}
writer.forceMerge(1);
}

reader = DirectoryReader.open(dir);
searcher = new IndexSearcher(reader);
query = rangeQuery();
}

private long valueForDoc(int doc, Random random) {
long value = random.nextLong(0, DOMAIN);
return switch (encoding) {
case "delta_only" -> DELTA + value;
case "gcd_1000" -> value * 1_000L;
case "gcd_100_delta" -> DELTA + value * 100L;
default -> throw new IllegalArgumentException("Unknown encoding: " + encoding);
};
}

private long step() {
return switch (encoding) {
case "delta_only" -> 1;
case "gcd_1000" -> 1_000L;
case "gcd_100_delta" -> 100L;
default -> throw new IllegalArgumentException("Unknown encoding: " + encoding);
};
}

private Query rangeQuery() {
long range = Math.max(1, (long) (DOMAIN * selectivity));
long min = (DOMAIN - range) / 2;
long max = min + range;
long actualMin = actualValue(min);
long actualMax = actualValue(max);
Query rangeQuery = SortedNumericDocValuesField.newSlowRangeQuery(FIELD, actualMin, actualMax);
return new BooleanQuery.Builder()
.add(new MatchAllDocsQuery(), Occur.FILTER)
.add(rangeQuery, Occur.FILTER)
.build();
}

private long actualValue(long value) {
return switch (encoding) {
case "delta_only" -> DELTA + value;
case "gcd_1000" -> value * 1_000L;
case "gcd_100_delta" -> DELTA + value * 100L;
default -> throw new IllegalArgumentException("Unknown encoding: " + encoding);
};
}

@TearDown(Level.Trial)
public void tearDown() throws Exception {
reader.close();
dir.close();
if (Files.exists(path)) {
try (Stream<Path> walk = Files.walk(path)) {
walk.sorted(Comparator.reverseOrder())
.forEach(
p -> {
try {
Files.delete(p);
} catch (IOException _) {
}
});
}
}
}

@Benchmark
@Fork(
value = 1,
jvmArgsAppend = {"-Xmx2g", "-Xms2g", "-XX:+AlwaysPreTouch"})
public int rangeQueryDefaultProvider() throws IOException {
return searcher.count(query);
}

@Benchmark
@Fork(
value = 1,
jvmArgsAppend = {
"--add-modules",
"jdk.incubator.vector",
"-Xmx2g",
"-Xms2g",
"-XX:+AlwaysPreTouch"
})
public int rangeQueryPanamaProvider() throws IOException {
return searcher.count(query);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,59 @@ static void rangeIntoBitSet(
values, fromDoc, toDoc, minValue, maxValue, bitSet, offset);
}

/**
* Transforms query bounds {@code [minValue, maxValue]} into the encoded domain where stored
* values satisfy {@code stored = raw * mul + delta}. Returns {@code {encodedMin, encodedMax}} or
* {@code null} if the range is empty (no raw value can match).
*/
private static long[] transformGcdBounds(long minValue, long maxValue, long mul, long delta) {
assert mul > 0;
long encodedMin = saturatingShiftLower(minValue, delta);
long encodedMax = saturatingShiftUpper(maxValue, delta);
if (mul != 1) {
encodedMin = Math.ceilDiv(encodedMin, mul);
encodedMax = Math.floorDiv(encodedMax, mul);
}
encodedMin = Math.max(0, encodedMin);
if (encodedMin > encodedMax) {
return null;
}
return new long[] {encodedMin, encodedMax};
}

/**
* Returns {@code minValue - delta}, saturating to {@code Long.MIN_VALUE} when the real value
* would underflow (every non-negative stored value satisfies the lower bound) or to {@code
* Long.MAX_VALUE} when it would overflow (no stored value can satisfy the lower bound). Stored
* values are non-negative, so the caller can keep using the SIMD path with these saturated
* sentinels.
*/
private static long saturatingShiftLower(long minValue, long delta) {
try {
return Math.subtractExact(minValue, delta);
} catch (
@SuppressWarnings("unused")
ArithmeticException overflow) {
return delta > 0 ? Long.MIN_VALUE : Long.MAX_VALUE;
}
}

/**
* Symmetric counterpart of {@link #saturatingShiftLower}: returns {@code maxValue - delta},
* saturating to {@code Long.MAX_VALUE} when the real value would overflow (every stored value
* satisfies the upper bound) or to {@code Long.MIN_VALUE} when it would underflow (no stored
* value can satisfy the upper bound).
*/
private static long saturatingShiftUpper(long maxValue, long delta) {
try {
return Math.subtractExact(maxValue, delta);
} catch (
@SuppressWarnings("unused")
ArithmeticException overflow) {
return delta < 0 ? Long.MAX_VALUE : Long.MIN_VALUE;
}
}

private static int fixedCardinality(
SortedNumericEntry entry, DocValuesSkipperEntry skipperEntry) {
if (skipperEntry == null
Expand Down Expand Up @@ -1838,6 +1891,27 @@ private SortedNumericDocValues getSortedNumeric(
final LongValues values = getNumericValues(entry);
final int denseFixedCardinality = fixedCardinality(entry, skipperEntry);

// For GCD/delta encoded entries, capture raw packed values for rangeIntoBitSet optimization.
// The decoded `values` wrapper applies mul*get+delta per call; using raw values with
// transformed bounds avoids this per-value decode cost.
final boolean hasGcdEncoding =
entry.bitsPerValue > 0
&& entry.blockShift < 0
&& entry.table == null
&& (entry.gcd != 1 || entry.minValue != 0);
final LongValues rawValues;
final long mul, delta;
if (hasGcdEncoding) {
RandomAccessInput rawSlice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength);
rawValues = getDirectReaderInstance(rawSlice, entry.bitsPerValue, 0L, entry.numValues);
mul = entry.gcd;
delta = entry.minValue;
} else {
rawValues = null;
mul = 1;
delta = 0;
}

if (entry.docsWithFieldOffset == -1) {
// dense
return new SortedNumericDocValues() {
Expand Down Expand Up @@ -1904,6 +1978,27 @@ public void rangeIntoBitSet(
}
return;
}
if (rawValues != null) {
long[] bounds = transformGcdBounds(minValue, maxValue, mul, delta);
if (bounds == null) {
return;
}
int cardinality = denseFixedCardinality;
if (cardinality > 1) {
sortedNumericScalarRangeIntoBitSet(
rawValues, fromDoc, endDoc, cardinality, bounds[0], bounds[1], bitSet, offset);
return;
}
for (int currentDoc = fromDoc; currentDoc < endDoc; currentDoc++) {
long startOffset = addresses.get(currentDoc);
long endOffset = addresses.get(currentDoc + 1L);
if (sortedNumericMatchesRange(
rawValues, startOffset, endOffset, bounds[0], bounds[1])) {
bitSet.set(currentDoc - offset);
}
}
return;
}
int cardinality = denseFixedCardinality;
if (cardinality > 1) {
sortedNumericScalarRangeIntoBitSet(
Expand Down Expand Up @@ -2013,6 +2108,24 @@ public void rangeIntoBitSet(
set = false;
return;
}
if (rawValues != null) {
long[] bounds = transformGcdBounds(minValue, maxValue, mul, delta);
if (bounds == null) {
set = false;
return;
}
for (; currentDoc < endDoc; currentDoc = disi.nextDoc()) {
int index = disi.index();
long startOffset = addresses.get(index);
long endOffset = addresses.get(index + 1L);
if (sortedNumericMatchesRange(
rawValues, startOffset, endOffset, bounds[0], bounds[1])) {
bitSet.set(currentDoc - offset);
}
}
set = false;
return;
}
for (; currentDoc < endDoc; currentDoc = disi.nextDoc()) {
int index = disi.index();
long startOffset = addresses.get(index);
Expand Down
Loading
Loading