Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ Improvements

Optimizations
---------------------
(No changes)
* GITHUG#16280: Single-pass writeString fast path for short strings in ByteBuffersDataOutput (neoremind)

Bug Fixes
---------------------
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,336 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;

import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;

@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 5, time = 3)
@Measurement(iterations = 5, time = 3)
@Fork(
value = 3,
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
public class ByteBuffersDataOutputWriteStringBenchmark {

private static final int STRING_POOL_SIZE = 8192;

@Param({
"ascii_1",
"ascii_10",
"ascii_20",
"ascii_30",
"ascii_40",
"ascii_medium",
"ascii_long",
"ascii_vlarge",
"cjk_1",
"cjk_10",
"cjk_20",
"cjk_30",
"cjk_40",
"cjk_medium",
"cjk_long",
"cjk_vlarge",
"latin_ext_1",
"latin_ext_10",
"latin_ext_20",
"latin_ext_30",
"latin_ext_40",
"latin_ext_medium",
"latin_ext_long",
"latin_ext_vlarge",
"mixed"
})
public String stringType;

/** Target bytes to write per invocation. */
@Param({"81920", "491520", "2097152"})
public int targetBytes;

/** Pre-generated strings to write. */
private String[] testStrings;

/** Number of strings to write per invocation to reach targetBytes total output. */
private int stringsPerInvocation;

private ByteBuffersDataOutput reusableOutput;

@Setup(Level.Trial)
public void setup() {
Random random = new Random(42);
testStrings = new String[STRING_POOL_SIZE];

int avgBytesPerString;
switch (stringType) {
case "ascii_1":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomAscii(random, 1);
}
avgBytesPerString = 2;
break;
case "ascii_10":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomAscii(random, 8 + random.nextInt(5));
}
avgBytesPerString = 11;
break;
case "ascii_20":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomAscii(random, 18 + random.nextInt(5));
}
avgBytesPerString = 21;
break;
case "ascii_30":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomAscii(random, 28 + random.nextInt(5));
}
avgBytesPerString = 31;
break;
case "ascii_40":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomAscii(random, 38 + random.nextInt(5));
}
avgBytesPerString = 41;
break;
case "ascii_medium":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomAscii(random, 50 + random.nextInt(100));
}
avgBytesPerString = 100;
break;
case "ascii_long":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomAscii(random, 900 + random.nextInt(250));
}
avgBytesPerString = 1024;
break;
case "ascii_vlarge":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomAscii(random, 7000 + random.nextInt(2400));
}
avgBytesPerString = 8192;
break;
case "cjk_1":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomCjk(random, 1);
}
avgBytesPerString = 4;
break;
case "cjk_10":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomCjk(random, 8 + random.nextInt(5));
}
avgBytesPerString = 32;
break;
case "cjk_20":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomCjk(random, 18 + random.nextInt(5));
}
avgBytesPerString = 62;
break;
case "cjk_30":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomCjk(random, 28 + random.nextInt(5));
}
avgBytesPerString = 92;
break;
case "cjk_40":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomCjk(random, 38 + random.nextInt(5));
}
avgBytesPerString = 122;
break;
case "cjk_medium":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomCjk(random, 50 + random.nextInt(100));
}
avgBytesPerString = 300;
break;
case "cjk_long":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomCjk(random, 400 + random.nextInt(200));
}
avgBytesPerString = 1500;
break;
case "cjk_vlarge":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomCjk(random, 5500 + random.nextInt(1000));
}
avgBytesPerString = 18000;
break;
case "latin_ext_1":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomLatinExtended(random, 1);
}
avgBytesPerString = 3;
break;
case "latin_ext_10":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomLatinExtended(random, 8 + random.nextInt(5));
}
avgBytesPerString = 21;
break;
case "latin_ext_20":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomLatinExtended(random, 18 + random.nextInt(5));
}
avgBytesPerString = 41;
break;
case "latin_ext_30":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomLatinExtended(random, 28 + random.nextInt(5));
}
avgBytesPerString = 61;
break;
case "latin_ext_40":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomLatinExtended(random, 38 + random.nextInt(5));
}
avgBytesPerString = 81;
break;
case "latin_ext_medium":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomLatinExtended(random, 50 + random.nextInt(100));
}
avgBytesPerString = 200;
break;
case "latin_ext_long":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomLatinExtended(random, 400 + random.nextInt(200));
}
avgBytesPerString = 1000;
break;
case "latin_ext_vlarge":
for (int i = 0; i < STRING_POOL_SIZE; i++) {
testStrings[i] = randomLatinExtended(random, 5500 + random.nextInt(1000));
}
avgBytesPerString = 12000;
break;
case "mixed":
// Varying lengths
for (int i = 0; i < STRING_POOL_SIZE; i++) {
int roll = random.nextInt(100);
if (roll < 50) {
testStrings[i] = randomAscii(random, 3 + random.nextInt(30));
} else if (roll < 65) {
testStrings[i] = randomAscii(random, 50 + random.nextInt(100));
} else if (roll < 75) {
testStrings[i] = randomAscii(random, 500 + random.nextInt(500));
} else if (roll < 85) {
testStrings[i] = randomCjk(random, 5 + random.nextInt(20));
} else if (roll < 95) {
testStrings[i] = randomLatinExtended(random, 20 + random.nextInt(60));
} else {
testStrings[i] = randomCjk(random, 200 + random.nextInt(300));
}
}
avgBytesPerString = 80;
break;
default:
throw new IllegalArgumentException("Unknown stringType: " + stringType);
}

stringsPerInvocation = targetBytes / avgBytesPerString;

reusableOutput = ByteBuffersDataOutput.newResettableInstance();
}

private ByteBuffersDataOutput getOutput() {
reusableOutput.reset();
return reusableOutput;
}

@Benchmark
public void writeString(Blackhole bh) {
ByteBuffersDataOutput output = getOutput();
for (int i = 0; i < stringsPerInvocation; i++) {
output.writeString(testStrings[i % STRING_POOL_SIZE]);
}
bh.consume(output.size());
}

private String randomAscii(Random random, int length) {
char[] chars = new char[length];
for (int i = 0; i < length; i++) {
chars[i] = (char) (32 + random.nextInt(95));
}
return new String(chars);
}

/**
* Generates realistic CJK text: ~90% CJK Unified Ideographs (3-byte UTF-8), ~9% ASCII digits
* (1-byte), ~1% surrogate pairs (emoji, rare CJK-B characters, 4-byte UTF-8).
*/
private String randomCjk(Random random, int length) {
char[] chars = new char[length + 1]; // +1 room for potential surrogate pair expansion
int pos = 0;
for (int i = 0; i < length && pos < chars.length - 1; i++) {
int roll = random.nextInt(100);
if (roll < 90) {
// CJK Unified Ideographs: U+4E00–U+9FFF (3 bytes in UTF-8)
chars[pos++] = (char) (0x4E00 + random.nextInt(0x9FFF - 0x4E00));
} else if (roll < 99) {
// ASCII digits
chars[pos++] = (char) (0x30 + random.nextInt(10)); // 0-9
} else {
// Surrogate pair, 4 bytes UTF-8
if (pos < chars.length - 1) {
chars[pos++] = (char) (0xD800 + random.nextInt(0x400)); // high surrogate
chars[pos++] = (char) (0xDC00 + random.nextInt(0x400)); // low surrogate
} else {
chars[pos++] = (char) (0x4E00 + random.nextInt(0x9FFF - 0x4E00));
}
}
}
return new String(chars, 0, pos);
}

private String randomLatinExtended(Random random, int length) {
char[] chars = new char[length];
for (int i = 0; i < length; i++) {
int roll = random.nextInt(100);
if (roll < 80) {
// 2-byte UTF-8
chars[i] = (char) (0x0080 + random.nextInt(0x0700));
} else if (roll < 95) {
// ASCII
chars[i] = (char) (0x41 + random.nextInt(26)); // A-Z
} else {
// 3-byte UTF-8
chars[i] = (char) (0x2000 + random.nextInt(0xBFF));
}
}
return new String(chars);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -415,9 +415,28 @@ public void writeLong(long v) {
public void writeString(String v) {
try {
final int charCount = v.length();
ByteBuffer currentBlock = this.currentBlock;

// Fast path for short strings (charCount <= 42): the VInt length prefix is guaranteed to be 1
// byte,
// so we can encode directly and backfill the length without computing the UTF-8 byte count
// upfront.
if (charCount <= UnicodeUtil.MAX_CHARS_FOR_1_BYTE_VINT
&& currentBlock.hasArray()
&& currentBlock.remaining() >= 1 + charCount * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR) {
byte[] array = currentBlock.array();
int startingPos = currentBlock.position();
int off = currentBlock.arrayOffset() + startingPos;
int encodedEnd = UnicodeUtil.UTF16toUTF8(v, 0, charCount, array, off + 1);
int byteLen = encodedEnd - (off + 1);
array[off] = (byte) byteLen;
currentBlock.position(startingPos + 1 + byteLen);
return;
}

final int byteLen = UnicodeUtil.calcUTF16toUTF8Length(v, 0, charCount);
writeVInt(byteLen);
ByteBuffer currentBlock = this.currentBlock;
currentBlock = this.currentBlock;
if (currentBlock.hasArray() && currentBlock.remaining() >= byteLen) {
int startingPos = currentBlock.position();
UnicodeUtil.UTF16toUTF8(
Expand Down
3 changes: 3 additions & 0 deletions lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,9 @@ private UnicodeUtil() {} // no instance
/** Maximum number of UTF8 bytes per UTF16 character. */
public static final int MAX_UTF8_BYTES_PER_CHAR = 3;

/** Max charCount where the string's UTF-8 byte count is guaranteed to need only a 1-byte VInt */
public static final int MAX_CHARS_FOR_1_BYTE_VINT = 127 / MAX_UTF8_BYTES_PER_CHAR;

/**
* Encode characters from a char[] source, starting at offset for length chars. It is the
* responsibility of the caller to make sure that the destination array is large enough.
Expand Down
Loading