diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 42a6e8073576..f7e36fdcddba 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -291,7 +291,7 @@ Improvements Optimizations --------------------- -(No changes) +* GITHUG#16280: Single-pass writeString fast path for short strings in ByteBuffersDataOutput (neoremind) Bug Fixes --------------------- diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/ByteBuffersDataOutputWriteStringBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/ByteBuffersDataOutputWriteStringBenchmark.java new file mode 100644 index 000000000000..cba7b2a85124 --- /dev/null +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/ByteBuffersDataOutputWriteStringBenchmark.java @@ -0,0 +1,336 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.jmh; + +import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.lucene.store.ByteBuffersDataOutput; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.SECONDS) +@State(Scope.Benchmark) +@Warmup(iterations = 5, time = 3) +@Measurement(iterations = 5, time = 3) +@Fork( + value = 3, + jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"}) +public class ByteBuffersDataOutputWriteStringBenchmark { + + private static final int STRING_POOL_SIZE = 8192; + + @Param({ + "ascii_1", + "ascii_10", + "ascii_20", + "ascii_30", + "ascii_40", + "ascii_medium", + "ascii_long", + "ascii_vlarge", + "cjk_1", + "cjk_10", + "cjk_20", + "cjk_30", + "cjk_40", + "cjk_medium", + "cjk_long", + "cjk_vlarge", + "latin_ext_1", + "latin_ext_10", + "latin_ext_20", + "latin_ext_30", + "latin_ext_40", + "latin_ext_medium", + "latin_ext_long", + "latin_ext_vlarge", + "mixed" + }) + public String stringType; + + /** Target bytes to write per invocation. */ + @Param({"81920", "491520", "2097152"}) + public int targetBytes; + + /** Pre-generated strings to write. */ + private String[] testStrings; + + /** Number of strings to write per invocation to reach targetBytes total output. */ + private int stringsPerInvocation; + + private ByteBuffersDataOutput reusableOutput; + + @Setup(Level.Trial) + public void setup() { + Random random = new Random(42); + testStrings = new String[STRING_POOL_SIZE]; + + int avgBytesPerString; + switch (stringType) { + case "ascii_1": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomAscii(random, 1); + } + avgBytesPerString = 2; + break; + case "ascii_10": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomAscii(random, 8 + random.nextInt(5)); + } + avgBytesPerString = 11; + break; + case "ascii_20": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomAscii(random, 18 + random.nextInt(5)); + } + avgBytesPerString = 21; + break; + case "ascii_30": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomAscii(random, 28 + random.nextInt(5)); + } + avgBytesPerString = 31; + break; + case "ascii_40": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomAscii(random, 38 + random.nextInt(5)); + } + avgBytesPerString = 41; + break; + case "ascii_medium": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomAscii(random, 50 + random.nextInt(100)); + } + avgBytesPerString = 100; + break; + case "ascii_long": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomAscii(random, 900 + random.nextInt(250)); + } + avgBytesPerString = 1024; + break; + case "ascii_vlarge": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomAscii(random, 7000 + random.nextInt(2400)); + } + avgBytesPerString = 8192; + break; + case "cjk_1": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomCjk(random, 1); + } + avgBytesPerString = 4; + break; + case "cjk_10": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomCjk(random, 8 + random.nextInt(5)); + } + avgBytesPerString = 32; + break; + case "cjk_20": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomCjk(random, 18 + random.nextInt(5)); + } + avgBytesPerString = 62; + break; + case "cjk_30": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomCjk(random, 28 + random.nextInt(5)); + } + avgBytesPerString = 92; + break; + case "cjk_40": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomCjk(random, 38 + random.nextInt(5)); + } + avgBytesPerString = 122; + break; + case "cjk_medium": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomCjk(random, 50 + random.nextInt(100)); + } + avgBytesPerString = 300; + break; + case "cjk_long": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomCjk(random, 400 + random.nextInt(200)); + } + avgBytesPerString = 1500; + break; + case "cjk_vlarge": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomCjk(random, 5500 + random.nextInt(1000)); + } + avgBytesPerString = 18000; + break; + case "latin_ext_1": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomLatinExtended(random, 1); + } + avgBytesPerString = 3; + break; + case "latin_ext_10": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomLatinExtended(random, 8 + random.nextInt(5)); + } + avgBytesPerString = 21; + break; + case "latin_ext_20": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomLatinExtended(random, 18 + random.nextInt(5)); + } + avgBytesPerString = 41; + break; + case "latin_ext_30": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomLatinExtended(random, 28 + random.nextInt(5)); + } + avgBytesPerString = 61; + break; + case "latin_ext_40": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomLatinExtended(random, 38 + random.nextInt(5)); + } + avgBytesPerString = 81; + break; + case "latin_ext_medium": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomLatinExtended(random, 50 + random.nextInt(100)); + } + avgBytesPerString = 200; + break; + case "latin_ext_long": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomLatinExtended(random, 400 + random.nextInt(200)); + } + avgBytesPerString = 1000; + break; + case "latin_ext_vlarge": + for (int i = 0; i < STRING_POOL_SIZE; i++) { + testStrings[i] = randomLatinExtended(random, 5500 + random.nextInt(1000)); + } + avgBytesPerString = 12000; + break; + case "mixed": + // Varying lengths + for (int i = 0; i < STRING_POOL_SIZE; i++) { + int roll = random.nextInt(100); + if (roll < 50) { + testStrings[i] = randomAscii(random, 3 + random.nextInt(30)); + } else if (roll < 65) { + testStrings[i] = randomAscii(random, 50 + random.nextInt(100)); + } else if (roll < 75) { + testStrings[i] = randomAscii(random, 500 + random.nextInt(500)); + } else if (roll < 85) { + testStrings[i] = randomCjk(random, 5 + random.nextInt(20)); + } else if (roll < 95) { + testStrings[i] = randomLatinExtended(random, 20 + random.nextInt(60)); + } else { + testStrings[i] = randomCjk(random, 200 + random.nextInt(300)); + } + } + avgBytesPerString = 80; + break; + default: + throw new IllegalArgumentException("Unknown stringType: " + stringType); + } + + stringsPerInvocation = targetBytes / avgBytesPerString; + + reusableOutput = ByteBuffersDataOutput.newResettableInstance(); + } + + private ByteBuffersDataOutput getOutput() { + reusableOutput.reset(); + return reusableOutput; + } + + @Benchmark + public void writeString(Blackhole bh) { + ByteBuffersDataOutput output = getOutput(); + for (int i = 0; i < stringsPerInvocation; i++) { + output.writeString(testStrings[i % STRING_POOL_SIZE]); + } + bh.consume(output.size()); + } + + private String randomAscii(Random random, int length) { + char[] chars = new char[length]; + for (int i = 0; i < length; i++) { + chars[i] = (char) (32 + random.nextInt(95)); + } + return new String(chars); + } + + /** + * Generates realistic CJK text: ~90% CJK Unified Ideographs (3-byte UTF-8), ~9% ASCII digits + * (1-byte), ~1% surrogate pairs (emoji, rare CJK-B characters, 4-byte UTF-8). + */ + private String randomCjk(Random random, int length) { + char[] chars = new char[length + 1]; // +1 room for potential surrogate pair expansion + int pos = 0; + for (int i = 0; i < length && pos < chars.length - 1; i++) { + int roll = random.nextInt(100); + if (roll < 90) { + // CJK Unified Ideographs: U+4E00–U+9FFF (3 bytes in UTF-8) + chars[pos++] = (char) (0x4E00 + random.nextInt(0x9FFF - 0x4E00)); + } else if (roll < 99) { + // ASCII digits + chars[pos++] = (char) (0x30 + random.nextInt(10)); // 0-9 + } else { + // Surrogate pair, 4 bytes UTF-8 + if (pos < chars.length - 1) { + chars[pos++] = (char) (0xD800 + random.nextInt(0x400)); // high surrogate + chars[pos++] = (char) (0xDC00 + random.nextInt(0x400)); // low surrogate + } else { + chars[pos++] = (char) (0x4E00 + random.nextInt(0x9FFF - 0x4E00)); + } + } + } + return new String(chars, 0, pos); + } + + private String randomLatinExtended(Random random, int length) { + char[] chars = new char[length]; + for (int i = 0; i < length; i++) { + int roll = random.nextInt(100); + if (roll < 80) { + // 2-byte UTF-8 + chars[i] = (char) (0x0080 + random.nextInt(0x0700)); + } else if (roll < 95) { + // ASCII + chars[i] = (char) (0x41 + random.nextInt(26)); // A-Z + } else { + // 3-byte UTF-8 + chars[i] = (char) (0x2000 + random.nextInt(0xBFF)); + } + } + return new String(chars); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java index 5e93a3986611..52ede7baf6cd 100644 --- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java @@ -415,9 +415,28 @@ public void writeLong(long v) { public void writeString(String v) { try { final int charCount = v.length(); + ByteBuffer currentBlock = this.currentBlock; + + // Fast path for short strings (charCount <= 42): the VInt length prefix is guaranteed to be 1 + // byte, + // so we can encode directly and backfill the length without computing the UTF-8 byte count + // upfront. + if (charCount <= UnicodeUtil.MAX_CHARS_FOR_1_BYTE_VINT + && currentBlock.hasArray() + && currentBlock.remaining() >= 1 + charCount * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR) { + byte[] array = currentBlock.array(); + int startingPos = currentBlock.position(); + int off = currentBlock.arrayOffset() + startingPos; + int encodedEnd = UnicodeUtil.UTF16toUTF8(v, 0, charCount, array, off + 1); + int byteLen = encodedEnd - (off + 1); + array[off] = (byte) byteLen; + currentBlock.position(startingPos + 1 + byteLen); + return; + } + final int byteLen = UnicodeUtil.calcUTF16toUTF8Length(v, 0, charCount); writeVInt(byteLen); - ByteBuffer currentBlock = this.currentBlock; + currentBlock = this.currentBlock; if (currentBlock.hasArray() && currentBlock.remaining() >= byteLen) { int startingPos = currentBlock.position(); UnicodeUtil.UTF16toUTF8( diff --git a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java index bf719377b184..6f493c0556d8 100644 --- a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java @@ -127,6 +127,9 @@ private UnicodeUtil() {} // no instance /** Maximum number of UTF8 bytes per UTF16 character. */ public static final int MAX_UTF8_BYTES_PER_CHAR = 3; + /** Max charCount where the string's UTF-8 byte count is guaranteed to need only a 1-byte VInt */ + public static final int MAX_CHARS_FOR_1_BYTE_VINT = 127 / MAX_UTF8_BYTES_PER_CHAR; + /** * Encode characters from a char[] source, starting at offset for length chars. It is the * responsibility of the caller to make sure that the destination array is large enough.