From 18d4625b5d8db86767b10cf34cbf8260b313eb00 Mon Sep 17 00:00:00 2001 From: Tommaso Teofili Date: Fri, 19 Jun 2026 09:08:20 +0200 Subject: [PATCH] Lower FST heap utilization for Nori/Korean analyzers --- .../lucene/analysis/morph/MorphFSTLoader.java | 137 ++++++++++++++++ .../analysis/morph/TestMorphFSTLoader.java | 97 +++++++++++ .../analysis/ja/dict/TokenInfoDictionary.java | 42 +++-- .../analysis/ja/dict/UserDictionary.java | 4 +- .../analysis/ko/KoreanTokenizerFactory.java | 9 +- .../analysis/ko/dict/TokenInfoDictionary.java | 49 ++++-- .../lucene/analysis/ko/dict/TokenInfoFST.java | 14 +- .../analysis/ko/dict/UserDictionary.java | 16 +- .../dict/TestKoreanDictionaryHeapUsage.java | 86 ++++++++++ .../dict/TestKoreanDictionarySmallHeap.java | 53 ++++++ lucene/benchmark-jmh/build.gradle | 1 + .../benchmark-jmh/src/java/module-info.java | 1 + .../jmh/KoreanTokenizerBenchmark.java | 97 +++++++++++ .../lucene/util/fst/DirectBufferFSTStore.java | 116 +++++++++++++ .../java/org/apache/lucene/util/fst/FST.java | 7 + .../util/fst/ReverseDirectBufferReader.java | 56 +++++++ .../util/fst/TestDirectBufferFSTStore.java | 155 ++++++++++++++++++ 17 files changed, 901 insertions(+), 39 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/MorphFSTLoader.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/morph/TestMorphFSTLoader.java create mode 100644 lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestKoreanDictionaryHeapUsage.java create mode 100644 lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestKoreanDictionarySmallHeap.java create mode 100644 lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/KoreanTokenizerBenchmark.java create mode 100644 lucene/core/src/java/org/apache/lucene/util/fst/DirectBufferFSTStore.java create mode 100644 lucene/core/src/java/org/apache/lucene/util/fst/ReverseDirectBufferReader.java create mode 100644 lucene/core/src/test/org/apache/lucene/util/fst/TestDirectBufferFSTStore.java diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/MorphFSTLoader.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/MorphFSTLoader.java new file mode 100644 index 000000000000..a67ae6170122 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/MorphFSTLoader.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.morph; + +import java.io.BufferedInputStream; +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.file.Path; +import java.nio.file.Paths; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.store.MMapDirectory; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.fst.DirectBufferFSTStore; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTReader; +import org.apache.lucene.util.fst.OffHeapFSTStore; +import org.apache.lucene.util.fst.PositiveIntOutputs; + +/** + * Loads morphological dictionary {@link FST}s with storage appropriate to the byte source. + * + * + */ +public final class MorphFSTLoader { + + private static final PositiveIntOutputs OUTPUTS = PositiveIntOutputs.getSingleton(); + + private MorphFSTLoader() {} + + /** + * An FST plus an optional resource that must remain open for mmap-backed FSTs. Callers loading + * from a {@link Path} must retain this holder for the lifetime of the returned {@link FST}. + */ + public static final class LoadedFST implements Closeable { + private final FST fst; + private final Closeable resource; + + LoadedFST(FST fst, Closeable resource) { + this.fst = fst; + this.resource = resource; + } + + public FST fst() { + return fst; + } + + /** Returns the mmap resource, or {@code null} when the FST does not require one. */ + public Closeable resource() { + return resource; + } + + @Override + public void close() throws IOException { + if (resource != null) { + resource.close(); + } + } + } + + /** + * Load an FST from a file using mmap ({@link OffHeapFSTStore}). The returned {@link LoadedFST} + * must be kept alive while the FST is in use. + */ + public static LoadedFST loadFromPath(Path fstFile) throws IOException { + Directory dir = new MMapDirectory(fstFile.getParent()); + IndexInput in = dir.openInput(fstFile.getFileName().toString(), IOContext.READONCE); + FST.FSTMetadata metadata = FST.readMetadata(in, OUTPUTS); + OffHeapFSTStore store = new OffHeapFSTStore(in, in.getFilePointer(), metadata); + FST fst = FST.fromFSTReader(metadata, store); + Closeable resource = () -> IOUtils.close(in, dir); + return new LoadedFST(fst, resource); + } + + /** + * Load an FST from a stream by copying bytes into a direct buffer ({@link + * DirectBufferFSTStore}). + */ + public static FST loadFromStream(InputStream in) throws IOException { + DataInput dataIn = new InputStreamDataInput(new BufferedInputStream(in)); + FST.FSTMetadata metadata = FST.readMetadata(dataIn, OUTPUTS); + return FST.fromFSTReader( + metadata, new DirectBufferFSTStore(dataIn, metadata.getNumBytes())); + } + + /** + * Load an FST from a URL. {@code file:} URLs are mmap'd; other schemes are read as streams. + */ + public static LoadedFST loadFromUrl(URL fstUrl) throws IOException { + if ("file".equalsIgnoreCase(fstUrl.getProtocol())) { + try { + URI uri = fstUrl.toURI(); + return loadFromPath(Paths.get(uri)); + } catch (URISyntaxException e) { + throw new IOException("Bad file URL: " + fstUrl, e); + } + } + try (InputStream is = new BufferedInputStream(fstUrl.openStream())) { + return new LoadedFST(loadFromStream(is), null); + } + } + + /** Relocate a freshly compiled on-heap FST into a direct buffer. */ + public static FST loadCompiled( + FST.FSTMetadata metadata, FSTReader compilerReader) throws IOException { + return FST.fromFSTReader( + metadata, new DirectBufferFSTStore(compilerReader, metadata.getNumBytes())); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/morph/TestMorphFSTLoader.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/morph/TestMorphFSTLoader.java new file mode 100644 index 000000000000..4f8c90d83835 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/morph/TestMorphFSTLoader.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.morph; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.file.Path; +import org.apache.lucene.store.OutputStreamDataOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.IntsRefBuilder; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTCompiler; +import org.apache.lucene.util.fst.IntsRefFSTEnum; +import org.apache.lucene.util.fst.PositiveIntOutputs; + +public class TestMorphFSTLoader extends LuceneTestCase { + + private static final long MAX_SHALLOW_FST_RAM = 10_000L; + + public void testLoadFromPathMmapsWithoutHeapCopy() throws Exception { + Path fstFile = writeSampleFstToPath(); + try (MorphFSTLoader.LoadedFST loaded = MorphFSTLoader.loadFromPath(fstFile)) { + assertNotNull(loaded.resource()); + assertTrue(loaded.fst().ramBytesUsed() < MAX_SHALLOW_FST_RAM); + assertEnumEquals(buildSampleFst(), loaded.fst()); + } + } + + public void testLoadFromStreamCopiesOffHeap() throws Exception { + FST expected = buildSampleFst(); + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + expected.save(new OutputStreamDataOutput(bos), new OutputStreamDataOutput(bos)); + FST loaded = + MorphFSTLoader.loadFromStream(new java.io.ByteArrayInputStream(bos.toByteArray())); + assertTrue(loaded.ramBytesUsed() < MAX_SHALLOW_FST_RAM); + assertEnumEquals(expected, loaded); + } + + public void testLoadCompiledCopiesOffHeap() throws Exception { + PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); + FSTCompiler compiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build(); + IntsRefBuilder scratch = new IntsRefBuilder(); + scratch.append(42); + compiler.add(scratch.get(), 7L); + FST.FSTMetadata metadata = compiler.compile(); + FST loaded = MorphFSTLoader.loadCompiled(metadata, compiler.getFSTReader()); + assertTrue(loaded.ramBytesUsed() < MAX_SHALLOW_FST_RAM); + } + + private static Path writeSampleFstToPath() throws IOException { + FST fst = buildSampleFst(); + Path path = createTempFile("morph-fst", ".dat"); + fst.save(path); + return path; + } + + private static FST buildSampleFst() throws IOException { + PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); + FSTCompiler compiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build(); + IntsRefBuilder scratch = new IntsRefBuilder(); + for (int i = 0; i < 64; i++) { + scratch.clear(); + scratch.append(i); + compiler.add(scratch.get(), (long) i); + } + return FST.fromFSTReader(compiler.compile(), compiler.getFSTReader()); + } + + private static void assertEnumEquals(FST expected, FST actual) throws IOException { + IntsRefFSTEnum expectedEnum = new IntsRefFSTEnum<>(expected); + IntsRefFSTEnum actualEnum = new IntsRefFSTEnum<>(actual); + IntsRefFSTEnum.InputOutput e; + while ((e = expectedEnum.next()) != null) { + IntsRefFSTEnum.InputOutput a = actualEnum.next(); + assertNotNull(a); + assertEquals(e.input, a.input); + assertEquals(e.output, a.output); + } + assertNull(actualEnum.next()); + } +} diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java index ac2520f5b62c..045e07b78e3c 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java @@ -16,8 +16,6 @@ */ package org.apache.lucene.analysis.ja.dict; -import static org.apache.lucene.util.fst.FST.readMetadata; - import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; @@ -25,12 +23,9 @@ import java.nio.file.Files; import java.nio.file.Path; import org.apache.lucene.analysis.morph.BinaryDictionary; -import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.analysis.morph.MorphFSTLoader; import org.apache.lucene.util.IOSupplier; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.PositiveIntOutputs; /** * Binary dictionary implementation for a known-word dictionary model: Words are encoded into an FST @@ -42,6 +37,9 @@ public final class TokenInfoDictionary extends BinaryDictionary Files.newInputStream(targetMapFile), - () -> Files.newInputStream(posDictFile), () -> Files.newInputStream(dictFile), - () -> Files.newInputStream(fstFile)); + DictionaryConstants.TARGETMAP_HEADER, + DictionaryConstants.DICT_HEADER, + DictionaryConstants.VERSION); + this.morphAtts = + new TokenInfoMorphData(buffer, () -> Files.newInputStream(posDictFile)); + MorphFSTLoader.LoadedFST loaded = MorphFSTLoader.loadFromPath(fstFile); + this.fstHolder = loaded; + this.fst = new TokenInfoFST(loaded.fst(), true); } /** @@ -73,11 +77,16 @@ public TokenInfoDictionary(Path targetMapFile, Path posDictFile, Path dictFile, */ public TokenInfoDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl, URL fstUrl) throws IOException { - this( + super( () -> targetMapUrl.openStream(), - () -> posDictUrl.openStream(), () -> dictUrl.openStream(), - () -> fstUrl.openStream()); + DictionaryConstants.TARGETMAP_HEADER, + DictionaryConstants.DICT_HEADER, + DictionaryConstants.VERSION); + this.morphAtts = new TokenInfoMorphData(buffer, () -> posDictUrl.openStream()); + MorphFSTLoader.LoadedFST loaded = MorphFSTLoader.loadFromUrl(fstUrl); + this.fstHolder = loaded.resource() != null ? loaded : null; + this.fst = new TokenInfoFST(loaded.fst(), true); } private TokenInfoDictionary() throws IOException { @@ -101,14 +110,11 @@ private TokenInfoDictionary( DictionaryConstants.DICT_HEADER, DictionaryConstants.VERSION); this.morphAtts = new TokenInfoMorphData(buffer, posResource); - - FST fst; try (InputStream is = new BufferedInputStream(fstResource.get())) { - DataInput in = new InputStreamDataInput(is); - fst = new FST<>(readMetadata(in, PositiveIntOutputs.getSingleton()), in); + // TODO: some way to configure? + this.fst = new TokenInfoFST(MorphFSTLoader.loadFromStream(is), true); } - // TODO: some way to configure? - this.fst = new TokenInfoFST(fst, true); + this.fstHolder = null; } static InputStream getClassResource(String suffix) throws IOException { diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java index 9a8ee3a8f8ce..5cf8b8f6a22c 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java @@ -23,6 +23,7 @@ import java.util.List; import java.util.regex.Pattern; import org.apache.lucene.analysis.morph.Dictionary; +import org.apache.lucene.analysis.morph.MorphFSTLoader; import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST; @@ -141,9 +142,10 @@ private UserDictionary(List featureEntries) throws IOException { segmentations.add(wordIdAndLength); ord++; } + FST.FSTMetadata metadata = fstCompiler.compile(); this.fst = new TokenInfoFST( - FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()), false); + MorphFSTLoader.loadCompiled(metadata, fstCompiler.getFSTReader()), false); this.morphAtts = new UserMorphData(data.toArray(String[]::new)); this.segmentations = segmentations.toArray(int[][]::new); } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerFactory.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerFactory.java index 399012fb52b8..e2e78fe88b1f 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerFactory.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerFactory.java @@ -59,6 +59,10 @@ * {@link DecompoundMode} *
  • outputUnknownUnigrams: If true outputs unigrams for unknown words. *
  • discardPunctuation: true if punctuation tokens should be dropped from the output. + *
  • cacheHangulRootArcs: if true caches Hangul syllable root arcs in user dictionary FST for + * faster tokenization at the cost of additional heap. Default is true. The system dictionary + * honors system property {@link + * org.apache.lucene.analysis.ko.dict.TokenInfoDictionary#CACHE_HANGUL_ROOT_ARCS_PROPERTY}. * * * @lucene.experimental @@ -75,6 +79,7 @@ public class KoreanTokenizerFactory extends TokenizerFactory implements Resource private static final String DECOMPOUND_MODE = "decompoundMode"; private static final String OUTPUT_UNKNOWN_UNIGRAMS = "outputUnknownUnigrams"; private static final String DISCARD_PUNCTUATION = "discardPunctuation"; + private static final String CACHE_HANGUL_ROOT_ARCS = "cacheHangulRootArcs"; private final String userDictionaryPath; private final String userDictionaryEncoding; @@ -83,6 +88,7 @@ public class KoreanTokenizerFactory extends TokenizerFactory implements Resource private final KoreanTokenizer.DecompoundMode mode; private final boolean outputUnknownUnigrams; private final boolean discardPunctuation; + private final boolean cacheHangulRootArcs; /** Creates a new KoreanTokenizerFactory */ public KoreanTokenizerFactory(Map args) { @@ -95,6 +101,7 @@ public KoreanTokenizerFactory(Map args) { .toUpperCase(Locale.ROOT)); outputUnknownUnigrams = getBoolean(args, OUTPUT_UNKNOWN_UNIGRAMS, false); discardPunctuation = getBoolean(args, DISCARD_PUNCTUATION, true); + cacheHangulRootArcs = getBoolean(args, CACHE_HANGUL_ROOT_ARCS, true); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); @@ -120,7 +127,7 @@ public void inform(ResourceLoader loader) throws IOException { .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); Reader reader = new InputStreamReader(stream, decoder); - userDictionary = UserDictionary.open(reader); + userDictionary = UserDictionary.open(reader, cacheHangulRootArcs); } } else { userDictionary = null; diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java index 679fb8739882..c9e34b1bd39e 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java @@ -16,8 +16,6 @@ */ package org.apache.lucene.analysis.ko.dict; -import static org.apache.lucene.util.fst.FST.readMetadata; - import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; @@ -25,12 +23,10 @@ import java.nio.file.Files; import java.nio.file.Path; import org.apache.lucene.analysis.morph.BinaryDictionary; -import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.analysis.morph.MorphFSTLoader; import org.apache.lucene.util.IOSupplier; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.PositiveIntOutputs; /** * Binary dictionary implementation for a known-word dictionary model: Words are encoded into an FST @@ -42,6 +38,9 @@ public final class TokenInfoDictionary extends BinaryDictionary Files.newInputStream(targetMapFile), - () -> Files.newInputStream(posDictFile), () -> Files.newInputStream(dictFile), - () -> Files.newInputStream(fstFile)); + DictionaryConstants.TARGETMAP_HEADER, + DictionaryConstants.DICT_HEADER, + DictionaryConstants.VERSION); + this.morphAtts = + new TokenInfoMorphData(buffer, () -> Files.newInputStream(posDictFile)); + MorphFSTLoader.LoadedFST loaded = MorphFSTLoader.loadFromPath(fstFile); + this.fstHolder = loaded; + this.fst = new TokenInfoFST(loaded.fst(), cacheHangulRootArcs()); } /** @@ -81,11 +86,16 @@ public TokenInfoDictionary(Path targetMapFile, Path posDictFile, Path dictFile, */ public TokenInfoDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl, URL fstUrl) throws IOException { - this( + super( () -> targetMapUrl.openStream(), - () -> posDictUrl.openStream(), () -> dictUrl.openStream(), - () -> fstUrl.openStream()); + DictionaryConstants.TARGETMAP_HEADER, + DictionaryConstants.DICT_HEADER, + DictionaryConstants.VERSION); + this.morphAtts = new TokenInfoMorphData(buffer, () -> posDictUrl.openStream()); + MorphFSTLoader.LoadedFST loaded = MorphFSTLoader.loadFromUrl(fstUrl); + this.fstHolder = loaded.resource() != null ? loaded : null; + this.fst = new TokenInfoFST(loaded.fst(), cacheHangulRootArcs()); } private TokenInfoDictionary( @@ -101,12 +111,10 @@ private TokenInfoDictionary( DictionaryConstants.DICT_HEADER, DictionaryConstants.VERSION); this.morphAtts = new TokenInfoMorphData(buffer, posResource); - FST fst; try (InputStream is = new BufferedInputStream(fstResource.get())) { - DataInput in = new InputStreamDataInput(is); - fst = new FST<>(readMetadata(in, PositiveIntOutputs.getSingleton()), in); + this.fst = new TokenInfoFST(MorphFSTLoader.loadFromStream(is), cacheHangulRootArcs()); } - this.fst = new TokenInfoFST(fst); + this.fstHolder = null; } static InputStream getClassResource(String suffix) throws IOException { @@ -119,6 +127,17 @@ public TokenInfoFST getFST() { return fst; } + /** + * Whether to cache Hangul syllable root arcs for the system dictionary. Controlled by system + * property {@value #CACHE_HANGUL_ROOT_ARCS_PROPERTY}, default {@code true}. + */ + static final String CACHE_HANGUL_ROOT_ARCS_PROPERTY = + "org.apache.lucene.analysis.ko.cacheHangulRootArcs"; + + private static boolean cacheHangulRootArcs() { + return Boolean.parseBoolean(System.getProperty(CACHE_HANGUL_ROOT_ARCS_PROPERTY, "true")); + } + public static TokenInfoDictionary getInstance() { return SingletonHolder.INSTANCE; } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java index fa591eb00439..c318321ec6bb 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java @@ -19,11 +19,21 @@ import java.io.IOException; import org.apache.lucene.util.fst.FST; -/** Thin wrapper around an FST with root-arc caching for Hangul syllables (11,172 arcs). */ +/** + * Thin wrapper around an FST with root-arc caching for Hangul syllables (11,172 arcs). + * + *

    When {@code cacheHangulSyllables} is {@code true}, root arcs for all Hangul syllables + * ({@code 0xAC00}–{@code 0xD7A3}) are cached for faster lookup at the cost of additional heap. When + * {@code false}, only a single unused root arc is cached. + */ public final class TokenInfoFST extends org.apache.lucene.analysis.morph.TokenInfoFST { public TokenInfoFST(FST fst) throws IOException { - super(fst, 0xD7A3, 0xAC00); + this(fst, true); + } + + public TokenInfoFST(FST fst, boolean cacheHangulSyllables) throws IOException { + super(fst, cacheHangulSyllables ? 0xD7A3 : 0, cacheHangulSyllables ? 0xAC00 : 0); } /** diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java index f2e54ad5ecfb..1810c537aed5 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java @@ -23,6 +23,7 @@ import java.util.Comparator; import java.util.List; import org.apache.lucene.analysis.morph.Dictionary; +import org.apache.lucene.analysis.morph.MorphFSTLoader; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST; @@ -47,6 +48,10 @@ public final class UserDictionary implements Dictionary { private UserMorphData morphAtts; public static UserDictionary open(Reader reader) throws IOException { + return open(reader, true); + } + + public static UserDictionary open(Reader reader, boolean cacheHangulRootArcs) throws IOException { BufferedReader br = new BufferedReader(reader); String line; @@ -67,11 +72,15 @@ public static UserDictionary open(Reader reader) throws IOException { if (entries.isEmpty()) { return null; } else { - return new UserDictionary(entries); + return new UserDictionary(entries, cacheHangulRootArcs); } } private UserDictionary(List entries) throws IOException { + this(entries, true); + } + + private UserDictionary(List entries, boolean cacheHangulRootArcs) throws IOException { final CharacterDefinition charDef = CharacterDefinition.getInstance(); entries.sort(Comparator.comparing(e -> e.split("\\s+")[0])); @@ -135,8 +144,11 @@ private UserDictionary(List entries) throws IOException { if (entryIndex < rightIds.length) { rightIds = ArrayUtil.copyOfSubArray(rightIds, 0, entryIndex); } + FST.FSTMetadata metadata = fstCompiler.compile(); this.fst = - new TokenInfoFST(FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader())); + new TokenInfoFST( + MorphFSTLoader.loadCompiled(metadata, fstCompiler.getFSTReader()), + cacheHangulRootArcs); int[][] segmentations = _segmentations.toArray(int[][]::new); this.morphAtts = new UserMorphData(segmentations, rightIds); } diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestKoreanDictionaryHeapUsage.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestKoreanDictionaryHeapUsage.java new file mode 100644 index 000000000000..add499777641 --- /dev/null +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestKoreanDictionaryHeapUsage.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ko.dict; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.ko.KoreanAnalyzer; +import org.apache.lucene.analysis.ko.KoreanTokenizer; +import org.apache.lucene.analysis.ko.TestKoreanTokenizer; +import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.tests.util.RamUsageTester; +import org.apache.lucene.util.fst.FST; + +public class TestKoreanDictionaryHeapUsage extends BaseTokenStreamTestCase { + + private static final int TOKENIZER_COUNT = 50; + private static final long MAX_FST_RAM_BYTES = 10_000L; + + public void testSystemDictionaryFstIsOffHeap() throws IOException { + FST fst = TokenInfoDictionary.getInstance().getFST().getInternalFST(); + assertFstUsesDirectBuffer(fst); + } + + public void testUserDictionaryFstIsOffHeap() throws IOException { + UserDictionary userDictionary = TestKoreanTokenizer.readDict(); + FST fst = userDictionary.getFST().getInternalFST(); + assertFstUsesDirectBuffer(fst); + } + + public void testManyTokenizersDoNotDuplicateFstBytes() throws IOException { + UserDictionary userDictionary = TestKoreanTokenizer.readDict(); + List tokenizers = new ArrayList<>(TOKENIZER_COUNT); + for (int i = 0; i < TOKENIZER_COUNT; i++) { + tokenizers.add( + new KoreanTokenizer( + newAttributeFactory(), userDictionary, KoreanTokenizer.DecompoundMode.NONE, false)); + } + long tokenizerHeap = RamUsageTester.ramUsed(tokenizers); + assertTrue( + "expected small per-tokenizer heap footprint, got " + RamUsageTester.humanSizeOf(tokenizers), + tokenizerHeap < 5_000_000L); + assertFstUsesDirectBuffer(TokenInfoDictionary.getInstance().getFST().getInternalFST()); + } + + public void testReducedRootCacheUsesLessHeap() throws Exception { + FST fst = TokenInfoDictionary.getInstance().getFST().getInternalFST(); + TokenInfoFST cached = new TokenInfoFST(fst, true); + TokenInfoFST uncached = new TokenInfoFST(fst, false); + long cachedHeap = RamUsageTester.ramUsed(cached); + long uncachedHeap = RamUsageTester.ramUsed(uncached); + assertTrue(cachedHeap > uncachedHeap); + assertTrue(cachedHeap - uncachedHeap >= 500_000L); + } + + public void testSegmentationParityWithFullAnalyzer() throws Exception { + try (Analyzer analyzer = new KoreanAnalyzer()) { + assertAnalyzesTo( + analyzer, + "한국은 대단한 나라입니다.", + new String[] {"한국", "대단", "나라", "이"}, + new int[] {0, 4, 8, 10}, + new int[] {2, 6, 10, 13}, + new int[] {1, 2, 3, 1}); + } + } + + private static void assertFstUsesDirectBuffer(FST fst) { + assertTrue(fst.ramBytesUsed() < MAX_FST_RAM_BYTES); + } +} diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestKoreanDictionarySmallHeap.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestKoreanDictionarySmallHeap.java new file mode 100644 index 000000000000..5b946ad12d7f --- /dev/null +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestKoreanDictionarySmallHeap.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ko.dict; + +import java.io.IOException; +import java.io.StringReader; +import org.apache.lucene.analysis.ko.KoreanAnalyzer; +import org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilter; +import org.apache.lucene.analysis.ko.KoreanTokenizer; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.junit.Ignore; + +/** + * Manual smoke test for Nori dictionary loading under a very small heap. + * + *

    Run with: + * + *

    + * ./gradlew :lucene:analysis:nori:test --tests TestKoreanDictionarySmallHeap \
    + *   -Dtests.jvm.args=-Xmx32m
    + * 
    + */ +@Ignore("Manual small-heap validation; enable locally when verifying off-heap FST loading") +public class TestKoreanDictionarySmallHeap extends LuceneTestCase { + + public void testLoadDictionaryAndTokenizeOnSmallHeap() throws IOException { + TokenInfoDictionary dictionary = TokenInfoDictionary.getInstance(); + assertNotNull(dictionary.getFST()); + + try (KoreanAnalyzer analyzer = + new KoreanAnalyzer( + null, + KoreanTokenizer.DecompoundMode.NONE, + KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS, + false)) { + assertNotNull(analyzer.tokenStream("field", "한국어 형태소 분석")); + } + } +} diff --git a/lucene/benchmark-jmh/build.gradle b/lucene/benchmark-jmh/build.gradle index 6d64bb7a4f72..d1542cf6a7fe 100644 --- a/lucene/benchmark-jmh/build.gradle +++ b/lucene/benchmark-jmh/build.gradle @@ -19,6 +19,7 @@ description = 'Lucene JMH micro-benchmarking module' dependencies { moduleImplementation project(':lucene:core') + moduleImplementation project(':lucene:analysis:nori') moduleImplementation project(':lucene:expressions') moduleImplementation project(':lucene:join') moduleImplementation project(':lucene:sandbox') diff --git a/lucene/benchmark-jmh/src/java/module-info.java b/lucene/benchmark-jmh/src/java/module-info.java index 8090c7554739..b9b16b76debf 100644 --- a/lucene/benchmark-jmh/src/java/module-info.java +++ b/lucene/benchmark-jmh/src/java/module-info.java @@ -23,6 +23,7 @@ requires jmh.core; requires jdk.unsupported; requires org.apache.lucene.core; + requires org.apache.lucene.analysis.nori; requires org.apache.lucene.expressions; requires org.apache.lucene.join; requires org.apache.lucene.sandbox; diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/KoreanTokenizerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/KoreanTokenizerBenchmark.java new file mode 100644 index 000000000000..c2f6915069be --- /dev/null +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/KoreanTokenizerBenchmark.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.jmh; + +import java.io.IOException; +import java.io.StringReader; +import java.util.concurrent.TimeUnit; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ko.KoreanAnalyzer; +import org.apache.lucene.analysis.ko.KoreanTokenizer; +import org.apache.lucene.analysis.ko.dict.UserDictionary; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Warmup; + +@State(Scope.Benchmark) +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.SECONDS) +@Fork(value = 1, warmups = 1) +@Warmup(iterations = 2, time = 1) +@Measurement(iterations = 3, time = 2) +public class KoreanTokenizerBenchmark { + + @Param({"SHORT", "LONG"}) + public String textSize; + + @Param({"false", "true"}) + public boolean userDictionary; + + private Analyzer analyzer; + private String text; + + @Setup(Level.Trial) + public void setup() throws IOException { + UserDictionary userDict = userDictionary ? openUserDictionary() : null; + analyzer = new KoreanAnalyzer(userDict, KoreanTokenizer.DecompoundMode.DISCARD, null, false); + text = + switch (textSize) { + case "SHORT" -> "한국은 대단한 나라입니다."; + case "LONG" -> + """ + 서울특별시는 대한민국의 수도이자 최대 도시이다. 한강을 중심으로 발전해 왔으며 \ + 정치, 경제, 사회, 문화의 중심지 역할을 한다. 많은 기업과 연구 기관이 \ + 집중되어 있어 정보 기술과 금융 산업이 발달했다."""; + default -> throw new IllegalStateException("unknown textSize: " + textSize); + }; + } + + @TearDown(Level.Trial) + public void tearDown() throws IOException { + if (analyzer != null) { + analyzer.close(); + } + } + + @Benchmark + public int tokenizeDocument() throws IOException { + int tokenCount = 0; + try (TokenStream stream = analyzer.tokenStream("field", text)) { + stream.reset(); + while (stream.incrementToken()) { + tokenCount++; + } + stream.end(); + } + return tokenCount; + } + + private static UserDictionary openUserDictionary() throws IOException { + return UserDictionary.open(new StringReader("세종시\n세종 시")); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/DirectBufferFSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/DirectBufferFSTStore.java new file mode 100644 index 000000000000..7afbca79b769 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/fst/DirectBufferFSTStore.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +import java.io.IOException; +import java.nio.ByteBuffer; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * Provides off-heap storage of finite state machine (FST) bytes in a direct {@link ByteBuffer}. + * + *

    Unlike {@link OnHeapFSTStore}, FST bytes are not counted in JVM heap by {@link + * #ramBytesUsed()}. Morphological analysis dictionaries use this store to reduce on-heap pressure. + * + * @lucene.experimental + */ +public final class DirectBufferFSTStore implements FSTReader { + + private static final long BASE_RAM_BYTES_USED = + RamUsageEstimator.shallowSizeOfInstance(DirectBufferFSTStore.class); + + private final ByteBuffer buffer; + + /** + * Read FST bytes from {@code in} into a direct buffer. + * + * @param in input positioned at the first FST byte + * @param numBytes number of FST bytes to read + */ + public DirectBufferFSTStore(DataInput in, long numBytes) throws IOException { + this.buffer = readDirectBuffer(in, numBytes); + } + + /** + * Copy FST bytes from an existing {@link FSTReader} into a direct buffer. + * + * @param source reader containing FST bytes + * @param numBytes number of FST bytes to copy + */ + public DirectBufferFSTStore(FSTReader source, long numBytes) throws IOException { + if (numBytes > Integer.MAX_VALUE) { + throw new IllegalArgumentException("FST too large for direct buffer: " + numBytes); + } + ByteBuffer tmp = ByteBuffer.allocateDirect((int) numBytes); + source.writeTo( + new DataOutput() { + @Override + public void writeByte(byte b) { + tmp.put(b); + } + + @Override + public void writeBytes(byte[] b, int offset, int length) { + tmp.put(b, offset, length); + } + }); + tmp.flip(); + this.buffer = tmp.asReadOnlyBuffer(); + } + + private static ByteBuffer readDirectBuffer(DataInput in, long numBytes) throws IOException { + if (numBytes > Integer.MAX_VALUE) { + throw new IllegalArgumentException("FST too large for direct buffer: " + numBytes); + } + int size = (int) numBytes; + ByteBuffer tmp = ByteBuffer.allocateDirect(size); + byte[] scratch = new byte[8192]; + int remaining = size; + while (remaining > 0) { + int toRead = Math.min(scratch.length, remaining); + in.readBytes(scratch, 0, toRead); + tmp.put(scratch, 0, toRead); + remaining -= toRead; + } + tmp.flip(); + return tmp.asReadOnlyBuffer(); + } + + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES_USED; + } + + @Override + public FST.BytesReader getReverseBytesReader() { + return new ReverseDirectBufferReader(buffer.duplicate()); + } + + @Override + public void writeTo(DataOutput out) throws IOException { + ByteBuffer dup = buffer.duplicate(); + dup.rewind(); + byte[] scratch = new byte[8192]; + while (dup.hasRemaining()) { + int len = Math.min(scratch.length, dup.remaining()); + dup.get(scratch, 0, len); + out.writeBytes(scratch, 0, len); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 30fd39805daf..96a5e941e58c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -420,6 +420,13 @@ public FST(FSTMetadata metadata, DataInput in) throws IOException { this(metadata, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS, in, metadata.numBytes)); } + /** + * Load a previously saved FST with a DataInput for metadata using a {@link DirectBufferFSTStore}. + */ + public static FST loadDirect(FSTMetadata metadata, DataInput in) throws IOException { + return fromFSTReader(metadata, new DirectBufferFSTStore(in, metadata.numBytes)); + } + /** Create the FST with a metadata object and a FSTReader. */ FST(FSTMetadata metadata, FSTReader fstReader) { assert fstReader != null; diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ReverseDirectBufferReader.java b/lucene/core/src/java/org/apache/lucene/util/fst/ReverseDirectBufferReader.java new file mode 100644 index 000000000000..e5124bbd8ffa --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/fst/ReverseDirectBufferReader.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +import java.nio.ByteBuffer; + +/** Reads in reverse from a direct {@link ByteBuffer}. */ +final class ReverseDirectBufferReader extends FST.BytesReader { + private final ByteBuffer buffer; + private int pos; + + ReverseDirectBufferReader(ByteBuffer buffer) { + this.buffer = buffer; + } + + @Override + public byte readByte() { + return buffer.get(pos--); + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + for (int i = 0; i < len; i++) { + b[offset + i] = buffer.get(pos--); + } + } + + @Override + public void skipBytes(long count) { + pos -= count; + } + + @Override + public long getPosition() { + return pos; + } + + @Override + public void setPosition(long pos) { + this.pos = (int) pos; + } +} diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestDirectBufferFSTStore.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestDirectBufferFSTStore.java new file mode 100644 index 000000000000..4dafd99bc6ca --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestDirectBufferFSTStore.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.OutputStreamDataOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.RamUsageTester; +import org.apache.lucene.util.IntsRefBuilder; + +public class TestDirectBufferFSTStore extends LuceneTestCase { + + private static final int MAX_BLOCK_BITS = 30; + + public void testEquivalenceWithOnHeapStore() throws IOException { + SavedFST saved = buildSavedFst(); + FST onHeapFst = loadOnHeap(saved); + FST directFst = loadDirect(saved); + + IntsRefFSTEnum onHeapEnum = new IntsRefFSTEnum<>(onHeapFst); + IntsRefFSTEnum directEnum = new IntsRefFSTEnum<>(directFst); + IntsRefFSTEnum.InputOutput onHeapEntry; + IntsRefFSTEnum.InputOutput directEntry; + while ((onHeapEntry = onHeapEnum.next()) != null) { + directEntry = directEnum.next(); + assertNotNull(directEntry); + assertEquals(onHeapEntry.input, directEntry.input); + assertEquals(onHeapEntry.output, directEntry.output); + } + assertNull(directEnum.next()); + } + + public void testRamBytesUsed() throws IOException { + SavedFST saved = buildSavedFst(); + DirectBufferFSTStore directStore = + new DirectBufferFSTStore(saved.dataInput(), saved.metadata.getNumBytes()); + OnHeapFSTStore onHeapStore = + new OnHeapFSTStore(MAX_BLOCK_BITS, saved.dataInput(), saved.metadata.getNumBytes()); + + assertTrue(directStore.ramBytesUsed() < 1_000); + assertTrue(onHeapStore.ramBytesUsed() > directStore.ramBytesUsed() + 100); + assertTrue(RamUsageTester.ramUsed(directStore) < RamUsageTester.ramUsed(onHeapStore)); + assertNoLargeByteArray(directStore); + } + + public void testBytesReader() throws IOException { + SavedFST saved = buildSavedFst(); + FST fst = loadDirect(saved); + FST.BytesReader reader = fst.getBytesReader(); + byte[] data = saved.dataBytes; + reader.setPosition(data.length - 1); + assertEquals(data[data.length - 1], reader.readByte()); + byte[] scratch = new byte[4]; + reader.setPosition(3); + reader.readBytes(scratch, 0, 4); + for (int i = 0; i < 4; i++) { + assertEquals(data[3 - i], scratch[i]); + } + } + + public void testCopyFromFSTReader() throws IOException { + SavedFST saved = buildSavedFst(); + OnHeapFSTStore source = + new OnHeapFSTStore(MAX_BLOCK_BITS, saved.dataInput(), saved.metadata.getNumBytes()); + DirectBufferFSTStore copied = new DirectBufferFSTStore(source, saved.metadata.getNumBytes()); + + ByteArrayOutputStream out = new ByteArrayOutputStream(saved.dataBytes.length); + copied.writeTo(new OutputStreamDataOutput(out)); + assertArrayEquals(saved.dataBytes, out.toByteArray()); + assertNoLargeByteArray(copied); + } + + private static FST loadOnHeap(SavedFST saved) throws IOException { + return new FST<>(saved.metadata, saved.dataInput()); + } + + private static FST loadDirect(SavedFST saved) throws IOException { + return FST.loadDirect(saved.metadata, saved.dataInput()); + } + + private static SavedFST buildSavedFst() throws IOException { + PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); + FSTCompiler compiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build(); + IntsRefBuilder scratch = new IntsRefBuilder(); + for (int i = 0; i < 256; i++) { + scratch.clear(); + scratch.append(i); + compiler.add(scratch.get(), (long) i); + } + FST.FSTMetadata metadata = compiler.compile(); + FST fst = FST.fromFSTReader(metadata, compiler.getFSTReader()); + + ByteArrayOutputStream metaBytes = new ByteArrayOutputStream(); + ByteArrayOutputStream dataBytes = new ByteArrayOutputStream(); + fst.save(new OutputStreamDataOutput(metaBytes), new OutputStreamDataOutput(dataBytes)); + return new SavedFST( + metaBytes.toByteArray(), dataBytes.toByteArray(), outputs); + } + + private static void assertNoLargeByteArray(Object root) { + List largeArrays = new ArrayList<>(); + RamUsageTester.ramUsed( + root, + new RamUsageTester.Accumulator() { + @Override + public long accumulateArray( + Object array, long shallowSize, List values, Collection queue) { + if (array instanceof byte[] bytes && bytes.length > 1_000_000) { + largeArrays.add(bytes); + } + return super.accumulateArray(array, shallowSize, values, queue); + } + }); + assertEquals(0, largeArrays.size()); + } + + private static final class SavedFST { + final byte[] metaBytes; + final byte[] dataBytes; + final FST.FSTMetadata metadata; + + SavedFST(byte[] metaBytes, byte[] dataBytes, PositiveIntOutputs outputs) + throws IOException { + this.metaBytes = metaBytes; + this.dataBytes = dataBytes; + this.metadata = FST.readMetadata(new ByteArrayDataInput(metaBytes), outputs); + } + + DataInput dataInput() { + return new ByteArrayDataInput(dataBytes); + } + } +}