diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/MorphFSTLoader.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/MorphFSTLoader.java
new file mode 100644
index 000000000000..a67ae6170122
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/MorphFSTLoader.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.morph;
+
+import java.io.BufferedInputStream;
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.store.MMapDirectory;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.fst.DirectBufferFSTStore;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.FSTReader;
+import org.apache.lucene.util.fst.OffHeapFSTStore;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
+
+/**
+ * Loads morphological dictionary {@link FST}s with storage appropriate to the byte source.
+ *
+ *
+ *
{@link #loadFromPath(Path)} — zero-copy mmap via {@link OffHeapFSTStore}
+ *
{@link #loadFromStream(InputStream)} — one copy into a direct buffer for classpath/JAR
+ * streams
+ *
+ */
+public final class MorphFSTLoader {
+
+ private static final PositiveIntOutputs OUTPUTS = PositiveIntOutputs.getSingleton();
+
+ private MorphFSTLoader() {}
+
+ /**
+ * An FST plus an optional resource that must remain open for mmap-backed FSTs. Callers loading
+ * from a {@link Path} must retain this holder for the lifetime of the returned {@link FST}.
+ */
+ public static final class LoadedFST implements Closeable {
+ private final FST fst;
+ private final Closeable resource;
+
+ LoadedFST(FST fst, Closeable resource) {
+ this.fst = fst;
+ this.resource = resource;
+ }
+
+ public FST fst() {
+ return fst;
+ }
+
+ /** Returns the mmap resource, or {@code null} when the FST does not require one. */
+ public Closeable resource() {
+ return resource;
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (resource != null) {
+ resource.close();
+ }
+ }
+ }
+
+ /**
+ * Load an FST from a file using mmap ({@link OffHeapFSTStore}). The returned {@link LoadedFST}
+ * must be kept alive while the FST is in use.
+ */
+ public static LoadedFST loadFromPath(Path fstFile) throws IOException {
+ Directory dir = new MMapDirectory(fstFile.getParent());
+ IndexInput in = dir.openInput(fstFile.getFileName().toString(), IOContext.READONCE);
+ FST.FSTMetadata metadata = FST.readMetadata(in, OUTPUTS);
+ OffHeapFSTStore store = new OffHeapFSTStore(in, in.getFilePointer(), metadata);
+ FST fst = FST.fromFSTReader(metadata, store);
+ Closeable resource = () -> IOUtils.close(in, dir);
+ return new LoadedFST(fst, resource);
+ }
+
+ /**
+ * Load an FST from a stream by copying bytes into a direct buffer ({@link
+ * DirectBufferFSTStore}).
+ */
+ public static FST loadFromStream(InputStream in) throws IOException {
+ DataInput dataIn = new InputStreamDataInput(new BufferedInputStream(in));
+ FST.FSTMetadata metadata = FST.readMetadata(dataIn, OUTPUTS);
+ return FST.fromFSTReader(
+ metadata, new DirectBufferFSTStore(dataIn, metadata.getNumBytes()));
+ }
+
+ /**
+ * Load an FST from a URL. {@code file:} URLs are mmap'd; other schemes are read as streams.
+ */
+ public static LoadedFST loadFromUrl(URL fstUrl) throws IOException {
+ if ("file".equalsIgnoreCase(fstUrl.getProtocol())) {
+ try {
+ URI uri = fstUrl.toURI();
+ return loadFromPath(Paths.get(uri));
+ } catch (URISyntaxException e) {
+ throw new IOException("Bad file URL: " + fstUrl, e);
+ }
+ }
+ try (InputStream is = new BufferedInputStream(fstUrl.openStream())) {
+ return new LoadedFST(loadFromStream(is), null);
+ }
+ }
+
+ /** Relocate a freshly compiled on-heap FST into a direct buffer. */
+ public static FST loadCompiled(
+ FST.FSTMetadata metadata, FSTReader compilerReader) throws IOException {
+ return FST.fromFSTReader(
+ metadata, new DirectBufferFSTStore(compilerReader, metadata.getNumBytes()));
+ }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/morph/TestMorphFSTLoader.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/morph/TestMorphFSTLoader.java
new file mode 100644
index 000000000000..4f8c90d83835
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/morph/TestMorphFSTLoader.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.morph;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.nio.file.Path;
+import org.apache.lucene.store.OutputStreamDataOutput;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.util.IntsRefBuilder;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.FSTCompiler;
+import org.apache.lucene.util.fst.IntsRefFSTEnum;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
+
+public class TestMorphFSTLoader extends LuceneTestCase {
+
+ private static final long MAX_SHALLOW_FST_RAM = 10_000L;
+
+ public void testLoadFromPathMmapsWithoutHeapCopy() throws Exception {
+ Path fstFile = writeSampleFstToPath();
+ try (MorphFSTLoader.LoadedFST loaded = MorphFSTLoader.loadFromPath(fstFile)) {
+ assertNotNull(loaded.resource());
+ assertTrue(loaded.fst().ramBytesUsed() < MAX_SHALLOW_FST_RAM);
+ assertEnumEquals(buildSampleFst(), loaded.fst());
+ }
+ }
+
+ public void testLoadFromStreamCopiesOffHeap() throws Exception {
+ FST expected = buildSampleFst();
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ expected.save(new OutputStreamDataOutput(bos), new OutputStreamDataOutput(bos));
+ FST loaded =
+ MorphFSTLoader.loadFromStream(new java.io.ByteArrayInputStream(bos.toByteArray()));
+ assertTrue(loaded.ramBytesUsed() < MAX_SHALLOW_FST_RAM);
+ assertEnumEquals(expected, loaded);
+ }
+
+ public void testLoadCompiledCopiesOffHeap() throws Exception {
+ PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
+ FSTCompiler compiler =
+ new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
+ IntsRefBuilder scratch = new IntsRefBuilder();
+ scratch.append(42);
+ compiler.add(scratch.get(), 7L);
+ FST.FSTMetadata metadata = compiler.compile();
+ FST loaded = MorphFSTLoader.loadCompiled(metadata, compiler.getFSTReader());
+ assertTrue(loaded.ramBytesUsed() < MAX_SHALLOW_FST_RAM);
+ }
+
+ private static Path writeSampleFstToPath() throws IOException {
+ FST fst = buildSampleFst();
+ Path path = createTempFile("morph-fst", ".dat");
+ fst.save(path);
+ return path;
+ }
+
+ private static FST buildSampleFst() throws IOException {
+ PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
+ FSTCompiler compiler =
+ new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
+ IntsRefBuilder scratch = new IntsRefBuilder();
+ for (int i = 0; i < 64; i++) {
+ scratch.clear();
+ scratch.append(i);
+ compiler.add(scratch.get(), (long) i);
+ }
+ return FST.fromFSTReader(compiler.compile(), compiler.getFSTReader());
+ }
+
+ private static void assertEnumEquals(FST expected, FST actual) throws IOException {
+ IntsRefFSTEnum expectedEnum = new IntsRefFSTEnum<>(expected);
+ IntsRefFSTEnum actualEnum = new IntsRefFSTEnum<>(actual);
+ IntsRefFSTEnum.InputOutput e;
+ while ((e = expectedEnum.next()) != null) {
+ IntsRefFSTEnum.InputOutput a = actualEnum.next();
+ assertNotNull(a);
+ assertEquals(e.input, a.input);
+ assertEquals(e.output, a.output);
+ }
+ assertNull(actualEnum.next());
+ }
+}
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java
index ac2520f5b62c..045e07b78e3c 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java
@@ -16,8 +16,6 @@
*/
package org.apache.lucene.analysis.ja.dict;
-import static org.apache.lucene.util.fst.FST.readMetadata;
-
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
@@ -25,12 +23,9 @@
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.morph.BinaryDictionary;
-import org.apache.lucene.store.DataInput;
-import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.analysis.morph.MorphFSTLoader;
import org.apache.lucene.util.IOSupplier;
import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.PositiveIntOutputs;
/**
* Binary dictionary implementation for a known-word dictionary model: Words are encoded into an FST
@@ -42,6 +37,9 @@ public final class TokenInfoDictionary extends BinaryDictionary Files.newInputStream(targetMapFile),
- () -> Files.newInputStream(posDictFile),
() -> Files.newInputStream(dictFile),
- () -> Files.newInputStream(fstFile));
+ DictionaryConstants.TARGETMAP_HEADER,
+ DictionaryConstants.DICT_HEADER,
+ DictionaryConstants.VERSION);
+ this.morphAtts =
+ new TokenInfoMorphData(buffer, () -> Files.newInputStream(posDictFile));
+ MorphFSTLoader.LoadedFST loaded = MorphFSTLoader.loadFromPath(fstFile);
+ this.fstHolder = loaded;
+ this.fst = new TokenInfoFST(loaded.fst(), true);
}
/**
@@ -73,11 +77,16 @@ public TokenInfoDictionary(Path targetMapFile, Path posDictFile, Path dictFile,
*/
public TokenInfoDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl, URL fstUrl)
throws IOException {
- this(
+ super(
() -> targetMapUrl.openStream(),
- () -> posDictUrl.openStream(),
() -> dictUrl.openStream(),
- () -> fstUrl.openStream());
+ DictionaryConstants.TARGETMAP_HEADER,
+ DictionaryConstants.DICT_HEADER,
+ DictionaryConstants.VERSION);
+ this.morphAtts = new TokenInfoMorphData(buffer, () -> posDictUrl.openStream());
+ MorphFSTLoader.LoadedFST loaded = MorphFSTLoader.loadFromUrl(fstUrl);
+ this.fstHolder = loaded.resource() != null ? loaded : null;
+ this.fst = new TokenInfoFST(loaded.fst(), true);
}
private TokenInfoDictionary() throws IOException {
@@ -101,14 +110,11 @@ private TokenInfoDictionary(
DictionaryConstants.DICT_HEADER,
DictionaryConstants.VERSION);
this.morphAtts = new TokenInfoMorphData(buffer, posResource);
-
- FST fst;
try (InputStream is = new BufferedInputStream(fstResource.get())) {
- DataInput in = new InputStreamDataInput(is);
- fst = new FST<>(readMetadata(in, PositiveIntOutputs.getSingleton()), in);
+ // TODO: some way to configure?
+ this.fst = new TokenInfoFST(MorphFSTLoader.loadFromStream(is), true);
}
- // TODO: some way to configure?
- this.fst = new TokenInfoFST(fst, true);
+ this.fstHolder = null;
}
static InputStream getClassResource(String suffix) throws IOException {
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
index 9a8ee3a8f8ce..5cf8b8f6a22c 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
@@ -23,6 +23,7 @@
import java.util.List;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.morph.Dictionary;
+import org.apache.lucene.analysis.morph.MorphFSTLoader;
import org.apache.lucene.analysis.util.CSVUtil;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST;
@@ -141,9 +142,10 @@ private UserDictionary(List featureEntries) throws IOException {
segmentations.add(wordIdAndLength);
ord++;
}
+ FST.FSTMetadata metadata = fstCompiler.compile();
this.fst =
new TokenInfoFST(
- FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()), false);
+ MorphFSTLoader.loadCompiled(metadata, fstCompiler.getFSTReader()), false);
this.morphAtts = new UserMorphData(data.toArray(String[]::new));
this.segmentations = segmentations.toArray(int[][]::new);
}
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerFactory.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerFactory.java
index 399012fb52b8..e2e78fe88b1f 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerFactory.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerFactory.java
@@ -59,6 +59,10 @@
* {@link DecompoundMode}
*
outputUnknownUnigrams: If true outputs unigrams for unknown words.
*
discardPunctuation: true if punctuation tokens should be dropped from the output.
+ *
cacheHangulRootArcs: if true caches Hangul syllable root arcs in user dictionary FST for
+ * faster tokenization at the cost of additional heap. Default is true. The system dictionary
+ * honors system property {@link
+ * org.apache.lucene.analysis.ko.dict.TokenInfoDictionary#CACHE_HANGUL_ROOT_ARCS_PROPERTY}.
*
*
* @lucene.experimental
@@ -75,6 +79,7 @@ public class KoreanTokenizerFactory extends TokenizerFactory implements Resource
private static final String DECOMPOUND_MODE = "decompoundMode";
private static final String OUTPUT_UNKNOWN_UNIGRAMS = "outputUnknownUnigrams";
private static final String DISCARD_PUNCTUATION = "discardPunctuation";
+ private static final String CACHE_HANGUL_ROOT_ARCS = "cacheHangulRootArcs";
private final String userDictionaryPath;
private final String userDictionaryEncoding;
@@ -83,6 +88,7 @@ public class KoreanTokenizerFactory extends TokenizerFactory implements Resource
private final KoreanTokenizer.DecompoundMode mode;
private final boolean outputUnknownUnigrams;
private final boolean discardPunctuation;
+ private final boolean cacheHangulRootArcs;
/** Creates a new KoreanTokenizerFactory */
public KoreanTokenizerFactory(Map args) {
@@ -95,6 +101,7 @@ public KoreanTokenizerFactory(Map args) {
.toUpperCase(Locale.ROOT));
outputUnknownUnigrams = getBoolean(args, OUTPUT_UNKNOWN_UNIGRAMS, false);
discardPunctuation = getBoolean(args, DISCARD_PUNCTUATION, true);
+ cacheHangulRootArcs = getBoolean(args, CACHE_HANGUL_ROOT_ARCS, true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
@@ -120,7 +127,7 @@ public void inform(ResourceLoader loader) throws IOException {
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
Reader reader = new InputStreamReader(stream, decoder);
- userDictionary = UserDictionary.open(reader);
+ userDictionary = UserDictionary.open(reader, cacheHangulRootArcs);
}
} else {
userDictionary = null;
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java
index 679fb8739882..c9e34b1bd39e 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java
@@ -16,8 +16,6 @@
*/
package org.apache.lucene.analysis.ko.dict;
-import static org.apache.lucene.util.fst.FST.readMetadata;
-
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
@@ -25,12 +23,10 @@
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.morph.BinaryDictionary;
-import org.apache.lucene.store.DataInput;
-import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.analysis.morph.MorphFSTLoader;
import org.apache.lucene.util.IOSupplier;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.PositiveIntOutputs;
/**
* Binary dictionary implementation for a known-word dictionary model: Words are encoded into an FST
@@ -42,6 +38,9 @@ public final class TokenInfoDictionary extends BinaryDictionary Files.newInputStream(targetMapFile),
- () -> Files.newInputStream(posDictFile),
() -> Files.newInputStream(dictFile),
- () -> Files.newInputStream(fstFile));
+ DictionaryConstants.TARGETMAP_HEADER,
+ DictionaryConstants.DICT_HEADER,
+ DictionaryConstants.VERSION);
+ this.morphAtts =
+ new TokenInfoMorphData(buffer, () -> Files.newInputStream(posDictFile));
+ MorphFSTLoader.LoadedFST loaded = MorphFSTLoader.loadFromPath(fstFile);
+ this.fstHolder = loaded;
+ this.fst = new TokenInfoFST(loaded.fst(), cacheHangulRootArcs());
}
/**
@@ -81,11 +86,16 @@ public TokenInfoDictionary(Path targetMapFile, Path posDictFile, Path dictFile,
*/
public TokenInfoDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl, URL fstUrl)
throws IOException {
- this(
+ super(
() -> targetMapUrl.openStream(),
- () -> posDictUrl.openStream(),
() -> dictUrl.openStream(),
- () -> fstUrl.openStream());
+ DictionaryConstants.TARGETMAP_HEADER,
+ DictionaryConstants.DICT_HEADER,
+ DictionaryConstants.VERSION);
+ this.morphAtts = new TokenInfoMorphData(buffer, () -> posDictUrl.openStream());
+ MorphFSTLoader.LoadedFST loaded = MorphFSTLoader.loadFromUrl(fstUrl);
+ this.fstHolder = loaded.resource() != null ? loaded : null;
+ this.fst = new TokenInfoFST(loaded.fst(), cacheHangulRootArcs());
}
private TokenInfoDictionary(
@@ -101,12 +111,10 @@ private TokenInfoDictionary(
DictionaryConstants.DICT_HEADER,
DictionaryConstants.VERSION);
this.morphAtts = new TokenInfoMorphData(buffer, posResource);
- FST fst;
try (InputStream is = new BufferedInputStream(fstResource.get())) {
- DataInput in = new InputStreamDataInput(is);
- fst = new FST<>(readMetadata(in, PositiveIntOutputs.getSingleton()), in);
+ this.fst = new TokenInfoFST(MorphFSTLoader.loadFromStream(is), cacheHangulRootArcs());
}
- this.fst = new TokenInfoFST(fst);
+ this.fstHolder = null;
}
static InputStream getClassResource(String suffix) throws IOException {
@@ -119,6 +127,17 @@ public TokenInfoFST getFST() {
return fst;
}
+ /**
+ * Whether to cache Hangul syllable root arcs for the system dictionary. Controlled by system
+ * property {@value #CACHE_HANGUL_ROOT_ARCS_PROPERTY}, default {@code true}.
+ */
+ static final String CACHE_HANGUL_ROOT_ARCS_PROPERTY =
+ "org.apache.lucene.analysis.ko.cacheHangulRootArcs";
+
+ private static boolean cacheHangulRootArcs() {
+ return Boolean.parseBoolean(System.getProperty(CACHE_HANGUL_ROOT_ARCS_PROPERTY, "true"));
+ }
+
public static TokenInfoDictionary getInstance() {
return SingletonHolder.INSTANCE;
}
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java
index fa591eb00439..c318321ec6bb 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java
@@ -19,11 +19,21 @@
import java.io.IOException;
import org.apache.lucene.util.fst.FST;
-/** Thin wrapper around an FST with root-arc caching for Hangul syllables (11,172 arcs). */
+/**
+ * Thin wrapper around an FST with root-arc caching for Hangul syllables (11,172 arcs).
+ *
+ *
When {@code cacheHangulSyllables} is {@code true}, root arcs for all Hangul syllables
+ * ({@code 0xAC00}–{@code 0xD7A3}) are cached for faster lookup at the cost of additional heap. When
+ * {@code false}, only a single unused root arc is cached.
+ */
public final class TokenInfoFST extends org.apache.lucene.analysis.morph.TokenInfoFST {
public TokenInfoFST(FST fst) throws IOException {
- super(fst, 0xD7A3, 0xAC00);
+ this(fst, true);
+ }
+
+ public TokenInfoFST(FST fst, boolean cacheHangulSyllables) throws IOException {
+ super(fst, cacheHangulSyllables ? 0xD7A3 : 0, cacheHangulSyllables ? 0xAC00 : 0);
}
/**
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java
index f2e54ad5ecfb..1810c537aed5 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java
@@ -23,6 +23,7 @@
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.morph.Dictionary;
+import org.apache.lucene.analysis.morph.MorphFSTLoader;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST;
@@ -47,6 +48,10 @@ public final class UserDictionary implements Dictionary {
private UserMorphData morphAtts;
public static UserDictionary open(Reader reader) throws IOException {
+ return open(reader, true);
+ }
+
+ public static UserDictionary open(Reader reader, boolean cacheHangulRootArcs) throws IOException {
BufferedReader br = new BufferedReader(reader);
String line;
@@ -67,11 +72,15 @@ public static UserDictionary open(Reader reader) throws IOException {
if (entries.isEmpty()) {
return null;
} else {
- return new UserDictionary(entries);
+ return new UserDictionary(entries, cacheHangulRootArcs);
}
}
private UserDictionary(List entries) throws IOException {
+ this(entries, true);
+ }
+
+ private UserDictionary(List entries, boolean cacheHangulRootArcs) throws IOException {
final CharacterDefinition charDef = CharacterDefinition.getInstance();
entries.sort(Comparator.comparing(e -> e.split("\\s+")[0]));
@@ -135,8 +144,11 @@ private UserDictionary(List entries) throws IOException {
if (entryIndex < rightIds.length) {
rightIds = ArrayUtil.copyOfSubArray(rightIds, 0, entryIndex);
}
+ FST.FSTMetadata metadata = fstCompiler.compile();
this.fst =
- new TokenInfoFST(FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()));
+ new TokenInfoFST(
+ MorphFSTLoader.loadCompiled(metadata, fstCompiler.getFSTReader()),
+ cacheHangulRootArcs);
int[][] segmentations = _segmentations.toArray(int[][]::new);
this.morphAtts = new UserMorphData(segmentations, rightIds);
}
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestKoreanDictionaryHeapUsage.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestKoreanDictionaryHeapUsage.java
new file mode 100644
index 000000000000..add499777641
--- /dev/null
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestKoreanDictionaryHeapUsage.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko.dict;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.ko.KoreanAnalyzer;
+import org.apache.lucene.analysis.ko.KoreanTokenizer;
+import org.apache.lucene.analysis.ko.TestKoreanTokenizer;
+import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.util.RamUsageTester;
+import org.apache.lucene.util.fst.FST;
+
+public class TestKoreanDictionaryHeapUsage extends BaseTokenStreamTestCase {
+
+ private static final int TOKENIZER_COUNT = 50;
+ private static final long MAX_FST_RAM_BYTES = 10_000L;
+
+ public void testSystemDictionaryFstIsOffHeap() throws IOException {
+ FST fst = TokenInfoDictionary.getInstance().getFST().getInternalFST();
+ assertFstUsesDirectBuffer(fst);
+ }
+
+ public void testUserDictionaryFstIsOffHeap() throws IOException {
+ UserDictionary userDictionary = TestKoreanTokenizer.readDict();
+ FST fst = userDictionary.getFST().getInternalFST();
+ assertFstUsesDirectBuffer(fst);
+ }
+
+ public void testManyTokenizersDoNotDuplicateFstBytes() throws IOException {
+ UserDictionary userDictionary = TestKoreanTokenizer.readDict();
+ List tokenizers = new ArrayList<>(TOKENIZER_COUNT);
+ for (int i = 0; i < TOKENIZER_COUNT; i++) {
+ tokenizers.add(
+ new KoreanTokenizer(
+ newAttributeFactory(), userDictionary, KoreanTokenizer.DecompoundMode.NONE, false));
+ }
+ long tokenizerHeap = RamUsageTester.ramUsed(tokenizers);
+ assertTrue(
+ "expected small per-tokenizer heap footprint, got " + RamUsageTester.humanSizeOf(tokenizers),
+ tokenizerHeap < 5_000_000L);
+ assertFstUsesDirectBuffer(TokenInfoDictionary.getInstance().getFST().getInternalFST());
+ }
+
+ public void testReducedRootCacheUsesLessHeap() throws Exception {
+ FST fst = TokenInfoDictionary.getInstance().getFST().getInternalFST();
+ TokenInfoFST cached = new TokenInfoFST(fst, true);
+ TokenInfoFST uncached = new TokenInfoFST(fst, false);
+ long cachedHeap = RamUsageTester.ramUsed(cached);
+ long uncachedHeap = RamUsageTester.ramUsed(uncached);
+ assertTrue(cachedHeap > uncachedHeap);
+ assertTrue(cachedHeap - uncachedHeap >= 500_000L);
+ }
+
+ public void testSegmentationParityWithFullAnalyzer() throws Exception {
+ try (Analyzer analyzer = new KoreanAnalyzer()) {
+ assertAnalyzesTo(
+ analyzer,
+ "한국은 대단한 나라입니다.",
+ new String[] {"한국", "대단", "나라", "이"},
+ new int[] {0, 4, 8, 10},
+ new int[] {2, 6, 10, 13},
+ new int[] {1, 2, 3, 1});
+ }
+ }
+
+ private static void assertFstUsesDirectBuffer(FST fst) {
+ assertTrue(fst.ramBytesUsed() < MAX_FST_RAM_BYTES);
+ }
+}
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestKoreanDictionarySmallHeap.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestKoreanDictionarySmallHeap.java
new file mode 100644
index 000000000000..5b946ad12d7f
--- /dev/null
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestKoreanDictionarySmallHeap.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko.dict;
+
+import java.io.IOException;
+import java.io.StringReader;
+import org.apache.lucene.analysis.ko.KoreanAnalyzer;
+import org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilter;
+import org.apache.lucene.analysis.ko.KoreanTokenizer;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.junit.Ignore;
+
+/**
+ * Manual smoke test for Nori dictionary loading under a very small heap.
+ *
+ *
+ */
+@Ignore("Manual small-heap validation; enable locally when verifying off-heap FST loading")
+public class TestKoreanDictionarySmallHeap extends LuceneTestCase {
+
+ public void testLoadDictionaryAndTokenizeOnSmallHeap() throws IOException {
+ TokenInfoDictionary dictionary = TokenInfoDictionary.getInstance();
+ assertNotNull(dictionary.getFST());
+
+ try (KoreanAnalyzer analyzer =
+ new KoreanAnalyzer(
+ null,
+ KoreanTokenizer.DecompoundMode.NONE,
+ KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS,
+ false)) {
+ assertNotNull(analyzer.tokenStream("field", "한국어 형태소 분석"));
+ }
+ }
+}
diff --git a/lucene/benchmark-jmh/build.gradle b/lucene/benchmark-jmh/build.gradle
index 6d64bb7a4f72..d1542cf6a7fe 100644
--- a/lucene/benchmark-jmh/build.gradle
+++ b/lucene/benchmark-jmh/build.gradle
@@ -19,6 +19,7 @@ description = 'Lucene JMH micro-benchmarking module'
dependencies {
moduleImplementation project(':lucene:core')
+ moduleImplementation project(':lucene:analysis:nori')
moduleImplementation project(':lucene:expressions')
moduleImplementation project(':lucene:join')
moduleImplementation project(':lucene:sandbox')
diff --git a/lucene/benchmark-jmh/src/java/module-info.java b/lucene/benchmark-jmh/src/java/module-info.java
index 8090c7554739..b9b16b76debf 100644
--- a/lucene/benchmark-jmh/src/java/module-info.java
+++ b/lucene/benchmark-jmh/src/java/module-info.java
@@ -23,6 +23,7 @@
requires jmh.core;
requires jdk.unsupported;
requires org.apache.lucene.core;
+ requires org.apache.lucene.analysis.nori;
requires org.apache.lucene.expressions;
requires org.apache.lucene.join;
requires org.apache.lucene.sandbox;
diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/KoreanTokenizerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/KoreanTokenizerBenchmark.java
new file mode 100644
index 000000000000..c2f6915069be
--- /dev/null
+++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/KoreanTokenizerBenchmark.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.benchmark.jmh;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.concurrent.TimeUnit;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ko.KoreanAnalyzer;
+import org.apache.lucene.analysis.ko.KoreanTokenizer;
+import org.apache.lucene.analysis.ko.dict.UserDictionary;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+
+@State(Scope.Benchmark)
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(value = 1, warmups = 1)
+@Warmup(iterations = 2, time = 1)
+@Measurement(iterations = 3, time = 2)
+public class KoreanTokenizerBenchmark {
+
+ @Param({"SHORT", "LONG"})
+ public String textSize;
+
+ @Param({"false", "true"})
+ public boolean userDictionary;
+
+ private Analyzer analyzer;
+ private String text;
+
+ @Setup(Level.Trial)
+ public void setup() throws IOException {
+ UserDictionary userDict = userDictionary ? openUserDictionary() : null;
+ analyzer = new KoreanAnalyzer(userDict, KoreanTokenizer.DecompoundMode.DISCARD, null, false);
+ text =
+ switch (textSize) {
+ case "SHORT" -> "한국은 대단한 나라입니다.";
+ case "LONG" ->
+ """
+ 서울특별시는 대한민국의 수도이자 최대 도시이다. 한강을 중심으로 발전해 왔으며 \
+ 정치, 경제, 사회, 문화의 중심지 역할을 한다. 많은 기업과 연구 기관이 \
+ 집중되어 있어 정보 기술과 금융 산업이 발달했다.""";
+ default -> throw new IllegalStateException("unknown textSize: " + textSize);
+ };
+ }
+
+ @TearDown(Level.Trial)
+ public void tearDown() throws IOException {
+ if (analyzer != null) {
+ analyzer.close();
+ }
+ }
+
+ @Benchmark
+ public int tokenizeDocument() throws IOException {
+ int tokenCount = 0;
+ try (TokenStream stream = analyzer.tokenStream("field", text)) {
+ stream.reset();
+ while (stream.incrementToken()) {
+ tokenCount++;
+ }
+ stream.end();
+ }
+ return tokenCount;
+ }
+
+ private static UserDictionary openUserDictionary() throws IOException {
+ return UserDictionary.open(new StringReader("세종시\n세종 시"));
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/DirectBufferFSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/DirectBufferFSTStore.java
new file mode 100644
index 000000000000..7afbca79b769
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/DirectBufferFSTStore.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.fst;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.RamUsageEstimator;
+
+/**
+ * Provides off-heap storage of finite state machine (FST) bytes in a direct {@link ByteBuffer}.
+ *
+ *
Unlike {@link OnHeapFSTStore}, FST bytes are not counted in JVM heap by {@link
+ * #ramBytesUsed()}. Morphological analysis dictionaries use this store to reduce on-heap pressure.
+ *
+ * @lucene.experimental
+ */
+public final class DirectBufferFSTStore implements FSTReader {
+
+ private static final long BASE_RAM_BYTES_USED =
+ RamUsageEstimator.shallowSizeOfInstance(DirectBufferFSTStore.class);
+
+ private final ByteBuffer buffer;
+
+ /**
+ * Read FST bytes from {@code in} into a direct buffer.
+ *
+ * @param in input positioned at the first FST byte
+ * @param numBytes number of FST bytes to read
+ */
+ public DirectBufferFSTStore(DataInput in, long numBytes) throws IOException {
+ this.buffer = readDirectBuffer(in, numBytes);
+ }
+
+ /**
+ * Copy FST bytes from an existing {@link FSTReader} into a direct buffer.
+ *
+ * @param source reader containing FST bytes
+ * @param numBytes number of FST bytes to copy
+ */
+ public DirectBufferFSTStore(FSTReader source, long numBytes) throws IOException {
+ if (numBytes > Integer.MAX_VALUE) {
+ throw new IllegalArgumentException("FST too large for direct buffer: " + numBytes);
+ }
+ ByteBuffer tmp = ByteBuffer.allocateDirect((int) numBytes);
+ source.writeTo(
+ new DataOutput() {
+ @Override
+ public void writeByte(byte b) {
+ tmp.put(b);
+ }
+
+ @Override
+ public void writeBytes(byte[] b, int offset, int length) {
+ tmp.put(b, offset, length);
+ }
+ });
+ tmp.flip();
+ this.buffer = tmp.asReadOnlyBuffer();
+ }
+
+ private static ByteBuffer readDirectBuffer(DataInput in, long numBytes) throws IOException {
+ if (numBytes > Integer.MAX_VALUE) {
+ throw new IllegalArgumentException("FST too large for direct buffer: " + numBytes);
+ }
+ int size = (int) numBytes;
+ ByteBuffer tmp = ByteBuffer.allocateDirect(size);
+ byte[] scratch = new byte[8192];
+ int remaining = size;
+ while (remaining > 0) {
+ int toRead = Math.min(scratch.length, remaining);
+ in.readBytes(scratch, 0, toRead);
+ tmp.put(scratch, 0, toRead);
+ remaining -= toRead;
+ }
+ tmp.flip();
+ return tmp.asReadOnlyBuffer();
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return BASE_RAM_BYTES_USED;
+ }
+
+ @Override
+ public FST.BytesReader getReverseBytesReader() {
+ return new ReverseDirectBufferReader(buffer.duplicate());
+ }
+
+ @Override
+ public void writeTo(DataOutput out) throws IOException {
+ ByteBuffer dup = buffer.duplicate();
+ dup.rewind();
+ byte[] scratch = new byte[8192];
+ while (dup.hasRemaining()) {
+ int len = Math.min(scratch.length, dup.remaining());
+ dup.get(scratch, 0, len);
+ out.writeBytes(scratch, 0, len);
+ }
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
index 30fd39805daf..96a5e941e58c 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
@@ -420,6 +420,13 @@ public FST(FSTMetadata metadata, DataInput in) throws IOException {
this(metadata, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS, in, metadata.numBytes));
}
+ /**
+ * Load a previously saved FST with a DataInput for metadata using a {@link DirectBufferFSTStore}.
+ */
+ public static FST loadDirect(FSTMetadata metadata, DataInput in) throws IOException {
+ return fromFSTReader(metadata, new DirectBufferFSTStore(in, metadata.numBytes));
+ }
+
/** Create the FST with a metadata object and a FSTReader. */
FST(FSTMetadata metadata, FSTReader fstReader) {
assert fstReader != null;
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ReverseDirectBufferReader.java b/lucene/core/src/java/org/apache/lucene/util/fst/ReverseDirectBufferReader.java
new file mode 100644
index 000000000000..e5124bbd8ffa
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/ReverseDirectBufferReader.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.fst;
+
+import java.nio.ByteBuffer;
+
+/** Reads in reverse from a direct {@link ByteBuffer}. */
+final class ReverseDirectBufferReader extends FST.BytesReader {
+ private final ByteBuffer buffer;
+ private int pos;
+
+ ReverseDirectBufferReader(ByteBuffer buffer) {
+ this.buffer = buffer;
+ }
+
+ @Override
+ public byte readByte() {
+ return buffer.get(pos--);
+ }
+
+ @Override
+ public void readBytes(byte[] b, int offset, int len) {
+ for (int i = 0; i < len; i++) {
+ b[offset + i] = buffer.get(pos--);
+ }
+ }
+
+ @Override
+ public void skipBytes(long count) {
+ pos -= count;
+ }
+
+ @Override
+ public long getPosition() {
+ return pos;
+ }
+
+ @Override
+ public void setPosition(long pos) {
+ this.pos = (int) pos;
+ }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestDirectBufferFSTStore.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestDirectBufferFSTStore.java
new file mode 100644
index 000000000000..4dafd99bc6ca
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestDirectBufferFSTStore.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.fst;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.OutputStreamDataOutput;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.tests.util.RamUsageTester;
+import org.apache.lucene.util.IntsRefBuilder;
+
+public class TestDirectBufferFSTStore extends LuceneTestCase {
+
+ private static final int MAX_BLOCK_BITS = 30;
+
+ public void testEquivalenceWithOnHeapStore() throws IOException {
+ SavedFST saved = buildSavedFst();
+ FST onHeapFst = loadOnHeap(saved);
+ FST directFst = loadDirect(saved);
+
+ IntsRefFSTEnum onHeapEnum = new IntsRefFSTEnum<>(onHeapFst);
+ IntsRefFSTEnum directEnum = new IntsRefFSTEnum<>(directFst);
+ IntsRefFSTEnum.InputOutput onHeapEntry;
+ IntsRefFSTEnum.InputOutput directEntry;
+ while ((onHeapEntry = onHeapEnum.next()) != null) {
+ directEntry = directEnum.next();
+ assertNotNull(directEntry);
+ assertEquals(onHeapEntry.input, directEntry.input);
+ assertEquals(onHeapEntry.output, directEntry.output);
+ }
+ assertNull(directEnum.next());
+ }
+
+ public void testRamBytesUsed() throws IOException {
+ SavedFST saved = buildSavedFst();
+ DirectBufferFSTStore directStore =
+ new DirectBufferFSTStore(saved.dataInput(), saved.metadata.getNumBytes());
+ OnHeapFSTStore onHeapStore =
+ new OnHeapFSTStore(MAX_BLOCK_BITS, saved.dataInput(), saved.metadata.getNumBytes());
+
+ assertTrue(directStore.ramBytesUsed() < 1_000);
+ assertTrue(onHeapStore.ramBytesUsed() > directStore.ramBytesUsed() + 100);
+ assertTrue(RamUsageTester.ramUsed(directStore) < RamUsageTester.ramUsed(onHeapStore));
+ assertNoLargeByteArray(directStore);
+ }
+
+ public void testBytesReader() throws IOException {
+ SavedFST saved = buildSavedFst();
+ FST fst = loadDirect(saved);
+ FST.BytesReader reader = fst.getBytesReader();
+ byte[] data = saved.dataBytes;
+ reader.setPosition(data.length - 1);
+ assertEquals(data[data.length - 1], reader.readByte());
+ byte[] scratch = new byte[4];
+ reader.setPosition(3);
+ reader.readBytes(scratch, 0, 4);
+ for (int i = 0; i < 4; i++) {
+ assertEquals(data[3 - i], scratch[i]);
+ }
+ }
+
+ public void testCopyFromFSTReader() throws IOException {
+ SavedFST saved = buildSavedFst();
+ OnHeapFSTStore source =
+ new OnHeapFSTStore(MAX_BLOCK_BITS, saved.dataInput(), saved.metadata.getNumBytes());
+ DirectBufferFSTStore copied = new DirectBufferFSTStore(source, saved.metadata.getNumBytes());
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream(saved.dataBytes.length);
+ copied.writeTo(new OutputStreamDataOutput(out));
+ assertArrayEquals(saved.dataBytes, out.toByteArray());
+ assertNoLargeByteArray(copied);
+ }
+
+ private static FST loadOnHeap(SavedFST saved) throws IOException {
+ return new FST<>(saved.metadata, saved.dataInput());
+ }
+
+ private static FST loadDirect(SavedFST saved) throws IOException {
+ return FST.loadDirect(saved.metadata, saved.dataInput());
+ }
+
+ private static SavedFST buildSavedFst() throws IOException {
+ PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
+ FSTCompiler compiler =
+ new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
+ IntsRefBuilder scratch = new IntsRefBuilder();
+ for (int i = 0; i < 256; i++) {
+ scratch.clear();
+ scratch.append(i);
+ compiler.add(scratch.get(), (long) i);
+ }
+ FST.FSTMetadata metadata = compiler.compile();
+ FST fst = FST.fromFSTReader(metadata, compiler.getFSTReader());
+
+ ByteArrayOutputStream metaBytes = new ByteArrayOutputStream();
+ ByteArrayOutputStream dataBytes = new ByteArrayOutputStream();
+ fst.save(new OutputStreamDataOutput(metaBytes), new OutputStreamDataOutput(dataBytes));
+ return new SavedFST(
+ metaBytes.toByteArray(), dataBytes.toByteArray(), outputs);
+ }
+
+ private static void assertNoLargeByteArray(Object root) {
+ List largeArrays = new ArrayList<>();
+ RamUsageTester.ramUsed(
+ root,
+ new RamUsageTester.Accumulator() {
+ @Override
+ public long accumulateArray(
+ Object array, long shallowSize, List