diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 42a6e8073576..f2d2b727041b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -287,7 +287,8 @@ New Features Improvements --------------------- -(No changes) + +* GITHUB#16269: Lazily build the term-vectors per-field. (Tim Brooks) Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java index be8016e359ef..14e789ead81e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java @@ -42,12 +42,18 @@ final class FreqProxTermsWriter extends TermsHash { + // The term vectors consumer is the (optional) downstream consumer of the terms interned here: + // it reuses our term byte pool and only buffers fields that store term vectors. This is the same + // instance the base class tracks as nextTermsHash, kept here with its concrete type. + private final TermVectorsConsumer termVectors; + FreqProxTermsWriter( final IntBlockPool.Allocator intBlockAllocator, final ByteBlockPool.Allocator byteBlockAllocator, Counter bytesUsed, - TermsHash termVectors) { + TermVectorsConsumer termVectors) { super(intBlockAllocator, byteBlockAllocator, bytesUsed, termVectors); + this.termVectors = termVectors; } private void applyDeletes(SegmentWriteState state, Fields fields) throws IOException { @@ -79,14 +85,15 @@ private void applyDeletes(SegmentWriteState state, Fields fields) throws IOExcep } } - @Override public void flush( Map fieldsToFlush, final SegmentWriteState state, Sorter.DocMap sortMap, NormsProducer norms) throws IOException { - super.flush(fieldsToFlush, state, sortMap, norms); + // Flush the per-document term vectors first (they were buffered as each document finished), + // then write the postings gathered per-field below. + termVectors.flush(state, sortMap); // Gather all fields that saw any postings: List allFields = new ArrayList<>(); @@ -136,8 +143,11 @@ public Terms terms(final String field) { @Override public TermsHashPerField addField(FieldInvertState invertState, FieldInfo fieldInfo) { - return new FreqProxTermsWriterPerField( - invertState, this, fieldInfo, nextTermsHash.addField(invertState, fieldInfo)); + // Only build the downstream term-vectors per-field when the field actually stores term vectors. + // hasTermVectors() is fixed at field-init time and is immutable for the segment. + TermsHashPerField termVectorsPerField = + fieldInfo.hasTermVectors() ? termVectors.addField(invertState, fieldInfo) : null; + return new FreqProxTermsWriterPerField(invertState, this, fieldInfo, termVectorsPerField); } static class SortingTerms extends FilterLeafReader.FilterTerms { diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java index 4235ff4e9421..da59d5730db8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java @@ -46,14 +46,14 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField { FieldInvertState invertState, TermsHash termsHash, FieldInfo fieldInfo, - TermsHashPerField nextPerField) { + TermsHashPerField termVectorsPerField) { super( fieldInfo.getIndexOptions().subsumes(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) ? 2 : 1, termsHash.intPool, termsHash.bytePool, termsHash.termBytePool, termsHash.bytesUsed, - nextPerField, + termVectorsPerField, fieldInfo.name, fieldInfo.getIndexOptions()); this.fieldState = invertState; @@ -62,6 +62,8 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField { hasProx = indexOptions.subsumes(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); hasOffsets = indexOptions.subsumes(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); isTermDoc = fieldInfo.isTermDocField(); + // The downstream term-vectors per-field exists iff the field stores term vectors. + assert (getTermVectorsPerField() != null) == fieldInfo.hasTermVectors(); } @Override @@ -73,12 +75,11 @@ void finish() throws IOException { } @Override - boolean start(IndexableField f, boolean first) { + void start(IndexableField f, boolean first) { super.start(f, first); termFreqAtt = fieldState.termFreqAttribute; payloadAttribute = fieldState.payloadAttribute; offsetAttribute = fieldState.offsetAttribute; - return true; } void writeProx(int termID, int proxCode) { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 5e81ddb2b0a7..d3a08111ce4a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -84,8 +84,8 @@ final class IndexingChain implements Accountable { final Counter bytesUsed = Counter.newCounter(); final FieldInfos.Builder fieldInfos; - // Writes postings and term vectors: - final TermsHash termsHash; + // Writes postings, and drives the (optional) downstream term-vectors consumer: + final FreqProxTermsWriter termsHash; // Shared pool for doc-value terms final ByteBlockPool docValuesBytePool; // Shared scratch buffers for dense points encoding @@ -592,7 +592,7 @@ void processDocument( // analyzer is free to reuse TokenStream across fields // (i.e., we cannot have more than one TokenStream // running "at once"): - termsHash.startDocument(); + termVectorsWriter.startDocument(); startStoredFields(docID); try { // Handle the parent field first (before document fields). Its schema was already @@ -663,9 +663,9 @@ void processDocument( fields[i].finish(docID); } finishStoredFields(); - // TODO: for broken docs, optimize termsHash.finishDocument + // TODO: for broken docs, optimize termVectorsWriter.finishDocument try { - termsHash.finishDocument(docID); + termVectorsWriter.finishDocument(docID); } catch (Throwable th) { // Must abort, on the possibility that on-disk term // vectors are now corrupt: @@ -886,7 +886,7 @@ private void processRowColumns(int baseDocID, int numDocs, Iterable colu int indexedFieldCount = 0; if (hasInverted) { - termsHash.startDocument(); + termVectorsWriter.startDocument(); } if (hasStored) { startStoredFields(segDocID); @@ -925,7 +925,7 @@ private void processRowColumns(int baseDocID, int numDocs, Iterable colu } if (hasInverted) { try { - termsHash.finishDocument(segDocID); + termVectorsWriter.finishDocument(segDocID); } catch (Throwable th) { abortingExceptionConsumer.accept(th); throw th; @@ -1478,6 +1478,9 @@ private static void updateDocFieldSchema( if (fieldType.indexOptions() != IndexOptions.NONE) { schema.setIndexOptions( fieldType.indexOptions(), fieldType.omitNorms(), fieldType.storeTermVectors()); + if (fieldType.storeTermVectors() == false) { + verifyNoTermVectorOptionsWithoutVectors(fieldName, fieldType); + } } else { // TODO: should this be checked when a fieldType is created? verifyUnIndexedFieldType(fieldName, fieldType); @@ -1540,6 +1543,31 @@ private static void verifyUnIndexedFieldType(String name, IndexableFieldType ft) } } + /** + * Verifies that an indexed field which does not store term vectors does not request any + * term-vector sub-options. + */ + private static void verifyNoTermVectorOptionsWithoutVectors(String name, IndexableFieldType ft) { + if (ft.storeTermVectorOffsets()) { + throw new IllegalArgumentException( + "cannot index term vector offsets when term vectors are not indexed (field=\"" + + name + + "\")"); + } + if (ft.storeTermVectorPositions()) { + throw new IllegalArgumentException( + "cannot index term vector positions when term vectors are not indexed (field=\"" + + name + + "\")"); + } + if (ft.storeTermVectorPayloads()) { + throw new IllegalArgumentException( + "cannot index term vector payloads when term vectors are not indexed (field=\"" + + name + + "\")"); + } + } + private static void validateMaxVectorDimension( String fieldName, int vectorDim, int maxVectorDim) { if (vectorDim > maxVectorDim) { diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java index d5d4aceae851..1804d4bffc3d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java @@ -18,9 +18,7 @@ import java.io.IOException; import java.util.Iterator; -import java.util.Map; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.TermVectorsWriter; @@ -51,13 +49,8 @@ final class SortingTermVectorsConsumer extends TermVectorsConsumer { } @Override - void flush( - Map fieldsToFlush, - final SegmentWriteState state, - Sorter.DocMap sortMap, - NormsProducer norms) - throws IOException { - super.flush(fieldsToFlush, state, sortMap, norms); + void flush(final SegmentWriteState state, Sorter.DocMap sortMap) throws IOException { + super.flush(state, sortMap); if (tmpDirectory != null) { TermVectorsReader reader = TEMP_TERM_VECTORS_FORMAT.vectorsReader( diff --git a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java index 832f0275fa9b..a75d877753ad 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java @@ -18,9 +18,7 @@ import java.io.IOException; import java.util.Arrays; -import java.util.Map; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FlushInfo; @@ -68,13 +66,7 @@ class TermVectorsConsumer extends TermsHash { this.codec = codec; } - @Override - void flush( - Map fieldsToFlush, - final SegmentWriteState state, - Sorter.DocMap sortMap, - NormsProducer norms) - throws IOException { + void flush(final SegmentWriteState state, Sorter.DocMap sortMap) throws IOException { if (writer != null) { int numDocs = state.segmentInfo.maxDoc(); assert numDocs > 0; @@ -113,7 +105,7 @@ void setHasVectors() { hasVectors = true; } - @Override + /** Writes this document's term vectors. Called per document by {@link IndexingChain}. */ void finishDocument(int docID) throws IOException { if (!hasVectors) { @@ -173,7 +165,7 @@ void addFieldToFlush(TermVectorsConsumerPerField fieldToFlush) { perFields[numVectorFields++] = fieldToFlush; } - @Override + /** Resets per-document state. Called per document by {@link IndexingChain}. */ void startDocument() { resetFields(); numVectorFields = 0; diff --git a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java index d6f2d9c7847c..6c17a350ca28 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java @@ -32,7 +32,9 @@ final class TermVectorsConsumerPerField extends TermsHashPerField { private final FieldInvertState fieldState; private final FieldInfo fieldInfo; - private boolean doVectors; + // True once this field has buffered vectors for the current document that still need flushing in + // finishDocument(); reset to false there. + private boolean pendingDoc; private boolean doVectorPositions; private boolean doVectorOffsets; private boolean doVectorPayloads; @@ -67,18 +69,18 @@ final class TermVectorsConsumerPerField extends TermsHashPerField { */ @Override void finish() { - if (!doVectors || getNumTerms() == 0) { + if (!pendingDoc || getNumTerms() == 0) { return; } termsWriter.addFieldToFlush(this); } void finishDocument() throws IOException { - if (doVectors == false) { + if (pendingDoc == false) { return; } - doVectors = false; + pendingDoc = false; final int numPostings = getNumTerms(); @@ -128,7 +130,7 @@ void finishDocument() throws IOException { } @Override - boolean start(IndexableField field, boolean first) { + void start(IndexableField field, boolean first) { super.start(field, first); termFreqAtt = fieldState.termFreqAttribute; assert field.fieldType().indexOptions() != IndexOptions.NONE; @@ -146,55 +148,34 @@ boolean start(IndexableField field, boolean first) { hasPayloads = false; - doVectors = field.fieldType().storeTermVectors(); - - if (doVectors) { - doVectorPositions = field.fieldType().storeTermVectorPositions(); - - // Somewhat confusingly, unlike postings, you are - // allowed to index TV offsets without TV positions: - doVectorOffsets = field.fieldType().storeTermVectorOffsets(); - - if (doVectorPositions) { - doVectorPayloads = field.fieldType().storeTermVectorPayloads(); - } else { - doVectorPayloads = false; - if (field.fieldType().storeTermVectorPayloads()) { - // TODO: move this check somewhere else, and impl the other missing ones - throw new IllegalArgumentException( - "cannot index term vector payloads without term vector positions (field=\"" - + field.name() - + "\")"); - } - } + // This per-field is only built for fields that store term vectors (see + // FreqProxTermsWriter#addField), and FieldSchema locks storeTermVectors for the segment, so + // this field always stores term vectors. + assert field.fieldType().storeTermVectors() + : "term-vectors per-field created for a field without term vectors"; + pendingDoc = true; + + doVectorPositions = field.fieldType().storeTermVectorPositions(); + + // Somewhat confusingly, unlike postings, you are + // allowed to index TV offsets without TV positions: + doVectorOffsets = field.fieldType().storeTermVectorOffsets(); + if (doVectorPositions) { + doVectorPayloads = field.fieldType().storeTermVectorPayloads(); } else { - if (field.fieldType().storeTermVectorOffsets()) { - throw new IllegalArgumentException( - "cannot index term vector offsets when term vectors are not indexed (field=\"" - + field.name() - + "\")"); - } - if (field.fieldType().storeTermVectorPositions()) { - throw new IllegalArgumentException( - "cannot index term vector positions when term vectors are not indexed (field=\"" - + field.name() - + "\")"); - } + doVectorPayloads = false; if (field.fieldType().storeTermVectorPayloads()) { + // TODO: move this check somewhere else, and impl the other missing ones throw new IllegalArgumentException( - "cannot index term vector payloads when term vectors are not indexed (field=\"" + "cannot index term vector payloads without term vector positions (field=\"" + field.name() + "\")"); } } } else { - if (doVectors != field.fieldType().storeTermVectors()) { - throw new IllegalArgumentException( - "all instances of a given field name must have the same term vectors settings (storeTermVectors changed for field=\"" - + field.name() - + "\")"); - } + // storeTermVectors itself is locked for the segment by FieldSchema, but the sub-options are + // not, so they are validated here for consistency across instances within the document. if (doVectorPositions != field.fieldType().storeTermVectorPositions()) { throw new IllegalArgumentException( "all instances of a given field name must have the same term vectors settings (storeTermVectorPositions changed for field=\"" @@ -215,21 +196,17 @@ boolean start(IndexableField field, boolean first) { } } - if (doVectors) { - if (doVectorOffsets) { - offsetAttribute = fieldState.offsetAttribute; - assert offsetAttribute != null; - } - - if (doVectorPayloads) { - // Can be null: - payloadAttribute = fieldState.payloadAttribute; - } else { - payloadAttribute = null; - } + if (doVectorOffsets) { + offsetAttribute = fieldState.offsetAttribute; + assert offsetAttribute != null; } - return doVectors; + if (doVectorPayloads) { + // Can be null: + payloadAttribute = fieldState.payloadAttribute; + } else { + payloadAttribute = null; + } } void writeProx(TermVectorsPostingsArray postings, int termID) { diff --git a/lucene/core/src/java/org/apache/lucene/index/TermsHash.java b/lucene/core/src/java/org/apache/lucene/index/TermsHash.java index 7d438df9b909..ca9ca83a8523 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermsHash.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermsHash.java @@ -16,10 +16,6 @@ */ package org.apache.lucene.index; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.Counter; import org.apache.lucene.util.IntBlockPool; @@ -73,32 +69,5 @@ void reset() { bytePool.reset(false, false); } - void flush( - Map fieldsToFlush, - final SegmentWriteState state, - Sorter.DocMap sortMap, - NormsProducer norms) - throws IOException { - if (nextTermsHash != null) { - Map nextChildFields = new HashMap<>(); - for (final Map.Entry entry : fieldsToFlush.entrySet()) { - nextChildFields.put(entry.getKey(), entry.getValue().getNextPerField()); - } - nextTermsHash.flush(nextChildFields, state, sortMap, norms); - } - } - abstract TermsHashPerField addField(FieldInvertState fieldInvertState, FieldInfo fieldInfo); - - void finishDocument(int docID) throws IOException { - if (nextTermsHash != null) { - nextTermsHash.finishDocument(docID); - } - } - - void startDocument() throws IOException { - if (nextTermsHash != null) { - nextTermsHash.startDocument(); - } - } } diff --git a/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java b/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java index 3f7be53606ce..8481975a1f1a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java @@ -35,7 +35,7 @@ abstract class TermsHashPerField implements Comparable { private static final int HASH_INIT_SIZE = 4; - private final TermsHashPerField nextPerField; + private final TermsHashPerField termVectorsPerField; private final IntBlockPool intPool; final ByteBlockPool bytePool; private final ByteSlicePool slicePool; @@ -68,7 +68,7 @@ abstract class TermsHashPerField implements Comparable { ByteBlockPool bytePool, ByteBlockPool termBytePool, Counter bytesUsed, - TermsHashPerField nextPerField, + TermsHashPerField termVectorsPerField, String fieldName, IndexOptions indexOptions) { this.intPool = intPool; @@ -76,7 +76,7 @@ abstract class TermsHashPerField implements Comparable { this.slicePool = new ByteSlicePool(bytePool); this.streamCount = streamCount; this.fieldName = fieldName; - this.nextPerField = nextPerField; + this.termVectorsPerField = termVectorsPerField; assert indexOptions != IndexOptions.NONE; this.indexOptions = indexOptions; PostingsBytesStartArray byteStarts = new PostingsBytesStartArray(this, bytesUsed); @@ -86,8 +86,8 @@ abstract class TermsHashPerField implements Comparable { void reset() { bytesHash.clear(false); sortedTermIDs = null; - if (nextPerField != null) { - nextPerField.reset(); + if (termVectorsPerField != null) { + termVectorsPerField.reset(); } } @@ -125,8 +125,6 @@ final void reinitHash() { bytesHash.reinit(); } - private boolean doNextCall; - // Secondary entry point (for 2nd & subsequent TermsHash), // because token text has already been "interned" into // textStart, so we hash by textStart. term vectors use @@ -203,8 +201,8 @@ void add(BytesRef termBytes, final int docID) throws IOException { throw new DuplicateTermException(e.getMessage() + " '" + termBytes.utf8ToString() + "'"); } } - if (doNextCall) { - nextPerField.add(postingsArray.textStarts[termID], docID); + if (termVectorsPerField != null) { + termVectorsPerField.add(postingsArray.textStarts[termID], docID); } } @@ -268,8 +266,8 @@ final void writeVInt(int stream, int i) { writeByte(stream, (byte) i); } - final TermsHashPerField getNextPerField() { - return nextPerField; + final TermsHashPerField getTermVectorsPerField() { + return termVectorsPerField; } final String getFieldName() { @@ -331,8 +329,8 @@ public final int compareTo(TermsHashPerField other) { /** Finish adding all instances of this field to the current document. */ void finish() throws IOException { - if (nextPerField != null) { - nextPerField.finish(); + if (termVectorsPerField != null) { + termVectorsPerField.finish(); } } @@ -344,11 +342,10 @@ final int getNumTerms() { * Start adding a new field instance; first is true if this is the first time this field name was * seen in the document. */ - boolean start(IndexableField field, boolean first) { - if (nextPerField != null) { - doNextCall = nextPerField.start(field, first); + void start(IndexableField field, boolean first) { + if (termVectorsPerField != null) { + termVectorsPerField.start(field, first); } - return true; } /** Called when a term is seen for the first time. */