From 975aa500ab9e5a321ff42bdd4296bce1a6713329 Mon Sep 17 00:00:00 2001 From: Tim Brooks Date: Wed, 10 Jun 2026 11:05:00 +0200 Subject: [PATCH 1/6] Initial --- .../lucene/index/FreqProxTermsWriter.java | 7 +- .../index/FreqProxTermsWriterPerField.java | 7 +- .../apache/lucene/index/IndexingChain.java | 51 +++++++++++-- .../apache/lucene/index/IndexingFeatures.java | 75 +++++++++++++++++++ .../lucene/index/TermVectorsConsumer.java | 4 +- .../index/TermVectorsConsumerPerField.java | 23 ++---- .../org/apache/lucene/index/TermsHash.java | 19 ++--- .../lucene/index/TermsHashPerField.java | 26 +++---- 8 files changed, 155 insertions(+), 57 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/index/IndexingFeatures.java diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java index be8016e359ef..ea18a895d952 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java @@ -136,8 +136,13 @@ public Terms terms(final String field) { @Override public TermsHashPerField addField(FieldInvertState invertState, FieldInfo fieldInfo) { + // Only build the downstream term-vectors per-field when the field actually stores term vectors. + // hasTermVectors() is fixed at field-init time (baked into the FieldInfo before this runs) and + // is immutable for the segment, so a null link is the structural signal "no row dependency". + TermsHashPerField termVectorsPerField = + fieldInfo.hasTermVectors() ? nextTermsHash.addField(invertState, fieldInfo) : null; return new FreqProxTermsWriterPerField( - invertState, this, fieldInfo, nextTermsHash.addField(invertState, fieldInfo)); + invertState, this, fieldInfo, termVectorsPerField); } static class SortingTerms extends FilterLeafReader.FilterTerms { diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java index 4235ff4e9421..1c508a138ce3 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java @@ -46,14 +46,14 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField { FieldInvertState invertState, TermsHash termsHash, FieldInfo fieldInfo, - TermsHashPerField nextPerField) { + TermsHashPerField termVectorsPerField) { super( fieldInfo.getIndexOptions().subsumes(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) ? 2 : 1, termsHash.intPool, termsHash.bytePool, termsHash.termBytePool, termsHash.bytesUsed, - nextPerField, + termVectorsPerField, fieldInfo.name, fieldInfo.getIndexOptions()); this.fieldState = invertState; @@ -62,6 +62,9 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField { hasProx = indexOptions.subsumes(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); hasOffsets = indexOptions.subsumes(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); isTermDoc = fieldInfo.isTermDocField(); + // The downstream term-vectors per-field exists iff the field stores term vectors. This makes + // "has a per-document row dependency" a structural fact (getTermVectorsPerField() != null). + assert (getTermVectorsPerField() != null) == fieldInfo.hasTermVectors(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 3e7517fb04b5..1fdb959b1c8c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -587,7 +587,7 @@ void processDocument( // analyzer is free to reuse TokenStream across fields // (i.e., we cannot have more than one TokenStream // running "at once"): - termsHash.startDocument(); + termVectorsWriter.startDocument(); startStoredFields(docID); try { // Handle the parent field first (before document fields). Its schema was already @@ -658,9 +658,9 @@ void processDocument( fields[i].finish(docID); } finishStoredFields(); - // TODO: for broken docs, optimize termsHash.finishDocument + // TODO: for broken docs, optimize termVectorsWriter.finishDocument try { - termsHash.finishDocument(docID); + termVectorsWriter.finishDocument(docID); } catch (Throwable th) { // Must abort, on the possibility that on-disk term // vectors are now corrupt: @@ -726,7 +726,7 @@ void processBatch(int baseDocID, ColumnBatch columnBatch) throws IOException { "Unknown column type: " + column.getClass().getName()); } - if (fieldType.stored() || fieldType.indexOptions() != IndexOptions.NONE) { + if (IndexingFeatures.rowEligible(fieldType)) { hasRowColumns = true; } @@ -826,7 +826,7 @@ private void processRowColumns(int baseDocID, int numDocs, Iterable colu int originalIdx = 0; for (Column column : columns) { IndexableFieldType fieldType = column.fieldType(); - if (fieldType.stored() == false && fieldType.indexOptions() == IndexOptions.NONE) { + if (IndexingFeatures.rowEligible(fieldType) == false) { originalIdx++; continue; } @@ -839,7 +839,12 @@ private void processRowColumns(int baseDocID, int numDocs, Iterable colu adapters[numRowCols] = adapter; rowPfIndices[numRowCols] = originalIdx; heads[numRowCols] = adapter.nextDoc(); - if (fieldType.indexOptions() != IndexOptions.NONE) { + // Defined as the seam for moving column-eligible inversion out of the row pass later; the + // partition assert documents that an indexed field is exactly one of the two inversion kinds. + assert (IndexingFeatures.rowBoundInversion(fieldType) + || IndexingFeatures.columnEligibleInversion(fieldType)) + == IndexingFeatures.indexed(fieldType); + if (IndexingFeatures.indexed(fieldType)) { hasInverted = true; } if (fieldType.stored()) { @@ -858,7 +863,7 @@ private void processRowColumns(int baseDocID, int numDocs, Iterable colu int indexedFieldCount = 0; if (hasInverted) { - termsHash.startDocument(); + termVectorsWriter.startDocument(); } if (hasStored) { startStoredFields(segDocID); @@ -897,7 +902,7 @@ private void processRowColumns(int baseDocID, int numDocs, Iterable colu } if (hasInverted) { try { - termsHash.finishDocument(segDocID); + termVectorsWriter.finishDocument(segDocID); } catch (Throwable th) { abortingExceptionConsumer.accept(th); throw th; @@ -1408,6 +1413,9 @@ private static void updateDocFieldSchema( if (fieldType.indexOptions() != IndexOptions.NONE) { schema.setIndexOptions( fieldType.indexOptions(), fieldType.omitNorms(), fieldType.storeTermVectors()); + if (fieldType.storeTermVectors() == false) { + verifyNoTermVectorOptionsWithoutVectors(fieldName, fieldType); + } } else { // TODO: should this be checked when a fieldType is created? verifyUnIndexedFieldType(fieldName, fieldType); @@ -1470,6 +1478,33 @@ private static void verifyUnIndexedFieldType(String name, IndexableFieldType ft) } } + /** + * Verifies that an indexed field which does not store term vectors does not request any + * term-vector sub-options. Previously enforced lazily in {@code + * TermVectorsConsumerPerField#start}; now that the term-vectors per-field is only built for fields + * that store term vectors, this runs in the schema layer for every indexed field. + */ + private static void verifyNoTermVectorOptionsWithoutVectors(String name, IndexableFieldType ft) { + if (ft.storeTermVectorOffsets()) { + throw new IllegalArgumentException( + "cannot index term vector offsets when term vectors are not indexed (field=\"" + + name + + "\")"); + } + if (ft.storeTermVectorPositions()) { + throw new IllegalArgumentException( + "cannot index term vector positions when term vectors are not indexed (field=\"" + + name + + "\")"); + } + if (ft.storeTermVectorPayloads()) { + throw new IllegalArgumentException( + "cannot index term vector payloads when term vectors are not indexed (field=\"" + + name + + "\")"); + } + } + private static void validateMaxVectorDimension( String fieldName, int vectorDim, int maxVectorDim) { if (vectorDim > maxVectorDim) { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingFeatures.java b/lucene/core/src/java/org/apache/lucene/index/IndexingFeatures.java new file mode 100644 index 000000000000..668412046faa --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingFeatures.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +/** + * Classification predicates over an {@link IndexableFieldType} that name how a field participates in + * indexing. These exist so the row-vs-column routing in {@link IndexingChain} is declared once + * rather than re-derived inline at each call site. + * + *

The key distinction is between the two kinds of term inversion: + * + *

    + *
  • row-bound inversion — a field with term vectors. Term vectors are written per + * document inside a {@code startDocument}/{@code finishDocument} frame, so this inversion has + * a genuine per-document obligation and must run in the row pass. + *
  • column-eligible inversion — an indexed field without term vectors. Its only outputs + * are postings and norms, both of which accumulate in memory and flush at segment end with no + * within-document cross-field dependency, so this inversion could run column-major. + *
+ * + *

{@link #columnEligibleInversion} is the seam a follow-up change will switch on to move that + * inversion into the columnar pass; today nothing routes on it. + */ +final class IndexingFeatures { + + private IndexingFeatures() {} + + /** Whether the field is indexed (its terms are inverted into postings). */ + static boolean indexed(IndexableFieldType fieldType) { + return fieldType.indexOptions() != IndexOptions.NONE; + } + + /** Whether the field stores term vectors. */ + static boolean hasTermVectors(IndexableFieldType fieldType) { + return fieldType.storeTermVectors(); + } + + /** + * Inversion that has a per-document obligation and must run in the row pass: an indexed field with + * term vectors. + */ + static boolean rowBoundInversion(IndexableFieldType fieldType) { + return indexed(fieldType) && hasTermVectors(fieldType); + } + + /** + * Inversion that has no per-document obligation and could run column-major: an indexed field + * without term vectors. This is the seam for the follow-up columnar-inversion change. + */ + static boolean columnEligibleInversion(IndexableFieldType fieldType) { + return indexed(fieldType) && hasTermVectors(fieldType) == false; + } + + /** + * Whether the field has any row-oriented feature (stored fields or term inversion) and therefore + * participates in the row pass. + */ + static boolean rowEligible(IndexableFieldType fieldType) { + return fieldType.stored() || indexed(fieldType); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java index 832f0275fa9b..8787ee613688 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java @@ -113,7 +113,7 @@ void setHasVectors() { hasVectors = true; } - @Override + /** Writes this document's term vectors. Called per document by {@link IndexingChain}. */ void finishDocument(int docID) throws IOException { if (!hasVectors) { @@ -173,7 +173,7 @@ void addFieldToFlush(TermVectorsConsumerPerField fieldToFlush) { perFields[numVectorFields++] = fieldToFlush; } - @Override + /** Resets per-document state. Called per document by {@link IndexingChain}. */ void startDocument() { resetFields(); numVectorFields = 0; diff --git a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java index d6f2d9c7847c..67c7868986aa 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java @@ -169,24 +169,11 @@ boolean start(IndexableField field, boolean first) { } } else { - if (field.fieldType().storeTermVectorOffsets()) { - throw new IllegalArgumentException( - "cannot index term vector offsets when term vectors are not indexed (field=\"" - + field.name() - + "\")"); - } - if (field.fieldType().storeTermVectorPositions()) { - throw new IllegalArgumentException( - "cannot index term vector positions when term vectors are not indexed (field=\"" - + field.name() - + "\")"); - } - if (field.fieldType().storeTermVectorPayloads()) { - throw new IllegalArgumentException( - "cannot index term vector payloads when term vectors are not indexed (field=\"" - + field.name() - + "\")"); - } + // Unreachable: this per-field is only built for fields that store term vectors (see + // FreqProxTermsWriter#addField), and the schema layer enforces every instance agrees. The + // "term-vector sub-option without term vectors" validation lives in + // IndexingChain#verifyNoTermVectorOptionsWithoutVectors. + assert false : "term-vectors per-field created for a field without term vectors"; } } else { if (doVectors != field.fieldType().storeTermVectors()) { diff --git a/lucene/core/src/java/org/apache/lucene/index/TermsHash.java b/lucene/core/src/java/org/apache/lucene/index/TermsHash.java index 7d438df9b909..4a695b6fc6c6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermsHash.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermsHash.java @@ -82,23 +82,16 @@ void flush( if (nextTermsHash != null) { Map nextChildFields = new HashMap<>(); for (final Map.Entry entry : fieldsToFlush.entrySet()) { - nextChildFields.put(entry.getKey(), entry.getValue().getNextPerField()); + // A field only has a next per-field when it feeds a downstream consumer (term vectors). + // Fields without one are simply absent from the downstream flush map. + TermsHashPerField next = entry.getValue().getTermVectorsPerField(); + if (next != null) { + nextChildFields.put(entry.getKey(), next); + } } nextTermsHash.flush(nextChildFields, state, sortMap, norms); } } abstract TermsHashPerField addField(FieldInvertState fieldInvertState, FieldInfo fieldInfo); - - void finishDocument(int docID) throws IOException { - if (nextTermsHash != null) { - nextTermsHash.finishDocument(docID); - } - } - - void startDocument() throws IOException { - if (nextTermsHash != null) { - nextTermsHash.startDocument(); - } - } } diff --git a/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java b/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java index 3f7be53606ce..057dff17ed9b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java @@ -35,7 +35,7 @@ abstract class TermsHashPerField implements Comparable { private static final int HASH_INIT_SIZE = 4; - private final TermsHashPerField nextPerField; + private final TermsHashPerField termVectorsPerField; private final IntBlockPool intPool; final ByteBlockPool bytePool; private final ByteSlicePool slicePool; @@ -68,7 +68,7 @@ abstract class TermsHashPerField implements Comparable { ByteBlockPool bytePool, ByteBlockPool termBytePool, Counter bytesUsed, - TermsHashPerField nextPerField, + TermsHashPerField termVectorsPerField, String fieldName, IndexOptions indexOptions) { this.intPool = intPool; @@ -76,7 +76,7 @@ abstract class TermsHashPerField implements Comparable { this.slicePool = new ByteSlicePool(bytePool); this.streamCount = streamCount; this.fieldName = fieldName; - this.nextPerField = nextPerField; + this.termVectorsPerField = termVectorsPerField; assert indexOptions != IndexOptions.NONE; this.indexOptions = indexOptions; PostingsBytesStartArray byteStarts = new PostingsBytesStartArray(this, bytesUsed); @@ -86,8 +86,8 @@ abstract class TermsHashPerField implements Comparable { void reset() { bytesHash.clear(false); sortedTermIDs = null; - if (nextPerField != null) { - nextPerField.reset(); + if (termVectorsPerField != null) { + termVectorsPerField.reset(); } } @@ -131,7 +131,7 @@ final void reinitHash() { // because token text has already been "interned" into // textStart, so we hash by textStart. term vectors use // this API. - private void add(int textStart, final int docID) throws IOException { + private void addInterned(int textStart, final int docID) throws IOException { int termID = bytesHash.addByPoolOffset(textStart); if (termID >= 0) { // New posting // First time we are seeing this token since we last @@ -204,7 +204,7 @@ void add(BytesRef termBytes, final int docID) throws IOException { } } if (doNextCall) { - nextPerField.add(postingsArray.textStarts[termID], docID); + termVectorsPerField.addInterned(postingsArray.textStarts[termID], docID); } } @@ -268,8 +268,8 @@ final void writeVInt(int stream, int i) { writeByte(stream, (byte) i); } - final TermsHashPerField getNextPerField() { - return nextPerField; + final TermsHashPerField getTermVectorsPerField() { + return termVectorsPerField; } final String getFieldName() { @@ -331,8 +331,8 @@ public final int compareTo(TermsHashPerField other) { /** Finish adding all instances of this field to the current document. */ void finish() throws IOException { - if (nextPerField != null) { - nextPerField.finish(); + if (termVectorsPerField != null) { + termVectorsPerField.finish(); } } @@ -345,8 +345,8 @@ final int getNumTerms() { * seen in the document. */ boolean start(IndexableField field, boolean first) { - if (nextPerField != null) { - doNextCall = nextPerField.start(field, first); + if (termVectorsPerField != null) { + doNextCall = termVectorsPerField.start(field, first); } return true; } From f12a2b7477a4456772783bd96b0a71e9f970286f Mon Sep 17 00:00:00 2001 From: Tim Brooks Date: Wed, 17 Jun 2026 21:07:01 -0600 Subject: [PATCH 2/6] Change --- .../java/org/apache/lucene/index/FreqProxTermsWriter.java | 5 ++--- .../apache/lucene/index/FreqProxTermsWriterPerField.java | 3 +-- .../src/java/org/apache/lucene/index/IndexingChain.java | 4 ++-- .../java/org/apache/lucene/index/IndexingFeatures.java | 8 ++++---- .../java/org/apache/lucene/index/TermsHashPerField.java | 4 ++-- 5 files changed, 11 insertions(+), 13 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java index ea18a895d952..ca8bfacb92f1 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java @@ -138,11 +138,10 @@ public Terms terms(final String field) { public TermsHashPerField addField(FieldInvertState invertState, FieldInfo fieldInfo) { // Only build the downstream term-vectors per-field when the field actually stores term vectors. // hasTermVectors() is fixed at field-init time (baked into the FieldInfo before this runs) and - // is immutable for the segment, so a null link is the structural signal "no row dependency". + // is immutable for the segment. TermsHashPerField termVectorsPerField = fieldInfo.hasTermVectors() ? nextTermsHash.addField(invertState, fieldInfo) : null; - return new FreqProxTermsWriterPerField( - invertState, this, fieldInfo, termVectorsPerField); + return new FreqProxTermsWriterPerField(invertState, this, fieldInfo, termVectorsPerField); } static class SortingTerms extends FilterLeafReader.FilterTerms { diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java index 1c508a138ce3..bce761a75c20 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java @@ -62,8 +62,7 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField { hasProx = indexOptions.subsumes(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); hasOffsets = indexOptions.subsumes(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); isTermDoc = fieldInfo.isTermDocField(); - // The downstream term-vectors per-field exists iff the field stores term vectors. This makes - // "has a per-document row dependency" a structural fact (getTermVectorsPerField() != null). + // The downstream term-vectors per-field exists iff the field stores term vectors. assert (getTermVectorsPerField() != null) == fieldInfo.hasTermVectors(); } diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 9e8d470cb6e3..8d347a0acf95 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -1551,8 +1551,8 @@ private static void verifyUnIndexedFieldType(String name, IndexableFieldType ft) /** * Verifies that an indexed field which does not store term vectors does not request any * term-vector sub-options. Previously enforced lazily in {@code - * TermVectorsConsumerPerField#start}; now that the term-vectors per-field is only built for fields - * that store term vectors, this runs in the schema layer for every indexed field. + * TermVectorsConsumerPerField#start}; now that the term-vectors per-field is only built for + * fields that store term vectors, this runs in the schema layer for every indexed field. */ private static void verifyNoTermVectorOptionsWithoutVectors(String name, IndexableFieldType ft) { if (ft.storeTermVectorOffsets()) { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingFeatures.java b/lucene/core/src/java/org/apache/lucene/index/IndexingFeatures.java index 668412046faa..38335b679644 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingFeatures.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingFeatures.java @@ -17,8 +17,8 @@ package org.apache.lucene.index; /** - * Classification predicates over an {@link IndexableFieldType} that name how a field participates in - * indexing. These exist so the row-vs-column routing in {@link IndexingChain} is declared once + * Classification predicates over an {@link IndexableFieldType} that name how a field participates + * in indexing. These exist so the row-vs-column routing in {@link IndexingChain} is declared once * rather than re-derived inline at each call site. * *

The key distinction is between the two kinds of term inversion: @@ -50,8 +50,8 @@ static boolean hasTermVectors(IndexableFieldType fieldType) { } /** - * Inversion that has a per-document obligation and must run in the row pass: an indexed field with - * term vectors. + * Inversion that has a per-document obligation and must run in the row pass: an indexed field + * with term vectors. */ static boolean rowBoundInversion(IndexableFieldType fieldType) { return indexed(fieldType) && hasTermVectors(fieldType); diff --git a/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java b/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java index 057dff17ed9b..de4a276692e7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java @@ -131,7 +131,7 @@ final void reinitHash() { // because token text has already been "interned" into // textStart, so we hash by textStart. term vectors use // this API. - private void addInterned(int textStart, final int docID) throws IOException { + private void add(int textStart, final int docID) throws IOException { int termID = bytesHash.addByPoolOffset(textStart); if (termID >= 0) { // New posting // First time we are seeing this token since we last @@ -204,7 +204,7 @@ void add(BytesRef termBytes, final int docID) throws IOException { } } if (doNextCall) { - termVectorsPerField.addInterned(postingsArray.textStarts[termID], docID); + termVectorsPerField.add(postingsArray.textStarts[termID], docID); } } From 8fdd493ac1aa8f7f981312411bd1b14fdc88e816 Mon Sep 17 00:00:00 2001 From: Tim Brooks Date: Wed, 17 Jun 2026 21:15:59 -0600 Subject: [PATCH 3/6] Change --- .../apache/lucene/index/IndexingChain.java | 11 +-- .../apache/lucene/index/IndexingFeatures.java | 75 ------------------- 2 files changed, 3 insertions(+), 83 deletions(-) delete mode 100644 lucene/core/src/java/org/apache/lucene/index/IndexingFeatures.java diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 8d347a0acf95..2833a8087781 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -734,7 +734,7 @@ void processBatch(int baseDocID, ColumnBatch columnBatch) throws IOException { "Unknown column type: " + column.getClass().getName()); } - if (IndexingFeatures.rowEligible(fieldType)) { + if (fieldType.stored() || fieldType.indexOptions() != IndexOptions.NONE) { hasRowColumns = true; } @@ -854,7 +854,7 @@ private void processRowColumns(int baseDocID, int numDocs, Iterable colu int originalIdx = 0; for (Column column : columns) { IndexableFieldType fieldType = column.fieldType(); - if (IndexingFeatures.rowEligible(fieldType) == false) { + if (fieldType.stored() == false && fieldType.indexOptions() == IndexOptions.NONE) { originalIdx++; continue; } @@ -867,12 +867,7 @@ private void processRowColumns(int baseDocID, int numDocs, Iterable colu adapters[numRowCols] = adapter; rowPfIndices[numRowCols] = originalIdx; heads[numRowCols] = adapter.nextDoc(); - // Defined as the seam for moving column-eligible inversion out of the row pass later; the - // partition assert documents that an indexed field is exactly one of the two inversion kinds. - assert (IndexingFeatures.rowBoundInversion(fieldType) - || IndexingFeatures.columnEligibleInversion(fieldType)) - == IndexingFeatures.indexed(fieldType); - if (IndexingFeatures.indexed(fieldType)) { + if (fieldType.indexOptions() != IndexOptions.NONE) { hasInverted = true; } if (fieldType.stored()) { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingFeatures.java b/lucene/core/src/java/org/apache/lucene/index/IndexingFeatures.java deleted file mode 100644 index 38335b679644..000000000000 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingFeatures.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.index; - -/** - * Classification predicates over an {@link IndexableFieldType} that name how a field participates - * in indexing. These exist so the row-vs-column routing in {@link IndexingChain} is declared once - * rather than re-derived inline at each call site. - * - *

The key distinction is between the two kinds of term inversion: - * - *

    - *
  • row-bound inversion — a field with term vectors. Term vectors are written per - * document inside a {@code startDocument}/{@code finishDocument} frame, so this inversion has - * a genuine per-document obligation and must run in the row pass. - *
  • column-eligible inversion — an indexed field without term vectors. Its only outputs - * are postings and norms, both of which accumulate in memory and flush at segment end with no - * within-document cross-field dependency, so this inversion could run column-major. - *
- * - *

{@link #columnEligibleInversion} is the seam a follow-up change will switch on to move that - * inversion into the columnar pass; today nothing routes on it. - */ -final class IndexingFeatures { - - private IndexingFeatures() {} - - /** Whether the field is indexed (its terms are inverted into postings). */ - static boolean indexed(IndexableFieldType fieldType) { - return fieldType.indexOptions() != IndexOptions.NONE; - } - - /** Whether the field stores term vectors. */ - static boolean hasTermVectors(IndexableFieldType fieldType) { - return fieldType.storeTermVectors(); - } - - /** - * Inversion that has a per-document obligation and must run in the row pass: an indexed field - * with term vectors. - */ - static boolean rowBoundInversion(IndexableFieldType fieldType) { - return indexed(fieldType) && hasTermVectors(fieldType); - } - - /** - * Inversion that has no per-document obligation and could run column-major: an indexed field - * without term vectors. This is the seam for the follow-up columnar-inversion change. - */ - static boolean columnEligibleInversion(IndexableFieldType fieldType) { - return indexed(fieldType) && hasTermVectors(fieldType) == false; - } - - /** - * Whether the field has any row-oriented feature (stored fields or term inversion) and therefore - * participates in the row pass. - */ - static boolean rowEligible(IndexableFieldType fieldType) { - return fieldType.stored() || indexed(fieldType); - } -} From 59eb9537a60aee4e1497531db2a1cb2eaef6a596 Mon Sep 17 00:00:00 2001 From: Tim Brooks Date: Wed, 17 Jun 2026 21:30:26 -0600 Subject: [PATCH 4/6] Change --- .../index/TermVectorsConsumerPerField.java | 47 +++++++++---------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java index 67c7868986aa..0f1dbcece764 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java @@ -146,34 +146,29 @@ boolean start(IndexableField field, boolean first) { hasPayloads = false; - doVectors = field.fieldType().storeTermVectors(); - - if (doVectors) { - doVectorPositions = field.fieldType().storeTermVectorPositions(); - - // Somewhat confusingly, unlike postings, you are - // allowed to index TV offsets without TV positions: - doVectorOffsets = field.fieldType().storeTermVectorOffsets(); - - if (doVectorPositions) { - doVectorPayloads = field.fieldType().storeTermVectorPayloads(); - } else { - doVectorPayloads = false; - if (field.fieldType().storeTermVectorPayloads()) { - // TODO: move this check somewhere else, and impl the other missing ones - throw new IllegalArgumentException( - "cannot index term vector payloads without term vector positions (field=\"" - + field.name() - + "\")"); - } - } + // This per-field is only built for fields that store term vectors (see + // FreqProxTermsWriter#addField), so doVectors is always true here. + assert field.fieldType().storeTermVectors() + : "term-vectors per-field created for a field without term vectors"; + doVectors = true; + + doVectorPositions = field.fieldType().storeTermVectorPositions(); + + // Somewhat confusingly, unlike postings, you are + // allowed to index TV offsets without TV positions: + doVectorOffsets = field.fieldType().storeTermVectorOffsets(); + if (doVectorPositions) { + doVectorPayloads = field.fieldType().storeTermVectorPayloads(); } else { - // Unreachable: this per-field is only built for fields that store term vectors (see - // FreqProxTermsWriter#addField), and the schema layer enforces every instance agrees. The - // "term-vector sub-option without term vectors" validation lives in - // IndexingChain#verifyNoTermVectorOptionsWithoutVectors. - assert false : "term-vectors per-field created for a field without term vectors"; + doVectorPayloads = false; + if (field.fieldType().storeTermVectorPayloads()) { + // TODO: move this check somewhere else, and impl the other missing ones + throw new IllegalArgumentException( + "cannot index term vector payloads without term vector positions (field=\"" + + field.name() + + "\")"); + } } } else { if (doVectors != field.fieldType().storeTermVectors()) { From 7af024d7980619969323355c11304ea07afbe733 Mon Sep 17 00:00:00 2001 From: Tim Brooks Date: Wed, 17 Jun 2026 21:34:47 -0600 Subject: [PATCH 5/6] Concise --- .../core/src/java/org/apache/lucene/index/IndexingChain.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 2833a8087781..9ca93cf06721 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -1545,9 +1545,7 @@ private static void verifyUnIndexedFieldType(String name, IndexableFieldType ft) /** * Verifies that an indexed field which does not store term vectors does not request any - * term-vector sub-options. Previously enforced lazily in {@code - * TermVectorsConsumerPerField#start}; now that the term-vectors per-field is only built for - * fields that store term vectors, this runs in the schema layer for every indexed field. + * term-vector sub-options. */ private static void verifyNoTermVectorOptionsWithoutVectors(String name, IndexableFieldType ft) { if (ft.storeTermVectorOffsets()) { From e426916d80ab9b3ea81591db6d528ad3f312d650 Mon Sep 17 00:00:00 2001 From: Tim Brooks Date: Sun, 21 Jun 2026 20:40:56 -0600 Subject: [PATCH 6/6] Change --- lucene/CHANGES.txt | 3 +- .../lucene/index/FreqProxTermsWriter.java | 18 ++++--- .../index/FreqProxTermsWriterPerField.java | 3 +- .../apache/lucene/index/IndexingChain.java | 4 +- .../index/SortingTermVectorsConsumer.java | 11 +---- .../lucene/index/TermVectorsConsumer.java | 10 +--- .../index/TermVectorsConsumerPerField.java | 47 +++++++++---------- .../org/apache/lucene/index/TermsHash.java | 24 ---------- .../lucene/index/TermsHashPerField.java | 9 ++-- 9 files changed, 44 insertions(+), 85 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 42a6e8073576..f2d2b727041b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -287,7 +287,8 @@ New Features Improvements --------------------- -(No changes) + +* GITHUB#16269: Lazily build the term-vectors per-field. (Tim Brooks) Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java index ca8bfacb92f1..14e789ead81e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java @@ -42,12 +42,18 @@ final class FreqProxTermsWriter extends TermsHash { + // The term vectors consumer is the (optional) downstream consumer of the terms interned here: + // it reuses our term byte pool and only buffers fields that store term vectors. This is the same + // instance the base class tracks as nextTermsHash, kept here with its concrete type. + private final TermVectorsConsumer termVectors; + FreqProxTermsWriter( final IntBlockPool.Allocator intBlockAllocator, final ByteBlockPool.Allocator byteBlockAllocator, Counter bytesUsed, - TermsHash termVectors) { + TermVectorsConsumer termVectors) { super(intBlockAllocator, byteBlockAllocator, bytesUsed, termVectors); + this.termVectors = termVectors; } private void applyDeletes(SegmentWriteState state, Fields fields) throws IOException { @@ -79,14 +85,15 @@ private void applyDeletes(SegmentWriteState state, Fields fields) throws IOExcep } } - @Override public void flush( Map fieldsToFlush, final SegmentWriteState state, Sorter.DocMap sortMap, NormsProducer norms) throws IOException { - super.flush(fieldsToFlush, state, sortMap, norms); + // Flush the per-document term vectors first (they were buffered as each document finished), + // then write the postings gathered per-field below. + termVectors.flush(state, sortMap); // Gather all fields that saw any postings: List allFields = new ArrayList<>(); @@ -137,10 +144,9 @@ public Terms terms(final String field) { @Override public TermsHashPerField addField(FieldInvertState invertState, FieldInfo fieldInfo) { // Only build the downstream term-vectors per-field when the field actually stores term vectors. - // hasTermVectors() is fixed at field-init time (baked into the FieldInfo before this runs) and - // is immutable for the segment. + // hasTermVectors() is fixed at field-init time and is immutable for the segment. TermsHashPerField termVectorsPerField = - fieldInfo.hasTermVectors() ? nextTermsHash.addField(invertState, fieldInfo) : null; + fieldInfo.hasTermVectors() ? termVectors.addField(invertState, fieldInfo) : null; return new FreqProxTermsWriterPerField(invertState, this, fieldInfo, termVectorsPerField); } diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java index bce761a75c20..da59d5730db8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java @@ -75,12 +75,11 @@ void finish() throws IOException { } @Override - boolean start(IndexableField f, boolean first) { + void start(IndexableField f, boolean first) { super.start(f, first); termFreqAtt = fieldState.termFreqAttribute; payloadAttribute = fieldState.payloadAttribute; offsetAttribute = fieldState.offsetAttribute; - return true; } void writeProx(int termID, int proxCode) { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 9ca93cf06721..d3a08111ce4a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -84,8 +84,8 @@ final class IndexingChain implements Accountable { final Counter bytesUsed = Counter.newCounter(); final FieldInfos.Builder fieldInfos; - // Writes postings and term vectors: - final TermsHash termsHash; + // Writes postings, and drives the (optional) downstream term-vectors consumer: + final FreqProxTermsWriter termsHash; // Shared pool for doc-value terms final ByteBlockPool docValuesBytePool; // Shared scratch buffers for dense points encoding diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java index d5d4aceae851..1804d4bffc3d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java @@ -18,9 +18,7 @@ import java.io.IOException; import java.util.Iterator; -import java.util.Map; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.TermVectorsWriter; @@ -51,13 +49,8 @@ final class SortingTermVectorsConsumer extends TermVectorsConsumer { } @Override - void flush( - Map fieldsToFlush, - final SegmentWriteState state, - Sorter.DocMap sortMap, - NormsProducer norms) - throws IOException { - super.flush(fieldsToFlush, state, sortMap, norms); + void flush(final SegmentWriteState state, Sorter.DocMap sortMap) throws IOException { + super.flush(state, sortMap); if (tmpDirectory != null) { TermVectorsReader reader = TEMP_TERM_VECTORS_FORMAT.vectorsReader( diff --git a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java index 8787ee613688..a75d877753ad 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java @@ -18,9 +18,7 @@ import java.io.IOException; import java.util.Arrays; -import java.util.Map; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FlushInfo; @@ -68,13 +66,7 @@ class TermVectorsConsumer extends TermsHash { this.codec = codec; } - @Override - void flush( - Map fieldsToFlush, - final SegmentWriteState state, - Sorter.DocMap sortMap, - NormsProducer norms) - throws IOException { + void flush(final SegmentWriteState state, Sorter.DocMap sortMap) throws IOException { if (writer != null) { int numDocs = state.segmentInfo.maxDoc(); assert numDocs > 0; diff --git a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java index 0f1dbcece764..6c17a350ca28 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java @@ -32,7 +32,9 @@ final class TermVectorsConsumerPerField extends TermsHashPerField { private final FieldInvertState fieldState; private final FieldInfo fieldInfo; - private boolean doVectors; + // True once this field has buffered vectors for the current document that still need flushing in + // finishDocument(); reset to false there. + private boolean pendingDoc; private boolean doVectorPositions; private boolean doVectorOffsets; private boolean doVectorPayloads; @@ -67,18 +69,18 @@ final class TermVectorsConsumerPerField extends TermsHashPerField { */ @Override void finish() { - if (!doVectors || getNumTerms() == 0) { + if (!pendingDoc || getNumTerms() == 0) { return; } termsWriter.addFieldToFlush(this); } void finishDocument() throws IOException { - if (doVectors == false) { + if (pendingDoc == false) { return; } - doVectors = false; + pendingDoc = false; final int numPostings = getNumTerms(); @@ -128,7 +130,7 @@ void finishDocument() throws IOException { } @Override - boolean start(IndexableField field, boolean first) { + void start(IndexableField field, boolean first) { super.start(field, first); termFreqAtt = fieldState.termFreqAttribute; assert field.fieldType().indexOptions() != IndexOptions.NONE; @@ -147,10 +149,11 @@ boolean start(IndexableField field, boolean first) { hasPayloads = false; // This per-field is only built for fields that store term vectors (see - // FreqProxTermsWriter#addField), so doVectors is always true here. + // FreqProxTermsWriter#addField), and FieldSchema locks storeTermVectors for the segment, so + // this field always stores term vectors. assert field.fieldType().storeTermVectors() : "term-vectors per-field created for a field without term vectors"; - doVectors = true; + pendingDoc = true; doVectorPositions = field.fieldType().storeTermVectorPositions(); @@ -171,12 +174,8 @@ boolean start(IndexableField field, boolean first) { } } } else { - if (doVectors != field.fieldType().storeTermVectors()) { - throw new IllegalArgumentException( - "all instances of a given field name must have the same term vectors settings (storeTermVectors changed for field=\"" - + field.name() - + "\")"); - } + // storeTermVectors itself is locked for the segment by FieldSchema, but the sub-options are + // not, so they are validated here for consistency across instances within the document. if (doVectorPositions != field.fieldType().storeTermVectorPositions()) { throw new IllegalArgumentException( "all instances of a given field name must have the same term vectors settings (storeTermVectorPositions changed for field=\"" @@ -197,21 +196,17 @@ boolean start(IndexableField field, boolean first) { } } - if (doVectors) { - if (doVectorOffsets) { - offsetAttribute = fieldState.offsetAttribute; - assert offsetAttribute != null; - } - - if (doVectorPayloads) { - // Can be null: - payloadAttribute = fieldState.payloadAttribute; - } else { - payloadAttribute = null; - } + if (doVectorOffsets) { + offsetAttribute = fieldState.offsetAttribute; + assert offsetAttribute != null; } - return doVectors; + if (doVectorPayloads) { + // Can be null: + payloadAttribute = fieldState.payloadAttribute; + } else { + payloadAttribute = null; + } } void writeProx(TermVectorsPostingsArray postings, int termID) { diff --git a/lucene/core/src/java/org/apache/lucene/index/TermsHash.java b/lucene/core/src/java/org/apache/lucene/index/TermsHash.java index 4a695b6fc6c6..ca9ca83a8523 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermsHash.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermsHash.java @@ -16,10 +16,6 @@ */ package org.apache.lucene.index; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.Counter; import org.apache.lucene.util.IntBlockPool; @@ -73,25 +69,5 @@ void reset() { bytePool.reset(false, false); } - void flush( - Map fieldsToFlush, - final SegmentWriteState state, - Sorter.DocMap sortMap, - NormsProducer norms) - throws IOException { - if (nextTermsHash != null) { - Map nextChildFields = new HashMap<>(); - for (final Map.Entry entry : fieldsToFlush.entrySet()) { - // A field only has a next per-field when it feeds a downstream consumer (term vectors). - // Fields without one are simply absent from the downstream flush map. - TermsHashPerField next = entry.getValue().getTermVectorsPerField(); - if (next != null) { - nextChildFields.put(entry.getKey(), next); - } - } - nextTermsHash.flush(nextChildFields, state, sortMap, norms); - } - } - abstract TermsHashPerField addField(FieldInvertState fieldInvertState, FieldInfo fieldInfo); } diff --git a/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java b/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java index de4a276692e7..8481975a1f1a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java @@ -125,8 +125,6 @@ final void reinitHash() { bytesHash.reinit(); } - private boolean doNextCall; - // Secondary entry point (for 2nd & subsequent TermsHash), // because token text has already been "interned" into // textStart, so we hash by textStart. term vectors use @@ -203,7 +201,7 @@ void add(BytesRef termBytes, final int docID) throws IOException { throw new DuplicateTermException(e.getMessage() + " '" + termBytes.utf8ToString() + "'"); } } - if (doNextCall) { + if (termVectorsPerField != null) { termVectorsPerField.add(postingsArray.textStarts[termID], docID); } } @@ -344,11 +342,10 @@ final int getNumTerms() { * Start adding a new field instance; first is true if this is the first time this field name was * seen in the document. */ - boolean start(IndexableField field, boolean first) { + void start(IndexableField field, boolean first) { if (termVectorsPerField != null) { - doNextCall = termVectorsPerField.start(field, first); + termVectorsPerField.start(field, first); } - return true; } /** Called when a term is seen for the first time. */