Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,8 @@ New Features

Improvements
---------------------
(No changes)

* GITHUB#16269: Lazily build the term-vectors per-field. (Tim Brooks)

Optimizations
---------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,18 @@

final class FreqProxTermsWriter extends TermsHash {

// The term vectors consumer is the (optional) downstream consumer of the terms interned here:
// it reuses our term byte pool and only buffers fields that store term vectors. This is the same
// instance the base class tracks as nextTermsHash, kept here with its concrete type.
private final TermVectorsConsumer termVectors;

FreqProxTermsWriter(
final IntBlockPool.Allocator intBlockAllocator,
final ByteBlockPool.Allocator byteBlockAllocator,
Counter bytesUsed,
TermsHash termVectors) {
TermVectorsConsumer termVectors) {
super(intBlockAllocator, byteBlockAllocator, bytesUsed, termVectors);
this.termVectors = termVectors;
}

private void applyDeletes(SegmentWriteState state, Fields fields) throws IOException {
Expand Down Expand Up @@ -79,14 +85,15 @@ private void applyDeletes(SegmentWriteState state, Fields fields) throws IOExcep
}
}

@Override
public void flush(
Map<String, TermsHashPerField> fieldsToFlush,
final SegmentWriteState state,
Sorter.DocMap sortMap,
NormsProducer norms)
throws IOException {
super.flush(fieldsToFlush, state, sortMap, norms);
// Flush the per-document term vectors first (they were buffered as each document finished),
// then write the postings gathered per-field below.
termVectors.flush(state, sortMap);

// Gather all fields that saw any postings:
List<FreqProxTermsWriterPerField> allFields = new ArrayList<>();
Expand Down Expand Up @@ -136,8 +143,11 @@ public Terms terms(final String field) {

@Override
public TermsHashPerField addField(FieldInvertState invertState, FieldInfo fieldInfo) {
return new FreqProxTermsWriterPerField(
invertState, this, fieldInfo, nextTermsHash.addField(invertState, fieldInfo));
// Only build the downstream term-vectors per-field when the field actually stores term vectors.
// hasTermVectors() is fixed at field-init time and is immutable for the segment.
TermsHashPerField termVectorsPerField =
fieldInfo.hasTermVectors() ? termVectors.addField(invertState, fieldInfo) : null;
return new FreqProxTermsWriterPerField(invertState, this, fieldInfo, termVectorsPerField);
}

static class SortingTerms extends FilterLeafReader.FilterTerms {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,14 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
FieldInvertState invertState,
TermsHash termsHash,
FieldInfo fieldInfo,
TermsHashPerField nextPerField) {
TermsHashPerField termVectorsPerField) {
super(
fieldInfo.getIndexOptions().subsumes(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) ? 2 : 1,
termsHash.intPool,
termsHash.bytePool,
termsHash.termBytePool,
termsHash.bytesUsed,
nextPerField,
termVectorsPerField,
fieldInfo.name,
fieldInfo.getIndexOptions());
this.fieldState = invertState;
Expand All @@ -62,6 +62,8 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
hasProx = indexOptions.subsumes(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
hasOffsets = indexOptions.subsumes(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
isTermDoc = fieldInfo.isTermDocField();
// The downstream term-vectors per-field exists iff the field stores term vectors.
assert (getTermVectorsPerField() != null) == fieldInfo.hasTermVectors();
}

@Override
Expand All @@ -73,12 +75,11 @@ void finish() throws IOException {
}

@Override
boolean start(IndexableField f, boolean first) {
void start(IndexableField f, boolean first) {
super.start(f, first);
termFreqAtt = fieldState.termFreqAttribute;
payloadAttribute = fieldState.payloadAttribute;
offsetAttribute = fieldState.offsetAttribute;
return true;
}

void writeProx(int termID, int proxCode) {
Expand Down
42 changes: 35 additions & 7 deletions lucene/core/src/java/org/apache/lucene/index/IndexingChain.java
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ final class IndexingChain implements Accountable {
final Counter bytesUsed = Counter.newCounter();
final FieldInfos.Builder fieldInfos;

// Writes postings and term vectors:
final TermsHash termsHash;
// Writes postings, and drives the (optional) downstream term-vectors consumer:
final FreqProxTermsWriter termsHash;
// Shared pool for doc-value terms
final ByteBlockPool docValuesBytePool;
// Shared scratch buffers for dense points encoding
Expand Down Expand Up @@ -592,7 +592,7 @@ void processDocument(
// analyzer is free to reuse TokenStream across fields
// (i.e., we cannot have more than one TokenStream
// running "at once"):
termsHash.startDocument();
termVectorsWriter.startDocument();
startStoredFields(docID);
try {
// Handle the parent field first (before document fields). Its schema was already
Expand Down Expand Up @@ -663,9 +663,9 @@ void processDocument(
fields[i].finish(docID);
}
finishStoredFields();
// TODO: for broken docs, optimize termsHash.finishDocument
// TODO: for broken docs, optimize termVectorsWriter.finishDocument
try {
termsHash.finishDocument(docID);
termVectorsWriter.finishDocument(docID);
} catch (Throwable th) {
// Must abort, on the possibility that on-disk term
// vectors are now corrupt:
Expand Down Expand Up @@ -886,7 +886,7 @@ private void processRowColumns(int baseDocID, int numDocs, Iterable<Column> colu
int indexedFieldCount = 0;

if (hasInverted) {
termsHash.startDocument();
termVectorsWriter.startDocument();
}
if (hasStored) {
startStoredFields(segDocID);
Expand Down Expand Up @@ -925,7 +925,7 @@ private void processRowColumns(int baseDocID, int numDocs, Iterable<Column> colu
}
if (hasInverted) {
try {
termsHash.finishDocument(segDocID);
termVectorsWriter.finishDocument(segDocID);
} catch (Throwable th) {
abortingExceptionConsumer.accept(th);
throw th;
Expand Down Expand Up @@ -1478,6 +1478,9 @@ private static void updateDocFieldSchema(
if (fieldType.indexOptions() != IndexOptions.NONE) {
schema.setIndexOptions(
fieldType.indexOptions(), fieldType.omitNorms(), fieldType.storeTermVectors());
if (fieldType.storeTermVectors() == false) {
verifyNoTermVectorOptionsWithoutVectors(fieldName, fieldType);
}
} else {
// TODO: should this be checked when a fieldType is created?
verifyUnIndexedFieldType(fieldName, fieldType);
Expand Down Expand Up @@ -1540,6 +1543,31 @@ private static void verifyUnIndexedFieldType(String name, IndexableFieldType ft)
}
}

/**
* Verifies that an indexed field which does not store term vectors does not request any
* term-vector sub-options.
*/
private static void verifyNoTermVectorOptionsWithoutVectors(String name, IndexableFieldType ft) {
if (ft.storeTermVectorOffsets()) {
throw new IllegalArgumentException(
"cannot index term vector offsets when term vectors are not indexed (field=\""
+ name
+ "\")");
}
if (ft.storeTermVectorPositions()) {
throw new IllegalArgumentException(
"cannot index term vector positions when term vectors are not indexed (field=\""
+ name
+ "\")");
}
if (ft.storeTermVectorPayloads()) {
throw new IllegalArgumentException(
"cannot index term vector payloads when term vectors are not indexed (field=\""
+ name
+ "\")");
}
}

private static void validateMaxVectorDimension(
String fieldName, int vectorDim, int maxVectorDim) {
if (vectorDim > maxVectorDim) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@

import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.codecs.TermVectorsWriter;
Expand Down Expand Up @@ -51,13 +49,8 @@ final class SortingTermVectorsConsumer extends TermVectorsConsumer {
}

@Override
void flush(
Map<String, TermsHashPerField> fieldsToFlush,
final SegmentWriteState state,
Sorter.DocMap sortMap,
NormsProducer norms)
throws IOException {
super.flush(fieldsToFlush, state, sortMap, norms);
void flush(final SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
super.flush(state, sortMap);
if (tmpDirectory != null) {
TermVectorsReader reader =
TEMP_TERM_VECTORS_FORMAT.vectorsReader(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@

import java.io.IOException;
import java.util.Arrays;
import java.util.Map;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.TermVectorsWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FlushInfo;
Expand Down Expand Up @@ -68,13 +66,7 @@ class TermVectorsConsumer extends TermsHash {
this.codec = codec;
}

@Override
void flush(
Map<String, TermsHashPerField> fieldsToFlush,
final SegmentWriteState state,
Sorter.DocMap sortMap,
NormsProducer norms)
throws IOException {
void flush(final SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
if (writer != null) {
int numDocs = state.segmentInfo.maxDoc();
assert numDocs > 0;
Expand Down Expand Up @@ -113,7 +105,7 @@ void setHasVectors() {
hasVectors = true;
}

@Override
/** Writes this document's term vectors. Called per document by {@link IndexingChain}. */
void finishDocument(int docID) throws IOException {

if (!hasVectors) {
Expand Down Expand Up @@ -173,7 +165,7 @@ void addFieldToFlush(TermVectorsConsumerPerField fieldToFlush) {
perFields[numVectorFields++] = fieldToFlush;
}

@Override
/** Resets per-document state. Called per document by {@link IndexingChain}. */
void startDocument() {
resetFields();
numVectorFields = 0;
Expand Down
Loading
Loading