From 6c48b404cd4c5a48435350007e1f5f41a0f3d01c Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 4 Dec 2024 15:19:41 +0100 Subject: [PATCH] Combine all postings enum impls of the default codec into a single class (#14033) Recent speedups by making call sites bimorphic made me want to play with combining all postings enums and impacts enums of the default codec into a single class, in order to reduce polymorphism. Unfortunately, it does not yield a speedup since the major polymorphic call sites we have that hurt performance (DefaultBulkScorer, ConjunctionDISI) are still 3-polymorphic or more. Yet, reduced polymorphism at little performance impact is a good trade-off as it would help make call sites bimorphic for users who don't have as much query diversity as nightly benchmarks, or in the future when we remove other causes of polymorphism. --- lucene/CHANGES.txt | 3 + .../lucene101/Lucene101PostingsReader.java | 1919 ++++++----------- .../org/apache/lucene/search/PhraseQuery.java | 7 +- .../intervals/TermIntervalsSource.java | 4 +- 4 files changed, 628 insertions(+), 1305 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 98580d3ea3da..afda0820239d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -61,6 +61,9 @@ Improvements segments for merges below the floor segment size, even if maxMergeAtOnce is bigger than segsPerTier. (Adrien Grand) +* GITHUB#14033: Combine all postings enum impls of the default codec into a + single class. (Adrien Grand) + Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java index d879a58b4ab7..010ed43135fe 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java @@ -44,7 +44,6 @@ import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SlowImpactsEnum; import org.apache.lucene.internal.vectorization.PostingDecodingUtil; import org.apache.lucene.internal.vectorization.VectorizationProvider; import org.apache.lucene.store.ByteArrayDataInput; @@ -272,100 +271,222 @@ public void decodeTerm( public PostingsEnum postings( FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags) throws IOException { - if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0 - || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false) { - return (reuse instanceof BlockDocsEnum blockDocsEnum - && blockDocsEnum.canReuse(docIn, fieldInfo) - ? blockDocsEnum - : new BlockDocsEnum(fieldInfo)) - .reset((IntBlockTermState) termState, flags); - } else { - return (reuse instanceof EverythingEnum everythingEnum - && everythingEnum.canReuse(docIn, fieldInfo) - ? everythingEnum - : new EverythingEnum(fieldInfo)) - .reset((IntBlockTermState) termState, flags); - } + return (reuse instanceof BlockPostingsEnum everythingEnum + && everythingEnum.canReuse(docIn, fieldInfo, flags, false) + ? everythingEnum + : new BlockPostingsEnum(fieldInfo, flags, false)) + .reset((IntBlockTermState) termState, flags); } @Override public ImpactsEnum impacts(FieldInfo fieldInfo, BlockTermState state, int flags) throws IOException { - final IndexOptions options = fieldInfo.getIndexOptions(); - final boolean indexHasPositions = - options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - - if (options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0 - && (indexHasPositions == false - || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false)) { - return new BlockImpactsDocsEnum(indexHasPositions, (IntBlockTermState) state); - } - - if (indexHasPositions - && (options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0 - || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == false) - && (fieldInfo.hasPayloads() == false - || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == false)) { - return new BlockImpactsPostingsEnum(fieldInfo, (IntBlockTermState) state); - } - - return new SlowImpactsEnum(postings(fieldInfo, state, null, flags)); + return new BlockPostingsEnum(fieldInfo, flags, true).reset((IntBlockTermState) state, flags); } - private static long sumOverRange(int[] arr, int start, int end) { - long res = 0L; + private static int sumOverRange(int[] arr, int start, int end) { + int res = 0; for (int i = start; i < end; i++) { res += arr[i]; } return res; } - private abstract class AbstractPostingsEnum extends PostingsEnum { + final class BlockPostingsEnum extends ImpactsEnum { - protected ForDeltaUtil forDeltaUtil; - protected PForUtil pforUtil; + private ForDeltaUtil forDeltaUtil; + private PForUtil pforUtil; - protected final int[] docBuffer = new int[BLOCK_SIZE + 1]; - protected final boolean indexHasFreq; + private final int[] docBuffer = new int[BLOCK_SIZE + 1]; - protected int doc; // doc we last read + private int doc; // doc we last read // level 0 skip data - protected int level0LastDocID; + private int level0LastDocID; + private long level0DocEndFP; // level 1 skip data - protected int level1LastDocID; - protected long level1DocEndFP; - protected int level1DocCountUpto; + private int level1LastDocID; + private long level1DocEndFP; + private int level1DocCountUpto; + + private int docFreq; // number of docs in this posting list + private long totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) + + private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + + private int docCountUpto; // number of docs in or before the current block + private int prevDocID; // last doc ID of the previous block + + private int docBufferSize; + private int docBufferUpto; + + private IndexInput docIn; + private PostingDecodingUtil docInUtil; + + private final int[] freqBuffer = new int[BLOCK_SIZE]; + private final int[] posDeltaBuffer; + + private final int[] payloadLengthBuffer; + private final int[] offsetStartDeltaBuffer; + private final int[] offsetLengthBuffer; + + private byte[] payloadBytes; + private int payloadByteUpto; + private int payloadLength; + + private int lastStartOffset; + private int startOffset; + private int endOffset; + + private int posBufferUpto; + + final IndexInput posIn; + final PostingDecodingUtil posInUtil; + final IndexInput payIn; + final PostingDecodingUtil payInUtil; + final BytesRef payload; + + final IndexOptions options; + final boolean indexHasFreq; + final boolean indexHasPos; + final boolean indexHasOffsets; + final boolean indexHasPayloads; + final boolean indexHasOffsetsOrPayloads; + + final int flags; + final boolean needsFreq; + final boolean needsPos; + final boolean needsOffsets; + final boolean needsPayloads; + final boolean needsOffsetsOrPayloads; + final boolean needsImpacts; + final boolean needsDocsAndFreqsOnly; + + private long freqFP; // offset of the freq block + + private int position; // current position + + // value of docBufferUpto on the last doc ID when positions have been read + private int posDocBufferUpto; + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private long lastPosBlockFP; - protected int docFreq; // number of docs in this posting list - protected long - totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) + // level 0 skip data + private long level0PosEndFP; + private int level0BlockPosUpto; + private long level0PayEndFP; + private int level0BlockPayUpto; + private final BytesRef level0SerializedImpacts; + private final MutableImpactList level0Impacts; - protected int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + // level 1 skip data + private long level1PosEndFP; + private int level1BlockPosUpto; + private long level1PayEndFP; + private int level1BlockPayUpto; + private final BytesRef level1SerializedImpacts; + private final MutableImpactList level1Impacts; - protected int docCountUpto; // number of docs in or before the current block - protected int prevDocID; // last doc ID of the previous block + // true if we shallow-advanced to a new block that we have not decoded yet + private boolean needsRefilling; - protected int docBufferSize; - protected int docBufferUpto; + public BlockPostingsEnum(FieldInfo fieldInfo, int flags, boolean needsImpacts) + throws IOException { + options = fieldInfo.getIndexOptions(); + indexHasFreq = options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + indexHasPos = options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + indexHasOffsets = + options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads; - protected IndexInput docIn; - protected PostingDecodingUtil docInUtil; + this.flags = flags; + needsFreq = indexHasFreq && PostingsEnum.featureRequested(flags, PostingsEnum.FREQS); + needsPos = indexHasPos && PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS); + needsOffsets = indexHasOffsets && PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS); + needsPayloads = + indexHasPayloads && PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS); + needsOffsetsOrPayloads = needsOffsets || needsPayloads; + this.needsImpacts = needsImpacts; + needsDocsAndFreqsOnly = needsPos == false && needsImpacts == false; - protected AbstractPostingsEnum(FieldInfo fieldInfo) { - indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in // advance() docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; + + if (needsFreq == false) { + Arrays.fill(freqBuffer, 1); + } + + if (needsFreq && needsImpacts) { + level0SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel0); + level1SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel1); + level0Impacts = new MutableImpactList(maxNumImpactsAtLevel0); + level1Impacts = new MutableImpactList(maxNumImpactsAtLevel1); + } else { + level0SerializedImpacts = null; + level1SerializedImpacts = null; + level0Impacts = null; + level1Impacts = null; + } + + if (needsPos) { + this.posIn = Lucene101PostingsReader.this.posIn.clone(); + posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn); + posDeltaBuffer = new int[BLOCK_SIZE]; + } else { + this.posIn = null; + this.posInUtil = null; + posDeltaBuffer = null; + } + + if (needsOffsets || needsPayloads) { + this.payIn = Lucene101PostingsReader.this.payIn.clone(); + payInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(payIn); + } else { + this.payIn = null; + payInUtil = null; + } + + if (needsOffsets) { + offsetStartDeltaBuffer = new int[BLOCK_SIZE]; + offsetLengthBuffer = new int[BLOCK_SIZE]; + } else { + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + startOffset = -1; + endOffset = -1; + } + + if (indexHasPayloads) { + payloadLengthBuffer = new int[BLOCK_SIZE]; + payloadBytes = new byte[128]; + payload = new BytesRef(); + } else { + payloadLengthBuffer = null; + payloadBytes = null; + payload = null; + } } - @Override - public int docID() { - return doc; + public boolean canReuse( + IndexInput docIn, FieldInfo fieldInfo, int flags, boolean needsImpacts) { + return docIn == Lucene101PostingsReader.this.docIn + && options == fieldInfo.getIndexOptions() + && indexHasPayloads == fieldInfo.hasPayloads() + && this.flags == flags + && this.needsImpacts == needsImpacts; } - protected void resetIndexInput(IntBlockTermState termState) throws IOException { + public BlockPostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { docFreq = termState.docFreq; singletonDocID = termState.singletonDocID; if (docFreq > 1) { @@ -376,12 +497,50 @@ protected void resetIndexInput(IntBlockTermState termState) throws IOException { } prefetchPostings(docIn, termState); } - } - protected PostingsEnum resetIdsAndLevelParams(IntBlockTermState termState) throws IOException { + if (forDeltaUtil == null && docFreq >= BLOCK_SIZE) { + forDeltaUtil = new ForDeltaUtil(); + } + totalTermFreq = indexHasFreq ? termState.totalTermFreq : termState.docFreq; + if (needsFreq && pforUtil == null && totalTermFreq >= BLOCK_SIZE) { + pforUtil = new PForUtil(); + } + + // Where this term's postings start in the .pos file: + final long posTermStartFP = termState.posStartFP; + // Where this term's payloads/offsets start in the .pay + // file: + final long payTermStartFP = termState.payStartFP; + if (posIn != null) { + posIn.seek(posTermStartFP); + if (payIn != null) { + payIn.seek(payTermStartFP); + } + } + level1PosEndFP = posTermStartFP; + level1PayEndFP = payTermStartFP; + level0PosEndFP = posTermStartFP; + level0PayEndFP = payTermStartFP; + posPendingCount = 0; + payloadByteUpto = 0; + if (termState.totalTermFreq < BLOCK_SIZE) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == BLOCK_SIZE) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + + level1BlockPosUpto = 0; + level1BlockPayUpto = 0; + level0BlockPosUpto = 0; + level0BlockPayUpto = 0; + posBufferUpto = BLOCK_SIZE; + doc = -1; prevDocID = -1; docCountUpto = 0; + freqFP = -1L; level0LastDocID = -1; if (docFreq < LEVEL1_NUM_DOCS) { level1LastDocID = NO_MORE_DOCS; @@ -395,43 +554,14 @@ protected PostingsEnum resetIdsAndLevelParams(IntBlockTermState termState) throw level1DocCountUpto = 0; docBufferSize = BLOCK_SIZE; docBufferUpto = BLOCK_SIZE; - return this; - } - } - - final class BlockDocsEnum extends AbstractPostingsEnum { - - private final int[] freqBuffer = new int[BLOCK_SIZE]; - - private boolean needsFreq; // true if the caller actually needs frequencies - private long freqFP; - - public BlockDocsEnum(FieldInfo fieldInfo) { - super(fieldInfo); - } + posDocBufferUpto = BLOCK_SIZE; - public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { - final IndexOptions options = fieldInfo.getIndexOptions(); - return docIn == Lucene101PostingsReader.this.docIn - && indexHasFreq == (options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0); + return this; } - public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { - resetIndexInput(termState); - if (pforUtil == null && docFreq >= BLOCK_SIZE) { - pforUtil = new PForUtil(); - forDeltaUtil = new ForDeltaUtil(); - } - totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq; - - this.needsFreq = PostingsEnum.featureRequested(flags, PostingsEnum.FREQS); - if (indexHasFreq == false || needsFreq == false) { - // Filling this buffer may not be cheap when doing primary key lookups, so we make sure to - // not fill more than `docFreq` entries. - Arrays.fill(freqBuffer, 0, Math.min(ForUtil.BLOCK_SIZE, docFreq), 1); - } - freqFP = -1; - return resetIdsAndLevelParams(termState); + @Override + public int docID() { + return doc; } @Override @@ -441,35 +571,11 @@ public int freq() throws IOException { pforUtil.decode(docInUtil, freqBuffer); freqFP = -1; } - return freqBuffer[docBufferUpto - 1]; } - @Override - public int nextPosition() { - return -1; - } - - @Override - public int startOffset() { - return -1; - } - - @Override - public int endOffset() { - return -1; - } - - @Override - public BytesRef getPayload() { - return null; - } - private void refillFullBlock() throws IOException { - assert docFreq - docCountUpto >= BLOCK_SIZE; - forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); - if (indexHasFreq) { if (needsFreq) { freqFP = docIn.getFilePointer(); @@ -479,29 +585,44 @@ private void refillFullBlock() throws IOException { docCountUpto += BLOCK_SIZE; prevDocID = docBuffer[BLOCK_SIZE - 1]; docBufferUpto = 0; + posDocBufferUpto = 0; assert docBuffer[docBufferSize] == NO_MORE_DOCS; } private void refillRemainder() throws IOException { final int left = docFreq - docCountUpto; - assert left >= 0; - assert left < BLOCK_SIZE; - + assert left >= 0 && left < BLOCK_SIZE; if (docFreq == 1) { docBuffer[0] = singletonDocID; freqBuffer[0] = (int) totalTermFreq; docBuffer[1] = NO_MORE_DOCS; + assert freqFP == -1; docCountUpto++; + docBufferSize = 1; } else { // Read vInts: PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, needsFreq); prefixSum(docBuffer, left, prevDocID); docBuffer[left] = NO_MORE_DOCS; + freqFP = -1L; docCountUpto += left; + docBufferSize = left; } + prevDocID = docBuffer[BLOCK_SIZE - 1]; docBufferUpto = 0; - docBufferSize = left; - freqFP = -1; + posDocBufferUpto = 0; + assert docBuffer[docBufferSize] == NO_MORE_DOCS; + } + + private void refillDocs() throws IOException { + final int left = docFreq - docCountUpto; + assert left >= 0; + + if (left >= BLOCK_SIZE) { + refillFullBlock(); + } else { + refillRemainder(); + } } private void skipLevel1To(int target) throws IOException { @@ -509,6 +630,10 @@ private void skipLevel1To(int target) throws IOException { prevDocID = level1LastDocID; level0LastDocID = level1LastDocID; docIn.seek(level1DocEndFP); + level0PosEndFP = level1PosEndFP; + level0BlockPosUpto = level1BlockPosUpto; + level0PayEndFP = level1PayEndFP; + level0BlockPayUpto = level1BlockPayUpto; docCountUpto = level1DocCountUpto; level1DocCountUpto += LEVEL1_NUM_DOCS; @@ -518,1168 +643,178 @@ private void skipLevel1To(int target) throws IOException { } level1LastDocID += docIn.readVInt(); - level1DocEndFP = docIn.readVLong() + docIn.getFilePointer(); + long delta = docIn.readVLong(); + level1DocEndFP = delta + docIn.getFilePointer(); - if (level1LastDocID >= target) { - if (indexHasFreq) { - // skip impacts and pos skip data - docIn.skipBytes(docIn.readShort()); + if (indexHasFreq) { + long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); + int numImpactBytes = docIn.readShort(); + if (needsImpacts && level1LastDocID >= target) { + docIn.readBytes(level1SerializedImpacts.bytes, 0, numImpactBytes); + level1SerializedImpacts.length = numImpactBytes; + } else { + docIn.skipBytes(numImpactBytes); } + if (indexHasPos) { + level1PosEndFP += docIn.readVLong(); + level1BlockPosUpto = docIn.readByte(); + if (indexHasOffsetsOrPayloads) { + level1PayEndFP += docIn.readVLong(); + level1BlockPayUpto = docIn.readVInt(); + } + } + assert docIn.getFilePointer() == skip1EndFP; + } + + if (level1LastDocID >= target) { break; } } } - private void skipLevel0To(int target) throws IOException { - while (true) { - prevDocID = level0LastDocID; - if (docFreq - docCountUpto >= BLOCK_SIZE) { - long skip0NumBytes = docIn.readVLong(); - // end offset of skip data (before the actual data starts) - long skip0EndFP = docIn.getFilePointer() + skip0NumBytes; - int docDelta = readVInt15(docIn); - level0LastDocID += docDelta; + private void doMoveToNextLevel0Block() throws IOException { + assert docBufferUpto == BLOCK_SIZE; + if (posIn != null) { + if (level0PosEndFP >= posIn.getFilePointer()) { + posIn.seek(level0PosEndFP); + posPendingCount = level0BlockPosUpto; + if (payIn != null) { + assert level0PayEndFP >= payIn.getFilePointer(); + payIn.seek(level0PayEndFP); + payloadByteUpto = level0BlockPayUpto; + } + posBufferUpto = BLOCK_SIZE; + } else { + assert freqFP == -1L; + posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE); + } + } - if (target <= level0LastDocID) { - docIn.seek(skip0EndFP); - break; + if (docFreq - docCountUpto >= BLOCK_SIZE) { + docIn.readVLong(); // level0NumBytes + int docDelta = readVInt15(docIn); + level0LastDocID += docDelta; + long blockLength = readVLong15(docIn); + level0DocEndFP = docIn.getFilePointer() + blockLength; + if (indexHasFreq) { + int numImpactBytes = docIn.readVInt(); + if (needsImpacts) { + docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); + level0SerializedImpacts.length = numImpactBytes; + } else { + docIn.skipBytes(numImpactBytes); } - // skip block - docIn.skipBytes(readVLong15(docIn)); - docCountUpto += BLOCK_SIZE; - } else { - level0LastDocID = NO_MORE_DOCS; - break; + if (indexHasPos) { + level0PosEndFP += docIn.readVLong(); + level0BlockPosUpto = docIn.readByte(); + if (indexHasOffsetsOrPayloads) { + level0PayEndFP += docIn.readVLong(); + level0BlockPayUpto = docIn.readVInt(); + } + } } + refillFullBlock(); + } else { + level0LastDocID = NO_MORE_DOCS; + refillRemainder(); } } private void moveToNextLevel0Block() throws IOException { - if (doc == level1LastDocID) { // advance skip data on level 1 + if (doc == level1LastDocID) { // advance level 1 skip data skipLevel1To(doc + 1); } + // Now advance level 0 skip data prevDocID = level0LastDocID; - if (docFreq - docCountUpto >= BLOCK_SIZE) { - docIn.skipBytes(docIn.readVLong()); + + if (needsDocsAndFreqsOnly && docFreq - docCountUpto >= BLOCK_SIZE) { + // Optimize the common path for exhaustive evaluation + long level0NumBytes = docIn.readVLong(); + docIn.skipBytes(level0NumBytes); refillFullBlock(); level0LastDocID = docBuffer[BLOCK_SIZE - 1]; } else { - level0LastDocID = NO_MORE_DOCS; - refillRemainder(); + doMoveToNextLevel0Block(); } } - @Override - public int nextDoc() throws IOException { - if (docBufferUpto == BLOCK_SIZE) { // advance skip data on level 0 - moveToNextLevel0Block(); - } - - return this.doc = docBuffer[docBufferUpto++]; - } - - @Override - public int advance(int target) throws IOException { - if (target > level0LastDocID) { // advance skip data on level 0 - - if (target > level1LastDocID) { // advance skip data on level 1 - skipLevel1To(target); - } - - skipLevel0To(target); - - if (docFreq - docCountUpto >= BLOCK_SIZE) { - refillFullBlock(); - } else { - refillRemainder(); - } - } - - int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); - this.doc = docBuffer[next]; - docBufferUpto = next + 1; - return doc; - } - - @Override - public long cost() { - return docFreq; - } - } - - final class EverythingEnum extends AbstractPostingsEnum { - - private final int[] freqBuffer = new int[BLOCK_SIZE + 1]; - private final int[] posDeltaBuffer = new int[BLOCK_SIZE]; - - private final int[] payloadLengthBuffer; - private final int[] offsetStartDeltaBuffer; - private final int[] offsetLengthBuffer; - - private byte[] payloadBytes; - private int payloadByteUpto; - private int payloadLength; - - private int lastStartOffset; - private int startOffset; - private int endOffset; - - private int posBufferUpto; - - final IndexInput posIn; - final PostingDecodingUtil posInUtil; - final IndexInput payIn; - final PostingDecodingUtil payInUtil; - final BytesRef payload; - - final boolean indexHasOffsets; - final boolean indexHasPayloads; - final boolean indexHasOffsetsOrPayloads; - - private long freqFP; // offset of the freq block - - private int position; // current position - - // value of docBufferUpto on the last doc ID when positions have been read - private int posDocBufferUpto; - - // how many positions "behind" we are; nextPosition must - // skip these to "catch up": - private int posPendingCount; - - // File pointer where the last (vInt encoded) pos delta - // block is. We need this to know whether to bulk - // decode vs vInt decode the block: - private long lastPosBlockFP; - - private long level0PosEndFP; - private int level0BlockPosUpto; - private long level0PayEndFP; - private int level0BlockPayUpto; - - private long level1PosEndFP; - private int level1BlockPosUpto; - private long level1PayEndFP; - private int level1BlockPayUpto; - - private boolean needsOffsets; // true if we actually need offsets - private boolean needsPayloads; // true if we actually need payloads - private boolean needsPayloadsOrOffsets; - - public EverythingEnum(FieldInfo fieldInfo) throws IOException { - super(fieldInfo); - indexHasOffsets = - fieldInfo - .getIndexOptions() - .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0; - indexHasPayloads = fieldInfo.hasPayloads(); - indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads; - - this.posIn = Lucene101PostingsReader.this.posIn.clone(); - posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn); - if (indexHasOffsetsOrPayloads) { - this.payIn = Lucene101PostingsReader.this.payIn.clone(); - payInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(payIn); - } else { - this.payIn = null; - payInUtil = null; - } - if (indexHasOffsets) { - offsetStartDeltaBuffer = new int[BLOCK_SIZE]; - offsetLengthBuffer = new int[BLOCK_SIZE]; - } else { - offsetStartDeltaBuffer = null; - offsetLengthBuffer = null; - startOffset = -1; - endOffset = -1; - } - - if (indexHasPayloads) { - payloadLengthBuffer = new int[BLOCK_SIZE]; - payloadBytes = new byte[128]; - payload = new BytesRef(); - } else { - payloadLengthBuffer = null; - payloadBytes = null; - payload = null; - } - } - - public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { - return docIn == Lucene101PostingsReader.this.docIn - && indexHasOffsets - == (fieldInfo - .getIndexOptions() - .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0) - && indexHasPayloads == fieldInfo.hasPayloads(); - } - - public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { - resetIndexInput(termState); - if (forDeltaUtil == null && docFreq >= BLOCK_SIZE) { - forDeltaUtil = new ForDeltaUtil(); - } - totalTermFreq = termState.totalTermFreq; - if (pforUtil == null && totalTermFreq >= BLOCK_SIZE) { - pforUtil = new PForUtil(); - } - // Where this term's postings start in the .pos file: - final long posTermStartFP = termState.posStartFP; - // Where this term's payloads/offsets start in the .pay - // file: - final long payTermStartFP = termState.payStartFP; - posIn.seek(posTermStartFP); - if (indexHasOffsetsOrPayloads) { - payIn.seek(payTermStartFP); - } - level1PosEndFP = posTermStartFP; - level1PayEndFP = payTermStartFP; - level0PosEndFP = posTermStartFP; - level0PayEndFP = payTermStartFP; - posPendingCount = 0; - payloadByteUpto = 0; - if (termState.totalTermFreq < BLOCK_SIZE) { - lastPosBlockFP = posTermStartFP; - } else if (termState.totalTermFreq == BLOCK_SIZE) { - lastPosBlockFP = -1; - } else { - lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; - } - - this.needsOffsets = - indexHasOffsets && PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS); - this.needsPayloads = - indexHasPayloads && PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS); - this.needsPayloadsOrOffsets = this.needsPayloads || this.needsOffsets; - - level1BlockPosUpto = 0; - level1BlockPayUpto = 0; - level0BlockPosUpto = 0; - level0BlockPayUpto = 0; - posBufferUpto = BLOCK_SIZE; - - return resetIdsAndLevelParams(termState); - } - - @Override - public int freq() throws IOException { - if (freqFP != -1) { - docIn.seek(freqFP); - pforUtil.decode(docInUtil, freqBuffer); - freqFP = -1; - } - return freqBuffer[docBufferUpto - 1]; - } - - private void refillDocs() throws IOException { - final int left = docFreq - docCountUpto; - assert left >= 0; - - if (left >= BLOCK_SIZE) { - forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); - freqFP = docIn.getFilePointer(); - PForUtil.skip(docIn); - docCountUpto += BLOCK_SIZE; - } else if (docFreq == 1) { - docBuffer[0] = singletonDocID; - freqBuffer[0] = (int) totalTermFreq; - freqFP = -1; - docBuffer[1] = NO_MORE_DOCS; - docCountUpto++; - docBufferSize = 1; - } else { - // Read vInts: - PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true); - prefixSum(docBuffer, left, prevDocID); - docBuffer[left] = NO_MORE_DOCS; - freqFP = -1; - docCountUpto += left; - docBufferSize = left; - } - prevDocID = docBuffer[BLOCK_SIZE - 1]; - docBufferUpto = 0; - posDocBufferUpto = 0; - assert docBuffer[docBufferSize] == NO_MORE_DOCS; - } - - private void skipLevel1To(int target) throws IOException { - while (true) { - prevDocID = level1LastDocID; - level0LastDocID = level1LastDocID; - docIn.seek(level1DocEndFP); - level0PosEndFP = level1PosEndFP; - level0BlockPosUpto = level1BlockPosUpto; - if (indexHasOffsetsOrPayloads) { - level0PayEndFP = level1PayEndFP; - level0BlockPayUpto = level1BlockPayUpto; - } - docCountUpto = level1DocCountUpto; - level1DocCountUpto += LEVEL1_NUM_DOCS; - - if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - break; - } - - level1LastDocID += docIn.readVInt(); - long delta = docIn.readVLong(); - level1DocEndFP = delta + docIn.getFilePointer(); - - long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); - docIn.skipBytes(docIn.readShort()); // impacts - level1PosEndFP += docIn.readVLong(); - level1BlockPosUpto = docIn.readByte(); - if (indexHasOffsetsOrPayloads) { - level1PayEndFP += docIn.readVLong(); - level1BlockPayUpto = docIn.readVInt(); - } - assert docIn.getFilePointer() == skip1EndFP; - - if (level1LastDocID >= target) { - break; - } - } - } - - private void moveToNextLevel0Block() throws IOException { - if (doc == level1LastDocID) { // advance level 1 skip data - skipLevel1To(doc + 1); - } - - // Now advance level 0 skip data - prevDocID = level0LastDocID; - - assert docBufferUpto == BLOCK_SIZE; - if (level0PosEndFP >= posIn.getFilePointer()) { - posIn.seek(level0PosEndFP); - posPendingCount = level0BlockPosUpto; - if (indexHasOffsetsOrPayloads) { - assert level0PayEndFP >= payIn.getFilePointer(); - payIn.seek(level0PayEndFP); - payloadByteUpto = level0BlockPayUpto; - } - posBufferUpto = BLOCK_SIZE; - } else { - posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE); - } - - if (docFreq - docCountUpto >= BLOCK_SIZE) { - docIn.readVLong(); // skip0 num bytes - int docDelta = readVInt15(docIn); - level0LastDocID += docDelta; - readVLong15(docIn); // block length - docIn.skipBytes(docIn.readVLong()); // impacts - - level0PosEndFP += docIn.readVLong(); - level0BlockPosUpto = docIn.readByte(); - if (indexHasOffsetsOrPayloads) { - level0PayEndFP += docIn.readVLong(); - level0BlockPayUpto = docIn.readVInt(); - } - } else { - level0LastDocID = NO_MORE_DOCS; - } - - refillDocs(); - } - - @Override - public int nextDoc() throws IOException { - if (docBufferUpto == BLOCK_SIZE) { // advance level 0 skip data - moveToNextLevel0Block(); - } - - this.doc = docBuffer[docBufferUpto]; - docBufferUpto++; - return doc; - } - - private void skipLevel0To(int target) throws IOException { - long posFP; - int posUpto; - long payFP; - int payUpto; - - while (true) { - prevDocID = level0LastDocID; - - posFP = level0PosEndFP; - posUpto = level0BlockPosUpto; - payFP = level0PayEndFP; - payUpto = level0BlockPayUpto; - - if (docFreq - docCountUpto >= BLOCK_SIZE) { - docIn.readVLong(); // skip0 num bytes - int docDelta = readVInt15(docIn); - level0LastDocID += docDelta; - - long blockLength = readVLong15(docIn); - long blockEndFP = docIn.getFilePointer() + blockLength; - docIn.skipBytes(docIn.readVLong()); // impacts - - level0PosEndFP += docIn.readVLong(); - level0BlockPosUpto = docIn.readByte(); - if (indexHasOffsetsOrPayloads) { - level0PayEndFP += docIn.readVLong(); - level0BlockPayUpto = docIn.readVInt(); - } - - if (target <= level0LastDocID) { - break; - } - - docIn.seek(blockEndFP); - docCountUpto += BLOCK_SIZE; - } else { - level0LastDocID = NO_MORE_DOCS; - break; - } - } - - // If nextBlockPosFP is less than the current FP, it means that the block of positions for - // the first docs of the next block are already decoded. In this case we just accumulate - // frequencies into posPendingCount instead of seeking backwards and decoding the same pos - // block again. - if (posFP >= posIn.getFilePointer()) { - posIn.seek(posFP); - posPendingCount = posUpto; - if (indexHasOffsetsOrPayloads) { - assert level0PayEndFP >= payIn.getFilePointer(); - payIn.seek(payFP); - payloadByteUpto = payUpto; - } - posBufferUpto = BLOCK_SIZE; - } else { - posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE); - } - } - - @Override - public int advance(int target) throws IOException { - if (target > level0LastDocID) { // advance level 0 skip data - - if (target > level1LastDocID) { // advance level 1 skip data - skipLevel1To(target); - } - - skipLevel0To(target); - - refillDocs(); - } - - int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); - this.docBufferUpto = next + 1; - - return this.doc = docBuffer[next]; - } - - private void skipPositions(int freq) throws IOException { - // Skip positions now: - int toSkip = posPendingCount - freq; - // if (DEBUG) { - // System.out.println(" FPR.skipPositions: toSkip=" + toSkip); - // } - - final int leftInBlock = BLOCK_SIZE - posBufferUpto; - if (toSkip < leftInBlock) { - int end = posBufferUpto + toSkip; - if (indexHasPayloads) { - payloadByteUpto += sumOverRange(payloadLengthBuffer, posBufferUpto, end); - } - posBufferUpto = end; - } else { - toSkip -= leftInBlock; - while (toSkip >= BLOCK_SIZE) { - assert posIn.getFilePointer() != lastPosBlockFP; - PForUtil.skip(posIn); - - if (indexHasPayloads) { - // Skip payloadLength block: - PForUtil.skip(payIn); - - // Skip payloadBytes block: - int numBytes = payIn.readVInt(); - payIn.seek(payIn.getFilePointer() + numBytes); - } - - if (indexHasOffsets) { - PForUtil.skip(payIn); - PForUtil.skip(payIn); - } - toSkip -= BLOCK_SIZE; - } - refillPositions(); - payloadByteUpto = 0; - if (indexHasPayloads) { - payloadByteUpto += sumOverRange(payloadLengthBuffer, 0, toSkip); - } - posBufferUpto = toSkip; - } - - position = 0; - lastStartOffset = 0; - } - - private void refillLastPositionBlock() throws IOException { - final int count = (int) (totalTermFreq % BLOCK_SIZE); - int payloadLength = 0; - int offsetLength = 0; - payloadByteUpto = 0; - for (int i = 0; i < count; i++) { - int code = posIn.readVInt(); - if (indexHasPayloads) { - if ((code & 1) != 0) { - payloadLength = posIn.readVInt(); - } - payloadLengthBuffer[i] = payloadLength; - posDeltaBuffer[i] = code >>> 1; - if (payloadLength != 0) { - if (payloadByteUpto + payloadLength > payloadBytes.length) { - payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payloadLength); - } - posIn.readBytes(payloadBytes, payloadByteUpto, payloadLength); - payloadByteUpto += payloadLength; - } - } else { - posDeltaBuffer[i] = code; - } - - if (indexHasOffsets) { - int deltaCode = posIn.readVInt(); - if ((deltaCode & 1) != 0) { - offsetLength = posIn.readVInt(); - } - offsetStartDeltaBuffer[i] = deltaCode >>> 1; - offsetLengthBuffer[i] = offsetLength; - } - } - payloadByteUpto = 0; - } - - private void refillPositions() throws IOException { - if (posIn.getFilePointer() == lastPosBlockFP) { - refillLastPositionBlock(); - } else { - pforUtil.decode(posInUtil, posDeltaBuffer); - - if (indexHasPayloads) { - if (needsPayloads) { - pforUtil.decode(payInUtil, payloadLengthBuffer); - int numBytes = payIn.readVInt(); - - if (numBytes > payloadBytes.length) { - payloadBytes = ArrayUtil.growNoCopy(payloadBytes, numBytes); - } - payIn.readBytes(payloadBytes, 0, numBytes); - } else { - // this works, because when writing a vint block we always force the first length to be - // written - PForUtil.skip(payIn); // skip over lengths - payIn.skipBytes(payIn.readVInt()); // skip over payloadBytes - } - payloadByteUpto = 0; - } - - if (indexHasOffsets) { - if (needsOffsets) { - pforUtil.decode(payInUtil, offsetStartDeltaBuffer); - pforUtil.decode(payInUtil, offsetLengthBuffer); - } else { - // this works, because when writing a vint block we always force the first length to be - // written - PForUtil.skip(payIn); // skip over starts - PForUtil.skip(payIn); // skip over lengths - } - } - } - } - - private void accumulatePayloadAndOffsets() { - if (needsPayloads) { - payloadLength = payloadLengthBuffer[posBufferUpto]; - payload.bytes = payloadBytes; - payload.offset = payloadByteUpto; - payload.length = payloadLength; - payloadByteUpto += payloadLength; - } - - if (needsOffsets) { - startOffset = lastStartOffset + offsetStartDeltaBuffer[posBufferUpto]; - endOffset = startOffset + offsetLengthBuffer[posBufferUpto]; - lastStartOffset = startOffset; - } - } - - @Override - public int nextPosition() throws IOException { - if (posDocBufferUpto != docBufferUpto) { - int freq = freq(); // triggers lazy decoding of freqs - - // First position that is being read on this doc. - posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, docBufferUpto); - posDocBufferUpto = docBufferUpto; - - assert posPendingCount > 0; - - if (posPendingCount > freq) { - skipPositions(freq); - posPendingCount = freq; - } - - position = 0; - lastStartOffset = 0; - } - - if (posBufferUpto == BLOCK_SIZE) { - refillPositions(); - posBufferUpto = 0; - } - position += posDeltaBuffer[posBufferUpto]; - - if (needsPayloadsOrOffsets) { - accumulatePayloadAndOffsets(); - } - - posBufferUpto++; - posPendingCount--; - return position; - } - - @Override - public int startOffset() { - if (needsOffsets == false) { - return -1; - } - return startOffset; - } - - @Override - public int endOffset() { - if (needsOffsets == false) { - return -1; - } - return endOffset; - } - - @Override - public BytesRef getPayload() { - if (needsPayloads == false || payloadLength == 0) { - return null; - } else { - return payload; - } - } - - @Override - public long cost() { - return docFreq; - } - } - - private abstract class BlockImpactsEnum extends ImpactsEnum { - - protected final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); - protected final PForUtil pforUtil = new PForUtil(); - - protected final int[] docBuffer = new int[BLOCK_SIZE + 1]; - protected final int[] freqBuffer = new int[BLOCK_SIZE]; - - protected final int docFreq; // number of docs in this posting list - // sum of freqBuffer in this posting list (or docFreq when omitted) - protected final long totalTermFreq; - protected final int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 - - protected final IndexInput docIn; - protected final PostingDecodingUtil docInUtil; - - protected int docCountUpto; // number of docs in or before the current block - protected int doc = -1; // doc we last read - protected int prevDocID = -1; // last doc ID of the previous block - protected int docBufferSize = BLOCK_SIZE; - protected int docBufferUpto = BLOCK_SIZE; - - // true if we shallow-advanced to a new block that we have not decoded yet - protected boolean needsRefilling; - - // level 0 skip data - protected int level0LastDocID = -1; - protected long level0DocEndFP; - protected final BytesRef level0SerializedImpacts; - protected final MutableImpactList level0Impacts; - // level 1 skip data - protected int level1LastDocID; - protected long level1DocEndFP; - protected int level1DocCountUpto = 0; - protected final BytesRef level1SerializedImpacts; - protected final MutableImpactList level1Impacts; - - private BlockImpactsEnum(IntBlockTermState termState) throws IOException { - this.docFreq = termState.docFreq; - this.docIn = Lucene101PostingsReader.this.docIn.clone(); - this.docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn); - if (docFreq > 1) { - prefetchPostings(docIn, termState); - } - this.singletonDocID = termState.singletonDocID; - this.totalTermFreq = termState.totalTermFreq; - level0SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel0); - level1SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel1); - level0Impacts = new MutableImpactList(maxNumImpactsAtLevel0); - level1Impacts = new MutableImpactList(maxNumImpactsAtLevel1); - if (docFreq < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - if (docFreq > 1) { - docIn.seek(termState.docStartFP); - } - } else { - level1LastDocID = -1; - level1DocEndFP = termState.docStartFP; - } - // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in - // advance() - docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; - } - - @Override - public int docID() { - return doc; - } - - @Override - public int startOffset() { - return -1; - } - - @Override - public int endOffset() { - return -1; - } - - @Override - public BytesRef getPayload() { - return null; - } - - @Override - public long cost() { - return docFreq; - } - - private final Impacts impacts = - new Impacts() { - - private final ByteArrayDataInput scratch = new ByteArrayDataInput(); - - @Override - public int numLevels() { - return level1LastDocID == NO_MORE_DOCS ? 1 : 2; - } - - @Override - public int getDocIdUpTo(int level) { - if (level == 0) { - return level0LastDocID; - } - return level == 1 ? level1LastDocID : NO_MORE_DOCS; - } - - @Override - public List getImpacts(int level) { - if (level == 0 && level0LastDocID != NO_MORE_DOCS) { - return readImpacts(level0SerializedImpacts, level0Impacts); - } - if (level == 1) { - return readImpacts(level1SerializedImpacts, level1Impacts); - } - return DUMMY_IMPACTS; - } - - private List readImpacts(BytesRef serialized, MutableImpactList impactsList) { - var scratch = this.scratch; - scratch.reset(serialized.bytes, 0, serialized.length); - Lucene101PostingsReader.readImpacts(scratch, impactsList); - return impactsList; - } - }; - - @Override - public Impacts getImpacts() { - return impacts; - } - } - - final class BlockImpactsDocsEnum extends BlockImpactsEnum { - final boolean indexHasPos; - - private long freqFP; - - public BlockImpactsDocsEnum(boolean indexHasPos, IntBlockTermState termState) - throws IOException { - super(termState); - this.indexHasPos = indexHasPos; - freqFP = -1; - } - - @Override - public int freq() throws IOException { - if (freqFP != -1) { - docIn.seek(freqFP); - pforUtil.decode(docInUtil, freqBuffer); - freqFP = -1; - } - return freqBuffer[docBufferUpto - 1]; - } - - @Override - public int nextPosition() { - return -1; - } - - private void refillDocs() throws IOException { - final int left = docFreq - docCountUpto; - assert left >= 0; - - if (left >= BLOCK_SIZE) { - forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); - freqFP = docIn.getFilePointer(); - PForUtil.skip(docIn); - docCountUpto += BLOCK_SIZE; - } else if (docFreq == 1) { - docBuffer[0] = singletonDocID; - freqBuffer[0] = (int) totalTermFreq; - freqFP = -1; - docBuffer[1] = NO_MORE_DOCS; - docCountUpto++; - docBufferSize = 1; - } else { - // Read vInts: - PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true); - prefixSum(docBuffer, left, prevDocID); - docBuffer[left] = NO_MORE_DOCS; - freqFP = -1; - docCountUpto += left; - docBufferSize = left; - } - prevDocID = docBuffer[BLOCK_SIZE - 1]; - docBufferUpto = 0; - assert docBuffer[docBufferSize] == NO_MORE_DOCS; - } - - private void skipLevel1To(int target) throws IOException { - while (true) { - prevDocID = level1LastDocID; - level0LastDocID = level1LastDocID; - docIn.seek(level1DocEndFP); - docCountUpto = level1DocCountUpto; - level1DocCountUpto += LEVEL1_NUM_DOCS; - - if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - break; - } - - level1LastDocID += docIn.readVInt(); - level1DocEndFP = docIn.readVLong() + docIn.getFilePointer(); - - if (level1LastDocID >= target) { - long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); - int numImpactBytes = docIn.readShort(); - docIn.readBytes(level1SerializedImpacts.bytes, 0, numImpactBytes); - level1SerializedImpacts.length = numImpactBytes; - assert indexHasPos || docIn.getFilePointer() == skip1EndFP; - docIn.seek(skip1EndFP); - break; - } - } - } - - private void skipLevel0To(int target) throws IOException { - while (true) { - prevDocID = level0LastDocID; - if (docFreq - docCountUpto >= BLOCK_SIZE) { - long skip0NumBytes = docIn.readVLong(); - // end offset of skip data (before the actual data starts) - long skip0End = docIn.getFilePointer() + skip0NumBytes; - int docDelta = readVInt15(docIn); - long blockLength = readVLong15(docIn); - - level0LastDocID += docDelta; - - if (target <= level0LastDocID) { - level0DocEndFP = docIn.getFilePointer() + blockLength; - int numImpactBytes = docIn.readVInt(); - docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); - level0SerializedImpacts.length = numImpactBytes; - docIn.seek(skip0End); - break; - } - - // skip block - docIn.skipBytes(blockLength); - docCountUpto += BLOCK_SIZE; - } else { - level0LastDocID = NO_MORE_DOCS; - break; - } - } - } - - @Override - public void advanceShallow(int target) throws IOException { - if (target > level0LastDocID) { // advance skip data on level 0 - if (target > level1LastDocID) { // advance skip data on level 1 - skipLevel1To(target); - } else if (needsRefilling) { - docIn.seek(level0DocEndFP); - docCountUpto += BLOCK_SIZE; - } - - skipLevel0To(target); - - needsRefilling = true; - } - } - - private void moveToNextLevel0Block() throws IOException { - if (doc == level1LastDocID) { - skipLevel1To(doc + 1); - } else if (needsRefilling) { - docIn.seek(level0DocEndFP); - docCountUpto += BLOCK_SIZE; - } - - prevDocID = level0LastDocID; - if (docFreq - docCountUpto >= BLOCK_SIZE) { - final long skip0Len = docIn.readVLong(); // skip len - final long skip0End = docIn.getFilePointer() + skip0Len; - final int docDelta = readVInt15(docIn); - final long blockLength = readVLong15(docIn); - level0LastDocID += docDelta; - level0DocEndFP = docIn.getFilePointer() + blockLength; - final int numImpactBytes = docIn.readVInt(); - docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); - level0SerializedImpacts.length = numImpactBytes; - docIn.seek(skip0End); - } else { - level0LastDocID = NO_MORE_DOCS; - } - - refillDocs(); - needsRefilling = false; - } - - @Override - public int nextDoc() throws IOException { - if (docBufferUpto == BLOCK_SIZE) { - if (needsRefilling) { - refillDocs(); - needsRefilling = false; - } else { - moveToNextLevel0Block(); - } - } - - return this.doc = docBuffer[docBufferUpto++]; - } - - @Override - public int advance(int target) throws IOException { - if (target > level0LastDocID || needsRefilling) { - advanceShallow(target); - refillDocs(); - needsRefilling = false; - } - - int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); - this.doc = docBuffer[next]; - docBufferUpto = next + 1; - return doc; - } - } - - final class BlockImpactsPostingsEnum extends BlockImpactsEnum { - private final int[] posDeltaBuffer = new int[BLOCK_SIZE]; - - private int posBufferUpto; - final IndexInput posIn; - final PostingDecodingUtil posInUtil; - - final boolean indexHasFreq; - final boolean indexHasOffsets; - final boolean indexHasPayloads; - final boolean indexHasOffsetsOrPayloads; - - private long freqFP; // offset of the freq block - - private int position; // current position - - // value of docBufferUpto on the last doc ID when positions have been read - private int posDocBufferUpto; - - // how many positions "behind" we are; nextPosition must - // skip these to "catch up": - private int posPendingCount; - - // File pointer where the last (vInt encoded) pos delta - // block is. We need this to know whether to bulk - // decode vs vInt decode the block: - private final long lastPosBlockFP; - - // level 0 skip data - private long level0PosEndFP; - private int level0BlockPosUpto; - // level 1 skip data - private long level1PosEndFP; - private int level1BlockPosUpto; - - public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState) - throws IOException { - super(termState); - final IndexOptions options = fieldInfo.getIndexOptions(); - indexHasFreq = options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - indexHasOffsets = - options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; - indexHasPayloads = fieldInfo.hasPayloads(); - indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads; - - this.posIn = Lucene101PostingsReader.this.posIn.clone(); - posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn); - - // Where this term's postings start in the .pos file: - final long posTermStartFP = termState.posStartFP; - posIn.seek(posTermStartFP); - level1PosEndFP = posTermStartFP; - level0PosEndFP = posTermStartFP; - posPendingCount = 0; - if (termState.totalTermFreq < BLOCK_SIZE) { - lastPosBlockFP = posTermStartFP; - } else if (termState.totalTermFreq == BLOCK_SIZE) { - lastPosBlockFP = -1; - } else { - lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; - } - level1BlockPosUpto = 0; - posBufferUpto = BLOCK_SIZE; - } - - @Override - public int freq() throws IOException { - if (freqFP != -1) { - docIn.seek(freqFP); - pforUtil.decode(docInUtil, freqBuffer); - freqFP = -1; - } - return freqBuffer[docBufferUpto - 1]; - } - - private void refillDocs() throws IOException { - final int left = docFreq - docCountUpto; - assert left >= 0; - - if (left >= BLOCK_SIZE) { - forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); - freqFP = docIn.getFilePointer(); - PForUtil.skip(docIn); - docCountUpto += BLOCK_SIZE; - } else if (docFreq == 1) { - docBuffer[0] = singletonDocID; - freqBuffer[0] = (int) totalTermFreq; - freqFP = -1; - docBuffer[1] = NO_MORE_DOCS; - docCountUpto++; - docBufferSize = 1; - - } else { - // Read vInts: - PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true); - prefixSum(docBuffer, left, prevDocID); - docBuffer[left] = NO_MORE_DOCS; - freqFP = -1; - docCountUpto += left; - docBufferSize = left; - freqFP = -1; + private void readLevel0PosData() throws IOException { + level0PosEndFP += docIn.readVLong(); + level0BlockPosUpto = docIn.readByte(); + if (indexHasOffsetsOrPayloads) { + level0PayEndFP += docIn.readVLong(); + level0BlockPayUpto = docIn.readVInt(); } - prevDocID = docBuffer[BLOCK_SIZE - 1]; - docBufferUpto = 0; - posDocBufferUpto = 0; - assert docBuffer[docBufferSize] == NO_MORE_DOCS; } - private void skipLevel1To(int target) throws IOException { - while (true) { - prevDocID = level1LastDocID; - level0LastDocID = level1LastDocID; - docIn.seek(level1DocEndFP); - level0PosEndFP = level1PosEndFP; - level0BlockPosUpto = level1BlockPosUpto; - docCountUpto = level1DocCountUpto; - level1DocCountUpto += LEVEL1_NUM_DOCS; - - if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - break; - } - - level1LastDocID += docIn.readVInt(); - level1DocEndFP = docIn.readVLong() + docIn.getFilePointer(); - - long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); - int numImpactBytes = docIn.readShort(); - if (level1LastDocID >= target) { - docIn.readBytes(level1SerializedImpacts.bytes, 0, numImpactBytes); - level1SerializedImpacts.length = numImpactBytes; - } else { - docIn.skipBytes(numImpactBytes); - } - level1PosEndFP += docIn.readVLong(); - level1BlockPosUpto = docIn.readByte(); - assert indexHasOffsetsOrPayloads || docIn.getFilePointer() == skip1EndFP; - - if (level1LastDocID >= target) { - docIn.seek(skip1EndFP); - break; + private void seekPosData(long posFP, int posUpto, long payFP, int payUpto) throws IOException { + // If nextBlockPosFP is less than the current FP, it means that the block of positions for + // the first docs of the next block are already decoded. In this case we just accumulate + // frequencies into posPendingCount instead of seeking backwards and decoding the same pos + // block again. + if (posFP >= posIn.getFilePointer()) { + posIn.seek(posFP); + posPendingCount = posUpto; + if (payIn != null) { // needs payloads or offsets + assert level0PayEndFP >= payIn.getFilePointer(); + payIn.seek(payFP); + payloadByteUpto = payUpto; } + posBufferUpto = BLOCK_SIZE; + } else { + posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE); } } private void skipLevel0To(int target) throws IOException { long posFP; int posUpto; + long payFP; + int payUpto; while (true) { prevDocID = level0LastDocID; posFP = level0PosEndFP; posUpto = level0BlockPosUpto; + payFP = level0PayEndFP; + payUpto = level0BlockPayUpto; if (docFreq - docCountUpto >= BLOCK_SIZE) { - docIn.readVLong(); // skip0 num bytes + long numSkipBytes = docIn.readVLong(); + long skip0End = docIn.getFilePointer() + numSkipBytes; int docDelta = readVInt15(docIn); + level0LastDocID += docDelta; + boolean found = target <= level0LastDocID; long blockLength = readVLong15(docIn); level0DocEndFP = docIn.getFilePointer() + blockLength; - level0LastDocID += docDelta; - - if (target <= level0LastDocID) { - int numImpactBytes = docIn.readVInt(); - docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); - level0SerializedImpacts.length = numImpactBytes; - level0PosEndFP += docIn.readVLong(); - level0BlockPosUpto = docIn.readByte(); - if (indexHasOffsetsOrPayloads) { - docIn.readVLong(); // pay fp delta - docIn.readVInt(); // pay upto + if (indexHasFreq) { + if (found == false && needsPos == false) { + docIn.seek(skip0End); + } else { + int numImpactBytes = docIn.readVInt(); + if (needsImpacts && found) { + docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); + level0SerializedImpacts.length = numImpactBytes; + } else { + docIn.skipBytes(numImpactBytes); + } + + if (needsPos) { + readLevel0PosData(); + } else { + docIn.seek(skip0End); + } } + } + + if (found) { break; } - // skip block - docIn.skipBytes(docIn.readVLong()); // impacts - level0PosEndFP += docIn.readVLong(); - level0BlockPosUpto = docIn.readVInt(); + docIn.seek(level0DocEndFP); docCountUpto += BLOCK_SIZE; } else { @@ -1688,62 +823,62 @@ private void skipLevel0To(int target) throws IOException { } } - // If nextBlockPosFP is less than the current FP, it means that the block of positions for - // the first docs of the next block are already decoded. In this case we just accumulate - // frequencies into posPendingCount instead of seeking backwards and decoding the same pos - // block again. - if (posFP >= posIn.getFilePointer()) { - posIn.seek(posFP); - posPendingCount = posUpto; - posBufferUpto = BLOCK_SIZE; - } else { - posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE); + if (posIn != null) { // needs positions + seekPosData(posFP, posUpto, payFP, payUpto); } } @Override public void advanceShallow(int target) throws IOException { if (target > level0LastDocID) { // advance level 0 skip data + doAdvanceShallow(target); - if (target > level1LastDocID) { // advance skip data on level 1 - skipLevel1To(target); - } else if (needsRefilling) { - docIn.seek(level0DocEndFP); - docCountUpto += BLOCK_SIZE; + // If we are on the last doc ID of a block and we are advancing on the doc ID just beyond + // this block, then we decode the block. This may not be necessary, but this helps avoid + // having to check whether we are in a block that is not decoded yet in #nextDoc(). + if (docBufferUpto == BLOCK_SIZE && target == doc + 1) { + refillDocs(); + needsRefilling = false; + } else { + needsRefilling = true; } + } + } - skipLevel0To(target); - - needsRefilling = true; + private void doAdvanceShallow(int target) throws IOException { + if (target > level1LastDocID) { // advance skip data on level 1 + skipLevel1To(target); + } else if (needsRefilling) { + docIn.seek(level0DocEndFP); + docCountUpto += BLOCK_SIZE; } + + skipLevel0To(target); } @Override public int nextDoc() throws IOException { if (docBufferUpto == BLOCK_SIZE) { - advanceShallow(doc + 1); - assert needsRefilling; - refillDocs(); - needsRefilling = false; + moveToNextLevel0Block(); } - doc = docBuffer[docBufferUpto]; - docBufferUpto++; - return this.doc; + return this.doc = docBuffer[docBufferUpto++]; } @Override public int advance(int target) throws IOException { if (target > level0LastDocID || needsRefilling) { - advanceShallow(target); - assert needsRefilling; + if (target > level0LastDocID) { + doAdvanceShallow(target); + } refillDocs(); needsRefilling = false; } int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); + this.doc = docBuffer[next]; docBufferUpto = next + 1; - return this.doc = docBuffer[next]; + return doc; } private void skipPositions(int freq) throws IOException { @@ -1755,66 +890,170 @@ private void skipPositions(int freq) throws IOException { final int leftInBlock = BLOCK_SIZE - posBufferUpto; if (toSkip < leftInBlock) { - posBufferUpto += toSkip; + int end = posBufferUpto + toSkip; + if (needsPayloads) { + payloadByteUpto += sumOverRange(payloadLengthBuffer, posBufferUpto, end); + } + posBufferUpto = end; } else { toSkip -= leftInBlock; while (toSkip >= BLOCK_SIZE) { assert posIn.getFilePointer() != lastPosBlockFP; PForUtil.skip(posIn); + + if (payIn != null) { + if (indexHasPayloads) { + // Skip payloadLength block: + PForUtil.skip(payIn); + + // Skip payloadBytes block: + int numBytes = payIn.readVInt(); + payIn.seek(payIn.getFilePointer() + numBytes); + } + + if (indexHasOffsets) { + PForUtil.skip(payIn); + PForUtil.skip(payIn); + } + } toSkip -= BLOCK_SIZE; } refillPositions(); + if (needsPayloads) { + payloadByteUpto = sumOverRange(payloadLengthBuffer, 0, toSkip); + } posBufferUpto = toSkip; } } - private void refillPositions() throws IOException { - if (posIn.getFilePointer() == lastPosBlockFP) { - final int count = (int) (totalTermFreq % BLOCK_SIZE); - int payloadLength = 0; - for (int i = 0; i < count; i++) { - int code = posIn.readVInt(); - if (indexHasPayloads) { - if ((code & 1) != 0) { - payloadLength = posIn.readVInt(); - } + private void refillLastPositionBlock() throws IOException { + final int count = (int) (totalTermFreq % BLOCK_SIZE); + int payloadLength = 0; + int offsetLength = 0; + payloadByteUpto = 0; + for (int i = 0; i < count; i++) { + int code = posIn.readVInt(); + if (indexHasPayloads) { + if ((code & 1) != 0) { + payloadLength = posIn.readVInt(); + } + if (payloadLengthBuffer != null) { // needs payloads + payloadLengthBuffer[i] = payloadLength; posDeltaBuffer[i] = code >>> 1; if (payloadLength != 0) { - posIn.skipBytes(payloadLength); + if (payloadByteUpto + payloadLength > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payloadLength); + } + posIn.readBytes(payloadBytes, payloadByteUpto, payloadLength); + payloadByteUpto += payloadLength; } } else { - posDeltaBuffer[i] = code; + posIn.skipBytes(payloadLength); + } + } else { + posDeltaBuffer[i] = code; + } + + if (indexHasOffsets) { + int deltaCode = posIn.readVInt(); + if ((deltaCode & 1) != 0) { + offsetLength = posIn.readVInt(); + } + if (offsetStartDeltaBuffer != null) { // needs offsets + offsetStartDeltaBuffer[i] = deltaCode >>> 1; + offsetLengthBuffer[i] = offsetLength; } + } + } + payloadByteUpto = 0; + } - if (indexHasOffsets) { - int deltaCode = posIn.readVInt(); - if ((deltaCode & 1) != 0) { - posIn.readVInt(); // offset length - } + private void refillOffsetsOrPayloads() throws IOException { + if (indexHasPayloads) { + if (needsPayloads) { + pforUtil.decode(payInUtil, payloadLengthBuffer); + int numBytes = payIn.readVInt(); + + if (numBytes > payloadBytes.length) { + payloadBytes = ArrayUtil.growNoCopy(payloadBytes, numBytes); } + payIn.readBytes(payloadBytes, 0, numBytes); + } else if (payIn != null) { // needs offsets + // this works, because when writing a vint block we always force the first length to be + // written + PForUtil.skip(payIn); // skip over lengths + int numBytes = payIn.readVInt(); // read length of payloadBytes + payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes + } + payloadByteUpto = 0; + } + + if (indexHasOffsets) { + if (needsOffsets) { + pforUtil.decode(payInUtil, offsetStartDeltaBuffer); + pforUtil.decode(payInUtil, offsetLengthBuffer); + } else if (payIn != null) { // needs payloads + // this works, because when writing a vint block we always force the first length to be + // written + PForUtil.skip(payIn); // skip over starts + PForUtil.skip(payIn); // skip over lengths } - } else { - pforUtil.decode(posInUtil, posDeltaBuffer); } } - @Override - public int nextPosition() throws IOException { - if (posDocBufferUpto != docBufferUpto) { - int freq = freq(); // triggers lazy decoding of freqs + private void refillPositions() throws IOException { + if (posIn.getFilePointer() == lastPosBlockFP) { + refillLastPositionBlock(); + return; + } + pforUtil.decode(posInUtil, posDeltaBuffer); - // First position that is being read on this doc. - posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, docBufferUpto); - posDocBufferUpto = docBufferUpto; + if (indexHasOffsetsOrPayloads) { + refillOffsetsOrPayloads(); + } + } - assert posPendingCount > 0; + private void accumulatePendingPositions() throws IOException { + int freq = freq(); // trigger lazy decoding of freqs + posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, docBufferUpto); + posDocBufferUpto = docBufferUpto; - if (posPendingCount > freq) { - skipPositions(freq); - posPendingCount = freq; - } + assert posPendingCount > 0; + if (posPendingCount > freq) { + skipPositions(freq); + posPendingCount = freq; + } + } + + private void accumulatePayloadAndOffsets() { + if (needsPayloads) { + payloadLength = payloadLengthBuffer[posBufferUpto]; + payload.bytes = payloadBytes; + payload.offset = payloadByteUpto; + payload.length = payloadLength; + payloadByteUpto += payloadLength; + } + + if (needsOffsets) { + startOffset = lastStartOffset + offsetStartDeltaBuffer[posBufferUpto]; + endOffset = startOffset + offsetLengthBuffer[posBufferUpto]; + lastStartOffset = startOffset; + } + } + + @Override + public int nextPosition() throws IOException { + if (needsPos == false) { + return -1; + } + + assert posDocBufferUpto <= docBufferUpto; + if (posDocBufferUpto != docBufferUpto) { + // First position we're reading on this doc + accumulatePendingPositions(); position = 0; + lastStartOffset = 0; } if (posBufferUpto == BLOCK_SIZE) { @@ -1823,10 +1062,92 @@ public int nextPosition() throws IOException { } position += posDeltaBuffer[posBufferUpto]; + if (needsOffsetsOrPayloads) { + accumulatePayloadAndOffsets(); + } + posBufferUpto++; posPendingCount--; return position; } + + @Override + public int startOffset() { + if (needsOffsets == false) { + return -1; + } + return startOffset; + } + + @Override + public int endOffset() { + if (needsOffsets == false) { + return -1; + } + return endOffset; + } + + @Override + public BytesRef getPayload() { + if (needsPayloads == false || payloadLength == 0) { + return null; + } else { + return payload; + } + } + + @Override + public long cost() { + return docFreq; + } + + private final Impacts impacts = + new Impacts() { + + private final ByteArrayDataInput scratch = new ByteArrayDataInput(); + + @Override + public int numLevels() { + return indexHasFreq == false || level1LastDocID == NO_MORE_DOCS ? 1 : 2; + } + + @Override + public int getDocIdUpTo(int level) { + if (indexHasFreq == false) { + return NO_MORE_DOCS; + } + if (level == 0) { + return level0LastDocID; + } + return level == 1 ? level1LastDocID : NO_MORE_DOCS; + } + + @Override + public List getImpacts(int level) { + if (indexHasFreq) { + if (level == 0 && level0LastDocID != NO_MORE_DOCS) { + return readImpacts(level0SerializedImpacts, level0Impacts); + } + if (level == 1) { + return readImpacts(level1SerializedImpacts, level1Impacts); + } + } + return DUMMY_IMPACTS; + } + + private List readImpacts(BytesRef serialized, MutableImpactList impactsList) { + var scratch = this.scratch; + scratch.reset(serialized.bytes, 0, serialized.length); + Lucene101PostingsReader.readImpacts(scratch, impactsList); + return impactsList; + } + }; + + @Override + public Impacts getImpacts() { + assert needsImpacts; + return impacts; + } } /** diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index 1c5436dcf0a5..b1d283cc3899 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -399,7 +399,7 @@ public boolean equals(Object obj) { /** * A guess of the average number of simple operations for the initial seek and buffer refill per * document for the positions of a term. See also {@link - * Lucene101PostingsReader.BlockImpactsPostingsEnum#nextPosition()}. + * Lucene101PostingsReader.BlockPostingsEnum#nextPosition()}. * *

Aside: Instead of being constant this could depend among others on {@link * Lucene101PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link @@ -410,9 +410,8 @@ public boolean equals(Object obj) { private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; /** - * Number of simple operations in {@link - * Lucene101PostingsReader.BlockImpactsPostingsEnum#nextPosition()} when no seek or buffer refill - * is done. + * Number of simple operations in {@link Lucene101PostingsReader.BlockPostingsEnum#nextPosition()} + * when no seek or buffer refill is done. */ private static final int TERM_OPS_PER_POS = 7; diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java index c0f2f61d7cc1..c1360a9d9a7e 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java @@ -261,7 +261,7 @@ public void visit(String field, QueryVisitor visitor) { /** * A guess of the average number of simple operations for the initial seek and buffer refill per * document for the positions of a term. See also {@link - * Lucene101PostingsReader.EverythingEnum#nextPosition()}. + * Lucene101PostingsReader.BlockPostingsEnum#nextPosition()}. * *

Aside: Instead of being constant this could depend among others on {@link * Lucene101PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link @@ -272,7 +272,7 @@ public void visit(String field, QueryVisitor visitor) { private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; /** - * Number of simple operations in {@link Lucene101PostingsReader.EverythingEnum#nextPosition()} + * Number of simple operations in {@link Lucene101PostingsReader.BlockPostingsEnum#nextPosition()} * when no seek or buffer refill is done. */ private static final int TERM_OPS_PER_POS = 7;