diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 98580d3ea3da..afda0820239d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -61,6 +61,9 @@ Improvements segments for merges below the floor segment size, even if maxMergeAtOnce is bigger than segsPerTier. (Adrien Grand) +* GITHUB#14033: Combine all postings enum impls of the default codec into a + single class. (Adrien Grand) + Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java index d879a58b4ab7..010ed43135fe 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java @@ -44,7 +44,6 @@ import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SlowImpactsEnum; import org.apache.lucene.internal.vectorization.PostingDecodingUtil; import org.apache.lucene.internal.vectorization.VectorizationProvider; import org.apache.lucene.store.ByteArrayDataInput; @@ -272,100 +271,222 @@ public void decodeTerm( public PostingsEnum postings( FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags) throws IOException { - if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0 - || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false) { - return (reuse instanceof BlockDocsEnum blockDocsEnum - && blockDocsEnum.canReuse(docIn, fieldInfo) - ? blockDocsEnum - : new BlockDocsEnum(fieldInfo)) - .reset((IntBlockTermState) termState, flags); - } else { - return (reuse instanceof EverythingEnum everythingEnum - && everythingEnum.canReuse(docIn, fieldInfo) - ? everythingEnum - : new EverythingEnum(fieldInfo)) - .reset((IntBlockTermState) termState, flags); - } + return (reuse instanceof BlockPostingsEnum everythingEnum + && everythingEnum.canReuse(docIn, fieldInfo, flags, false) + ? everythingEnum + : new BlockPostingsEnum(fieldInfo, flags, false)) + .reset((IntBlockTermState) termState, flags); } @Override public ImpactsEnum impacts(FieldInfo fieldInfo, BlockTermState state, int flags) throws IOException { - final IndexOptions options = fieldInfo.getIndexOptions(); - final boolean indexHasPositions = - options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - - if (options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0 - && (indexHasPositions == false - || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false)) { - return new BlockImpactsDocsEnum(indexHasPositions, (IntBlockTermState) state); - } - - if (indexHasPositions - && (options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0 - || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == false) - && (fieldInfo.hasPayloads() == false - || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == false)) { - return new BlockImpactsPostingsEnum(fieldInfo, (IntBlockTermState) state); - } - - return new SlowImpactsEnum(postings(fieldInfo, state, null, flags)); + return new BlockPostingsEnum(fieldInfo, flags, true).reset((IntBlockTermState) state, flags); } - private static long sumOverRange(int[] arr, int start, int end) { - long res = 0L; + private static int sumOverRange(int[] arr, int start, int end) { + int res = 0; for (int i = start; i < end; i++) { res += arr[i]; } return res; } - private abstract class AbstractPostingsEnum extends PostingsEnum { + final class BlockPostingsEnum extends ImpactsEnum { - protected ForDeltaUtil forDeltaUtil; - protected PForUtil pforUtil; + private ForDeltaUtil forDeltaUtil; + private PForUtil pforUtil; - protected final int[] docBuffer = new int[BLOCK_SIZE + 1]; - protected final boolean indexHasFreq; + private final int[] docBuffer = new int[BLOCK_SIZE + 1]; - protected int doc; // doc we last read + private int doc; // doc we last read // level 0 skip data - protected int level0LastDocID; + private int level0LastDocID; + private long level0DocEndFP; // level 1 skip data - protected int level1LastDocID; - protected long level1DocEndFP; - protected int level1DocCountUpto; + private int level1LastDocID; + private long level1DocEndFP; + private int level1DocCountUpto; + + private int docFreq; // number of docs in this posting list + private long totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) + + private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + + private int docCountUpto; // number of docs in or before the current block + private int prevDocID; // last doc ID of the previous block + + private int docBufferSize; + private int docBufferUpto; + + private IndexInput docIn; + private PostingDecodingUtil docInUtil; + + private final int[] freqBuffer = new int[BLOCK_SIZE]; + private final int[] posDeltaBuffer; + + private final int[] payloadLengthBuffer; + private final int[] offsetStartDeltaBuffer; + private final int[] offsetLengthBuffer; + + private byte[] payloadBytes; + private int payloadByteUpto; + private int payloadLength; + + private int lastStartOffset; + private int startOffset; + private int endOffset; + + private int posBufferUpto; + + final IndexInput posIn; + final PostingDecodingUtil posInUtil; + final IndexInput payIn; + final PostingDecodingUtil payInUtil; + final BytesRef payload; + + final IndexOptions options; + final boolean indexHasFreq; + final boolean indexHasPos; + final boolean indexHasOffsets; + final boolean indexHasPayloads; + final boolean indexHasOffsetsOrPayloads; + + final int flags; + final boolean needsFreq; + final boolean needsPos; + final boolean needsOffsets; + final boolean needsPayloads; + final boolean needsOffsetsOrPayloads; + final boolean needsImpacts; + final boolean needsDocsAndFreqsOnly; + + private long freqFP; // offset of the freq block + + private int position; // current position + + // value of docBufferUpto on the last doc ID when positions have been read + private int posDocBufferUpto; + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private long lastPosBlockFP; - protected int docFreq; // number of docs in this posting list - protected long - totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) + // level 0 skip data + private long level0PosEndFP; + private int level0BlockPosUpto; + private long level0PayEndFP; + private int level0BlockPayUpto; + private final BytesRef level0SerializedImpacts; + private final MutableImpactList level0Impacts; - protected int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + // level 1 skip data + private long level1PosEndFP; + private int level1BlockPosUpto; + private long level1PayEndFP; + private int level1BlockPayUpto; + private final BytesRef level1SerializedImpacts; + private final MutableImpactList level1Impacts; - protected int docCountUpto; // number of docs in or before the current block - protected int prevDocID; // last doc ID of the previous block + // true if we shallow-advanced to a new block that we have not decoded yet + private boolean needsRefilling; - protected int docBufferSize; - protected int docBufferUpto; + public BlockPostingsEnum(FieldInfo fieldInfo, int flags, boolean needsImpacts) + throws IOException { + options = fieldInfo.getIndexOptions(); + indexHasFreq = options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + indexHasPos = options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + indexHasOffsets = + options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads; - protected IndexInput docIn; - protected PostingDecodingUtil docInUtil; + this.flags = flags; + needsFreq = indexHasFreq && PostingsEnum.featureRequested(flags, PostingsEnum.FREQS); + needsPos = indexHasPos && PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS); + needsOffsets = indexHasOffsets && PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS); + needsPayloads = + indexHasPayloads && PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS); + needsOffsetsOrPayloads = needsOffsets || needsPayloads; + this.needsImpacts = needsImpacts; + needsDocsAndFreqsOnly = needsPos == false && needsImpacts == false; - protected AbstractPostingsEnum(FieldInfo fieldInfo) { - indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in // advance() docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; + + if (needsFreq == false) { + Arrays.fill(freqBuffer, 1); + } + + if (needsFreq && needsImpacts) { + level0SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel0); + level1SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel1); + level0Impacts = new MutableImpactList(maxNumImpactsAtLevel0); + level1Impacts = new MutableImpactList(maxNumImpactsAtLevel1); + } else { + level0SerializedImpacts = null; + level1SerializedImpacts = null; + level0Impacts = null; + level1Impacts = null; + } + + if (needsPos) { + this.posIn = Lucene101PostingsReader.this.posIn.clone(); + posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn); + posDeltaBuffer = new int[BLOCK_SIZE]; + } else { + this.posIn = null; + this.posInUtil = null; + posDeltaBuffer = null; + } + + if (needsOffsets || needsPayloads) { + this.payIn = Lucene101PostingsReader.this.payIn.clone(); + payInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(payIn); + } else { + this.payIn = null; + payInUtil = null; + } + + if (needsOffsets) { + offsetStartDeltaBuffer = new int[BLOCK_SIZE]; + offsetLengthBuffer = new int[BLOCK_SIZE]; + } else { + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + startOffset = -1; + endOffset = -1; + } + + if (indexHasPayloads) { + payloadLengthBuffer = new int[BLOCK_SIZE]; + payloadBytes = new byte[128]; + payload = new BytesRef(); + } else { + payloadLengthBuffer = null; + payloadBytes = null; + payload = null; + } } - @Override - public int docID() { - return doc; + public boolean canReuse( + IndexInput docIn, FieldInfo fieldInfo, int flags, boolean needsImpacts) { + return docIn == Lucene101PostingsReader.this.docIn + && options == fieldInfo.getIndexOptions() + && indexHasPayloads == fieldInfo.hasPayloads() + && this.flags == flags + && this.needsImpacts == needsImpacts; } - protected void resetIndexInput(IntBlockTermState termState) throws IOException { + public BlockPostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { docFreq = termState.docFreq; singletonDocID = termState.singletonDocID; if (docFreq > 1) { @@ -376,12 +497,50 @@ protected void resetIndexInput(IntBlockTermState termState) throws IOException { } prefetchPostings(docIn, termState); } - } - protected PostingsEnum resetIdsAndLevelParams(IntBlockTermState termState) throws IOException { + if (forDeltaUtil == null && docFreq >= BLOCK_SIZE) { + forDeltaUtil = new ForDeltaUtil(); + } + totalTermFreq = indexHasFreq ? termState.totalTermFreq : termState.docFreq; + if (needsFreq && pforUtil == null && totalTermFreq >= BLOCK_SIZE) { + pforUtil = new PForUtil(); + } + + // Where this term's postings start in the .pos file: + final long posTermStartFP = termState.posStartFP; + // Where this term's payloads/offsets start in the .pay + // file: + final long payTermStartFP = termState.payStartFP; + if (posIn != null) { + posIn.seek(posTermStartFP); + if (payIn != null) { + payIn.seek(payTermStartFP); + } + } + level1PosEndFP = posTermStartFP; + level1PayEndFP = payTermStartFP; + level0PosEndFP = posTermStartFP; + level0PayEndFP = payTermStartFP; + posPendingCount = 0; + payloadByteUpto = 0; + if (termState.totalTermFreq < BLOCK_SIZE) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == BLOCK_SIZE) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + + level1BlockPosUpto = 0; + level1BlockPayUpto = 0; + level0BlockPosUpto = 0; + level0BlockPayUpto = 0; + posBufferUpto = BLOCK_SIZE; + doc = -1; prevDocID = -1; docCountUpto = 0; + freqFP = -1L; level0LastDocID = -1; if (docFreq < LEVEL1_NUM_DOCS) { level1LastDocID = NO_MORE_DOCS; @@ -395,43 +554,14 @@ protected PostingsEnum resetIdsAndLevelParams(IntBlockTermState termState) throw level1DocCountUpto = 0; docBufferSize = BLOCK_SIZE; docBufferUpto = BLOCK_SIZE; - return this; - } - } - - final class BlockDocsEnum extends AbstractPostingsEnum { - - private final int[] freqBuffer = new int[BLOCK_SIZE]; - - private boolean needsFreq; // true if the caller actually needs frequencies - private long freqFP; - - public BlockDocsEnum(FieldInfo fieldInfo) { - super(fieldInfo); - } + posDocBufferUpto = BLOCK_SIZE; - public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { - final IndexOptions options = fieldInfo.getIndexOptions(); - return docIn == Lucene101PostingsReader.this.docIn - && indexHasFreq == (options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0); + return this; } - public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { - resetIndexInput(termState); - if (pforUtil == null && docFreq >= BLOCK_SIZE) { - pforUtil = new PForUtil(); - forDeltaUtil = new ForDeltaUtil(); - } - totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq; - - this.needsFreq = PostingsEnum.featureRequested(flags, PostingsEnum.FREQS); - if (indexHasFreq == false || needsFreq == false) { - // Filling this buffer may not be cheap when doing primary key lookups, so we make sure to - // not fill more than `docFreq` entries. - Arrays.fill(freqBuffer, 0, Math.min(ForUtil.BLOCK_SIZE, docFreq), 1); - } - freqFP = -1; - return resetIdsAndLevelParams(termState); + @Override + public int docID() { + return doc; } @Override @@ -441,35 +571,11 @@ public int freq() throws IOException { pforUtil.decode(docInUtil, freqBuffer); freqFP = -1; } - return freqBuffer[docBufferUpto - 1]; } - @Override - public int nextPosition() { - return -1; - } - - @Override - public int startOffset() { - return -1; - } - - @Override - public int endOffset() { - return -1; - } - - @Override - public BytesRef getPayload() { - return null; - } - private void refillFullBlock() throws IOException { - assert docFreq - docCountUpto >= BLOCK_SIZE; - forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); - if (indexHasFreq) { if (needsFreq) { freqFP = docIn.getFilePointer(); @@ -479,29 +585,44 @@ private void refillFullBlock() throws IOException { docCountUpto += BLOCK_SIZE; prevDocID = docBuffer[BLOCK_SIZE - 1]; docBufferUpto = 0; + posDocBufferUpto = 0; assert docBuffer[docBufferSize] == NO_MORE_DOCS; } private void refillRemainder() throws IOException { final int left = docFreq - docCountUpto; - assert left >= 0; - assert left < BLOCK_SIZE; - + assert left >= 0 && left < BLOCK_SIZE; if (docFreq == 1) { docBuffer[0] = singletonDocID; freqBuffer[0] = (int) totalTermFreq; docBuffer[1] = NO_MORE_DOCS; + assert freqFP == -1; docCountUpto++; + docBufferSize = 1; } else { // Read vInts: PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, needsFreq); prefixSum(docBuffer, left, prevDocID); docBuffer[left] = NO_MORE_DOCS; + freqFP = -1L; docCountUpto += left; + docBufferSize = left; } + prevDocID = docBuffer[BLOCK_SIZE - 1]; docBufferUpto = 0; - docBufferSize = left; - freqFP = -1; + posDocBufferUpto = 0; + assert docBuffer[docBufferSize] == NO_MORE_DOCS; + } + + private void refillDocs() throws IOException { + final int left = docFreq - docCountUpto; + assert left >= 0; + + if (left >= BLOCK_SIZE) { + refillFullBlock(); + } else { + refillRemainder(); + } } private void skipLevel1To(int target) throws IOException { @@ -509,6 +630,10 @@ private void skipLevel1To(int target) throws IOException { prevDocID = level1LastDocID; level0LastDocID = level1LastDocID; docIn.seek(level1DocEndFP); + level0PosEndFP = level1PosEndFP; + level0BlockPosUpto = level1BlockPosUpto; + level0PayEndFP = level1PayEndFP; + level0BlockPayUpto = level1BlockPayUpto; docCountUpto = level1DocCountUpto; level1DocCountUpto += LEVEL1_NUM_DOCS; @@ -518,1168 +643,178 @@ private void skipLevel1To(int target) throws IOException { } level1LastDocID += docIn.readVInt(); - level1DocEndFP = docIn.readVLong() + docIn.getFilePointer(); + long delta = docIn.readVLong(); + level1DocEndFP = delta + docIn.getFilePointer(); - if (level1LastDocID >= target) { - if (indexHasFreq) { - // skip impacts and pos skip data - docIn.skipBytes(docIn.readShort()); + if (indexHasFreq) { + long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); + int numImpactBytes = docIn.readShort(); + if (needsImpacts && level1LastDocID >= target) { + docIn.readBytes(level1SerializedImpacts.bytes, 0, numImpactBytes); + level1SerializedImpacts.length = numImpactBytes; + } else { + docIn.skipBytes(numImpactBytes); } + if (indexHasPos) { + level1PosEndFP += docIn.readVLong(); + level1BlockPosUpto = docIn.readByte(); + if (indexHasOffsetsOrPayloads) { + level1PayEndFP += docIn.readVLong(); + level1BlockPayUpto = docIn.readVInt(); + } + } + assert docIn.getFilePointer() == skip1EndFP; + } + + if (level1LastDocID >= target) { break; } } } - private void skipLevel0To(int target) throws IOException { - while (true) { - prevDocID = level0LastDocID; - if (docFreq - docCountUpto >= BLOCK_SIZE) { - long skip0NumBytes = docIn.readVLong(); - // end offset of skip data (before the actual data starts) - long skip0EndFP = docIn.getFilePointer() + skip0NumBytes; - int docDelta = readVInt15(docIn); - level0LastDocID += docDelta; + private void doMoveToNextLevel0Block() throws IOException { + assert docBufferUpto == BLOCK_SIZE; + if (posIn != null) { + if (level0PosEndFP >= posIn.getFilePointer()) { + posIn.seek(level0PosEndFP); + posPendingCount = level0BlockPosUpto; + if (payIn != null) { + assert level0PayEndFP >= payIn.getFilePointer(); + payIn.seek(level0PayEndFP); + payloadByteUpto = level0BlockPayUpto; + } + posBufferUpto = BLOCK_SIZE; + } else { + assert freqFP == -1L; + posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE); + } + } - if (target <= level0LastDocID) { - docIn.seek(skip0EndFP); - break; + if (docFreq - docCountUpto >= BLOCK_SIZE) { + docIn.readVLong(); // level0NumBytes + int docDelta = readVInt15(docIn); + level0LastDocID += docDelta; + long blockLength = readVLong15(docIn); + level0DocEndFP = docIn.getFilePointer() + blockLength; + if (indexHasFreq) { + int numImpactBytes = docIn.readVInt(); + if (needsImpacts) { + docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); + level0SerializedImpacts.length = numImpactBytes; + } else { + docIn.skipBytes(numImpactBytes); } - // skip block - docIn.skipBytes(readVLong15(docIn)); - docCountUpto += BLOCK_SIZE; - } else { - level0LastDocID = NO_MORE_DOCS; - break; + if (indexHasPos) { + level0PosEndFP += docIn.readVLong(); + level0BlockPosUpto = docIn.readByte(); + if (indexHasOffsetsOrPayloads) { + level0PayEndFP += docIn.readVLong(); + level0BlockPayUpto = docIn.readVInt(); + } + } } + refillFullBlock(); + } else { + level0LastDocID = NO_MORE_DOCS; + refillRemainder(); } } private void moveToNextLevel0Block() throws IOException { - if (doc == level1LastDocID) { // advance skip data on level 1 + if (doc == level1LastDocID) { // advance level 1 skip data skipLevel1To(doc + 1); } + // Now advance level 0 skip data prevDocID = level0LastDocID; - if (docFreq - docCountUpto >= BLOCK_SIZE) { - docIn.skipBytes(docIn.readVLong()); + + if (needsDocsAndFreqsOnly && docFreq - docCountUpto >= BLOCK_SIZE) { + // Optimize the common path for exhaustive evaluation + long level0NumBytes = docIn.readVLong(); + docIn.skipBytes(level0NumBytes); refillFullBlock(); level0LastDocID = docBuffer[BLOCK_SIZE - 1]; } else { - level0LastDocID = NO_MORE_DOCS; - refillRemainder(); + doMoveToNextLevel0Block(); } } - @Override - public int nextDoc() throws IOException { - if (docBufferUpto == BLOCK_SIZE) { // advance skip data on level 0 - moveToNextLevel0Block(); - } - - return this.doc = docBuffer[docBufferUpto++]; - } - - @Override - public int advance(int target) throws IOException { - if (target > level0LastDocID) { // advance skip data on level 0 - - if (target > level1LastDocID) { // advance skip data on level 1 - skipLevel1To(target); - } - - skipLevel0To(target); - - if (docFreq - docCountUpto >= BLOCK_SIZE) { - refillFullBlock(); - } else { - refillRemainder(); - } - } - - int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); - this.doc = docBuffer[next]; - docBufferUpto = next + 1; - return doc; - } - - @Override - public long cost() { - return docFreq; - } - } - - final class EverythingEnum extends AbstractPostingsEnum { - - private final int[] freqBuffer = new int[BLOCK_SIZE + 1]; - private final int[] posDeltaBuffer = new int[BLOCK_SIZE]; - - private final int[] payloadLengthBuffer; - private final int[] offsetStartDeltaBuffer; - private final int[] offsetLengthBuffer; - - private byte[] payloadBytes; - private int payloadByteUpto; - private int payloadLength; - - private int lastStartOffset; - private int startOffset; - private int endOffset; - - private int posBufferUpto; - - final IndexInput posIn; - final PostingDecodingUtil posInUtil; - final IndexInput payIn; - final PostingDecodingUtil payInUtil; - final BytesRef payload; - - final boolean indexHasOffsets; - final boolean indexHasPayloads; - final boolean indexHasOffsetsOrPayloads; - - private long freqFP; // offset of the freq block - - private int position; // current position - - // value of docBufferUpto on the last doc ID when positions have been read - private int posDocBufferUpto; - - // how many positions "behind" we are; nextPosition must - // skip these to "catch up": - private int posPendingCount; - - // File pointer where the last (vInt encoded) pos delta - // block is. We need this to know whether to bulk - // decode vs vInt decode the block: - private long lastPosBlockFP; - - private long level0PosEndFP; - private int level0BlockPosUpto; - private long level0PayEndFP; - private int level0BlockPayUpto; - - private long level1PosEndFP; - private int level1BlockPosUpto; - private long level1PayEndFP; - private int level1BlockPayUpto; - - private boolean needsOffsets; // true if we actually need offsets - private boolean needsPayloads; // true if we actually need payloads - private boolean needsPayloadsOrOffsets; - - public EverythingEnum(FieldInfo fieldInfo) throws IOException { - super(fieldInfo); - indexHasOffsets = - fieldInfo - .getIndexOptions() - .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0; - indexHasPayloads = fieldInfo.hasPayloads(); - indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads; - - this.posIn = Lucene101PostingsReader.this.posIn.clone(); - posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn); - if (indexHasOffsetsOrPayloads) { - this.payIn = Lucene101PostingsReader.this.payIn.clone(); - payInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(payIn); - } else { - this.payIn = null; - payInUtil = null; - } - if (indexHasOffsets) { - offsetStartDeltaBuffer = new int[BLOCK_SIZE]; - offsetLengthBuffer = new int[BLOCK_SIZE]; - } else { - offsetStartDeltaBuffer = null; - offsetLengthBuffer = null; - startOffset = -1; - endOffset = -1; - } - - if (indexHasPayloads) { - payloadLengthBuffer = new int[BLOCK_SIZE]; - payloadBytes = new byte[128]; - payload = new BytesRef(); - } else { - payloadLengthBuffer = null; - payloadBytes = null; - payload = null; - } - } - - public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { - return docIn == Lucene101PostingsReader.this.docIn - && indexHasOffsets - == (fieldInfo - .getIndexOptions() - .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0) - && indexHasPayloads == fieldInfo.hasPayloads(); - } - - public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { - resetIndexInput(termState); - if (forDeltaUtil == null && docFreq >= BLOCK_SIZE) { - forDeltaUtil = new ForDeltaUtil(); - } - totalTermFreq = termState.totalTermFreq; - if (pforUtil == null && totalTermFreq >= BLOCK_SIZE) { - pforUtil = new PForUtil(); - } - // Where this term's postings start in the .pos file: - final long posTermStartFP = termState.posStartFP; - // Where this term's payloads/offsets start in the .pay - // file: - final long payTermStartFP = termState.payStartFP; - posIn.seek(posTermStartFP); - if (indexHasOffsetsOrPayloads) { - payIn.seek(payTermStartFP); - } - level1PosEndFP = posTermStartFP; - level1PayEndFP = payTermStartFP; - level0PosEndFP = posTermStartFP; - level0PayEndFP = payTermStartFP; - posPendingCount = 0; - payloadByteUpto = 0; - if (termState.totalTermFreq < BLOCK_SIZE) { - lastPosBlockFP = posTermStartFP; - } else if (termState.totalTermFreq == BLOCK_SIZE) { - lastPosBlockFP = -1; - } else { - lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; - } - - this.needsOffsets = - indexHasOffsets && PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS); - this.needsPayloads = - indexHasPayloads && PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS); - this.needsPayloadsOrOffsets = this.needsPayloads || this.needsOffsets; - - level1BlockPosUpto = 0; - level1BlockPayUpto = 0; - level0BlockPosUpto = 0; - level0BlockPayUpto = 0; - posBufferUpto = BLOCK_SIZE; - - return resetIdsAndLevelParams(termState); - } - - @Override - public int freq() throws IOException { - if (freqFP != -1) { - docIn.seek(freqFP); - pforUtil.decode(docInUtil, freqBuffer); - freqFP = -1; - } - return freqBuffer[docBufferUpto - 1]; - } - - private void refillDocs() throws IOException { - final int left = docFreq - docCountUpto; - assert left >= 0; - - if (left >= BLOCK_SIZE) { - forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); - freqFP = docIn.getFilePointer(); - PForUtil.skip(docIn); - docCountUpto += BLOCK_SIZE; - } else if (docFreq == 1) { - docBuffer[0] = singletonDocID; - freqBuffer[0] = (int) totalTermFreq; - freqFP = -1; - docBuffer[1] = NO_MORE_DOCS; - docCountUpto++; - docBufferSize = 1; - } else { - // Read vInts: - PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true); - prefixSum(docBuffer, left, prevDocID); - docBuffer[left] = NO_MORE_DOCS; - freqFP = -1; - docCountUpto += left; - docBufferSize = left; - } - prevDocID = docBuffer[BLOCK_SIZE - 1]; - docBufferUpto = 0; - posDocBufferUpto = 0; - assert docBuffer[docBufferSize] == NO_MORE_DOCS; - } - - private void skipLevel1To(int target) throws IOException { - while (true) { - prevDocID = level1LastDocID; - level0LastDocID = level1LastDocID; - docIn.seek(level1DocEndFP); - level0PosEndFP = level1PosEndFP; - level0BlockPosUpto = level1BlockPosUpto; - if (indexHasOffsetsOrPayloads) { - level0PayEndFP = level1PayEndFP; - level0BlockPayUpto = level1BlockPayUpto; - } - docCountUpto = level1DocCountUpto; - level1DocCountUpto += LEVEL1_NUM_DOCS; - - if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - break; - } - - level1LastDocID += docIn.readVInt(); - long delta = docIn.readVLong(); - level1DocEndFP = delta + docIn.getFilePointer(); - - long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); - docIn.skipBytes(docIn.readShort()); // impacts - level1PosEndFP += docIn.readVLong(); - level1BlockPosUpto = docIn.readByte(); - if (indexHasOffsetsOrPayloads) { - level1PayEndFP += docIn.readVLong(); - level1BlockPayUpto = docIn.readVInt(); - } - assert docIn.getFilePointer() == skip1EndFP; - - if (level1LastDocID >= target) { - break; - } - } - } - - private void moveToNextLevel0Block() throws IOException { - if (doc == level1LastDocID) { // advance level 1 skip data - skipLevel1To(doc + 1); - } - - // Now advance level 0 skip data - prevDocID = level0LastDocID; - - assert docBufferUpto == BLOCK_SIZE; - if (level0PosEndFP >= posIn.getFilePointer()) { - posIn.seek(level0PosEndFP); - posPendingCount = level0BlockPosUpto; - if (indexHasOffsetsOrPayloads) { - assert level0PayEndFP >= payIn.getFilePointer(); - payIn.seek(level0PayEndFP); - payloadByteUpto = level0BlockPayUpto; - } - posBufferUpto = BLOCK_SIZE; - } else { - posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE); - } - - if (docFreq - docCountUpto >= BLOCK_SIZE) { - docIn.readVLong(); // skip0 num bytes - int docDelta = readVInt15(docIn); - level0LastDocID += docDelta; - readVLong15(docIn); // block length - docIn.skipBytes(docIn.readVLong()); // impacts - - level0PosEndFP += docIn.readVLong(); - level0BlockPosUpto = docIn.readByte(); - if (indexHasOffsetsOrPayloads) { - level0PayEndFP += docIn.readVLong(); - level0BlockPayUpto = docIn.readVInt(); - } - } else { - level0LastDocID = NO_MORE_DOCS; - } - - refillDocs(); - } - - @Override - public int nextDoc() throws IOException { - if (docBufferUpto == BLOCK_SIZE) { // advance level 0 skip data - moveToNextLevel0Block(); - } - - this.doc = docBuffer[docBufferUpto]; - docBufferUpto++; - return doc; - } - - private void skipLevel0To(int target) throws IOException { - long posFP; - int posUpto; - long payFP; - int payUpto; - - while (true) { - prevDocID = level0LastDocID; - - posFP = level0PosEndFP; - posUpto = level0BlockPosUpto; - payFP = level0PayEndFP; - payUpto = level0BlockPayUpto; - - if (docFreq - docCountUpto >= BLOCK_SIZE) { - docIn.readVLong(); // skip0 num bytes - int docDelta = readVInt15(docIn); - level0LastDocID += docDelta; - - long blockLength = readVLong15(docIn); - long blockEndFP = docIn.getFilePointer() + blockLength; - docIn.skipBytes(docIn.readVLong()); // impacts - - level0PosEndFP += docIn.readVLong(); - level0BlockPosUpto = docIn.readByte(); - if (indexHasOffsetsOrPayloads) { - level0PayEndFP += docIn.readVLong(); - level0BlockPayUpto = docIn.readVInt(); - } - - if (target <= level0LastDocID) { - break; - } - - docIn.seek(blockEndFP); - docCountUpto += BLOCK_SIZE; - } else { - level0LastDocID = NO_MORE_DOCS; - break; - } - } - - // If nextBlockPosFP is less than the current FP, it means that the block of positions for - // the first docs of the next block are already decoded. In this case we just accumulate - // frequencies into posPendingCount instead of seeking backwards and decoding the same pos - // block again. - if (posFP >= posIn.getFilePointer()) { - posIn.seek(posFP); - posPendingCount = posUpto; - if (indexHasOffsetsOrPayloads) { - assert level0PayEndFP >= payIn.getFilePointer(); - payIn.seek(payFP); - payloadByteUpto = payUpto; - } - posBufferUpto = BLOCK_SIZE; - } else { - posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE); - } - } - - @Override - public int advance(int target) throws IOException { - if (target > level0LastDocID) { // advance level 0 skip data - - if (target > level1LastDocID) { // advance level 1 skip data - skipLevel1To(target); - } - - skipLevel0To(target); - - refillDocs(); - } - - int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); - this.docBufferUpto = next + 1; - - return this.doc = docBuffer[next]; - } - - private void skipPositions(int freq) throws IOException { - // Skip positions now: - int toSkip = posPendingCount - freq; - // if (DEBUG) { - // System.out.println(" FPR.skipPositions: toSkip=" + toSkip); - // } - - final int leftInBlock = BLOCK_SIZE - posBufferUpto; - if (toSkip < leftInBlock) { - int end = posBufferUpto + toSkip; - if (indexHasPayloads) { - payloadByteUpto += sumOverRange(payloadLengthBuffer, posBufferUpto, end); - } - posBufferUpto = end; - } else { - toSkip -= leftInBlock; - while (toSkip >= BLOCK_SIZE) { - assert posIn.getFilePointer() != lastPosBlockFP; - PForUtil.skip(posIn); - - if (indexHasPayloads) { - // Skip payloadLength block: - PForUtil.skip(payIn); - - // Skip payloadBytes block: - int numBytes = payIn.readVInt(); - payIn.seek(payIn.getFilePointer() + numBytes); - } - - if (indexHasOffsets) { - PForUtil.skip(payIn); - PForUtil.skip(payIn); - } - toSkip -= BLOCK_SIZE; - } - refillPositions(); - payloadByteUpto = 0; - if (indexHasPayloads) { - payloadByteUpto += sumOverRange(payloadLengthBuffer, 0, toSkip); - } - posBufferUpto = toSkip; - } - - position = 0; - lastStartOffset = 0; - } - - private void refillLastPositionBlock() throws IOException { - final int count = (int) (totalTermFreq % BLOCK_SIZE); - int payloadLength = 0; - int offsetLength = 0; - payloadByteUpto = 0; - for (int i = 0; i < count; i++) { - int code = posIn.readVInt(); - if (indexHasPayloads) { - if ((code & 1) != 0) { - payloadLength = posIn.readVInt(); - } - payloadLengthBuffer[i] = payloadLength; - posDeltaBuffer[i] = code >>> 1; - if (payloadLength != 0) { - if (payloadByteUpto + payloadLength > payloadBytes.length) { - payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payloadLength); - } - posIn.readBytes(payloadBytes, payloadByteUpto, payloadLength); - payloadByteUpto += payloadLength; - } - } else { - posDeltaBuffer[i] = code; - } - - if (indexHasOffsets) { - int deltaCode = posIn.readVInt(); - if ((deltaCode & 1) != 0) { - offsetLength = posIn.readVInt(); - } - offsetStartDeltaBuffer[i] = deltaCode >>> 1; - offsetLengthBuffer[i] = offsetLength; - } - } - payloadByteUpto = 0; - } - - private void refillPositions() throws IOException { - if (posIn.getFilePointer() == lastPosBlockFP) { - refillLastPositionBlock(); - } else { - pforUtil.decode(posInUtil, posDeltaBuffer); - - if (indexHasPayloads) { - if (needsPayloads) { - pforUtil.decode(payInUtil, payloadLengthBuffer); - int numBytes = payIn.readVInt(); - - if (numBytes > payloadBytes.length) { - payloadBytes = ArrayUtil.growNoCopy(payloadBytes, numBytes); - } - payIn.readBytes(payloadBytes, 0, numBytes); - } else { - // this works, because when writing a vint block we always force the first length to be - // written - PForUtil.skip(payIn); // skip over lengths - payIn.skipBytes(payIn.readVInt()); // skip over payloadBytes - } - payloadByteUpto = 0; - } - - if (indexHasOffsets) { - if (needsOffsets) { - pforUtil.decode(payInUtil, offsetStartDeltaBuffer); - pforUtil.decode(payInUtil, offsetLengthBuffer); - } else { - // this works, because when writing a vint block we always force the first length to be - // written - PForUtil.skip(payIn); // skip over starts - PForUtil.skip(payIn); // skip over lengths - } - } - } - } - - private void accumulatePayloadAndOffsets() { - if (needsPayloads) { - payloadLength = payloadLengthBuffer[posBufferUpto]; - payload.bytes = payloadBytes; - payload.offset = payloadByteUpto; - payload.length = payloadLength; - payloadByteUpto += payloadLength; - } - - if (needsOffsets) { - startOffset = lastStartOffset + offsetStartDeltaBuffer[posBufferUpto]; - endOffset = startOffset + offsetLengthBuffer[posBufferUpto]; - lastStartOffset = startOffset; - } - } - - @Override - public int nextPosition() throws IOException { - if (posDocBufferUpto != docBufferUpto) { - int freq = freq(); // triggers lazy decoding of freqs - - // First position that is being read on this doc. - posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, docBufferUpto); - posDocBufferUpto = docBufferUpto; - - assert posPendingCount > 0; - - if (posPendingCount > freq) { - skipPositions(freq); - posPendingCount = freq; - } - - position = 0; - lastStartOffset = 0; - } - - if (posBufferUpto == BLOCK_SIZE) { - refillPositions(); - posBufferUpto = 0; - } - position += posDeltaBuffer[posBufferUpto]; - - if (needsPayloadsOrOffsets) { - accumulatePayloadAndOffsets(); - } - - posBufferUpto++; - posPendingCount--; - return position; - } - - @Override - public int startOffset() { - if (needsOffsets == false) { - return -1; - } - return startOffset; - } - - @Override - public int endOffset() { - if (needsOffsets == false) { - return -1; - } - return endOffset; - } - - @Override - public BytesRef getPayload() { - if (needsPayloads == false || payloadLength == 0) { - return null; - } else { - return payload; - } - } - - @Override - public long cost() { - return docFreq; - } - } - - private abstract class BlockImpactsEnum extends ImpactsEnum { - - protected final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); - protected final PForUtil pforUtil = new PForUtil(); - - protected final int[] docBuffer = new int[BLOCK_SIZE + 1]; - protected final int[] freqBuffer = new int[BLOCK_SIZE]; - - protected final int docFreq; // number of docs in this posting list - // sum of freqBuffer in this posting list (or docFreq when omitted) - protected final long totalTermFreq; - protected final int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 - - protected final IndexInput docIn; - protected final PostingDecodingUtil docInUtil; - - protected int docCountUpto; // number of docs in or before the current block - protected int doc = -1; // doc we last read - protected int prevDocID = -1; // last doc ID of the previous block - protected int docBufferSize = BLOCK_SIZE; - protected int docBufferUpto = BLOCK_SIZE; - - // true if we shallow-advanced to a new block that we have not decoded yet - protected boolean needsRefilling; - - // level 0 skip data - protected int level0LastDocID = -1; - protected long level0DocEndFP; - protected final BytesRef level0SerializedImpacts; - protected final MutableImpactList level0Impacts; - // level 1 skip data - protected int level1LastDocID; - protected long level1DocEndFP; - protected int level1DocCountUpto = 0; - protected final BytesRef level1SerializedImpacts; - protected final MutableImpactList level1Impacts; - - private BlockImpactsEnum(IntBlockTermState termState) throws IOException { - this.docFreq = termState.docFreq; - this.docIn = Lucene101PostingsReader.this.docIn.clone(); - this.docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn); - if (docFreq > 1) { - prefetchPostings(docIn, termState); - } - this.singletonDocID = termState.singletonDocID; - this.totalTermFreq = termState.totalTermFreq; - level0SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel0); - level1SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel1); - level0Impacts = new MutableImpactList(maxNumImpactsAtLevel0); - level1Impacts = new MutableImpactList(maxNumImpactsAtLevel1); - if (docFreq < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - if (docFreq > 1) { - docIn.seek(termState.docStartFP); - } - } else { - level1LastDocID = -1; - level1DocEndFP = termState.docStartFP; - } - // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in - // advance() - docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; - } - - @Override - public int docID() { - return doc; - } - - @Override - public int startOffset() { - return -1; - } - - @Override - public int endOffset() { - return -1; - } - - @Override - public BytesRef getPayload() { - return null; - } - - @Override - public long cost() { - return docFreq; - } - - private final Impacts impacts = - new Impacts() { - - private final ByteArrayDataInput scratch = new ByteArrayDataInput(); - - @Override - public int numLevels() { - return level1LastDocID == NO_MORE_DOCS ? 1 : 2; - } - - @Override - public int getDocIdUpTo(int level) { - if (level == 0) { - return level0LastDocID; - } - return level == 1 ? level1LastDocID : NO_MORE_DOCS; - } - - @Override - public List getImpacts(int level) { - if (level == 0 && level0LastDocID != NO_MORE_DOCS) { - return readImpacts(level0SerializedImpacts, level0Impacts); - } - if (level == 1) { - return readImpacts(level1SerializedImpacts, level1Impacts); - } - return DUMMY_IMPACTS; - } - - private List readImpacts(BytesRef serialized, MutableImpactList impactsList) { - var scratch = this.scratch; - scratch.reset(serialized.bytes, 0, serialized.length); - Lucene101PostingsReader.readImpacts(scratch, impactsList); - return impactsList; - } - }; - - @Override - public Impacts getImpacts() { - return impacts; - } - } - - final class BlockImpactsDocsEnum extends BlockImpactsEnum { - final boolean indexHasPos; - - private long freqFP; - - public BlockImpactsDocsEnum(boolean indexHasPos, IntBlockTermState termState) - throws IOException { - super(termState); - this.indexHasPos = indexHasPos; - freqFP = -1; - } - - @Override - public int freq() throws IOException { - if (freqFP != -1) { - docIn.seek(freqFP); - pforUtil.decode(docInUtil, freqBuffer); - freqFP = -1; - } - return freqBuffer[docBufferUpto - 1]; - } - - @Override - public int nextPosition() { - return -1; - } - - private void refillDocs() throws IOException { - final int left = docFreq - docCountUpto; - assert left >= 0; - - if (left >= BLOCK_SIZE) { - forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); - freqFP = docIn.getFilePointer(); - PForUtil.skip(docIn); - docCountUpto += BLOCK_SIZE; - } else if (docFreq == 1) { - docBuffer[0] = singletonDocID; - freqBuffer[0] = (int) totalTermFreq; - freqFP = -1; - docBuffer[1] = NO_MORE_DOCS; - docCountUpto++; - docBufferSize = 1; - } else { - // Read vInts: - PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true); - prefixSum(docBuffer, left, prevDocID); - docBuffer[left] = NO_MORE_DOCS; - freqFP = -1; - docCountUpto += left; - docBufferSize = left; - } - prevDocID = docBuffer[BLOCK_SIZE - 1]; - docBufferUpto = 0; - assert docBuffer[docBufferSize] == NO_MORE_DOCS; - } - - private void skipLevel1To(int target) throws IOException { - while (true) { - prevDocID = level1LastDocID; - level0LastDocID = level1LastDocID; - docIn.seek(level1DocEndFP); - docCountUpto = level1DocCountUpto; - level1DocCountUpto += LEVEL1_NUM_DOCS; - - if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - break; - } - - level1LastDocID += docIn.readVInt(); - level1DocEndFP = docIn.readVLong() + docIn.getFilePointer(); - - if (level1LastDocID >= target) { - long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); - int numImpactBytes = docIn.readShort(); - docIn.readBytes(level1SerializedImpacts.bytes, 0, numImpactBytes); - level1SerializedImpacts.length = numImpactBytes; - assert indexHasPos || docIn.getFilePointer() == skip1EndFP; - docIn.seek(skip1EndFP); - break; - } - } - } - - private void skipLevel0To(int target) throws IOException { - while (true) { - prevDocID = level0LastDocID; - if (docFreq - docCountUpto >= BLOCK_SIZE) { - long skip0NumBytes = docIn.readVLong(); - // end offset of skip data (before the actual data starts) - long skip0End = docIn.getFilePointer() + skip0NumBytes; - int docDelta = readVInt15(docIn); - long blockLength = readVLong15(docIn); - - level0LastDocID += docDelta; - - if (target <= level0LastDocID) { - level0DocEndFP = docIn.getFilePointer() + blockLength; - int numImpactBytes = docIn.readVInt(); - docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); - level0SerializedImpacts.length = numImpactBytes; - docIn.seek(skip0End); - break; - } - - // skip block - docIn.skipBytes(blockLength); - docCountUpto += BLOCK_SIZE; - } else { - level0LastDocID = NO_MORE_DOCS; - break; - } - } - } - - @Override - public void advanceShallow(int target) throws IOException { - if (target > level0LastDocID) { // advance skip data on level 0 - if (target > level1LastDocID) { // advance skip data on level 1 - skipLevel1To(target); - } else if (needsRefilling) { - docIn.seek(level0DocEndFP); - docCountUpto += BLOCK_SIZE; - } - - skipLevel0To(target); - - needsRefilling = true; - } - } - - private void moveToNextLevel0Block() throws IOException { - if (doc == level1LastDocID) { - skipLevel1To(doc + 1); - } else if (needsRefilling) { - docIn.seek(level0DocEndFP); - docCountUpto += BLOCK_SIZE; - } - - prevDocID = level0LastDocID; - if (docFreq - docCountUpto >= BLOCK_SIZE) { - final long skip0Len = docIn.readVLong(); // skip len - final long skip0End = docIn.getFilePointer() + skip0Len; - final int docDelta = readVInt15(docIn); - final long blockLength = readVLong15(docIn); - level0LastDocID += docDelta; - level0DocEndFP = docIn.getFilePointer() + blockLength; - final int numImpactBytes = docIn.readVInt(); - docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); - level0SerializedImpacts.length = numImpactBytes; - docIn.seek(skip0End); - } else { - level0LastDocID = NO_MORE_DOCS; - } - - refillDocs(); - needsRefilling = false; - } - - @Override - public int nextDoc() throws IOException { - if (docBufferUpto == BLOCK_SIZE) { - if (needsRefilling) { - refillDocs(); - needsRefilling = false; - } else { - moveToNextLevel0Block(); - } - } - - return this.doc = docBuffer[docBufferUpto++]; - } - - @Override - public int advance(int target) throws IOException { - if (target > level0LastDocID || needsRefilling) { - advanceShallow(target); - refillDocs(); - needsRefilling = false; - } - - int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); - this.doc = docBuffer[next]; - docBufferUpto = next + 1; - return doc; - } - } - - final class BlockImpactsPostingsEnum extends BlockImpactsEnum { - private final int[] posDeltaBuffer = new int[BLOCK_SIZE]; - - private int posBufferUpto; - final IndexInput posIn; - final PostingDecodingUtil posInUtil; - - final boolean indexHasFreq; - final boolean indexHasOffsets; - final boolean indexHasPayloads; - final boolean indexHasOffsetsOrPayloads; - - private long freqFP; // offset of the freq block - - private int position; // current position - - // value of docBufferUpto on the last doc ID when positions have been read - private int posDocBufferUpto; - - // how many positions "behind" we are; nextPosition must - // skip these to "catch up": - private int posPendingCount; - - // File pointer where the last (vInt encoded) pos delta - // block is. We need this to know whether to bulk - // decode vs vInt decode the block: - private final long lastPosBlockFP; - - // level 0 skip data - private long level0PosEndFP; - private int level0BlockPosUpto; - // level 1 skip data - private long level1PosEndFP; - private int level1BlockPosUpto; - - public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState) - throws IOException { - super(termState); - final IndexOptions options = fieldInfo.getIndexOptions(); - indexHasFreq = options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - indexHasOffsets = - options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; - indexHasPayloads = fieldInfo.hasPayloads(); - indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads; - - this.posIn = Lucene101PostingsReader.this.posIn.clone(); - posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn); - - // Where this term's postings start in the .pos file: - final long posTermStartFP = termState.posStartFP; - posIn.seek(posTermStartFP); - level1PosEndFP = posTermStartFP; - level0PosEndFP = posTermStartFP; - posPendingCount = 0; - if (termState.totalTermFreq < BLOCK_SIZE) { - lastPosBlockFP = posTermStartFP; - } else if (termState.totalTermFreq == BLOCK_SIZE) { - lastPosBlockFP = -1; - } else { - lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; - } - level1BlockPosUpto = 0; - posBufferUpto = BLOCK_SIZE; - } - - @Override - public int freq() throws IOException { - if (freqFP != -1) { - docIn.seek(freqFP); - pforUtil.decode(docInUtil, freqBuffer); - freqFP = -1; - } - return freqBuffer[docBufferUpto - 1]; - } - - private void refillDocs() throws IOException { - final int left = docFreq - docCountUpto; - assert left >= 0; - - if (left >= BLOCK_SIZE) { - forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); - freqFP = docIn.getFilePointer(); - PForUtil.skip(docIn); - docCountUpto += BLOCK_SIZE; - } else if (docFreq == 1) { - docBuffer[0] = singletonDocID; - freqBuffer[0] = (int) totalTermFreq; - freqFP = -1; - docBuffer[1] = NO_MORE_DOCS; - docCountUpto++; - docBufferSize = 1; - - } else { - // Read vInts: - PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true); - prefixSum(docBuffer, left, prevDocID); - docBuffer[left] = NO_MORE_DOCS; - freqFP = -1; - docCountUpto += left; - docBufferSize = left; - freqFP = -1; + private void readLevel0PosData() throws IOException { + level0PosEndFP += docIn.readVLong(); + level0BlockPosUpto = docIn.readByte(); + if (indexHasOffsetsOrPayloads) { + level0PayEndFP += docIn.readVLong(); + level0BlockPayUpto = docIn.readVInt(); } - prevDocID = docBuffer[BLOCK_SIZE - 1]; - docBufferUpto = 0; - posDocBufferUpto = 0; - assert docBuffer[docBufferSize] == NO_MORE_DOCS; } - private void skipLevel1To(int target) throws IOException { - while (true) { - prevDocID = level1LastDocID; - level0LastDocID = level1LastDocID; - docIn.seek(level1DocEndFP); - level0PosEndFP = level1PosEndFP; - level0BlockPosUpto = level1BlockPosUpto; - docCountUpto = level1DocCountUpto; - level1DocCountUpto += LEVEL1_NUM_DOCS; - - if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - break; - } - - level1LastDocID += docIn.readVInt(); - level1DocEndFP = docIn.readVLong() + docIn.getFilePointer(); - - long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); - int numImpactBytes = docIn.readShort(); - if (level1LastDocID >= target) { - docIn.readBytes(level1SerializedImpacts.bytes, 0, numImpactBytes); - level1SerializedImpacts.length = numImpactBytes; - } else { - docIn.skipBytes(numImpactBytes); - } - level1PosEndFP += docIn.readVLong(); - level1BlockPosUpto = docIn.readByte(); - assert indexHasOffsetsOrPayloads || docIn.getFilePointer() == skip1EndFP; - - if (level1LastDocID >= target) { - docIn.seek(skip1EndFP); - break; + private void seekPosData(long posFP, int posUpto, long payFP, int payUpto) throws IOException { + // If nextBlockPosFP is less than the current FP, it means that the block of positions for + // the first docs of the next block are already decoded. In this case we just accumulate + // frequencies into posPendingCount instead of seeking backwards and decoding the same pos + // block again. + if (posFP >= posIn.getFilePointer()) { + posIn.seek(posFP); + posPendingCount = posUpto; + if (payIn != null) { // needs payloads or offsets + assert level0PayEndFP >= payIn.getFilePointer(); + payIn.seek(payFP); + payloadByteUpto = payUpto; } + posBufferUpto = BLOCK_SIZE; + } else { + posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE); } } private void skipLevel0To(int target) throws IOException { long posFP; int posUpto; + long payFP; + int payUpto; while (true) { prevDocID = level0LastDocID; posFP = level0PosEndFP; posUpto = level0BlockPosUpto; + payFP = level0PayEndFP; + payUpto = level0BlockPayUpto; if (docFreq - docCountUpto >= BLOCK_SIZE) { - docIn.readVLong(); // skip0 num bytes + long numSkipBytes = docIn.readVLong(); + long skip0End = docIn.getFilePointer() + numSkipBytes; int docDelta = readVInt15(docIn); + level0LastDocID += docDelta; + boolean found = target <= level0LastDocID; long blockLength = readVLong15(docIn); level0DocEndFP = docIn.getFilePointer() + blockLength; - level0LastDocID += docDelta; - - if (target <= level0LastDocID) { - int numImpactBytes = docIn.readVInt(); - docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); - level0SerializedImpacts.length = numImpactBytes; - level0PosEndFP += docIn.readVLong(); - level0BlockPosUpto = docIn.readByte(); - if (indexHasOffsetsOrPayloads) { - docIn.readVLong(); // pay fp delta - docIn.readVInt(); // pay upto + if (indexHasFreq) { + if (found == false && needsPos == false) { + docIn.seek(skip0End); + } else { + int numImpactBytes = docIn.readVInt(); + if (needsImpacts && found) { + docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); + level0SerializedImpacts.length = numImpactBytes; + } else { + docIn.skipBytes(numImpactBytes); + } + + if (needsPos) { + readLevel0PosData(); + } else { + docIn.seek(skip0End); + } } + } + + if (found) { break; } - // skip block - docIn.skipBytes(docIn.readVLong()); // impacts - level0PosEndFP += docIn.readVLong(); - level0BlockPosUpto = docIn.readVInt(); + docIn.seek(level0DocEndFP); docCountUpto += BLOCK_SIZE; } else { @@ -1688,62 +823,62 @@ private void skipLevel0To(int target) throws IOException { } } - // If nextBlockPosFP is less than the current FP, it means that the block of positions for - // the first docs of the next block are already decoded. In this case we just accumulate - // frequencies into posPendingCount instead of seeking backwards and decoding the same pos - // block again. - if (posFP >= posIn.getFilePointer()) { - posIn.seek(posFP); - posPendingCount = posUpto; - posBufferUpto = BLOCK_SIZE; - } else { - posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE); + if (posIn != null) { // needs positions + seekPosData(posFP, posUpto, payFP, payUpto); } } @Override public void advanceShallow(int target) throws IOException { if (target > level0LastDocID) { // advance level 0 skip data + doAdvanceShallow(target); - if (target > level1LastDocID) { // advance skip data on level 1 - skipLevel1To(target); - } else if (needsRefilling) { - docIn.seek(level0DocEndFP); - docCountUpto += BLOCK_SIZE; + // If we are on the last doc ID of a block and we are advancing on the doc ID just beyond + // this block, then we decode the block. This may not be necessary, but this helps avoid + // having to check whether we are in a block that is not decoded yet in #nextDoc(). + if (docBufferUpto == BLOCK_SIZE && target == doc + 1) { + refillDocs(); + needsRefilling = false; + } else { + needsRefilling = true; } + } + } - skipLevel0To(target); - - needsRefilling = true; + private void doAdvanceShallow(int target) throws IOException { + if (target > level1LastDocID) { // advance skip data on level 1 + skipLevel1To(target); + } else if (needsRefilling) { + docIn.seek(level0DocEndFP); + docCountUpto += BLOCK_SIZE; } + + skipLevel0To(target); } @Override public int nextDoc() throws IOException { if (docBufferUpto == BLOCK_SIZE) { - advanceShallow(doc + 1); - assert needsRefilling; - refillDocs(); - needsRefilling = false; + moveToNextLevel0Block(); } - doc = docBuffer[docBufferUpto]; - docBufferUpto++; - return this.doc; + return this.doc = docBuffer[docBufferUpto++]; } @Override public int advance(int target) throws IOException { if (target > level0LastDocID || needsRefilling) { - advanceShallow(target); - assert needsRefilling; + if (target > level0LastDocID) { + doAdvanceShallow(target); + } refillDocs(); needsRefilling = false; } int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); + this.doc = docBuffer[next]; docBufferUpto = next + 1; - return this.doc = docBuffer[next]; + return doc; } private void skipPositions(int freq) throws IOException { @@ -1755,66 +890,170 @@ private void skipPositions(int freq) throws IOException { final int leftInBlock = BLOCK_SIZE - posBufferUpto; if (toSkip < leftInBlock) { - posBufferUpto += toSkip; + int end = posBufferUpto + toSkip; + if (needsPayloads) { + payloadByteUpto += sumOverRange(payloadLengthBuffer, posBufferUpto, end); + } + posBufferUpto = end; } else { toSkip -= leftInBlock; while (toSkip >= BLOCK_SIZE) { assert posIn.getFilePointer() != lastPosBlockFP; PForUtil.skip(posIn); + + if (payIn != null) { + if (indexHasPayloads) { + // Skip payloadLength block: + PForUtil.skip(payIn); + + // Skip payloadBytes block: + int numBytes = payIn.readVInt(); + payIn.seek(payIn.getFilePointer() + numBytes); + } + + if (indexHasOffsets) { + PForUtil.skip(payIn); + PForUtil.skip(payIn); + } + } toSkip -= BLOCK_SIZE; } refillPositions(); + if (needsPayloads) { + payloadByteUpto = sumOverRange(payloadLengthBuffer, 0, toSkip); + } posBufferUpto = toSkip; } } - private void refillPositions() throws IOException { - if (posIn.getFilePointer() == lastPosBlockFP) { - final int count = (int) (totalTermFreq % BLOCK_SIZE); - int payloadLength = 0; - for (int i = 0; i < count; i++) { - int code = posIn.readVInt(); - if (indexHasPayloads) { - if ((code & 1) != 0) { - payloadLength = posIn.readVInt(); - } + private void refillLastPositionBlock() throws IOException { + final int count = (int) (totalTermFreq % BLOCK_SIZE); + int payloadLength = 0; + int offsetLength = 0; + payloadByteUpto = 0; + for (int i = 0; i < count; i++) { + int code = posIn.readVInt(); + if (indexHasPayloads) { + if ((code & 1) != 0) { + payloadLength = posIn.readVInt(); + } + if (payloadLengthBuffer != null) { // needs payloads + payloadLengthBuffer[i] = payloadLength; posDeltaBuffer[i] = code >>> 1; if (payloadLength != 0) { - posIn.skipBytes(payloadLength); + if (payloadByteUpto + payloadLength > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payloadLength); + } + posIn.readBytes(payloadBytes, payloadByteUpto, payloadLength); + payloadByteUpto += payloadLength; } } else { - posDeltaBuffer[i] = code; + posIn.skipBytes(payloadLength); + } + } else { + posDeltaBuffer[i] = code; + } + + if (indexHasOffsets) { + int deltaCode = posIn.readVInt(); + if ((deltaCode & 1) != 0) { + offsetLength = posIn.readVInt(); + } + if (offsetStartDeltaBuffer != null) { // needs offsets + offsetStartDeltaBuffer[i] = deltaCode >>> 1; + offsetLengthBuffer[i] = offsetLength; } + } + } + payloadByteUpto = 0; + } - if (indexHasOffsets) { - int deltaCode = posIn.readVInt(); - if ((deltaCode & 1) != 0) { - posIn.readVInt(); // offset length - } + private void refillOffsetsOrPayloads() throws IOException { + if (indexHasPayloads) { + if (needsPayloads) { + pforUtil.decode(payInUtil, payloadLengthBuffer); + int numBytes = payIn.readVInt(); + + if (numBytes > payloadBytes.length) { + payloadBytes = ArrayUtil.growNoCopy(payloadBytes, numBytes); } + payIn.readBytes(payloadBytes, 0, numBytes); + } else if (payIn != null) { // needs offsets + // this works, because when writing a vint block we always force the first length to be + // written + PForUtil.skip(payIn); // skip over lengths + int numBytes = payIn.readVInt(); // read length of payloadBytes + payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes + } + payloadByteUpto = 0; + } + + if (indexHasOffsets) { + if (needsOffsets) { + pforUtil.decode(payInUtil, offsetStartDeltaBuffer); + pforUtil.decode(payInUtil, offsetLengthBuffer); + } else if (payIn != null) { // needs payloads + // this works, because when writing a vint block we always force the first length to be + // written + PForUtil.skip(payIn); // skip over starts + PForUtil.skip(payIn); // skip over lengths } - } else { - pforUtil.decode(posInUtil, posDeltaBuffer); } } - @Override - public int nextPosition() throws IOException { - if (posDocBufferUpto != docBufferUpto) { - int freq = freq(); // triggers lazy decoding of freqs + private void refillPositions() throws IOException { + if (posIn.getFilePointer() == lastPosBlockFP) { + refillLastPositionBlock(); + return; + } + pforUtil.decode(posInUtil, posDeltaBuffer); - // First position that is being read on this doc. - posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, docBufferUpto); - posDocBufferUpto = docBufferUpto; + if (indexHasOffsetsOrPayloads) { + refillOffsetsOrPayloads(); + } + } - assert posPendingCount > 0; + private void accumulatePendingPositions() throws IOException { + int freq = freq(); // trigger lazy decoding of freqs + posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, docBufferUpto); + posDocBufferUpto = docBufferUpto; - if (posPendingCount > freq) { - skipPositions(freq); - posPendingCount = freq; - } + assert posPendingCount > 0; + if (posPendingCount > freq) { + skipPositions(freq); + posPendingCount = freq; + } + } + + private void accumulatePayloadAndOffsets() { + if (needsPayloads) { + payloadLength = payloadLengthBuffer[posBufferUpto]; + payload.bytes = payloadBytes; + payload.offset = payloadByteUpto; + payload.length = payloadLength; + payloadByteUpto += payloadLength; + } + + if (needsOffsets) { + startOffset = lastStartOffset + offsetStartDeltaBuffer[posBufferUpto]; + endOffset = startOffset + offsetLengthBuffer[posBufferUpto]; + lastStartOffset = startOffset; + } + } + + @Override + public int nextPosition() throws IOException { + if (needsPos == false) { + return -1; + } + + assert posDocBufferUpto <= docBufferUpto; + if (posDocBufferUpto != docBufferUpto) { + // First position we're reading on this doc + accumulatePendingPositions(); position = 0; + lastStartOffset = 0; } if (posBufferUpto == BLOCK_SIZE) { @@ -1823,10 +1062,92 @@ public int nextPosition() throws IOException { } position += posDeltaBuffer[posBufferUpto]; + if (needsOffsetsOrPayloads) { + accumulatePayloadAndOffsets(); + } + posBufferUpto++; posPendingCount--; return position; } + + @Override + public int startOffset() { + if (needsOffsets == false) { + return -1; + } + return startOffset; + } + + @Override + public int endOffset() { + if (needsOffsets == false) { + return -1; + } + return endOffset; + } + + @Override + public BytesRef getPayload() { + if (needsPayloads == false || payloadLength == 0) { + return null; + } else { + return payload; + } + } + + @Override + public long cost() { + return docFreq; + } + + private final Impacts impacts = + new Impacts() { + + private final ByteArrayDataInput scratch = new ByteArrayDataInput(); + + @Override + public int numLevels() { + return indexHasFreq == false || level1LastDocID == NO_MORE_DOCS ? 1 : 2; + } + + @Override + public int getDocIdUpTo(int level) { + if (indexHasFreq == false) { + return NO_MORE_DOCS; + } + if (level == 0) { + return level0LastDocID; + } + return level == 1 ? level1LastDocID : NO_MORE_DOCS; + } + + @Override + public List getImpacts(int level) { + if (indexHasFreq) { + if (level == 0 && level0LastDocID != NO_MORE_DOCS) { + return readImpacts(level0SerializedImpacts, level0Impacts); + } + if (level == 1) { + return readImpacts(level1SerializedImpacts, level1Impacts); + } + } + return DUMMY_IMPACTS; + } + + private List readImpacts(BytesRef serialized, MutableImpactList impactsList) { + var scratch = this.scratch; + scratch.reset(serialized.bytes, 0, serialized.length); + Lucene101PostingsReader.readImpacts(scratch, impactsList); + return impactsList; + } + }; + + @Override + public Impacts getImpacts() { + assert needsImpacts; + return impacts; + } } /** diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index 1c5436dcf0a5..b1d283cc3899 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -399,7 +399,7 @@ public boolean equals(Object obj) { /** * A guess of the average number of simple operations for the initial seek and buffer refill per * document for the positions of a term. See also {@link - * Lucene101PostingsReader.BlockImpactsPostingsEnum#nextPosition()}. + * Lucene101PostingsReader.BlockPostingsEnum#nextPosition()}. * *

Aside: Instead of being constant this could depend among others on {@link * Lucene101PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link @@ -410,9 +410,8 @@ public boolean equals(Object obj) { private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; /** - * Number of simple operations in {@link - * Lucene101PostingsReader.BlockImpactsPostingsEnum#nextPosition()} when no seek or buffer refill - * is done. + * Number of simple operations in {@link Lucene101PostingsReader.BlockPostingsEnum#nextPosition()} + * when no seek or buffer refill is done. */ private static final int TERM_OPS_PER_POS = 7; diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java index c0f2f61d7cc1..c1360a9d9a7e 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java @@ -261,7 +261,7 @@ public void visit(String field, QueryVisitor visitor) { /** * A guess of the average number of simple operations for the initial seek and buffer refill per * document for the positions of a term. See also {@link - * Lucene101PostingsReader.EverythingEnum#nextPosition()}. + * Lucene101PostingsReader.BlockPostingsEnum#nextPosition()}. * *

Aside: Instead of being constant this could depend among others on {@link * Lucene101PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link @@ -272,7 +272,7 @@ public void visit(String field, QueryVisitor visitor) { private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; /** - * Number of simple operations in {@link Lucene101PostingsReader.EverythingEnum#nextPosition()} + * Number of simple operations in {@link Lucene101PostingsReader.BlockPostingsEnum#nextPosition()} * when no seek or buffer refill is done. */ private static final int TERM_OPS_PER_POS = 7;