From a51241e4c94555a10a76cf01507b4d0a5db52542 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Fri, 9 Jun 2023 11:57:59 +0100 Subject: [PATCH] Better paging when random reads go backwards (#12357) When reading data from outside the buffer, BufferedIndexInput always resets its buffer to start at the new read position. If we are reading backwards (for example, using an OffHeapFSTStore for a terms dictionary) then this can have the effect of re-reading the same data over and over again. This commit changes BufferedIndexInput to use paging when reading backwards, so that if we ask for a byte that is before the current buffer, we read a block of data of bufferSize that ends at the previous buffer start. Fixes #12356 --- lucene/CHANGES.txt | 3 + .../lucene/store/BufferedIndexInput.java | 64 ++++++++-------- .../lucene/store/TestBufferedIndexInput.java | 75 ++++++++++++++++++- 3 files changed, 106 insertions(+), 36 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b0a5ee3ce76c..332e8eb4f608 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -177,6 +177,9 @@ Optimizations * GITHUB#12328: Optimize ConjunctionDISI.createConjunction (Armin Braun) +* GITHUB#12357: Better paging when doing backwards random reads. This speeds up + queries relying on terms in NIOFSDirectory and SimpleFSDirectory. (Alan Woodward) + * GITHUB#12339: Optimize part of duplicate calculation numDeletesToMerge in merge phase (fudongying) Bug Fixes diff --git a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java index 8db6ba3031cd..50dde6aa4c92 100644 --- a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java @@ -43,7 +43,7 @@ public abstract class BufferedIndexInput extends IndexInput implements RandomAcc /** A buffer size for merges set to {@value #MERGE_BUFFER_SIZE}. */ public static final int MERGE_BUFFER_SIZE = 4096; - private int bufferSize = BUFFER_SIZE; + private final int bufferSize; private ByteBuffer buffer = EMPTY_BYTEBUFFER; @@ -72,7 +72,7 @@ public BufferedIndexInput(String resourceDesc, int bufferSize) { this.bufferSize = bufferSize; } - /** Returns buffer size. @see #setBufferSize */ + /** Returns buffer size */ public final int getBufferSize() { return bufferSize; } @@ -159,55 +159,53 @@ public final long readLong() throws IOException { } } - @Override - public final byte readByte(long pos) throws IOException { + // Computes an offset into the current buffer from an absolute position to read + // `width` bytes from. If the buffer does not contain the position, then we + // readjust the bufferStart and refill. + private long resolvePositionInBuffer(long pos, int width) throws IOException { long index = pos - bufferStart; - if (index < 0 || index >= buffer.limit()) { + if (index >= 0 && index <= buffer.limit() - width) { + return index; + } + if (index < 0) { + // if we're moving backwards, then try and fill up the previous page rather than + // starting again at the current pos, to avoid successive backwards reads reloading + // the same data over and over again. We also check that we can read `width` + // bytes without going over the end of the buffer + bufferStart = Math.max(bufferStart - bufferSize, pos + width - bufferSize); + bufferStart = Math.max(bufferStart, 0); + bufferStart = Math.min(bufferStart, pos); + } else { + // we're moving forwards, reset the buffer to start at pos bufferStart = pos; - buffer.limit(0); // trigger refill() on read - seekInternal(pos); - refill(); - index = 0; } + buffer.limit(0); // trigger refill() on read + seekInternal(bufferStart); + refill(); + return pos - bufferStart; + } + + @Override + public final byte readByte(long pos) throws IOException { + long index = resolvePositionInBuffer(pos, Byte.BYTES); return buffer.get((int) index); } @Override public final short readShort(long pos) throws IOException { - long index = pos - bufferStart; - if (index < 0 || index >= buffer.limit() - 1) { - bufferStart = pos; - buffer.limit(0); // trigger refill() on read - seekInternal(pos); - refill(); - index = 0; - } + long index = resolvePositionInBuffer(pos, Short.BYTES); return buffer.getShort((int) index); } @Override public final int readInt(long pos) throws IOException { - long index = pos - bufferStart; - if (index < 0 || index >= buffer.limit() - 3) { - bufferStart = pos; - buffer.limit(0); // trigger refill() on read - seekInternal(pos); - refill(); - index = 0; - } + long index = resolvePositionInBuffer(pos, Integer.BYTES); return buffer.getInt((int) index); } @Override public final long readLong(long pos) throws IOException { - long index = pos - bufferStart; - if (index < 0 || index >= buffer.limit() - 7) { - bufferStart = pos; - buffer.limit(0); // trigger refill() on read - seekInternal(pos); - refill(); - index = 0; - } + long index = resolvePositionInBuffer(pos, Long.BYTES); return buffer.getLong((int) index); } diff --git a/lucene/core/src/test/org/apache/lucene/store/TestBufferedIndexInput.java b/lucene/core/src/test/org/apache/lucene/store/TestBufferedIndexInput.java index 546a22493975..d19b2058e07b 100644 --- a/lucene/core/src/test/org/apache/lucene/store/TestBufferedIndexInput.java +++ b/lucene/core/src/test/org/apache/lucene/store/TestBufferedIndexInput.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.Random; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.ArrayUtil; @@ -142,6 +143,72 @@ public void testEOF() throws Exception { }); } + // Test that when reading backwards, we page backwards rather than refilling + // on every call + public void testBackwardsByteReads() throws IOException { + MyBufferedIndexInput input = new MyBufferedIndexInput(1024 * 8); + for (int i = 2048; i > 0; i -= random().nextInt(16)) { + assertEquals(byten(i), input.readByte(i)); + } + assertEquals(3, input.readCount); + } + + public void testBackwardsShortReads() throws IOException { + MyBufferedIndexInput input = new MyBufferedIndexInput(1024 * 8); + ByteBuffer bb = ByteBuffer.allocate(2); + bb.order(ByteOrder.LITTLE_ENDIAN); + for (int i = 2048; i > 0; i -= (random().nextInt(16) + 1)) { + bb.clear(); + bb.put(byten(i)); + bb.put(byten(i + 1)); + assertEquals(bb.getShort(0), input.readShort(i)); + } + // readCount can be three or four, depending on whether or not we had to adjust the bufferStart + // to include a whole short + assertTrue( + "Expected 4 or 3, got " + input.readCount, input.readCount == 4 || input.readCount == 3); + } + + public void testBackwardsIntReads() throws IOException { + MyBufferedIndexInput input = new MyBufferedIndexInput(1024 * 8); + ByteBuffer bb = ByteBuffer.allocate(4); + bb.order(ByteOrder.LITTLE_ENDIAN); + for (int i = 2048; i > 0; i -= (random().nextInt(16) + 3)) { + bb.clear(); + bb.put(byten(i)); + bb.put(byten(i + 1)); + bb.put(byten(i + 2)); + bb.put(byten(i + 3)); + assertEquals(bb.getInt(0), input.readInt(i)); + } + // readCount can be three or four, depending on whether or not we had to adjust the bufferStart + // to include a whole int + assertTrue( + "Expected 4 or 3, got " + input.readCount, input.readCount == 4 || input.readCount == 3); + } + + public void testBackwardsLongReads() throws IOException { + MyBufferedIndexInput input = new MyBufferedIndexInput(1024 * 8); + ByteBuffer bb = ByteBuffer.allocate(8); + bb.order(ByteOrder.LITTLE_ENDIAN); + for (int i = 2048; i > 0; i -= (random().nextInt(16) + 7)) { + bb.clear(); + bb.put(byten(i)); + bb.put(byten(i + 1)); + bb.put(byten(i + 2)); + bb.put(byten(i + 3)); + bb.put(byten(i + 4)); + bb.put(byten(i + 5)); + bb.put(byten(i + 6)); + bb.put(byten(i + 7)); + assertEquals(bb.getLong(0), input.readLong(i)); + } + // readCount can be three or four, depending on whether or not we had to adjust the bufferStart + // to include a whole long + assertTrue( + "Expected 4 or 3, got " + input.readCount, input.readCount == 4 || input.readCount == 3); + } + // byten emulates a file - byten(n) returns the n'th byte in that file. // MyBufferedIndexInput reads this "file". private static byte byten(long n) { @@ -150,7 +217,8 @@ private static byte byten(long n) { private static class MyBufferedIndexInput extends BufferedIndexInput { private long pos; - private long len; + private final long len; + private long readCount = 0; public MyBufferedIndexInput(long len) { super("MyBufferedIndexInput(len=" + len + ")", BufferedIndexInput.BUFFER_SIZE); @@ -164,14 +232,15 @@ public MyBufferedIndexInput() { } @Override - protected void readInternal(ByteBuffer b) throws IOException { + protected void readInternal(ByteBuffer b) { + readCount++; while (b.hasRemaining()) { b.put(byten(pos++)); } } @Override - protected void seekInternal(long pos) throws IOException { + protected void seekInternal(long pos) { this.pos = pos; }