apache · bbeaudreault · Dec 27, 2022 · Jan 9, 2023 · Jan 9, 2023 · Jan 9, 2023
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/HalfStoreFileReader.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/HalfStoreFileReader.java
@@ -277,6 +277,16 @@ public void close() {
       public void shipped() throws IOException {
         this.delegate.shipped();
       }
+
+      @Override
+      public void checkpoint(State state) {
+        this.delegate.checkpoint(state);
+      }
+
+      @Override
+      public void retainBlock() {
+        this.delegate.retainBlock();
+      }
     };
   }
 

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java
@@ -2070,7 +2070,8 @@ private static HFileBlock shallowClone(HFileBlock blk, ByteBuff newBuf) {
     return createBuilder(blk, newBuf).build();
   }
 
-  static HFileBlock deepCloneOnHeap(HFileBlock blk) {
+  // Publicly visible for access in tests
+  public static HFileBlock deepCloneOnHeap(HFileBlock blk) {
     ByteBuff deepCloned = ByteBuff.wrap(ByteBuffer.wrap(blk.buf.toBytes(0, blk.buf.limit())));
     return createBuilder(blk, deepCloned).build();
   }

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java
@@ -336,7 +336,18 @@ protected static class HFileScannerImpl implements HFileScanner {
     // RegionScannerImpl#handleException). Call the releaseIfNotCurBlock() to release the
     // unreferenced block please.
     protected HFileBlock curBlock;
-    // Previous blocks that were used in the course of the read
+
+    // Updated to the current prevBlocks size when checkpoint is called. Used to eagerly release
+    // any blocks accumulated in the fetching of a row, if that row is thrown away due to filterRow.
+    private int lastCheckpointIndex = -1;
+
+    // Updated by retainBlock, when a cell is included from the current block. Is reset whenever
+    // curBlock gets updated. Only honored when lastCheckpointIndex >= 0, meaning a checkpoint
+    // has occurred.
+    private boolean shouldRetainBlock = false;
+
+    // Previous blocks that were used in the course of the read, to be released at close,
+    // checkpoint, or shipped
     protected final ArrayList<HFileBlock> prevBlocks = new ArrayList<>();
 
     public HFileScannerImpl(final HFile.Reader reader, final boolean cacheBlocks,
@@ -351,23 +362,43 @@ void updateCurrBlockRef(HFileBlock block) {
       if (block != null && curBlock != null && block.getOffset() == curBlock.getOffset()) {
         return;
       }
-      if (this.curBlock != null && this.curBlock.isSharedMem()) {
-        prevBlocks.add(this.curBlock);
-      }
+      handlePrevBlock();
       this.curBlock = block;
     }
 
     void reset() {
+      handlePrevBlock();
+      this.curBlock = null;
+    }
+
+    /**
+     * Add curBlock to prevBlocks or release it immediately, depending on whether a checkpoint has
+     * occurred and we've been instructed to retain the block. If no checkpoint has occurred, we use
+     * original logic to always add to prevBlocks. If checkpoint occurred, release the block unless
+     * {@link #retainBlock()} has been called.
+     */
+    private void handlePrevBlock() {
       // We don't have to keep ref to heap block
       if (this.curBlock != null && this.curBlock.isSharedMem()) {
-        this.prevBlocks.add(this.curBlock);
+        if (shouldRetainBlock || lastCheckpointIndex < 0) {
+          prevBlocks.add(this.curBlock);
+        } else {
+          this.curBlock.release();
+        }
       }
-      this.curBlock = null;
+      shouldRetainBlock = false;
     }
 
     private void returnBlocks(boolean returnAll) {
-      this.prevBlocks.forEach(HFileBlock::release);
+      this.prevBlocks.forEach((block) -> {
+        if (block != null) {
+          block.release();
+        }
+      });
       this.prevBlocks.clear();
+      if (lastCheckpointIndex > 0) {
+        this.lastCheckpointIndex = 0;
+      }
       if (returnAll && this.curBlock != null) {
         this.curBlock.release();
         this.curBlock = null;
@@ -1047,6 +1078,28 @@ public int compareKey(CellComparator comparator, Cell key) {
     public void shipped() throws IOException {
       this.returnBlocks(false);
     }
+
+    /**
+     * Sets the last checkpoint index to the current prevBlocks size. If called with State.FILTERED,
+     * releases and nulls out any prevBlocks entries which were added since the last checkpoint.
+     * Nulls out instead of removing to avoid unnecessary resizing of the list.
+     */
+    @Override
+    public void checkpoint(State state) {
+      if (state == State.FILTERED) {
+        assert lastCheckpointIndex >= 0;
+        for (int i = lastCheckpointIndex; i < prevBlocks.size(); i++) {
+          prevBlocks.get(i).release();
+          prevBlocks.set(i, null);
+        }
+      }
+      lastCheckpointIndex = prevBlocks.size();
+    }
+
+    @Override
+    public void retainBlock() {
+      shouldRetainBlock = true;
+    }
   }
 
   @Override

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueHeap.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueHeap.java
@@ -420,4 +420,32 @@ public void shipped() throws IOException {
       }
     }
   }
+
+  @Override
+  public void checkpoint(State state) {
+    if (current != null) {
+      current.checkpoint(state);
+    }
+    if (this.heap != null) {
+      for (KeyValueScanner scanner : this.heap) {
+        scanner.checkpoint(state);
+      }
+    }
+    // Also checkpoint any scanners for delayed close. These would be exhausted scanners,
+    // which may contain blocks that were totally filtered during a request. If so, the checkpoint
+    // will release them.
+    if (scannersForDelayedClose != null) {
+      for (KeyValueScanner scanner : scannersForDelayedClose) {
+        scanner.checkpoint(state);
+      }
+    }
+  }
+
+  @Override
+  public void retainBlock() {
+    if (current != null) {
+      current.retainBlock();
+    }
+
+  }
 }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/NonLazyKeyValueScanner.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/NonLazyKeyValueScanner.java
@@ -78,4 +78,14 @@ public Cell getNextIndexedKey() {
   public void shipped() throws IOException {
     // do nothing
   }
+
+  @Override
+  public void checkpoint(State state) {
+    // do nothing
+  }
+
+  @Override
+  public void retainBlock() {
+    // do nothing
+  }
 }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionScannerImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionScannerImpl.java
@@ -426,6 +426,8 @@ private boolean nextInternal(List<Cell> results, ScannerContext scannerContext)
     // Used to check time limit
     LimitScope limitScope = LimitScope.BETWEEN_CELLS;
 
+    checkpoint(State.START);
+
     // The loop here is used only when at some point during the next we determine
     // that due to effects of filters or otherwise, we have an empty row in the result.
     // Then we loop and try again. Otherwise, we must get out on the first iteration via return,
@@ -501,6 +503,7 @@ private boolean nextInternal(List<Cell> results, ScannerContext scannerContext)
             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
           }
           results.clear();
+          checkpoint(State.FILTERED);
 
           // Read nothing as the rowkey was filtered, but still need to check time limit
           if (scannerContext.checkTimeLimit(limitScope)) {
@@ -553,6 +556,7 @@ private boolean nextInternal(List<Cell> results, ScannerContext scannerContext)
         if (isEmptyRow || ret == FilterWrapper.FilterRowRetCode.EXCLUDE || filterRow()) {
           incrementCountOfRowsFilteredMetric(scannerContext);
           results.clear();
+          checkpoint(State.FILTERED);
           boolean moreRows = nextRow(scannerContext, current);
           if (!moreRows) {
             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
@@ -602,6 +606,7 @@ private boolean nextInternal(List<Cell> results, ScannerContext scannerContext)
       // Double check to prevent empty rows from appearing in result. It could be
       // the case when SingleColumnValueExcludeFilter is used.
       if (results.isEmpty()) {
+        checkpoint(State.FILTERED);
         incrementCountOfRowsFilteredMetric(scannerContext);
         boolean moreRows = nextRow(scannerContext, current);
         if (!moreRows) {
@@ -783,6 +788,21 @@ public void shipped() throws IOException {
     }
   }
 
+  @Override
+  public void checkpoint(State state) {
+    if (storeHeap != null) {
+      storeHeap.checkpoint(state);
+    }
+    if (joinedHeap != null) {
+      joinedHeap.checkpoint(state);
+    }
+  }
+
+  @Override
+  public void retainBlock() {
+    // do nothing. this is really only called in StoreScanner
+  }
+
   @Override
   public void run() throws IOException {
     // This is the RPC callback method executed. We do the close in of the scanner in this

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SegmentScanner.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SegmentScanner.java
@@ -309,6 +309,16 @@ public void shipped() throws IOException {
     // do nothing
   }
 
+  @Override
+  public void checkpoint(State state) {
+    // do nothing
+  }
+
+  @Override
+  public void retainBlock() {
+    // do nothing
+  }
+
   // debug method
   @Override
   public String toString() {

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/Shipper.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/Shipper.java
@@ -23,7 +23,11 @@
 /**
  * This interface denotes a scanner as one which can ship cells. Scan operation do many RPC requests
  * to server and fetch N rows/RPC. These are then shipped to client. At the end of every such batch
- * {@link #shipped()} will get called.
+ * {@link #shipped()} will get called. <br>
+ * Scans of large numbers of fully filtered blocks (due to Filter, or sparse columns, etc) can cause
+ * excess memory to be held while waiting for {@link #shipped()} to be called. Therefore, there's a
+ * checkpoint mechanism via {@link #checkpoint(State)}. These enable fully filtered blocks to be
+ * eagerly released, since they are not referenced by cells being returned to clients.
  */
 @InterfaceAudience.Private
 public interface Shipper {
@@ -33,4 +37,27 @@ public interface Shipper {
    * can be done here.
    */
   void shipped() throws IOException;
+
+  enum State {
+    START,
+    FILTERED
+  }
+
+  /**
+   * Called during processing of a batch of scanned rows, before returning to the client. Allows
+   * releasing of blocks which have been totally skipped in the result set due to filters. <br>
+   * Should be called with {@link State#START} at the beginning of a request for a row. This will
+   * set state necessary to handle {@link State#FILTERED}. Calling with {@link State#FILTERED} will
+   * release any blocks which have been fully processed since the last call to
+   * {@link #checkpoint(State)}. Calling again with {@link State#START} will reset the pointers.
+   */
+  void checkpoint(State state);
+
+  /**
+   * Used by upstream callers to notify the shipper that the current block should be retained for
+   * shipping when {@link #shipped()} or {@link #checkpoint(State)} are called. Otherwise, the block
+   * will be released immediately once it's no longer needed. Only has an effect after
+   * {@link #checkpoint(State)} has been called at least once.
+   */
+  void retainBlock();
 }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFileScanner.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFileScanner.java
@@ -558,4 +558,14 @@ public Cell getNextIndexedKey() {
   public void shipped() throws IOException {
     this.hfs.shipped();
   }
+
+  @Override
+  public void checkpoint(State state) {
+    this.hfs.checkpoint(state);
+  }
+
+  @Override
+  public void retainBlock() {
+    this.hfs.retainBlock();
+  }
 }