diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 96b4bf5eab75..4bc5cd95e684 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -30,6 +30,10 @@ Improvements
   system property to increase the set of Java versions that Panama Vectorization will
   provide optimized implementations for. (Chris Hegarty)
 
+* GITHUB#266: TieredMergePolicy now allows merging up to maxMergeAtOnce
+  segments for merges below the floor segment size, even if maxMergeAtOnce is
+  bigger than segsPerTier. (Adrien Grand)
+
 Optimizations
 ---------------------
 
@@ -93,6 +97,12 @@ Optimizations
 * GITHUB#14023: Make JVM inlining decisions more predictable in our main
   queries. (Adrien Grand)
 
+* GITHUB#14032: Speed up PostingsEnum when positions are requested.
+  (Adrien Grand)
+
+* GITHUB#14031: Ensure Panama float vector distance impls inlinable.
+  (Robert Muir, Chris Hegarty)
+
 * GITHUB#14011: Reduce allocation rate in HNSW concurrent merge. (Viliam Durina)
 
 Bug Fixes
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java
index 9e79aaf71e15..d879a58b4ab7 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java
@@ -638,9 +638,13 @@ final class EverythingEnum extends AbstractPostingsEnum {
     final boolean indexHasPayloads;
     final boolean indexHasOffsetsOrPayloads;
 
-    private int freq; // freq we last read
+    private long freqFP; // offset of the freq block
+
     private int position; // current position
 
+    // value of docBufferUpto on the last doc ID when positions have been read
+    private int posDocBufferUpto;
+
     // how many positions "behind" we are; nextPosition must
     // skip these to "catch up":
     private int posPendingCount;
@@ -662,6 +666,7 @@ final class EverythingEnum extends AbstractPostingsEnum {
 
     private boolean needsOffsets; // true if we actually need offsets
     private boolean needsPayloads; // true if we actually need payloads
+    private boolean needsPayloadsOrOffsets;
 
     public EverythingEnum(FieldInfo fieldInfo) throws IOException {
       super(fieldInfo);
@@ -745,8 +750,11 @@ public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOExcep
         lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset;
       }
 
-      this.needsOffsets = PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS);
-      this.needsPayloads = PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS);
+      this.needsOffsets =
+          indexHasOffsets && PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS);
+      this.needsPayloads =
+          indexHasPayloads && PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS);
+      this.needsPayloadsOrOffsets = this.needsPayloads || this.needsOffsets;
 
       level1BlockPosUpto = 0;
       level1BlockPayUpto = 0;
@@ -758,8 +766,13 @@ public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOExcep
     }
 
     @Override
-    public int freq() {
-      return freq;
+    public int freq() throws IOException {
+      if (freqFP != -1) {
+        docIn.seek(freqFP);
+        pforUtil.decode(docInUtil, freqBuffer);
+        freqFP = -1;
+      }
+      return freqBuffer[docBufferUpto - 1];
     }
 
     private void refillDocs() throws IOException {
@@ -768,11 +781,13 @@ private void refillDocs() throws IOException {
 
       if (left >= BLOCK_SIZE) {
         forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
-        pforUtil.decode(docInUtil, freqBuffer);
+        freqFP = docIn.getFilePointer();
+        PForUtil.skip(docIn);
         docCountUpto += BLOCK_SIZE;
       } else if (docFreq == 1) {
         docBuffer[0] = singletonDocID;
         freqBuffer[0] = (int) totalTermFreq;
+        freqFP = -1;
         docBuffer[1] = NO_MORE_DOCS;
         docCountUpto++;
         docBufferSize = 1;
@@ -781,11 +796,13 @@ private void refillDocs() throws IOException {
         PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true);
         prefixSum(docBuffer, left, prevDocID);
         docBuffer[left] = NO_MORE_DOCS;
+        freqFP = -1;
         docCountUpto += left;
         docBufferSize = left;
       }
       prevDocID = docBuffer[BLOCK_SIZE - 1];
       docBufferUpto = 0;
+      posDocBufferUpto = 0;
       assert docBuffer[docBufferSize] == NO_MORE_DOCS;
     }
 
@@ -846,6 +863,8 @@ private void moveToNextLevel0Block() throws IOException {
           payloadByteUpto = level0BlockPayUpto;
         }
         posBufferUpto = BLOCK_SIZE;
+      } else {
+        posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE);
       }
 
       if (docFreq - docCountUpto >= BLOCK_SIZE) {
@@ -875,34 +894,23 @@ public int nextDoc() throws IOException {
       }
 
       this.doc = docBuffer[docBufferUpto];
-      this.freq = freqBuffer[docBufferUpto];
       docBufferUpto++;
-      posPendingCount += freq;
-      position = 0;
-      lastStartOffset = 0;
       return doc;
     }
 
     private void skipLevel0To(int target) throws IOException {
+      long posFP;
+      int posUpto;
+      long payFP;
+      int payUpto;
+
       while (true) {
         prevDocID = level0LastDocID;
 
-        // If nextBlockPosFP is less than the current FP, it means that the block of positions for
-        // the first docs of the next block are already decoded. In this case we just accumulate
-        // frequencies into posPendingCount instead of seeking backwards and decoding the same pos
-        // block again.
-        if (level0PosEndFP >= posIn.getFilePointer()) {
-          posIn.seek(level0PosEndFP);
-          posPendingCount = level0BlockPosUpto;
-          if (indexHasOffsetsOrPayloads) {
-            assert level0PayEndFP >= payIn.getFilePointer();
-            payIn.seek(level0PayEndFP);
-            payloadByteUpto = level0BlockPayUpto;
-          }
-          posBufferUpto = BLOCK_SIZE;
-        } else {
-          posPendingCount += sumOverRange(freqBuffer, docBufferUpto, BLOCK_SIZE);
-        }
+        posFP = level0PosEndFP;
+        posUpto = level0BlockPosUpto;
+        payFP = level0PayEndFP;
+        payUpto = level0BlockPayUpto;
 
         if (docFreq - docCountUpto >= BLOCK_SIZE) {
           docIn.readVLong(); // skip0 num bytes
@@ -931,6 +939,23 @@ private void skipLevel0To(int target) throws IOException {
           break;
         }
       }
+
+      // If nextBlockPosFP is less than the current FP, it means that the block of positions for
+      // the first docs of the next block are already decoded. In this case we just accumulate
+      // frequencies into posPendingCount instead of seeking backwards and decoding the same pos
+      // block again.
+      if (posFP >= posIn.getFilePointer()) {
+        posIn.seek(posFP);
+        posPendingCount = posUpto;
+        if (indexHasOffsetsOrPayloads) {
+          assert level0PayEndFP >= payIn.getFilePointer();
+          payIn.seek(payFP);
+          payloadByteUpto = payUpto;
+        }
+        posBufferUpto = BLOCK_SIZE;
+      } else {
+        posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE);
+      }
     }
 
     @Override
@@ -947,16 +972,12 @@ public int advance(int target) throws IOException {
       }
 
       int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize);
-      posPendingCount += sumOverRange(freqBuffer, docBufferUpto, next + 1);
-      this.freq = freqBuffer[next];
       this.docBufferUpto = next + 1;
-      position = 0;
-      lastStartOffset = 0;
 
       return this.doc = docBuffer[next];
     }
 
-    private void skipPositions() throws IOException {
+    private void skipPositions(int freq) throws IOException {
       // Skip positions now:
       int toSkip = posPendingCount - freq;
       // if (DEBUG) {
@@ -1003,41 +1024,45 @@ private void skipPositions() throws IOException {
       lastStartOffset = 0;
     }
 
-    private void refillPositions() throws IOException {
-      if (posIn.getFilePointer() == lastPosBlockFP) {
-        final int count = (int) (totalTermFreq % BLOCK_SIZE);
-        int payloadLength = 0;
-        int offsetLength = 0;
-        payloadByteUpto = 0;
-        for (int i = 0; i < count; i++) {
-          int code = posIn.readVInt();
-          if (indexHasPayloads) {
-            if ((code & 1) != 0) {
-              payloadLength = posIn.readVInt();
-            }
-            payloadLengthBuffer[i] = payloadLength;
-            posDeltaBuffer[i] = code >>> 1;
-            if (payloadLength != 0) {
-              if (payloadByteUpto + payloadLength > payloadBytes.length) {
-                payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payloadLength);
-              }
-              posIn.readBytes(payloadBytes, payloadByteUpto, payloadLength);
-              payloadByteUpto += payloadLength;
+    private void refillLastPositionBlock() throws IOException {
+      final int count = (int) (totalTermFreq % BLOCK_SIZE);
+      int payloadLength = 0;
+      int offsetLength = 0;
+      payloadByteUpto = 0;
+      for (int i = 0; i < count; i++) {
+        int code = posIn.readVInt();
+        if (indexHasPayloads) {
+          if ((code & 1) != 0) {
+            payloadLength = posIn.readVInt();
+          }
+          payloadLengthBuffer[i] = payloadLength;
+          posDeltaBuffer[i] = code >>> 1;
+          if (payloadLength != 0) {
+            if (payloadByteUpto + payloadLength > payloadBytes.length) {
+              payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payloadLength);
             }
-          } else {
-            posDeltaBuffer[i] = code;
+            posIn.readBytes(payloadBytes, payloadByteUpto, payloadLength);
+            payloadByteUpto += payloadLength;
           }
+        } else {
+          posDeltaBuffer[i] = code;
+        }
 
-          if (indexHasOffsets) {
-            int deltaCode = posIn.readVInt();
-            if ((deltaCode & 1) != 0) {
-              offsetLength = posIn.readVInt();
-            }
-            offsetStartDeltaBuffer[i] = deltaCode >>> 1;
-            offsetLengthBuffer[i] = offsetLength;
+        if (indexHasOffsets) {
+          int deltaCode = posIn.readVInt();
+          if ((deltaCode & 1) != 0) {
+            offsetLength = posIn.readVInt();
           }
+          offsetStartDeltaBuffer[i] = deltaCode >>> 1;
+          offsetLengthBuffer[i] = offsetLength;
         }
-        payloadByteUpto = 0;
+      }
+      payloadByteUpto = 0;
+    }
+
+    private void refillPositions() throws IOException {
+      if (posIn.getFilePointer() == lastPosBlockFP) {
+        refillLastPositionBlock();
       } else {
         pforUtil.decode(posInUtil, posDeltaBuffer);
 
@@ -1054,8 +1079,7 @@ private void refillPositions() throws IOException {
             // this works, because when writing a vint block we always force the first length to be
             // written
             PForUtil.skip(payIn); // skip over lengths
-            int numBytes = payIn.readVInt(); // read length of payloadBytes
-            payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes
+            payIn.skipBytes(payIn.readVInt()); // skip over payloadBytes
           }
           payloadByteUpto = 0;
         }
@@ -1074,13 +1098,40 @@ private void refillPositions() throws IOException {
       }
     }
 
+    private void accumulatePayloadAndOffsets() {
+      if (needsPayloads) {
+        payloadLength = payloadLengthBuffer[posBufferUpto];
+        payload.bytes = payloadBytes;
+        payload.offset = payloadByteUpto;
+        payload.length = payloadLength;
+        payloadByteUpto += payloadLength;
+      }
+
+      if (needsOffsets) {
+        startOffset = lastStartOffset + offsetStartDeltaBuffer[posBufferUpto];
+        endOffset = startOffset + offsetLengthBuffer[posBufferUpto];
+        lastStartOffset = startOffset;
+      }
+    }
+
     @Override
     public int nextPosition() throws IOException {
-      assert posPendingCount > 0;
+      if (posDocBufferUpto != docBufferUpto) {
+        int freq = freq(); // triggers lazy decoding of freqs
+
+        // First position that is being read on this doc.
+        posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, docBufferUpto);
+        posDocBufferUpto = docBufferUpto;
+
+        assert posPendingCount > 0;
+
+        if (posPendingCount > freq) {
+          skipPositions(freq);
+          posPendingCount = freq;
+        }
 
-      if (posPendingCount > freq) {
-        skipPositions();
-        posPendingCount = freq;
+        position = 0;
+        lastStartOffset = 0;
       }
 
       if (posBufferUpto == BLOCK_SIZE) {
@@ -1089,18 +1140,8 @@ public int nextPosition() throws IOException {
       }
       position += posDeltaBuffer[posBufferUpto];
 
-      if (indexHasPayloads) {
-        payloadLength = payloadLengthBuffer[posBufferUpto];
-        payload.bytes = payloadBytes;
-        payload.offset = payloadByteUpto;
-        payload.length = payloadLength;
-        payloadByteUpto += payloadLength;
-      }
-
-      if (indexHasOffsets) {
-        startOffset = lastStartOffset + offsetStartDeltaBuffer[posBufferUpto];
-        endOffset = startOffset + offsetLengthBuffer[posBufferUpto];
-        lastStartOffset = startOffset;
+      if (needsPayloadsOrOffsets) {
+        accumulatePayloadAndOffsets();
       }
 
       posBufferUpto++;
@@ -1110,17 +1151,23 @@ public int nextPosition() throws IOException {
 
     @Override
     public int startOffset() {
+      if (needsOffsets == false) {
+        return -1;
+      }
       return startOffset;
     }
 
     @Override
     public int endOffset() {
+      if (needsOffsets == false) {
+        return -1;
+      }
       return endOffset;
     }
 
     @Override
     public BytesRef getPayload() {
-      if (payloadLength == 0) {
+      if (needsPayloads == false || payloadLength == 0) {
         return null;
       } else {
         return payload;
@@ -1466,9 +1513,13 @@ final class BlockImpactsPostingsEnum extends BlockImpactsEnum {
     final boolean indexHasPayloads;
     final boolean indexHasOffsetsOrPayloads;
 
-    private int freq; // freq we last read
+    private long freqFP; // offset of the freq block
+
     private int position; // current position
 
+    // value of docBufferUpto on the last doc ID when positions have been read
+    private int posDocBufferUpto;
+
     // how many positions "behind" we are; nextPosition must
     // skip these to "catch up":
     private int posPendingCount;
@@ -1516,8 +1567,13 @@ public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState
     }
 
     @Override
-    public int freq() {
-      return freq;
+    public int freq() throws IOException {
+      if (freqFP != -1) {
+        docIn.seek(freqFP);
+        pforUtil.decode(docInUtil, freqBuffer);
+        freqFP = -1;
+      }
+      return freqBuffer[docBufferUpto - 1];
     }
 
     private void refillDocs() throws IOException {
@@ -1526,24 +1582,30 @@ private void refillDocs() throws IOException {
 
       if (left >= BLOCK_SIZE) {
         forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
-        pforUtil.decode(docInUtil, freqBuffer);
+        freqFP = docIn.getFilePointer();
+        PForUtil.skip(docIn);
         docCountUpto += BLOCK_SIZE;
       } else if (docFreq == 1) {
         docBuffer[0] = singletonDocID;
         freqBuffer[0] = (int) totalTermFreq;
+        freqFP = -1;
         docBuffer[1] = NO_MORE_DOCS;
         docCountUpto++;
         docBufferSize = 1;
+
       } else {
         // Read vInts:
         PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true);
         prefixSum(docBuffer, left, prevDocID);
         docBuffer[left] = NO_MORE_DOCS;
+        freqFP = -1;
         docCountUpto += left;
         docBufferSize = left;
+        freqFP = -1;
       }
       prevDocID = docBuffer[BLOCK_SIZE - 1];
       docBufferUpto = 0;
+      posDocBufferUpto = 0;
       assert docBuffer[docBufferSize] == NO_MORE_DOCS;
     }
 
@@ -1585,20 +1647,14 @@ private void skipLevel1To(int target) throws IOException {
     }
 
     private void skipLevel0To(int target) throws IOException {
+      long posFP;
+      int posUpto;
+
       while (true) {
         prevDocID = level0LastDocID;
 
-        // If nextBlockPosFP is less than the current FP, it means that the block of positions for
-        // the first docs of the next block are already decoded. In this case we just accumulate
-        // frequencies into posPendingCount instead of seeking backwards and decoding the same pos
-        // block again.
-        if (level0PosEndFP >= posIn.getFilePointer()) {
-          posIn.seek(level0PosEndFP);
-          posPendingCount = level0BlockPosUpto;
-          posBufferUpto = BLOCK_SIZE;
-        } else {
-          posPendingCount += sumOverRange(freqBuffer, docBufferUpto, BLOCK_SIZE);
-        }
+        posFP = level0PosEndFP;
+        posUpto = level0BlockPosUpto;
 
         if (docFreq - docCountUpto >= BLOCK_SIZE) {
           docIn.readVLong(); // skip0 num bytes
@@ -1631,6 +1687,18 @@ private void skipLevel0To(int target) throws IOException {
           break;
         }
       }
+
+      // If nextBlockPosFP is less than the current FP, it means that the block of positions for
+      // the first docs of the next block are already decoded. In this case we just accumulate
+      // frequencies into posPendingCount instead of seeking backwards and decoding the same pos
+      // block again.
+      if (posFP >= posIn.getFilePointer()) {
+        posIn.seek(posFP);
+        posPendingCount = posUpto;
+        posBufferUpto = BLOCK_SIZE;
+      } else {
+        posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE);
+      }
     }
 
     @Override
@@ -1660,30 +1728,25 @@ public int nextDoc() throws IOException {
       }
 
       doc = docBuffer[docBufferUpto];
-      freq = freqBuffer[docBufferUpto];
-      posPendingCount += freq;
       docBufferUpto++;
-      position = 0;
       return this.doc;
     }
 
     @Override
     public int advance(int target) throws IOException {
-      advanceShallow(target);
-      if (needsRefilling) {
+      if (target > level0LastDocID || needsRefilling) {
+        advanceShallow(target);
+        assert needsRefilling;
         refillDocs();
         needsRefilling = false;
       }
 
       int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize);
-      posPendingCount += sumOverRange(freqBuffer, docBufferUpto, next + 1);
-      freq = freqBuffer[next];
       docBufferUpto = next + 1;
-      position = 0;
       return this.doc = docBuffer[next];
     }
 
-    private void skipPositions() throws IOException {
+    private void skipPositions(int freq) throws IOException {
       // Skip positions now:
       int toSkip = posPendingCount - freq;
       // if (DEBUG) {
@@ -1703,8 +1766,6 @@ private void skipPositions() throws IOException {
         refillPositions();
         posBufferUpto = toSkip;
       }
-
-      position = 0;
     }
 
     private void refillPositions() throws IOException {
@@ -1739,11 +1800,21 @@ private void refillPositions() throws IOException {
 
     @Override
     public int nextPosition() throws IOException {
-      assert posPendingCount > 0;
+      if (posDocBufferUpto != docBufferUpto) {
+        int freq = freq(); // triggers lazy decoding of freqs
+
+        // First position that is being read on this doc.
+        posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, docBufferUpto);
+        posDocBufferUpto = docBufferUpto;
+
+        assert posPendingCount > 0;
+
+        if (posPendingCount > freq) {
+          skipPositions(freq);
+          posPendingCount = freq;
+        }
 
-      if (posPendingCount > freq) {
-        skipPositions();
-        posPendingCount = freq;
+        position = 0;
       }
 
       if (posBufferUpto == BLOCK_SIZE) {
diff --git a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
index 2fb0c0783a2e..6447b7305ef2 100644
--- a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
@@ -85,7 +85,7 @@ public class TieredMergePolicy extends MergePolicy {
   public static final double DEFAULT_NO_CFS_RATIO = 0.1;
 
   // User-specified maxMergeAtOnce. In practice we always take the min of its
-  // value and segsPerTier to avoid suboptimal merging.
+  // value and segsPerTier for segments above the floor size to avoid suboptimal merging.
   private int maxMergeAtOnce = 10;
   private long maxMergedSegmentBytes = 5 * 1024 * 1024 * 1024L;
 
@@ -100,7 +100,13 @@ public TieredMergePolicy() {
     super(DEFAULT_NO_CFS_RATIO, MergePolicy.DEFAULT_MAX_CFS_SEGMENT_SIZE);
   }
 
-  /** Maximum number of segments to be merged at a time during "normal" merging. Default is 10. */
+  /**
+   * Maximum number of segments to be merged at a time during "normal" merging. Default is 10.
+   *
+   * <p><b>NOTE</b>: Merges above the {@link #setFloorSegmentMB(double) floor segment size} also
+   * bound the number of merged segments by {@link #setSegmentsPerTier(double) the number of
+   * segments per tier}.
+   */
   public TieredMergePolicy setMaxMergeAtOnce(int v) {
     if (v < 2) {
       throw new IllegalArgumentException("maxMergeAtOnce must be > 1 (got " + v + ")");
@@ -557,46 +563,46 @@ private MergeSpecification doFindMerges(
 
       for (int startIdx = 0; startIdx < sortedEligible.size(); startIdx++) {
 
-        long totAfterMergeBytes = 0;
-
         final List<SegmentCommitInfo> candidate = new ArrayList<>();
         boolean hitTooLarge = false;
         long bytesThisMerge = 0;
         long docCountThisMerge = 0;
         for (int idx = startIdx;
             idx < sortedEligible.size()
-                && candidate.size() < mergeFactor
+                && candidate.size() < maxMergeAtOnce
+                // We allow merging more than mergeFactor segments together if the merged segment
+                // would be less than the floor segment size. This is important because segments
+                // below the floor segment size are more aggressively merged by this policy, so we
+                // need to grow them as quickly as possible.
+                && (candidate.size() < mergeFactor || bytesThisMerge < floorSegmentBytes)
                 && bytesThisMerge < maxMergedSegmentBytes
                 && (bytesThisMerge < floorSegmentBytes || docCountThisMerge <= allowedDocCount);
             idx++) {
           final SegmentSizeAndDocs segSizeDocs = sortedEligible.get(idx);
           final long segBytes = segSizeDocs.sizeInBytes;
           int segDocCount = segSizeDocs.maxDoc - segSizeDocs.delCount;
-          if (totAfterMergeBytes + segBytes > maxMergedSegmentBytes
-              || (totAfterMergeBytes > floorSegmentBytes
+          if (bytesThisMerge + segBytes > maxMergedSegmentBytes
+              || (bytesThisMerge > floorSegmentBytes
                   && docCountThisMerge + segDocCount > allowedDocCount)) {
             // Only set hitTooLarge when reaching the maximum byte size, as this will create
             // segments of the maximum size which will no longer be eligible for merging for a long
             // time (until they accumulate enough deletes).
-            hitTooLarge |= totAfterMergeBytes + segBytes > maxMergedSegmentBytes;
-            if (candidate.size() == 0) {
-              // We should never have something coming in that _cannot_ be merged, so handle
-              // singleton merges
-              candidate.add(segSizeDocs.segInfo);
-              bytesThisMerge += segBytes;
+            hitTooLarge |= bytesThisMerge + segBytes > maxMergedSegmentBytes;
+            // We should never have something coming in that _cannot_ be merged, so handle
+            // singleton merges
+            if (candidate.size() > 0) {
+              // NOTE: we continue, so that we can try
+              // "packing" smaller segments into this merge
+              // to see if we can get closer to the max
+              // size; this in general is not perfect since
+              // this is really "bin packing" and we'd have
+              // to try different permutations.
+              continue;
             }
-            // NOTE: we continue, so that we can try
-            // "packing" smaller segments into this merge
-            // to see if we can get closer to the max
-            // size; this in general is not perfect since
-            // this is really "bin packing" and we'd have
-            // to try different permutations.
-            continue;
           }
           candidate.add(segSizeDocs.segInfo);
           bytesThisMerge += segBytes;
           docCountThisMerge += segDocCount;
-          totAfterMergeBytes += segBytes;
         }
 
         // We should never see an empty candidate: we iterated over maxMergeAtOnce
@@ -645,7 +651,7 @@ private MergeSpecification doFindMerges(
                   + " tooLarge="
                   + hitTooLarge
                   + " size="
-                  + String.format(Locale.ROOT, "%.3f MB", totAfterMergeBytes / 1024. / 1024.),
+                  + String.format(Locale.ROOT, "%.3f MB", bytesThisMerge / 1024. / 1024.),
               mergeContext);
         }
 
@@ -654,7 +660,7 @@ private MergeSpecification doFindMerges(
           best = candidate;
           bestScore = score;
           bestTooLarge = hitTooLarge;
-          bestMergeBytes = totAfterMergeBytes;
+          bestMergeBytes = bytesThisMerge;
         }
       }
 
diff --git a/lucene/core/src/java/org/apache/lucene/search/ScoreCachingWrappingScorer.java b/lucene/core/src/java/org/apache/lucene/search/ScoreCachingWrappingScorer.java
index e9a23921eceb..9ebc053a6605 100644
--- a/lucene/core/src/java/org/apache/lucene/search/ScoreCachingWrappingScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/ScoreCachingWrappingScorer.java
@@ -31,8 +31,7 @@
  */
 public final class ScoreCachingWrappingScorer extends Scorable {
 
-  private int lastDoc = -1;
-  private int curDoc = -1;
+  private boolean scoreIsCached;
   private float curScore;
   private final Scorable in;
 
@@ -64,7 +63,8 @@ public void setScorer(Scorable scorer) throws IOException {
     @Override
     public void collect(int doc) throws IOException {
       if (scorer != null) {
-        scorer.curDoc = doc;
+        // Invalidate cache when collecting a new doc
+        scorer.scoreIsCached = false;
       }
       super.collect(doc);
     }
@@ -82,9 +82,9 @@ private ScoreCachingWrappingScorer(Scorable scorer) {
 
   @Override
   public float score() throws IOException {
-    if (lastDoc != curDoc) {
+    if (scoreIsCached == false) {
       curScore = in.score();
-      curDoc = lastDoc;
+      scoreIsCached = true;
     }
 
     return curScore;
diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java
index 9273f7c5a813..18ef76914bbf 100644
--- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java
+++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java
@@ -75,6 +75,9 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport {
     }
   }
 
+  // cached vector sizes for smaller method bodies
+  private static final int FLOAT_SPECIES_LENGTH = FLOAT_SPECIES.length();
+
   // the way FMA should work! if available use it, otherwise fall back to mul/add
   private static FloatVector fma(FloatVector a, FloatVector b, FloatVector c) {
     if (Constants.HAS_FAST_VECTOR_FMA) {
@@ -99,7 +102,7 @@ public float dotProduct(float[] a, float[] b) {
     float res = 0;
 
     // if the array size is large (> 2x platform vector size), its worth the overhead to vectorize
-    if (a.length > 2 * FLOAT_SPECIES.length()) {
+    if (a.length > 2 * FLOAT_SPECIES_LENGTH) {
       i += FLOAT_SPECIES.loopBound(a.length);
       res += dotProductBody(a, b, i);
     }
@@ -120,30 +123,33 @@ private float dotProductBody(float[] a, float[] b, int limit) {
     FloatVector acc2 = FloatVector.zero(FLOAT_SPECIES);
     FloatVector acc3 = FloatVector.zero(FLOAT_SPECIES);
     FloatVector acc4 = FloatVector.zero(FLOAT_SPECIES);
-    int unrolledLimit = limit - 3 * FLOAT_SPECIES.length();
-    for (; i < unrolledLimit; i += 4 * FLOAT_SPECIES.length()) {
+    final int unrolledLimit = limit - 3 * FLOAT_SPECIES_LENGTH;
+    for (; i < unrolledLimit; i += 4 * FLOAT_SPECIES_LENGTH) {
       // one
       FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
       FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
       acc1 = fma(va, vb, acc1);
 
       // two
-      FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i + FLOAT_SPECIES.length());
-      FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i + FLOAT_SPECIES.length());
+      final int i2 = i + FLOAT_SPECIES_LENGTH;
+      FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i2);
+      FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i2);
       acc2 = fma(vc, vd, acc2);
 
       // three
-      FloatVector ve = FloatVector.fromArray(FLOAT_SPECIES, a, i + 2 * FLOAT_SPECIES.length());
-      FloatVector vf = FloatVector.fromArray(FLOAT_SPECIES, b, i + 2 * FLOAT_SPECIES.length());
+      final int i3 = i2 + FLOAT_SPECIES_LENGTH;
+      FloatVector ve = FloatVector.fromArray(FLOAT_SPECIES, a, i3);
+      FloatVector vf = FloatVector.fromArray(FLOAT_SPECIES, b, i3);
       acc3 = fma(ve, vf, acc3);
 
       // four
-      FloatVector vg = FloatVector.fromArray(FLOAT_SPECIES, a, i + 3 * FLOAT_SPECIES.length());
-      FloatVector vh = FloatVector.fromArray(FLOAT_SPECIES, b, i + 3 * FLOAT_SPECIES.length());
+      final int i4 = i3 + FLOAT_SPECIES_LENGTH;
+      FloatVector vg = FloatVector.fromArray(FLOAT_SPECIES, a, i4);
+      FloatVector vh = FloatVector.fromArray(FLOAT_SPECIES, b, i4);
       acc4 = fma(vg, vh, acc4);
     }
     // vector tail: less scalar computations for unaligned sizes, esp with big vector sizes
-    for (; i < limit; i += FLOAT_SPECIES.length()) {
+    for (; i < limit; i += FLOAT_SPECIES_LENGTH) {
       FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
       FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
       acc1 = fma(va, vb, acc1);
@@ -162,7 +168,7 @@ public float cosine(float[] a, float[] b) {
     float norm2 = 0;
 
     // if the array size is large (> 2x platform vector size), its worth the overhead to vectorize
-    if (a.length > 2 * FLOAT_SPECIES.length()) {
+    if (a.length > 2 * FLOAT_SPECIES_LENGTH) {
       i += FLOAT_SPECIES.loopBound(a.length);
       float[] ret = cosineBody(a, b, i);
       sum += ret[0];
@@ -190,8 +196,8 @@ private float[] cosineBody(float[] a, float[] b, int limit) {
     FloatVector norm1_2 = FloatVector.zero(FLOAT_SPECIES);
     FloatVector norm2_1 = FloatVector.zero(FLOAT_SPECIES);
     FloatVector norm2_2 = FloatVector.zero(FLOAT_SPECIES);
-    int unrolledLimit = limit - FLOAT_SPECIES.length();
-    for (; i < unrolledLimit; i += 2 * FLOAT_SPECIES.length()) {
+    final int unrolledLimit = limit - FLOAT_SPECIES_LENGTH;
+    for (; i < unrolledLimit; i += 2 * FLOAT_SPECIES_LENGTH) {
       // one
       FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
       FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
@@ -200,14 +206,15 @@ private float[] cosineBody(float[] a, float[] b, int limit) {
       norm2_1 = fma(vb, vb, norm2_1);
 
       // two
-      FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i + FLOAT_SPECIES.length());
-      FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i + FLOAT_SPECIES.length());
+      final int i2 = i + FLOAT_SPECIES_LENGTH;
+      FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i2);
+      FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i2);
       sum2 = fma(vc, vd, sum2);
       norm1_2 = fma(vc, vc, norm1_2);
       norm2_2 = fma(vd, vd, norm2_2);
     }
     // vector tail: less scalar computations for unaligned sizes, esp with big vector sizes
-    for (; i < limit; i += FLOAT_SPECIES.length()) {
+    for (; i < limit; i += FLOAT_SPECIES_LENGTH) {
       FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
       FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
       sum1 = fma(va, vb, sum1);
@@ -227,7 +234,7 @@ public float squareDistance(float[] a, float[] b) {
     float res = 0;
 
     // if the array size is large (> 2x platform vector size), its worth the overhead to vectorize
-    if (a.length > 2 * FLOAT_SPECIES.length()) {
+    if (a.length > 2 * FLOAT_SPECIES_LENGTH) {
       i += FLOAT_SPECIES.loopBound(a.length);
       res += squareDistanceBody(a, b, i);
     }
@@ -240,6 +247,12 @@ public float squareDistance(float[] a, float[] b) {
     return res;
   }
 
+  /** helper: returns fma(a.sub(b), a.sub(b), c) */
+  private static FloatVector square(FloatVector a, FloatVector b, FloatVector c) {
+    FloatVector diff = a.sub(b);
+    return fma(diff, diff, c);
+  }
+
   /** vectorized square distance body */
   private float squareDistanceBody(float[] a, float[] b, int limit) {
     int i = 0;
@@ -249,38 +262,36 @@ private float squareDistanceBody(float[] a, float[] b, int limit) {
     FloatVector acc2 = FloatVector.zero(FLOAT_SPECIES);
     FloatVector acc3 = FloatVector.zero(FLOAT_SPECIES);
     FloatVector acc4 = FloatVector.zero(FLOAT_SPECIES);
-    int unrolledLimit = limit - 3 * FLOAT_SPECIES.length();
-    for (; i < unrolledLimit; i += 4 * FLOAT_SPECIES.length()) {
+    final int unrolledLimit = limit - 3 * FLOAT_SPECIES_LENGTH;
+    for (; i < unrolledLimit; i += 4 * FLOAT_SPECIES_LENGTH) {
       // one
       FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
       FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
-      FloatVector diff1 = va.sub(vb);
-      acc1 = fma(diff1, diff1, acc1);
+      acc1 = square(va, vb, acc1);
 
       // two
-      FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i + FLOAT_SPECIES.length());
-      FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i + FLOAT_SPECIES.length());
-      FloatVector diff2 = vc.sub(vd);
-      acc2 = fma(diff2, diff2, acc2);
+      final int i2 = i + FLOAT_SPECIES_LENGTH;
+      FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i2);
+      FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i2);
+      acc2 = square(vc, vd, acc2);
 
       // three
-      FloatVector ve = FloatVector.fromArray(FLOAT_SPECIES, a, i + 2 * FLOAT_SPECIES.length());
-      FloatVector vf = FloatVector.fromArray(FLOAT_SPECIES, b, i + 2 * FLOAT_SPECIES.length());
-      FloatVector diff3 = ve.sub(vf);
-      acc3 = fma(diff3, diff3, acc3);
+      final int i3 = i2 + FLOAT_SPECIES_LENGTH;
+      FloatVector ve = FloatVector.fromArray(FLOAT_SPECIES, a, i3);
+      FloatVector vf = FloatVector.fromArray(FLOAT_SPECIES, b, i3);
+      acc3 = square(ve, vf, acc3);
 
       // four
-      FloatVector vg = FloatVector.fromArray(FLOAT_SPECIES, a, i + 3 * FLOAT_SPECIES.length());
-      FloatVector vh = FloatVector.fromArray(FLOAT_SPECIES, b, i + 3 * FLOAT_SPECIES.length());
-      FloatVector diff4 = vg.sub(vh);
-      acc4 = fma(diff4, diff4, acc4);
+      final int i4 = i3 + FLOAT_SPECIES_LENGTH;
+      FloatVector vg = FloatVector.fromArray(FLOAT_SPECIES, a, i4);
+      FloatVector vh = FloatVector.fromArray(FLOAT_SPECIES, b, i4);
+      acc4 = square(vg, vh, acc4);
     }
     // vector tail: less scalar computations for unaligned sizes, esp with big vector sizes
-    for (; i < limit; i += FLOAT_SPECIES.length()) {
+    for (; i < limit; i += FLOAT_SPECIES_LENGTH) {
       FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
       FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
-      FloatVector diff = va.sub(vb);
-      acc1 = fma(diff, diff, acc1);
+      acc1 = square(va, vb, acc1);
     }
     // reduce
     FloatVector res1 = acc1.add(acc2);
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java
index a2d678a3ec04..a27bdc5f92cd 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java
@@ -163,9 +163,8 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws
   @Override
   protected void assertMerge(MergePolicy policy, MergeSpecification merges) {
     TieredMergePolicy tmp = (TieredMergePolicy) policy;
-    final int mergeFactor = (int) Math.min(tmp.getMaxMergeAtOnce(), tmp.getSegmentsPerTier());
     for (OneMerge merge : merges.merges) {
-      assertTrue(merge.segments.size() <= mergeFactor);
+      assertTrue(merge.segments.size() <= tmp.getMaxMergeAtOnce());
     }
   }
 
@@ -943,6 +942,49 @@ public void testSimulateUpdates() throws IOException {
     doTestSimulateUpdates(mergePolicy, numDocs, 2500);
   }
 
+  public void testMergeSizeIsLessThanFloorSize() throws IOException {
+    MergeContext mergeContext = new MockMergeContext(SegmentCommitInfo::getDelCount);
+
+    SegmentInfos infos = new SegmentInfos(Version.LATEST.major);
+    // 50 1MB segments
+    for (int i = 0; i < 50; ++i) {
+      infos.add(makeSegmentCommitInfo("_0", 1_000_000, 0, 1, IndexWriter.SOURCE_FLUSH));
+    }
+
+    TieredMergePolicy mergePolicy = new TieredMergePolicy();
+    mergePolicy.setMaxMergeAtOnce(30);
+    mergePolicy.setFloorSegmentMB(0.1);
+
+    // Segments are above the floor segment size, we get 4 merges of mergeFactor=10 segments each
+    MergeSpecification mergeSpec =
+        mergePolicy.findMerges(MergeTrigger.FULL_FLUSH, infos, mergeContext);
+    assertNotNull(mergeSpec);
+    assertEquals(4, mergeSpec.merges.size());
+    for (OneMerge oneMerge : mergeSpec.merges) {
+      assertEquals(mergePolicy.getSegmentsPerTier(), oneMerge.segments.size(), 0d);
+    }
+
+    // Segments are below the floor segment size and it takes 15 segments to go above the floor
+    // segment size. We get 3 merges of 15 segments each
+    mergePolicy.setFloorSegmentMB(15);
+    mergeSpec = mergePolicy.findMerges(MergeTrigger.FULL_FLUSH, infos, mergeContext);
+    assertNotNull(mergeSpec);
+    assertEquals(3, mergeSpec.merges.size());
+    for (OneMerge oneMerge : mergeSpec.merges) {
+      assertEquals(15, oneMerge.segments.size());
+    }
+
+    // Segments are below the floor segment size and we'd need to merge more than maxMergeAtOnce
+    // segments to go above the minimum segment size. We get 1 merge of maxMergeAtOnce=30 segments
+    // and 1 merge of 50-30=20 segments.
+    mergePolicy.setFloorSegmentMB(60);
+    mergeSpec = mergePolicy.findMerges(MergeTrigger.FULL_FLUSH, infos, mergeContext);
+    assertNotNull(mergeSpec);
+    assertEquals(2, mergeSpec.merges.size());
+    assertEquals(30, mergeSpec.merges.get(0).segments.size());
+    assertEquals(20, mergeSpec.merges.get(1).segments.size());
+  }
+
   public void testFullFlushMerges() throws IOException {
     AtomicLong segNameGenerator = new AtomicLong();
     IOStats stats = new IOStats();