LUCENE-9539: Remove caches from SortingCodecReader (#1909)

SortingCodecReader keeps all docvalues in memory that are loaded from this reader. Yet, this reader should only be used for merging which happens sequentially. This makes caching docvalues unnecessary. Co-authored-by: Jim Ferenczi <[email protected]>
apache · Sep 23, 2020 · 427e11c · 427e11c
1 parent 12664dd
commit 427e11c
Show file tree

Hide file tree

Showing 6 changed files with 107 additions and 102 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -107,6 +107,13 @@ Documentation
 
 * LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Haoyu Zhai)
 
+Changes in Runtime Behavior
+---------------------
+
+* LUCENE-9539: SortingCodecReader now doesn't cache doc values fields anymore. Previously, SortingCodecReader
+  used to cache all doc values fields after they were loaded into memory. This reader should only be used
+  to sort segments after the fact using IndexWriter#addIndices. (Simon Willnauer) 
+
 
 Other
 ---------------------

diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
@@ -113,9 +113,9 @@ public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsu
     if (finalLengths == null) {
       finalLengths = this.lengths.build();
     }
-    final CachedBinaryDVs sorted;
+    final BinaryDVs sorted;
     if (sortMap != null) {
-      sorted = new CachedBinaryDVs(state.segmentInfo.maxDoc(), sortMap,
+      sorted = new BinaryDVs(state.segmentInfo.maxDoc(), sortMap,
           new BufferedBinaryDocValues(finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator()));
     } else {
       sorted = null;
@@ -189,11 +189,11 @@ public BytesRef binaryValue() {
   }
 
   static class SortingBinaryDocValues extends BinaryDocValues {
-    private final CachedBinaryDVs dvs;
+    private final BinaryDVs dvs;
     private final BytesRefBuilder spare = new BytesRefBuilder();
     private int docID = -1;
 
-    SortingBinaryDocValues(CachedBinaryDVs dvs) {
+    SortingBinaryDocValues(BinaryDVs dvs) {
       this.dvs = dvs;
     }
 
@@ -235,10 +235,10 @@ public long cost() {
     }
   }
 
-  static final class CachedBinaryDVs {
+  static final class BinaryDVs {
     final int[] offsets;
     final BytesRefArray values;
-    CachedBinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
+    BinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
       offsets = new int[maxDoc];
       values = new BytesRefArray(Counter.newCounter());
       int offset = 1; // 0 means no values for this document

diff --git a/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java
@@ -70,7 +70,7 @@ public void finish(int maxDoc) {
 
   public void flush(SegmentWriteState state, Sorter.DocMap sortMap, NormsConsumer normsConsumer) throws IOException {
     final PackedLongValues values = pending.build();
-    final NumericDocValuesWriter.CachedNumericDVs sorted;
+    final NumericDocValuesWriter.NumericDVs sorted;
     if (sortMap != null) {
       sorted = NumericDocValuesWriter.sortDocValues(state.segmentInfo.maxDoc(), sortMap,
           new BufferedNorms(values, docsWithField.iterator()));

diff --git a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
@@ -77,7 +77,7 @@ NumericDocValues getDocValues() {
     return new BufferedNumericDocValues(finalValues, docsWithField.iterator());
   }
 
-  static CachedNumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException {
+  static NumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException {
     FixedBitSet docsWithField = new FixedBitSet(maxDoc);
     long[] values = new long[maxDoc];
     while (true) {
@@ -89,15 +89,15 @@ static CachedNumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, Numeric
       docsWithField.set(newDocID);
       values[newDocID] = oldDocValues.longValue();
     }
-    return new CachedNumericDVs(values, docsWithField);
+    return new NumericDVs(values, docsWithField);
   }
 
   @Override
   public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer) throws IOException {
     if (finalValues == null) {
       finalValues = pending.build();
     }
-    final CachedNumericDVs sorted;
+    final NumericDVs sorted;
     if (sortMap != null) {
       NumericDocValues oldValues = new BufferedNumericDocValues(finalValues, docsWithField.iterator());
       sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap, oldValues);
@@ -169,11 +169,11 @@ public long longValue() {
 
   static class SortingNumericDocValues extends NumericDocValues {
 
-    private final CachedNumericDVs dvs;
+    private final NumericDVs dvs;
     private int docID = -1;
     private long cost = -1;
 
-    SortingNumericDocValues(CachedNumericDVs dvs) {
+    SortingNumericDocValues(NumericDVs dvs) {
       this.dvs = dvs;
     }
 
@@ -218,11 +218,11 @@ public long cost() {
     }
   }
 
-  static class CachedNumericDVs {
+  static class NumericDVs {
     private final long[] values;
     private final BitSet docsWithField;
 
-    CachedNumericDVs(long[] values, BitSet docsWithField) {
+    NumericDVs(long[] values, BitSet docsWithField) {
       this.values = values;
       this.docsWithField = docsWithField;
     }

diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java
@@ -30,8 +30,10 @@
 import org.apache.lucene.codecs.StoredFieldsReader;
 import org.apache.lucene.codecs.TermVectorsReader;
 import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IOSupplier;
 import org.apache.lucene.util.packed.PackedInts;
 
 import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
@@ -41,21 +43,13 @@
  * {@link Sort}. This can be used to re-sort and index after it's been created by wrapping all
  * readers of the index with this reader and adding it to a fresh IndexWriter via
  * {@link IndexWriter#addIndexes(CodecReader...)}.
+ * NOTE: This reader should only be used for merging. Pulling fields from this reader might be very costly and memory
+ * intensive.
  *
  * @lucene.experimental
  */
 public final class SortingCodecReader extends FilterCodecReader {
 
-  private final Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNumericDVs = new HashMap<>();
-
-  private final Map<String, BinaryDocValuesWriter.CachedBinaryDVs> cachedBinaryDVs = new HashMap<>();
-
-  private final Map<String, int[]> cachedSortedDVs = new HashMap<>();
-
-  private final Map<String, SortedSetDocValuesWriter.DocOrds> cachedSortedSetDVs = new HashMap<>();
-
-  private final Map<String, SortedNumericDocValuesWriter.LongValues> cachedSortedNumericDVs = new HashMap<>();
-
   private static class SortingBits implements Bits {
 
     private final Bits in;
@@ -148,10 +142,6 @@ public int getDocCount() {
     }
   }
 
-
-
-
-
   /** Return a sorted view of <code>reader</code> according to the order
    *  defined by <code>sort</code>. If the reader is already sorted, this
    *  method might return the reader as-is. */
@@ -313,15 +303,13 @@ public long ramBytesUsed() {
     };
   }
 
-  private final Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNorms = new HashMap<>();
-
   @Override
   public NormsProducer getNormsReader() {
     final NormsProducer delegate = in.getNormsReader();
     return new NormsProducer() {
       @Override
       public NumericDocValues getNorms(FieldInfo field) throws IOException {
-        return produceNumericDocValues(field, delegate.getNorms(field), cachedNorms);
+        return new NumericDocValuesWriter.SortingNumericDocValues(getOrCreateNorms(field.name, () -> getNumericDocValues(delegate.getNorms(field))));
       }
 
       @Override
@@ -341,101 +329,48 @@ public long ramBytesUsed() {
     };
   }
 
-  private NumericDocValues produceNumericDocValues(FieldInfo field, NumericDocValues oldNorms,
-                                                   Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNorms) throws IOException {
-    NumericDocValuesWriter.CachedNumericDVs norms;
-    synchronized (cachedNorms) {
-      norms = cachedNorms.get(field);
-      if (norms == null) {
-        FixedBitSet docsWithField = new FixedBitSet(maxDoc());
-        long[] values = new long[maxDoc()];
-        while (true) {
-          int docID = oldNorms.nextDoc();
-          if (docID == NO_MORE_DOCS) {
-            break;
-          }
-          int newDocID = docMap.oldToNew(docID);
-          docsWithField.set(newDocID);
-          values[newDocID] = oldNorms.longValue();
-        }
-        norms = new NumericDocValuesWriter.CachedNumericDVs(values, docsWithField);
-        cachedNorms.put(field.name, norms);
-      }
-    }
-    return new NumericDocValuesWriter.SortingNumericDocValues(norms);
-  }
-
   @Override
   public DocValuesProducer getDocValuesReader() {
     final DocValuesProducer delegate = in.getDocValuesReader();
     return new DocValuesProducer() {
       @Override
       public NumericDocValues getNumeric(FieldInfo field) throws IOException {
-        return produceNumericDocValues(field,delegate.getNumeric(field), cachedNumericDVs);
+        return new NumericDocValuesWriter.SortingNumericDocValues(getOrCreateDV(field.name, () -> getNumericDocValues(delegate.getNumeric(field))));
       }
 
       @Override
       public BinaryDocValues getBinary(FieldInfo field) throws IOException {
-        final BinaryDocValues oldDocValues = delegate.getBinary(field);
-        BinaryDocValuesWriter.CachedBinaryDVs dvs;
-        synchronized (cachedBinaryDVs) {
-          dvs = cachedBinaryDVs.get(field);
-          if (dvs == null) {
-            dvs = new BinaryDocValuesWriter.CachedBinaryDVs(maxDoc(), docMap, oldDocValues);
-            cachedBinaryDVs.put(field.name, dvs);
-          }
-        }
-        return new BinaryDocValuesWriter.SortingBinaryDocValues(dvs);
+        return new BinaryDocValuesWriter.SortingBinaryDocValues(getOrCreateDV(field.name,
+            () -> new BinaryDocValuesWriter.BinaryDVs(maxDoc(), docMap, delegate.getBinary(field))));
       }
 
       @Override
       public SortedDocValues getSorted(FieldInfo field) throws IOException {
         SortedDocValues oldDocValues = delegate.getSorted(field);
-        int[] ords;
-        synchronized (cachedSortedDVs) {
-          ords = cachedSortedDVs.get(field);
-          if (ords == null) {
-            ords = new int[maxDoc()];
-            Arrays.fill(ords, -1);
-            int docID;
-            while ((docID = oldDocValues.nextDoc()) != NO_MORE_DOCS) {
-              int newDocID = docMap.oldToNew(docID);
-              ords[newDocID] = oldDocValues.ordValue();
-            }
-            cachedSortedDVs.put(field.name, ords);
+        return new SortedDocValuesWriter.SortingSortedDocValues(oldDocValues, getOrCreateDV(field.name, () -> {
+          int[] ords = new int[maxDoc()];
+          Arrays.fill(ords, -1);
+          int docID;
+          while ((docID = oldDocValues.nextDoc()) != NO_MORE_DOCS) {
+            int newDocID = docMap.oldToNew(docID);
+            ords[newDocID] = oldDocValues.ordValue();
           }
-        }
-
-        return new SortedDocValuesWriter.SortingSortedDocValues(oldDocValues, ords);
+          return ords;
+        }));
       }
 
       @Override
       public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
         final SortedNumericDocValues oldDocValues = delegate.getSortedNumeric(field);
-        SortedNumericDocValuesWriter.LongValues values;
-        synchronized (cachedSortedNumericDVs) {
-          values = cachedSortedNumericDVs.get(field);
-          if (values == null) {
-            values = new SortedNumericDocValuesWriter.LongValues(maxDoc(), docMap, oldDocValues, PackedInts.FAST);
-            cachedSortedNumericDVs.put(field.name, values);
-          }
-        }
-
-        return new SortedNumericDocValuesWriter.SortingSortedNumericDocValues(oldDocValues, values);
+        return new SortedNumericDocValuesWriter.SortingSortedNumericDocValues(oldDocValues, getOrCreateDV(field.name, () ->
+            new SortedNumericDocValuesWriter.LongValues(maxDoc(), docMap, oldDocValues, PackedInts.FAST)));
       }
 
       @Override
       public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
         SortedSetDocValues oldDocValues = delegate.getSortedSet(field);
-        SortedSetDocValuesWriter.DocOrds ords;
-        synchronized (cachedSortedSetDVs) {
-          ords = cachedSortedSetDVs.get(field);
-          if (ords == null) {
-            ords = new SortedSetDocValuesWriter.DocOrds(maxDoc(), docMap, oldDocValues, PackedInts.FASTEST);
-            cachedSortedSetDVs.put(field.name, ords);
-          }
-        }
-        return new SortedSetDocValuesWriter.SortingSortedSetDocValues(oldDocValues, ords);
+        return new SortedSetDocValuesWriter.SortingSortedSetDocValues(oldDocValues, getOrCreateDV(field.name, () ->
+            new SortedSetDocValuesWriter.DocOrds(maxDoc(), docMap, oldDocValues, PackedInts.FAST)));
       }
 
       @Override
@@ -455,6 +390,18 @@ public long ramBytesUsed() {
     };
   }
 
+  private NumericDocValuesWriter.NumericDVs getNumericDocValues(NumericDocValues oldNumerics) throws IOException {
+    FixedBitSet docsWithField = new FixedBitSet(maxDoc());
+    long[] values = new long[maxDoc()];
+    int docID;
+    while ((docID = oldNumerics.nextDoc()) != NO_MORE_DOCS) {
+      int newDocID = docMap.oldToNew(docID);
+      docsWithField.set(newDocID);
+      values[newDocID] = oldNumerics.longValue();
+    }
+    return new NumericDocValuesWriter.NumericDVs(values, docsWithField);
+  }
+
   @Override
   public TermVectorsReader getTermVectorsReader() {
     return newTermVectorsReader(in.getTermVectorsReader());
@@ -510,4 +457,51 @@ public LeafMetaData getMetaData() {
     return metaData;
   }
 
+  // we try to cache the last used DV or Norms instance since during merge
+  // this instance is used more than once. We could in addition to this single instance
+  // also cache the fields that are used for sorting since we do the work twice for these fields
+  private String cachedField;
+  private Object cachedObject;
+  private boolean cacheIsNorms;
+
+  private <T> T getOrCreateNorms(String field, IOSupplier<T> supplier) throws IOException {
+    return getOrCreate(field, true, supplier);
+  }
+
+  @SuppressWarnings("unchecked")
+  private synchronized  <T> T getOrCreate(String field, boolean norms, IOSupplier<T> supplier) throws IOException {
+    if ((field.equals(cachedField) && cacheIsNorms == norms) == false) {
+      assert assertCreatedOnlyOnce(field, norms);
+      cachedObject = supplier.get();
+      cachedField = field;
+      cacheIsNorms = norms;
+    }
+    assert cachedObject != null;
+    return (T) cachedObject;
+  }
+
+  private final Map<String, Integer> cacheStats = new HashMap<>(); // only with assertions enabled
+  private boolean assertCreatedOnlyOnce(String field, boolean norms) {
+    assert Thread.holdsLock(this);
+    // this is mainly there to make sure we change anything in the way we merge we realize it early
+    Integer timesCached = cacheStats.compute(field + "N:" + norms, (s, i) -> i == null ? 1 : i.intValue() + 1);
+    if (timesCached > 1) {
+      assert norms == false :"[" + field + "] norms must not be cached twice";
+      boolean isSortField = false;
+      for (SortField sf : metaData.getSort().getSort()) {
+        if (field.equals(sf.getField())) {
+          isSortField = true;
+          break;
+        }
+      }
+      assert timesCached == 2 : "[" + field + "] must not be cached more than twice but was cached: "
+          + timesCached + " times isSortField: " + isSortField;
+      assert isSortField : "only sort fields should be cached twice but [" + field + "] is not a sort field";
+    }
+    return true;
+  }
+
+  private <T> T getOrCreateDV(String field, IOSupplier<T> supplier) throws IOException {
+    return getOrCreate(field, false, supplier);
+  }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java
@@ -115,14 +115,18 @@ public void testSortOnAddIndicesRandom() throws IOException {
         for (int i = 0; i < numDocs; i++) {
           int docId = docIds.get(i);
           Document doc = new Document();
-          doc.add(new NumericDocValuesField("foo", random().nextInt(20)));
           doc.add(new StringField("id", Integer.toString(docId), Field.Store.YES));
           doc.add(new LongPoint("id", docId));
-          doc.add(new TextField("text_field", RandomStrings.randomRealisticUnicodeOfLength(random(), 25), Field.Store.YES));
+          String s = RandomStrings.randomRealisticUnicodeOfLength(random(), 25);
+          doc.add(new TextField("text_field", s, Field.Store.YES));
+          doc.add(new BinaryDocValuesField("text_field", new BytesRef(s)));
+          doc.add(new TextField("another_text_field", s, Field.Store.YES));
+          doc.add(new BinaryDocValuesField("another_text_field", new BytesRef(s)));
           doc.add(new SortedNumericDocValuesField("sorted_numeric_dv", docId));
           doc.add(new SortedDocValuesField("binary_sorted_dv", new BytesRef(Integer.toString(docId))));
           doc.add(new BinaryDocValuesField("binary_dv", new BytesRef(Integer.toString(docId))));
           doc.add(new SortedSetDocValuesField("sorted_set_dv", new BytesRef(Integer.toString(docId))));
+          doc.add(new NumericDocValuesField("foo", random().nextInt(20)));
 
           FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
           ft.setStoreTermVectors(true);