Skip to content

Commit

Permalink
LUCENE-9539: Remove caches from SortingCodecReader (#1909)
Browse files Browse the repository at this point in the history
SortingCodecReader keeps all docvalues in memory that are loaded from this reader.
Yet, this reader should only be used for merging which happens sequentially. This makes
caching docvalues unnecessary.

Co-authored-by: Jim Ferenczi <[email protected]>
  • Loading branch information
s1monw and jimczi committed Sep 23, 2020
1 parent 12664dd commit 427e11c
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 102 deletions.
7 changes: 7 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,13 @@ Documentation

* LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Haoyu Zhai)

Changes in Runtime Behavior
---------------------

* LUCENE-9539: SortingCodecReader now doesn't cache doc values fields anymore. Previously, SortingCodecReader
used to cache all doc values fields after they were loaded into memory. This reader should only be used
to sort segments after the fact using IndexWriter#addIndices. (Simon Willnauer)


Other
---------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,9 @@ public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsu
if (finalLengths == null) {
finalLengths = this.lengths.build();
}
final CachedBinaryDVs sorted;
final BinaryDVs sorted;
if (sortMap != null) {
sorted = new CachedBinaryDVs(state.segmentInfo.maxDoc(), sortMap,
sorted = new BinaryDVs(state.segmentInfo.maxDoc(), sortMap,
new BufferedBinaryDocValues(finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator()));
} else {
sorted = null;
Expand Down Expand Up @@ -189,11 +189,11 @@ public BytesRef binaryValue() {
}

static class SortingBinaryDocValues extends BinaryDocValues {
private final CachedBinaryDVs dvs;
private final BinaryDVs dvs;
private final BytesRefBuilder spare = new BytesRefBuilder();
private int docID = -1;

SortingBinaryDocValues(CachedBinaryDVs dvs) {
SortingBinaryDocValues(BinaryDVs dvs) {
this.dvs = dvs;
}

Expand Down Expand Up @@ -235,10 +235,10 @@ public long cost() {
}
}

static final class CachedBinaryDVs {
static final class BinaryDVs {
final int[] offsets;
final BytesRefArray values;
CachedBinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
BinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
offsets = new int[maxDoc];
values = new BytesRefArray(Counter.newCounter());
int offset = 1; // 0 means no values for this document
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ public void finish(int maxDoc) {

public void flush(SegmentWriteState state, Sorter.DocMap sortMap, NormsConsumer normsConsumer) throws IOException {
final PackedLongValues values = pending.build();
final NumericDocValuesWriter.CachedNumericDVs sorted;
final NumericDocValuesWriter.NumericDVs sorted;
if (sortMap != null) {
sorted = NumericDocValuesWriter.sortDocValues(state.segmentInfo.maxDoc(), sortMap,
new BufferedNorms(values, docsWithField.iterator()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ NumericDocValues getDocValues() {
return new BufferedNumericDocValues(finalValues, docsWithField.iterator());
}

static CachedNumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException {
static NumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException {
FixedBitSet docsWithField = new FixedBitSet(maxDoc);
long[] values = new long[maxDoc];
while (true) {
Expand All @@ -89,15 +89,15 @@ static CachedNumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, Numeric
docsWithField.set(newDocID);
values[newDocID] = oldDocValues.longValue();
}
return new CachedNumericDVs(values, docsWithField);
return new NumericDVs(values, docsWithField);
}

@Override
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer) throws IOException {
if (finalValues == null) {
finalValues = pending.build();
}
final CachedNumericDVs sorted;
final NumericDVs sorted;
if (sortMap != null) {
NumericDocValues oldValues = new BufferedNumericDocValues(finalValues, docsWithField.iterator());
sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap, oldValues);
Expand Down Expand Up @@ -169,11 +169,11 @@ public long longValue() {

static class SortingNumericDocValues extends NumericDocValues {

private final CachedNumericDVs dvs;
private final NumericDVs dvs;
private int docID = -1;
private long cost = -1;

SortingNumericDocValues(CachedNumericDVs dvs) {
SortingNumericDocValues(NumericDVs dvs) {
this.dvs = dvs;
}

Expand Down Expand Up @@ -218,11 +218,11 @@ public long cost() {
}
}

static class CachedNumericDVs {
static class NumericDVs {
private final long[] values;
private final BitSet docsWithField;

CachedNumericDVs(long[] values, BitSet docsWithField) {
NumericDVs(long[] values, BitSet docsWithField) {
this.values = values;
this.docsWithField = docsWithField;
}
Expand Down
166 changes: 80 additions & 86 deletions lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOSupplier;
import org.apache.lucene.util.packed.PackedInts;

import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
Expand All @@ -41,21 +43,13 @@
* {@link Sort}. This can be used to re-sort and index after it's been created by wrapping all
* readers of the index with this reader and adding it to a fresh IndexWriter via
* {@link IndexWriter#addIndexes(CodecReader...)}.
* NOTE: This reader should only be used for merging. Pulling fields from this reader might be very costly and memory
* intensive.
*
* @lucene.experimental
*/
public final class SortingCodecReader extends FilterCodecReader {

private final Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNumericDVs = new HashMap<>();

private final Map<String, BinaryDocValuesWriter.CachedBinaryDVs> cachedBinaryDVs = new HashMap<>();

private final Map<String, int[]> cachedSortedDVs = new HashMap<>();

private final Map<String, SortedSetDocValuesWriter.DocOrds> cachedSortedSetDVs = new HashMap<>();

private final Map<String, SortedNumericDocValuesWriter.LongValues> cachedSortedNumericDVs = new HashMap<>();

private static class SortingBits implements Bits {

private final Bits in;
Expand Down Expand Up @@ -148,10 +142,6 @@ public int getDocCount() {
}
}





/** Return a sorted view of <code>reader</code> according to the order
* defined by <code>sort</code>. If the reader is already sorted, this
* method might return the reader as-is. */
Expand Down Expand Up @@ -313,15 +303,13 @@ public long ramBytesUsed() {
};
}

private final Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNorms = new HashMap<>();

@Override
public NormsProducer getNormsReader() {
final NormsProducer delegate = in.getNormsReader();
return new NormsProducer() {
@Override
public NumericDocValues getNorms(FieldInfo field) throws IOException {
return produceNumericDocValues(field, delegate.getNorms(field), cachedNorms);
return new NumericDocValuesWriter.SortingNumericDocValues(getOrCreateNorms(field.name, () -> getNumericDocValues(delegate.getNorms(field))));
}

@Override
Expand All @@ -341,101 +329,48 @@ public long ramBytesUsed() {
};
}

private NumericDocValues produceNumericDocValues(FieldInfo field, NumericDocValues oldNorms,
Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNorms) throws IOException {
NumericDocValuesWriter.CachedNumericDVs norms;
synchronized (cachedNorms) {
norms = cachedNorms.get(field);
if (norms == null) {
FixedBitSet docsWithField = new FixedBitSet(maxDoc());
long[] values = new long[maxDoc()];
while (true) {
int docID = oldNorms.nextDoc();
if (docID == NO_MORE_DOCS) {
break;
}
int newDocID = docMap.oldToNew(docID);
docsWithField.set(newDocID);
values[newDocID] = oldNorms.longValue();
}
norms = new NumericDocValuesWriter.CachedNumericDVs(values, docsWithField);
cachedNorms.put(field.name, norms);
}
}
return new NumericDocValuesWriter.SortingNumericDocValues(norms);
}

@Override
public DocValuesProducer getDocValuesReader() {
final DocValuesProducer delegate = in.getDocValuesReader();
return new DocValuesProducer() {
@Override
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
return produceNumericDocValues(field,delegate.getNumeric(field), cachedNumericDVs);
return new NumericDocValuesWriter.SortingNumericDocValues(getOrCreateDV(field.name, () -> getNumericDocValues(delegate.getNumeric(field))));
}

@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
final BinaryDocValues oldDocValues = delegate.getBinary(field);
BinaryDocValuesWriter.CachedBinaryDVs dvs;
synchronized (cachedBinaryDVs) {
dvs = cachedBinaryDVs.get(field);
if (dvs == null) {
dvs = new BinaryDocValuesWriter.CachedBinaryDVs(maxDoc(), docMap, oldDocValues);
cachedBinaryDVs.put(field.name, dvs);
}
}
return new BinaryDocValuesWriter.SortingBinaryDocValues(dvs);
return new BinaryDocValuesWriter.SortingBinaryDocValues(getOrCreateDV(field.name,
() -> new BinaryDocValuesWriter.BinaryDVs(maxDoc(), docMap, delegate.getBinary(field))));
}

@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
SortedDocValues oldDocValues = delegate.getSorted(field);
int[] ords;
synchronized (cachedSortedDVs) {
ords = cachedSortedDVs.get(field);
if (ords == null) {
ords = new int[maxDoc()];
Arrays.fill(ords, -1);
int docID;
while ((docID = oldDocValues.nextDoc()) != NO_MORE_DOCS) {
int newDocID = docMap.oldToNew(docID);
ords[newDocID] = oldDocValues.ordValue();
}
cachedSortedDVs.put(field.name, ords);
return new SortedDocValuesWriter.SortingSortedDocValues(oldDocValues, getOrCreateDV(field.name, () -> {
int[] ords = new int[maxDoc()];
Arrays.fill(ords, -1);
int docID;
while ((docID = oldDocValues.nextDoc()) != NO_MORE_DOCS) {
int newDocID = docMap.oldToNew(docID);
ords[newDocID] = oldDocValues.ordValue();
}
}

return new SortedDocValuesWriter.SortingSortedDocValues(oldDocValues, ords);
return ords;
}));
}

@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
final SortedNumericDocValues oldDocValues = delegate.getSortedNumeric(field);
SortedNumericDocValuesWriter.LongValues values;
synchronized (cachedSortedNumericDVs) {
values = cachedSortedNumericDVs.get(field);
if (values == null) {
values = new SortedNumericDocValuesWriter.LongValues(maxDoc(), docMap, oldDocValues, PackedInts.FAST);
cachedSortedNumericDVs.put(field.name, values);
}
}

return new SortedNumericDocValuesWriter.SortingSortedNumericDocValues(oldDocValues, values);
return new SortedNumericDocValuesWriter.SortingSortedNumericDocValues(oldDocValues, getOrCreateDV(field.name, () ->
new SortedNumericDocValuesWriter.LongValues(maxDoc(), docMap, oldDocValues, PackedInts.FAST)));
}

@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
SortedSetDocValues oldDocValues = delegate.getSortedSet(field);
SortedSetDocValuesWriter.DocOrds ords;
synchronized (cachedSortedSetDVs) {
ords = cachedSortedSetDVs.get(field);
if (ords == null) {
ords = new SortedSetDocValuesWriter.DocOrds(maxDoc(), docMap, oldDocValues, PackedInts.FASTEST);
cachedSortedSetDVs.put(field.name, ords);
}
}
return new SortedSetDocValuesWriter.SortingSortedSetDocValues(oldDocValues, ords);
return new SortedSetDocValuesWriter.SortingSortedSetDocValues(oldDocValues, getOrCreateDV(field.name, () ->
new SortedSetDocValuesWriter.DocOrds(maxDoc(), docMap, oldDocValues, PackedInts.FAST)));
}

@Override
Expand All @@ -455,6 +390,18 @@ public long ramBytesUsed() {
};
}

private NumericDocValuesWriter.NumericDVs getNumericDocValues(NumericDocValues oldNumerics) throws IOException {
FixedBitSet docsWithField = new FixedBitSet(maxDoc());
long[] values = new long[maxDoc()];
int docID;
while ((docID = oldNumerics.nextDoc()) != NO_MORE_DOCS) {
int newDocID = docMap.oldToNew(docID);
docsWithField.set(newDocID);
values[newDocID] = oldNumerics.longValue();
}
return new NumericDocValuesWriter.NumericDVs(values, docsWithField);
}

@Override
public TermVectorsReader getTermVectorsReader() {
return newTermVectorsReader(in.getTermVectorsReader());
Expand Down Expand Up @@ -510,4 +457,51 @@ public LeafMetaData getMetaData() {
return metaData;
}

// we try to cache the last used DV or Norms instance since during merge
// this instance is used more than once. We could in addition to this single instance
// also cache the fields that are used for sorting since we do the work twice for these fields
private String cachedField;
private Object cachedObject;
private boolean cacheIsNorms;

private <T> T getOrCreateNorms(String field, IOSupplier<T> supplier) throws IOException {
return getOrCreate(field, true, supplier);
}

@SuppressWarnings("unchecked")
private synchronized <T> T getOrCreate(String field, boolean norms, IOSupplier<T> supplier) throws IOException {
if ((field.equals(cachedField) && cacheIsNorms == norms) == false) {
assert assertCreatedOnlyOnce(field, norms);
cachedObject = supplier.get();
cachedField = field;
cacheIsNorms = norms;
}
assert cachedObject != null;
return (T) cachedObject;
}

private final Map<String, Integer> cacheStats = new HashMap<>(); // only with assertions enabled
private boolean assertCreatedOnlyOnce(String field, boolean norms) {
assert Thread.holdsLock(this);
// this is mainly there to make sure we change anything in the way we merge we realize it early
Integer timesCached = cacheStats.compute(field + "N:" + norms, (s, i) -> i == null ? 1 : i.intValue() + 1);
if (timesCached > 1) {
assert norms == false :"[" + field + "] norms must not be cached twice";
boolean isSortField = false;
for (SortField sf : metaData.getSort().getSort()) {
if (field.equals(sf.getField())) {
isSortField = true;
break;
}
}
assert timesCached == 2 : "[" + field + "] must not be cached more than twice but was cached: "
+ timesCached + " times isSortField: " + isSortField;
assert isSortField : "only sort fields should be cached twice but [" + field + "] is not a sort field";
}
return true;
}

private <T> T getOrCreateDV(String field, IOSupplier<T> supplier) throws IOException {
return getOrCreate(field, false, supplier);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -115,14 +115,18 @@ public void testSortOnAddIndicesRandom() throws IOException {
for (int i = 0; i < numDocs; i++) {
int docId = docIds.get(i);
Document doc = new Document();
doc.add(new NumericDocValuesField("foo", random().nextInt(20)));
doc.add(new StringField("id", Integer.toString(docId), Field.Store.YES));
doc.add(new LongPoint("id", docId));
doc.add(new TextField("text_field", RandomStrings.randomRealisticUnicodeOfLength(random(), 25), Field.Store.YES));
String s = RandomStrings.randomRealisticUnicodeOfLength(random(), 25);
doc.add(new TextField("text_field", s, Field.Store.YES));
doc.add(new BinaryDocValuesField("text_field", new BytesRef(s)));
doc.add(new TextField("another_text_field", s, Field.Store.YES));
doc.add(new BinaryDocValuesField("another_text_field", new BytesRef(s)));
doc.add(new SortedNumericDocValuesField("sorted_numeric_dv", docId));
doc.add(new SortedDocValuesField("binary_sorted_dv", new BytesRef(Integer.toString(docId))));
doc.add(new BinaryDocValuesField("binary_dv", new BytesRef(Integer.toString(docId))));
doc.add(new SortedSetDocValuesField("sorted_set_dv", new BytesRef(Integer.toString(docId))));
doc.add(new NumericDocValuesField("foo", random().nextInt(20)));

FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
Expand Down

0 comments on commit 427e11c

Please sign in to comment.