Skip to content

Commit

Permalink
LUCENE-9378: Make it possible to configure how to trade speed for com…
Browse files Browse the repository at this point in the history
…pression on doc values. (#2069)

This adds a switch to `Lucene80DocValuesFormat` which allows to
configure whether to prioritize retrieval speed over compression ratio
or the other way around. When prioritizing retrieval speed, binary doc
values are written using the exact same format as before more aggressive
compression got introduced.
  • Loading branch information
jpountz committed Nov 12, 2020
1 parent ad27dd7 commit a48dc12
Show file tree
Hide file tree
Showing 13 changed files with 225 additions and 35 deletions.
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ New Features
* LUCENE-9594: FeatureField supports newLinearQuery that for scoring uses raw indexed
values of features without any transformation. (Mayya Sharipova, Adrien Grand)

* LUCENE-9378: Doc values now allow configuring how to trade compression for
retrieval speed. (Adrien Grand)

Improvements
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,14 @@
/** writer for {@link Lucene80DocValuesFormat} */
final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Closeable {

final Lucene80DocValuesFormat.Mode mode;
IndexOutput data, meta;
final int maxDoc;
private final SegmentWriteState state;

/** expert: Creates a new writer */
public Lucene80DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
public Lucene80DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension, Lucene80DocValuesFormat.Mode mode) throws IOException {
this.mode = mode;
boolean success = false;
try {
this.state = state;
Expand Down Expand Up @@ -490,13 +492,86 @@ public void close() throws IOException {
}

}


@Override
public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
meta.writeInt(field.number);
meta.writeByte(Lucene80DocValuesFormat.BINARY);

switch (mode) {
case BEST_SPEED:
meta.writeByte((byte) 0);
doAddUncompressedBinaryField(field, valuesProducer);
break;
case BEST_COMPRESSION:
meta.writeByte((byte) 1);
doAddCompressedBinaryField(field, valuesProducer);
break;
default:
throw new AssertionError();
}
}

private void doAddUncompressedBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
BinaryDocValues values = valuesProducer.getBinary(field);
long start = data.getFilePointer();
meta.writeLong(start); // dataOffset
int numDocsWithField = 0;
int minLength = Integer.MAX_VALUE;
int maxLength = 0;
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
numDocsWithField++;
BytesRef v = values.binaryValue();
int length = v.length;
data.writeBytes(v.bytes, v.offset, v.length);
minLength = Math.min(length, minLength);
maxLength = Math.max(length, maxLength);
}
assert numDocsWithField <= maxDoc;
meta.writeLong(data.getFilePointer() - start); // dataLength

if (numDocsWithField == 0) {
meta.writeLong(-2); // docsWithFieldOffset
meta.writeLong(0L); // docsWithFieldLength
meta.writeShort((short) -1); // jumpTableEntryCount
meta.writeByte((byte) -1); // denseRankPower
} else if (numDocsWithField == maxDoc) {
meta.writeLong(-1); // docsWithFieldOffset
meta.writeLong(0L); // docsWithFieldLength
meta.writeShort((short) -1); // jumpTableEntryCount
meta.writeByte((byte) -1); // denseRankPower
} else {
long offset = data.getFilePointer();
meta.writeLong(offset); // docsWithFieldOffset
values = valuesProducer.getBinary(field);
final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
meta.writeShort(jumpTableEntryCount);
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
}

meta.writeInt(numDocsWithField);
meta.writeInt(minLength);
meta.writeInt(maxLength);
if (maxLength > minLength) {
start = data.getFilePointer();
meta.writeLong(start);
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);

final DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance(meta, data, numDocsWithField + 1, DIRECT_MONOTONIC_BLOCK_SHIFT);
long addr = 0;
writer.add(addr);
values = valuesProducer.getBinary(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
addr += values.binaryValue().length;
writer.add(addr);
}
writer.finish();
meta.writeLong(data.getFilePointer() - start);
}
}

private void doAddCompressedBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
try (CompressedBinaryBlockWriter blockWriter = new CompressedBinaryBlockWriter()){
BinaryDocValues values = valuesProducer.getBinary(field);
long start = data.getFilePointer();
Expand Down Expand Up @@ -542,7 +617,6 @@ public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) th
meta.writeInt(maxLength);

blockWriter.writeMetaData();

}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@


import java.io.IOException;
import java.util.Objects;

import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesFormat;
Expand Down Expand Up @@ -131,14 +132,30 @@
*/
public final class Lucene80DocValuesFormat extends DocValuesFormat {

/** Sole Constructor */
/** Configuration option for doc values. */
public static enum Mode {
/** Trade compression ratio for retrieval speed. */
BEST_SPEED,
/** Trade retrieval speed for compression ratio. */
BEST_COMPRESSION
}

private final Mode mode;

/** Default constructor. */
public Lucene80DocValuesFormat() {
this(Mode.BEST_SPEED);
}

/** Constructor */
public Lucene80DocValuesFormat(Mode mode) {
super("Lucene80");
this.mode = Objects.requireNonNull(mode);
}

@Override
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return new Lucene80DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
return new Lucene80DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION, mode);
}

@Override
Expand All @@ -152,7 +169,8 @@ public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOExcepti
static final String META_EXTENSION = "dvm";
static final int VERSION_START = 0;
static final int VERSION_BIN_COMPRESSED = 1;
static final int VERSION_CURRENT = VERSION_BIN_COMPRESSED;
static final int VERSION_CONFIGURABLE_COMPRESSION = 2;
static final int VERSION_CURRENT = VERSION_CONFIGURABLE_COMPRESSION;

// indicates docvalues type
static final byte NUMERIC = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,20 @@ private void readNumeric(ChecksumIndexInput meta, NumericEntry entry) throws IOE

private BinaryEntry readBinary(ChecksumIndexInput meta) throws IOException {
BinaryEntry entry = new BinaryEntry();
if (version >= Lucene80DocValuesFormat.VERSION_CONFIGURABLE_COMPRESSION) {
int b = meta.readByte();
switch (b) {
case 0:
case 1:
// valid
break;
default:
throw new CorruptIndexException("Unexpected byte: " + b + ", expected 0 or 1", meta);
}
entry.compressed = b != 0;
} else {
entry.compressed = version >= Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED;
}
entry.dataOffset = meta.readLong();
entry.dataLength = meta.readLong();
entry.docsWithFieldOffset = meta.readLong();
Expand All @@ -183,19 +197,19 @@ private BinaryEntry readBinary(ChecksumIndexInput meta) throws IOException {
entry.numDocsWithField = meta.readInt();
entry.minLength = meta.readInt();
entry.maxLength = meta.readInt();
if ((version >= Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED && entry.numDocsWithField > 0) || entry.minLength < entry.maxLength) {
if ((entry.compressed && entry.numDocsWithField > 0) || entry.minLength < entry.maxLength) {
entry.addressesOffset = meta.readLong();

// Old count of uncompressed addresses
long numAddresses = entry.numDocsWithField + 1L;
// New count of compressed addresses - the number of compresseed blocks
if (version >= Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED) {
if (entry.compressed) {
entry.numCompressedChunks = meta.readVInt();
entry.docsPerChunkShift = meta.readVInt();
entry.maxUncompressedChunkSize = meta.readVInt();
numAddresses = entry.numCompressedChunks;
}

final int blockShift = meta.readVInt();
entry.addressesMeta = DirectMonotonicReader.loadMeta(meta, numAddresses, blockShift);
ramBytesUsed += entry.addressesMeta.ramBytesUsed();
Expand Down Expand Up @@ -303,6 +317,7 @@ private static class NumericEntry {
}

private static class BinaryEntry {
boolean compressed;
long dataOffset;
long dataLength;
long docsWithFieldOffset;
Expand Down Expand Up @@ -680,9 +695,7 @@ public boolean advanceExact(int target) throws IOException {
}
}

// BWC - old binary format
private BinaryDocValues getUncompressedBinary(FieldInfo field) throws IOException {
BinaryEntry entry = binaries.get(field.name);
private BinaryDocValues getUncompressedBinary(BinaryEntry entry) throws IOException {
if (entry.docsWithFieldOffset == -2) {
return DocValues.emptyBinary();
}
Expand Down Expand Up @@ -844,11 +857,16 @@ BytesRef decode(int docNumber) throws IOException {

@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
if (version < Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED) {
return getUncompressedBinary(field);
BinaryEntry entry = binaries.get(field.name);
if (entry.compressed) {
return getCompressedBinary(entry);
} else {
return getUncompressedBinary(entry);
}
}

private BinaryDocValues getCompressedBinary(BinaryEntry entry) throws IOException {

BinaryEntry entry = binaries.get(field.name);
if (entry.docsWithFieldOffset == -2) {
return DocValues.emptyBinary();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat;
import org.apache.lucene.codecs.lucene80.Lucene80NormsFormat;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat;
Expand All @@ -54,6 +55,23 @@
* @lucene.experimental
*/
public class Lucene87Codec extends Codec {

/** Configuration option for the codec. */
public static enum Mode {
/** Trade compression ratio for retrieval speed. */
BEST_SPEED(Lucene87StoredFieldsFormat.Mode.BEST_SPEED, Lucene80DocValuesFormat.Mode.BEST_SPEED),
/** Trade retrieval speed for compression ratio. */
BEST_COMPRESSION(Lucene87StoredFieldsFormat.Mode.BEST_COMPRESSION, Lucene80DocValuesFormat.Mode.BEST_COMPRESSION);

private final Lucene87StoredFieldsFormat.Mode storedMode;
private final Lucene80DocValuesFormat.Mode dvMode;

private Mode(Lucene87StoredFieldsFormat.Mode storedMode, Lucene80DocValuesFormat.Mode dvMode) {
this.storedMode = Objects.requireNonNull(storedMode);
this.dvMode = Objects.requireNonNull(dvMode);
}
}

private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene60FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene86SegmentInfoFormat();
Expand Down Expand Up @@ -82,7 +100,7 @@ public DocValuesFormat getDocValuesFormatForField(String field) {
* Instantiates a new codec.
*/
public Lucene87Codec() {
this(Lucene87StoredFieldsFormat.Mode.BEST_SPEED);
this(Mode.BEST_SPEED);
}

/**
Expand All @@ -91,10 +109,11 @@ public Lucene87Codec() {
* @param mode stored fields compression mode to use for newly
* flushed/merged segments.
*/
public Lucene87Codec(Lucene87StoredFieldsFormat.Mode mode) {
public Lucene87Codec(Mode mode) {
super("Lucene87");
this.storedFieldsFormat = new Lucene87StoredFieldsFormat(Objects.requireNonNull(mode));
this.storedFieldsFormat = new Lucene87StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
this.defaultFormat = new Lucene84PostingsFormat();
this.defaultDVFormat = new Lucene80DocValuesFormat(mode.dvMode);
}

@Override
Expand Down Expand Up @@ -168,7 +187,7 @@ public final DocValuesFormat docValuesFormat() {
return docValuesFormat;
}

private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene80");
private final DocValuesFormat defaultDVFormat;

private final NormsFormat normsFormat = new Lucene80NormsFormat();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
import java.util.function.Supplier;

import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.asserting.AssertingCodec;
Expand Down Expand Up @@ -71,15 +70,8 @@

/**
* Tests Lucene80DocValuesFormat
* Copied directly from the lucene70 package for separation of codec-code
*/
public class TestLucene80DocValuesFormat extends BaseCompressingDocValuesFormatTestCase {
private final Codec codec = TestUtil.alwaysDocValuesFormat(new Lucene80DocValuesFormat());

@Override
protected Codec getCodec() {
return codec;
}
public abstract class BaseLucene80DocValuesFormatTestCase extends BaseCompressingDocValuesFormatTestCase {

// TODO: these big methods can easily blow up some of the other ram-hungry codecs...
// for now just keep them here, as we want to test this for this format.
Expand Down Expand Up @@ -287,7 +279,7 @@ private void doTestTermsEnumRandom(int numDocs, Supplier<String> valuesProducer)
conf.setMergeScheduler(new SerialMergeScheduler());
// set to duel against a codec which has ordinals:
final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random());
final DocValuesFormat dv = new Lucene80DocValuesFormat();
final DocValuesFormat dv = getCodec().docValuesFormat();
conf.setCodec(new AssertingCodec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene80;

import org.apache.lucene.codecs.Codec;
import org.apache.lucene.util.TestUtil;

/**
* Tests Lucene80DocValuesFormat
*/
public class TestBestCompressionLucene80DocValuesFormat extends BaseLucene80DocValuesFormatTestCase {
private final Codec codec = TestUtil.alwaysDocValuesFormat(new Lucene80DocValuesFormat(Lucene80DocValuesFormat.Mode.BEST_COMPRESSION));

@Override
protected Codec getCodec() {
return codec;
}

}
Loading

0 comments on commit a48dc12

Please sign in to comment.