Skip to content

Commit

Permalink
Enable BloomFilter for _id of non-datastream indices (#88409)
Browse files Browse the repository at this point in the history
This PR adds BloomFilter to Elasticsearch and enables it for the _id 
field of non-data stream indices. BloomFilter should speed up the
performance of mget and update requests at a small expense of refresh,
merge, and storage.
  • Loading branch information
dnhatn authored Aug 8, 2022
1 parent 92dc846 commit cfad420
Show file tree
Hide file tree
Showing 23 changed files with 781 additions and 28 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/88409.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 88409
summary: Enable `BloomFilter` for `_id` of non-datastream indices
area: Search
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import org.elasticsearch.action.admin.indices.forcemerge.ForceMergeResponse;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.util.CollectionUtils;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.IndexSettings;
Expand Down Expand Up @@ -62,7 +63,7 @@ EngineConfig engineConfigWithLargerIndexingMemory(EngineConfig config) {
config.getMergePolicy(),
config.getAnalyzer(),
config.getSimilarity(),
new CodecService(null),
new CodecService(null, BigArrays.NON_RECYCLING_INSTANCE),
config.getEventListener(),
config.getQueryCache(),
config.getQueryCachingPolicy(),
Expand Down
3 changes: 3 additions & 0 deletions server/src/main/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@
exports org.elasticsearch.index.cache.query;
exports org.elasticsearch.index.cache.request;
exports org.elasticsearch.index.codec;
exports org.elasticsearch.index.codec.bloomfilter;
exports org.elasticsearch.index.engine;
exports org.elasticsearch.index.fielddata;
exports org.elasticsearch.index.fielddata.fieldcomparator;
Expand Down Expand Up @@ -362,4 +363,6 @@
org.elasticsearch.index.shard.ShardToolCliProvider;

uses org.elasticsearch.reservedstate.ReservedClusterStateHandlerProvider;

provides org.apache.lucene.codecs.PostingsFormat with org.elasticsearch.index.codec.bloomfilter.ES85BloomFilterPostingsFormat;
}
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
DiskThresholdDecider.SETTING_IGNORE_DISK_WATERMARKS,
ShardLimitValidator.INDEX_SETTING_SHARD_LIMIT_GROUP,
DataTier.TIER_PREFERENCE_SETTING,
IndexSettings.BLOOM_FILTER_ID_FIELD_ENABLED_SETTING,

// validate that built-in similarities don't get redefined
Setting.groupSetting("index.similarity.", (s) -> {
Expand Down
12 changes: 12 additions & 0 deletions server/src/main/java/org/elasticsearch/index/IndexSettings.java
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,18 @@ public final class IndexSettings {
Setting.Property.IndexScope
);

/**
* This index setting is intentionally undocumented and should be used as an escape hatch to disable BloomFilter of the
* _id field of non-data-stream indices, which is enabled by default. This setting doesn't affect data-stream indices.
*/
public static final Setting<Boolean> BLOOM_FILTER_ID_FIELD_ENABLED_SETTING = Setting.boolSetting(
"index.bloom_filter_for_id_field.enabled",
true,
Setting.Property.Dynamic,
Setting.Property.IndexScope,
Property.DeprecatedWarning
);

/**
* Is the {@code index.mode} enabled? It should only be enbaled if you
* pass a jvm parameter or are running a snapshot build.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene92.Lucene92Codec;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.index.mapper.MapperService;

Expand All @@ -31,14 +32,14 @@ public class CodecService {
/** the raw unfiltered lucene default. useful for testing */
public static final String LUCENE_DEFAULT_CODEC = "lucene_default";

public CodecService(@Nullable MapperService mapperService) {
public CodecService(@Nullable MapperService mapperService, BigArrays bigArrays) {
final var codecs = new HashMap<String, Codec>();
if (mapperService == null) {
codecs.put(DEFAULT_CODEC, new Lucene92Codec());
codecs.put(BEST_COMPRESSION_CODEC, new Lucene92Codec(Lucene92Codec.Mode.BEST_COMPRESSION));
} else {
codecs.put(DEFAULT_CODEC, new PerFieldMapperCodec(Lucene92Codec.Mode.BEST_SPEED, mapperService));
codecs.put(BEST_COMPRESSION_CODEC, new PerFieldMapperCodec(Lucene92Codec.Mode.BEST_COMPRESSION, mapperService));
codecs.put(DEFAULT_CODEC, new PerFieldMapperCodec(Lucene92Codec.Mode.BEST_SPEED, mapperService, bigArrays));
codecs.put(BEST_COMPRESSION_CODEC, new PerFieldMapperCodec(Lucene92Codec.Mode.BEST_COMPRESSION, mapperService, bigArrays));
}
codecs.put(LUCENE_DEFAULT_CODEC, Codec.getDefault());
for (String codec : Codec.availableCodecs()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
import org.apache.lucene.codecs.lucene92.Lucene92Codec;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.codec.bloomfilter.ES85BloomFilterPostingsFormat;
import org.elasticsearch.index.mapper.IdFieldMapper;
import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
Expand All @@ -31,24 +35,39 @@ public class PerFieldMapperCodec extends Lucene92Codec {
private final MapperService mapperService;

private final DocValuesFormat docValuesFormat = new Lucene90DocValuesFormat();
private final ES85BloomFilterPostingsFormat bloomFilterPostingsFormat;

static {
assert Codec.forName(Lucene.LATEST_CODEC).getClass().isAssignableFrom(PerFieldMapperCodec.class)
: "PerFieldMapperCodec must subclass the latest " + "lucene codec: " + Lucene.LATEST_CODEC;
}

public PerFieldMapperCodec(Mode compressionMode, MapperService mapperService) {
public PerFieldMapperCodec(Mode compressionMode, MapperService mapperService, BigArrays bigArrays) {
super(compressionMode);
this.mapperService = mapperService;
this.bloomFilterPostingsFormat = new ES85BloomFilterPostingsFormat(bigArrays, this::internalGetPostingsFormatForField);
}

@Override
public PostingsFormat getPostingsFormatForField(String field) {
PostingsFormat format = mapperService.mappingLookup().getPostingsFormat(field);
if (format == null) {
return super.getPostingsFormatForField(field);
if (useBloomFilter(field)) {
return bloomFilterPostingsFormat;
}
return format;
return internalGetPostingsFormatForField(field);
}

private PostingsFormat internalGetPostingsFormatForField(String field) {
final PostingsFormat format = mapperService.mappingLookup().getPostingsFormat(field);
if (format != null) {
return format;
}
return super.getPostingsFormatForField(field);
}

private boolean useBloomFilter(String field) {
return IdFieldMapper.NAME.equals(field)
&& mapperService.mappingLookup().isDataStreamTimestampFieldEnabled() == false
&& IndexSettings.BLOOM_FILTER_ID_FIELD_ENABLED_SETTING.get(mapperService.getIndexSettings().getSettings());
}

@Override
Expand Down
Loading

0 comments on commit cfad420

Please sign in to comment.