Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable Fuzzy codec for doc id fields using a bloom filter #11027

Merged
merged 9 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- New DateTime format for RFC3339 compatible date fields ([#11465](https://github.com/opensearch-project/OpenSearch/pull/11465))
- Add support for Google Application Default Credentials in repository-gcs ([#8394](https://github.com/opensearch-project/OpenSearch/pull/8394))
- Remove concurrent segment search feature flag for GA launch ([#12074](https://github.com/opensearch-project/OpenSearch/pull/12074))
- Enable Fuzzy codec for doc id fields using a bloom filter ([#11022](https://github.com/opensearch-project/OpenSearch/pull/11022))

### Dependencies
- Bumps jetty version to 9.4.52.v20230823 to fix GMS-2023-1857 ([#9822](https://github.com/opensearch-project/OpenSearch/pull/9822))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.benchmark.index.codec.fuzzy;

import org.apache.lucene.util.BytesRef;
import org.opensearch.common.UUIDs;
import org.opensearch.index.codec.fuzzy.FuzzySet;
import org.opensearch.index.codec.fuzzy.FuzzySetFactory;
import org.opensearch.index.codec.fuzzy.FuzzySetParameters;
import org.opensearch.index.mapper.IdFieldMapper;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

@Fork(3)
@Warmup(iterations = 2)
@Measurement(iterations = 5, time = 60, timeUnit = TimeUnit.SECONDS)
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
public class FilterConstructionBenchmark {

private List<BytesRef> items;

@Param({ "1000000", "10000000", "50000000" })
private int numIds;

@Param({ "0.0511", "0.1023", "0.2047" })
private double fpp;

private FuzzySetFactory fuzzySetFactory;
private String fieldName;

@Setup
public void setupIds() {
this.fieldName = IdFieldMapper.NAME;
this.items = IntStream.range(0, numIds).mapToObj(i -> new BytesRef(UUIDs.base64UUID())).collect(Collectors.toList());
FuzzySetParameters parameters = new FuzzySetParameters(() -> fpp);
this.fuzzySetFactory = new FuzzySetFactory(Map.of(fieldName, parameters));
}

@Benchmark
public FuzzySet buildFilter() throws IOException {
return fuzzySetFactory.createFuzzySet(items.size(), fieldName, () -> items.iterator());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.benchmark.index.codec.fuzzy;

import org.apache.lucene.util.BytesRef;
import org.opensearch.common.UUIDs;
import org.opensearch.index.codec.fuzzy.FuzzySet;
import org.opensearch.index.codec.fuzzy.FuzzySetFactory;
import org.opensearch.index.codec.fuzzy.FuzzySetParameters;
import org.opensearch.index.mapper.IdFieldMapper;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

@Fork(3)
@Warmup(iterations = 2)
@Measurement(iterations = 5, time = 60, timeUnit = TimeUnit.SECONDS)
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
public class FilterLookupBenchmark {

@Param({ "50000000", "1000000" })
private int numItems;

@Param({ "1000000" })
private int searchKeyCount;

@Param({ "0.0511", "0.1023", "0.2047" })
private double fpp;

private FuzzySet fuzzySet;
private List<BytesRef> items;
private Random random = new Random();

@Setup
public void setupFilter() throws IOException {
String fieldName = IdFieldMapper.NAME;
items = IntStream.range(0, numItems).mapToObj(i -> new BytesRef(UUIDs.base64UUID())).collect(Collectors.toList());
FuzzySetParameters parameters = new FuzzySetParameters(() -> fpp);
fuzzySet = new FuzzySetFactory(Map.of(fieldName, parameters)).createFuzzySet(numItems, fieldName, () -> items.iterator());
}

@Benchmark
public void contains_withExistingKeys(Blackhole blackhole) throws IOException {
for (int i = 0; i < searchKeyCount; i++) {
blackhole.consume(fuzzySet.contains(items.get(random.nextInt(items.size()))) == FuzzySet.Result.MAYBE);
}
}

@Benchmark
public void contains_withRandomKeys(Blackhole blackhole) throws IOException {
for (int i = 0; i < searchKeyCount; i++) {
blackhole.consume(fuzzySet.contains(new BytesRef(UUIDs.base64UUID())));
}
}
}
1 change: 1 addition & 0 deletions qa/rolling-upgrade/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ for (Version bwcVersion : BuildParams.bwcVersions.wireCompatible) {
setting 'repositories.url.allowed_urls', 'http://snapshot.test*'
setting 'path.repo', "${buildDir}/cluster/shared/repo/${baseName}"
setting 'http.content_type.required', 'true'
systemProperty 'opensearch.experimental.optimize_doc_id_lookup.fuzzy_set.enabled', 'true'
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@
import org.opensearch.common.Booleans;
import org.opensearch.common.io.Streams;
import org.opensearch.common.settings.Settings;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.codec.CodecService;
import org.opensearch.index.engine.EngineConfig;
import org.opensearch.indices.replication.common.ReplicationType;
import org.opensearch.test.OpenSearchIntegTestCase;
import org.opensearch.test.rest.yaml.ObjectPath;

import java.io.IOException;
Expand Down Expand Up @@ -344,6 +344,92 @@ public void testIndexingWithSegRep() throws Exception {
}
}

public void testIndexingWithFuzzyFilterPostings() throws Exception {
if (UPGRADE_FROM_VERSION.onOrBefore(Version.V_2_11_1)) {
logger.info("--> Skip test for version {} where fuzzy filter postings format feature is not available", UPGRADE_FROM_VERSION);
return;
}
final String indexName = "test-index-fuzzy-set";
final int shardCount = 3;
final int replicaCount = 1;
logger.info("--> Case {}", CLUSTER_TYPE);
printClusterNodes();
logger.info("--> _cat/shards before test execution \n{}", EntityUtils.toString(client().performRequest(new Request("GET", "/_cat/shards?v")).getEntity()));
switch (CLUSTER_TYPE) {
case OLD:
mgodwan marked this conversation as resolved.
Show resolved Hide resolved
Settings.Builder settings = Settings.builder()
.put(IndexMetadata.INDEX_NUMBER_OF_SHARDS_SETTING.getKey(), shardCount)
.put(IndexMetadata.INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), replicaCount)
.put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT)
.put(
EngineConfig.INDEX_CODEC_SETTING.getKey(),
randomFrom(new ArrayList<>(CODECS) {
{
add(CodecService.LUCENE_DEFAULT_CODEC);
}
})
)
.put(INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), "100ms");
createIndex(indexName, settings.build());
waitForClusterHealthWithNoShardMigration(indexName, "green");
bulk(indexName, "_OLD", 5);
break;
case MIXED:
waitForClusterHealthWithNoShardMigration(indexName, "yellow");
break;
case UPGRADED:
Settings.Builder settingsBuilder = Settings.builder()
.put(IndexSettings.INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING.getKey(), true);
updateIndexSettings(indexName, settingsBuilder);
waitForClusterHealthWithNoShardMigration(indexName, "green");
break;
default:
throw new UnsupportedOperationException("Unknown cluster type [" + CLUSTER_TYPE + "]");
}

int expectedCount;
switch (CLUSTER_TYPE) {
case OLD:
expectedCount = 5;
break;
case MIXED:
if (Booleans.parseBoolean(System.getProperty("tests.first_round"))) {
expectedCount = 5;
} else {
expectedCount = 10;
}
break;
case UPGRADED:
expectedCount = 15;
break;
default:
throw new UnsupportedOperationException("Unknown cluster type [" + CLUSTER_TYPE + "]");
}

waitForSearchableDocs(indexName, shardCount, replicaCount);
assertCount(indexName, expectedCount);

if (CLUSTER_TYPE != ClusterType.OLD) {
bulk(indexName, "_" + CLUSTER_TYPE, 5);
logger.info("--> Index one doc (to be deleted next) and verify doc count");
Request toBeDeleted = new Request("PUT", "/" + indexName + "/_doc/to_be_deleted");
toBeDeleted.addParameter("refresh", "true");
toBeDeleted.setJsonEntity("{\"f1\": \"delete-me\"}");
client().performRequest(toBeDeleted);
waitForSearchableDocs(indexName, shardCount, replicaCount);
assertCount(indexName, expectedCount + 6);

logger.info("--> Delete previously added doc and verify doc count");
Request delete = new Request("DELETE", "/" + indexName + "/_doc/to_be_deleted");
delete.addParameter("refresh", "true");
client().performRequest(delete);
waitForSearchableDocs(indexName, shardCount, replicaCount);
assertCount(indexName, expectedCount + 5);

//forceMergeAndVerify(indexName, shardCount * (1 + replicaCount));
shwetathareja marked this conversation as resolved.
Show resolved Hide resolved
}
}

public void testAutoIdWithOpTypeCreate() throws IOException {
final String indexName = "auto_id_and_op_type_create_index";
StringBuilder b = new StringBuilder();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ protected FeatureFlagSettings(
FeatureFlags.IDENTITY_SETTING,
FeatureFlags.TELEMETRY_SETTING,
FeatureFlags.DATETIME_FORMATTER_CACHING_SETTING,
FeatureFlags.WRITEABLE_REMOTE_INDEX_SETTING
FeatureFlags.WRITEABLE_REMOTE_INDEX_SETTING,
FeatureFlags.DOC_ID_FUZZY_SET_SETTING
);
}
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,9 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
IndexMetadata.INDEX_REMOTE_SEGMENT_STORE_REPOSITORY_SETTING,
IndexMetadata.INDEX_REMOTE_TRANSLOG_REPOSITORY_SETTING,

IndexSettings.INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING,
IndexSettings.INDEX_DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING,

// Settings for concurrent segment search
IndexSettings.INDEX_CONCURRENT_SEGMENT_SEARCH_SETTING,

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ public class FeatureFlags {
*/
public static final String WRITEABLE_REMOTE_INDEX = "opensearch.experimental.feature.writeable_remote_index.enabled";

/**
* Gates the optimization to enable bloom filters for doc id lookup.
*/
public static final String DOC_ID_FUZZY_SET = "opensearch.experimental.optimize_doc_id_lookup.fuzzy_set.enabled";

/**
* Should store the settings from opensearch.yml.
*/
Expand Down Expand Up @@ -110,4 +115,6 @@ public static boolean isEnabled(Setting<Boolean> featureFlag) {
false,
Property.NodeScope
);

public static final Setting<Boolean> DOC_ID_FUZZY_SET_SETTING = Setting.boolSetting(DOC_ID_FUZZY_SET, false, Property.NodeScope);
}
Loading
Loading