From 37b755a85fb8942dc19959aaff3c2f062d3e269d Mon Sep 17 00:00:00 2001
From: mgodwan <mgodwan@amazon.com>
Date: Fri, 27 Oct 2023 13:59:57 +0530
Subject: [PATCH 1/8] Enable Fuzzy codec for doc id fields using a bloom filter

Signed-off-by: mgodwan <mgodwan@amazon.com>
---
 CHANGELOG.md                                  |   1 +
 .../fuzzy/FilterConstructionBenchmark.java    |  67 +++
 .../codec/fuzzy/FilterLookupBenchmark.java    |  80 +++
 .../org/opensearch/upgrades/IndexingIT.java   |  88 +++-
 .../common/settings/IndexScopedSettings.java  |   3 +
 .../org/opensearch/index/IndexSettings.java   |  52 ++
 .../PerFieldMappingPostingFormatCodec.java    |  19 +
 .../index/codec/fuzzy/AbstractFuzzySet.java   |  61 +++
 .../index/codec/fuzzy/BloomFilter.java        | 147 ++++++
 .../fuzzy/FuzzyFilterPostingsFormat.java      | 488 ++++++++++++++++++
 .../index/codec/fuzzy/FuzzySet.java           |  98 ++++
 .../index/codec/fuzzy/FuzzySetFactory.java    |  49 ++
 .../index/codec/fuzzy/FuzzySetParameters.java |  34 ++
 .../fuzzy/IndexInputImmutableLongArray.java   |  68 +++
 .../codec/fuzzy/LongArrayBackedBitSet.java    |  80 +++
 .../index/codec/fuzzy/package-info.java       |  10 +
 .../index/engine/SegmentsStats.java           |   4 +-
 .../org.apache.lucene.codecs.PostingsFormat   |   1 +
 .../index/codec/fuzzy/BloomFilterTests.java   |  82 +++
 .../fuzzy/FuzzyFilterPostingsFormatTests.java |  34 ++
 .../test/OpenSearchIntegTestCase.java         |   7 +
 21 files changed, 1471 insertions(+), 2 deletions(-)
 create mode 100644 benchmarks/src/main/java/org/opensearch/benchmark/index/codec/fuzzy/FilterConstructionBenchmark.java
 create mode 100644 benchmarks/src/main/java/org/opensearch/benchmark/index/codec/fuzzy/FilterLookupBenchmark.java
 create mode 100644 server/src/main/java/org/opensearch/index/codec/fuzzy/AbstractFuzzySet.java
 create mode 100644 server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java
 create mode 100644 server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java
 create mode 100644 server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySet.java
 create mode 100644 server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySetFactory.java
 create mode 100644 server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySetParameters.java
 create mode 100644 server/src/main/java/org/opensearch/index/codec/fuzzy/IndexInputImmutableLongArray.java
 create mode 100644 server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java
 create mode 100644 server/src/main/java/org/opensearch/index/codec/fuzzy/package-info.java
 create mode 100644 server/src/test/java/org/opensearch/index/codec/fuzzy/BloomFilterTests.java
 create mode 100644 server/src/test/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormatTests.java

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f9c2eda92ec73..afd4e547380c9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -136,6 +136,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Add `org.opensearch.rest.MethodHandlers` and `RestController#getAllHandlers` ([11876](https://github.com/opensearch-project/OpenSearch/pull/11876))
 - New DateTime format for RFC3339 compatible date fields ([#11465](https://github.com/opensearch-project/OpenSearch/pull/11465))
 - Add support for Google Application Default Credentials in repository-gcs ([#8394](https://github.com/opensearch-project/OpenSearch/pull/8394))
+- Enable Fuzzy codec for doc id fields using a bloom filter ([#11022](https://github.com/opensearch-project/OpenSearch/pull/11022))
 
 ### Dependencies
 - Bumps jetty version to 9.4.52.v20230823 to fix GMS-2023-1857 ([#9822](https://github.com/opensearch-project/OpenSearch/pull/9822))
diff --git a/benchmarks/src/main/java/org/opensearch/benchmark/index/codec/fuzzy/FilterConstructionBenchmark.java b/benchmarks/src/main/java/org/opensearch/benchmark/index/codec/fuzzy/FilterConstructionBenchmark.java
new file mode 100644
index 0000000000000..4e995f5a5067c
--- /dev/null
+++ b/benchmarks/src/main/java/org/opensearch/benchmark/index/codec/fuzzy/FilterConstructionBenchmark.java
@@ -0,0 +1,67 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.benchmark.index.codec.fuzzy;
+
+import org.apache.lucene.util.BytesRef;
+import org.opensearch.common.UUIDs;
+import org.opensearch.index.codec.fuzzy.FuzzySet;
+import org.opensearch.index.codec.fuzzy.FuzzySetFactory;
+import org.opensearch.index.codec.fuzzy.FuzzySetParameters;
+import org.opensearch.index.mapper.IdFieldMapper;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+@Fork(3)
+@Warmup(iterations = 2)
+@Measurement(iterations = 5, time = 60, timeUnit = TimeUnit.SECONDS)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+public class FilterConstructionBenchmark {
+
+    private List<BytesRef> items;
+
+    @Param({ "1000000", "10000000", "50000000" })
+    private int numIds;
+
+    @Param({ "0.0511", "0.1023", "0.2047" })
+    private double fpp;
+
+    private FuzzySetFactory fuzzySetFactory;
+    private String fieldName;
+
+    @Setup
+    public void setupIds() {
+        this.fieldName = IdFieldMapper.NAME;
+        this.items = IntStream.range(0, numIds).mapToObj(i -> new BytesRef(UUIDs.base64UUID())).collect(Collectors.toList());
+        FuzzySetParameters parameters = new FuzzySetParameters(() -> fpp);
+        this.fuzzySetFactory = new FuzzySetFactory(Map.of(fieldName, parameters));
+    }
+
+    @Benchmark
+    public FuzzySet buildFilter() throws IOException {
+        return fuzzySetFactory.createFuzzySet(items.size(), fieldName, () -> items.iterator());
+    }
+}
diff --git a/benchmarks/src/main/java/org/opensearch/benchmark/index/codec/fuzzy/FilterLookupBenchmark.java b/benchmarks/src/main/java/org/opensearch/benchmark/index/codec/fuzzy/FilterLookupBenchmark.java
new file mode 100644
index 0000000000000..383539219830e
--- /dev/null
+++ b/benchmarks/src/main/java/org/opensearch/benchmark/index/codec/fuzzy/FilterLookupBenchmark.java
@@ -0,0 +1,80 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.benchmark.index.codec.fuzzy;
+
+import org.apache.lucene.util.BytesRef;
+import org.opensearch.common.UUIDs;
+import org.opensearch.index.codec.fuzzy.FuzzySet;
+import org.opensearch.index.codec.fuzzy.FuzzySetFactory;
+import org.opensearch.index.codec.fuzzy.FuzzySetParameters;
+import org.opensearch.index.mapper.IdFieldMapper;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+@Fork(3)
+@Warmup(iterations = 2)
+@Measurement(iterations = 5, time = 60, timeUnit = TimeUnit.SECONDS)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+public class FilterLookupBenchmark {
+
+    @Param({ "50000000", "1000000" })
+    private int numItems;
+
+    @Param({ "1000000" })
+    private int searchKeyCount;
+
+    @Param({ "0.0511", "0.1023", "0.2047" })
+    private double fpp;
+
+    private FuzzySet fuzzySet;
+    private List<BytesRef> items;
+    private Random random = new Random();
+
+    @Setup
+    public void setupFilter() throws IOException {
+        String fieldName = IdFieldMapper.NAME;
+        items = IntStream.range(0, numItems).mapToObj(i -> new BytesRef(UUIDs.base64UUID())).collect(Collectors.toList());
+        FuzzySetParameters parameters = new FuzzySetParameters(() -> fpp);
+        fuzzySet = new FuzzySetFactory(Map.of(fieldName, parameters)).createFuzzySet(numItems, fieldName, () -> items.iterator());
+    }
+
+    @Benchmark
+    public void contains_withExistingKeys(Blackhole blackhole) throws IOException {
+        for (int i = 0; i < searchKeyCount; i++) {
+            blackhole.consume(fuzzySet.contains(items.get(random.nextInt(items.size()))) == FuzzySet.Result.MAYBE);
+        }
+    }
+
+    @Benchmark
+    public void contains_withRandomKeys(Blackhole blackhole) throws IOException {
+        for (int i = 0; i < searchKeyCount; i++) {
+            blackhole.consume(fuzzySet.contains(new BytesRef(UUIDs.base64UUID())));
+        }
+    }
+}
diff --git a/qa/rolling-upgrade/src/test/java/org/opensearch/upgrades/IndexingIT.java b/qa/rolling-upgrade/src/test/java/org/opensearch/upgrades/IndexingIT.java
index 1577260e145d4..08a5e92fc2d02 100644
--- a/qa/rolling-upgrade/src/test/java/org/opensearch/upgrades/IndexingIT.java
+++ b/qa/rolling-upgrade/src/test/java/org/opensearch/upgrades/IndexingIT.java
@@ -40,10 +40,10 @@
 import org.opensearch.common.Booleans;
 import org.opensearch.common.io.Streams;
 import org.opensearch.common.settings.Settings;
+import org.opensearch.index.IndexSettings;
 import org.opensearch.index.codec.CodecService;
 import org.opensearch.index.engine.EngineConfig;
 import org.opensearch.indices.replication.common.ReplicationType;
-import org.opensearch.test.OpenSearchIntegTestCase;
 import org.opensearch.test.rest.yaml.ObjectPath;
 
 import java.io.IOException;
@@ -344,6 +344,92 @@ public void testIndexingWithSegRep() throws Exception {
         }
     }
 
+    public void testIndexingWithFuzzyFilterPostings() throws Exception {
+        if (UPGRADE_FROM_VERSION.onOrBefore(Version.V_2_11_1)) {
+            logger.info("--> Skip test for version {} where fuzzy filter postings format feature is not available", UPGRADE_FROM_VERSION);
+            return;
+        }
+        final String indexName = "test-index-fuzzy-set";
+        final int shardCount = 3;
+        final int replicaCount = 1;
+        logger.info("--> Case {}", CLUSTER_TYPE);
+        printClusterNodes();
+        logger.info("--> _cat/shards before test execution \n{}", EntityUtils.toString(client().performRequest(new Request("GET", "/_cat/shards?v")).getEntity()));
+        switch (CLUSTER_TYPE) {
+            case OLD:
+                Settings.Builder settings = Settings.builder()
+                    .put(IndexMetadata.INDEX_NUMBER_OF_SHARDS_SETTING.getKey(), shardCount)
+                    .put(IndexMetadata.INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), replicaCount)
+                    .put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT)
+                    .put(
+                        EngineConfig.INDEX_CODEC_SETTING.getKey(),
+                        randomFrom(new ArrayList<>(CODECS) {
+                            {
+                                add(CodecService.LUCENE_DEFAULT_CODEC);
+                            }
+                        })
+                    )
+                    .put(INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), "100ms");
+                createIndex(indexName, settings.build());
+                waitForClusterHealthWithNoShardMigration(indexName, "green");
+                bulk(indexName, "_OLD", 5);
+                break;
+            case MIXED:
+                waitForClusterHealthWithNoShardMigration(indexName, "yellow");
+                break;
+            case UPGRADED:
+                Settings.Builder settingsBuilder = Settings.builder()
+                    .put(IndexSettings.INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING.getKey(), true);
+                updateIndexSettings(indexName, settingsBuilder);
+                waitForClusterHealthWithNoShardMigration(indexName, "green");
+                break;
+            default:
+                throw new UnsupportedOperationException("Unknown cluster type [" + CLUSTER_TYPE + "]");
+        }
+
+        int expectedCount;
+        switch (CLUSTER_TYPE) {
+            case OLD:
+                expectedCount = 5;
+                break;
+            case MIXED:
+                if (Booleans.parseBoolean(System.getProperty("tests.first_round"))) {
+                    expectedCount = 5;
+                } else {
+                    expectedCount = 10;
+                }
+                break;
+            case UPGRADED:
+                expectedCount = 15;
+                break;
+            default:
+                throw new UnsupportedOperationException("Unknown cluster type [" + CLUSTER_TYPE + "]");
+        }
+
+        waitForSearchableDocs(indexName, shardCount, replicaCount);
+        assertCount(indexName, expectedCount);
+
+        if (CLUSTER_TYPE != ClusterType.OLD) {
+            bulk(indexName, "_" + CLUSTER_TYPE, 5);
+            logger.info("--> Index one doc (to be deleted next) and verify doc count");
+            Request toBeDeleted = new Request("PUT", "/" + indexName + "/_doc/to_be_deleted");
+            toBeDeleted.addParameter("refresh", "true");
+            toBeDeleted.setJsonEntity("{\"f1\": \"delete-me\"}");
+            client().performRequest(toBeDeleted);
+            waitForSearchableDocs(indexName, shardCount, replicaCount);
+            assertCount(indexName, expectedCount + 6);
+
+            logger.info("--> Delete previously added doc and verify doc count");
+            Request delete = new Request("DELETE", "/" + indexName + "/_doc/to_be_deleted");
+            delete.addParameter("refresh", "true");
+            client().performRequest(delete);
+            waitForSearchableDocs(indexName, shardCount, replicaCount);
+            assertCount(indexName, expectedCount + 5);
+
+            //forceMergeAndVerify(indexName, shardCount * (1 + replicaCount));
+        }
+    }
+
     public void testAutoIdWithOpTypeCreate() throws IOException {
         final String indexName = "auto_id_and_op_type_create_index";
         StringBuilder b = new StringBuilder();
diff --git a/server/src/main/java/org/opensearch/common/settings/IndexScopedSettings.java b/server/src/main/java/org/opensearch/common/settings/IndexScopedSettings.java
index b34a2aaffe408..6168ad5a31d51 100644
--- a/server/src/main/java/org/opensearch/common/settings/IndexScopedSettings.java
+++ b/server/src/main/java/org/opensearch/common/settings/IndexScopedSettings.java
@@ -230,6 +230,9 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
                 IndexMetadata.INDEX_REMOTE_SEGMENT_STORE_REPOSITORY_SETTING,
                 IndexMetadata.INDEX_REMOTE_TRANSLOG_REPOSITORY_SETTING,
 
+                IndexSettings.INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING,
+                IndexSettings.INDEX_DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING,
+
                 // validate that built-in similarities don't get redefined
                 Setting.groupSetting("index.similarity.", (s) -> {
                     Map<String, Settings> groups = s.getAsGroups();
diff --git a/server/src/main/java/org/opensearch/index/IndexSettings.java b/server/src/main/java/org/opensearch/index/IndexSettings.java
index 00e765d73f77f..3966bca42c249 100644
--- a/server/src/main/java/org/opensearch/index/IndexSettings.java
+++ b/server/src/main/java/org/opensearch/index/IndexSettings.java
@@ -66,6 +66,7 @@
 
 import static org.opensearch.Version.V_2_7_0;
 import static org.opensearch.common.util.FeatureFlags.SEARCHABLE_SNAPSHOT_EXTENDED_COMPATIBILITY;
+import static org.opensearch.index.codec.fuzzy.FuzzySetParameters.DEFAULT_FALSE_POSITIVE_PROBABILITY;
 import static org.opensearch.index.mapper.MapperService.INDEX_MAPPING_DEPTH_LIMIT_SETTING;
 import static org.opensearch.index.mapper.MapperService.INDEX_MAPPING_FIELD_NAME_LENGTH_LIMIT_SETTING;
 import static org.opensearch.index.mapper.MapperService.INDEX_MAPPING_NESTED_DOCS_LIMIT_SETTING;
@@ -658,6 +659,22 @@ public static IndexMergePolicy fromString(String text) {
         Property.Dynamic
     );
 
+    public static final Setting<Boolean> INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING = Setting.boolSetting(
+        "index.doc_id_fuzzy_set.enabled",
+        false,
+        Property.IndexScope,
+        Property.Dynamic
+    );
+
+    public static final Setting<Double> INDEX_DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING = Setting.doubleSetting(
+        "index.doc_id_fuzzy_set.false_positive_probability",
+        DEFAULT_FALSE_POSITIVE_PROBABILITY,
+        0.01,
+        0.50,
+        Property.IndexScope,
+        Property.Dynamic
+    );
+
     public static final TimeValue DEFAULT_REMOTE_TRANSLOG_BUFFER_INTERVAL = new TimeValue(650, TimeUnit.MILLISECONDS);
     public static final TimeValue MINIMUM_REMOTE_TRANSLOG_BUFFER_INTERVAL = TimeValue.ZERO;
     public static final Setting<TimeValue> INDEX_REMOTE_TRANSLOG_BUFFER_INTERVAL_SETTING = Setting.timeSetting(
@@ -787,6 +804,16 @@ private void setRetentionLeaseMillis(final TimeValue retentionLease) {
      */
     private volatile UnaryOperator<MergePolicy> mergeOnFlushPolicy;
 
+    /**
+     * Is fuzzy set enabled for doc id
+     */
+    private volatile boolean enableFuzzySetForDocId;
+
+    /**
+     * False positive probability to use while creating fuzzy set.
+     */
+    private volatile double docIdFuzzySetFalsePositiveProbability;
+
     /**
      * Returns the default search fields for this index.
      */
@@ -926,6 +953,10 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti
          * Now this sortField (IndexSort) is stored in SegmentInfo and we need to maintain backward compatibility for them.
          */
         widenIndexSortType = IndexMetadata.SETTING_INDEX_VERSION_CREATED.get(settings).before(V_2_7_0);
+
+        enableFuzzySetForDocId = scopedSettings.get(INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING);
+        docIdFuzzySetFalsePositiveProbability = scopedSettings.get(INDEX_DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING);
+
         scopedSettings.addSettingsUpdateConsumer(
             TieredMergePolicyProvider.INDEX_COMPOUND_FORMAT_SETTING,
             tieredMergePolicyProvider::setNoCFSRatio
@@ -1032,6 +1063,11 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti
             this::setRemoteTranslogUploadBufferInterval
         );
         scopedSettings.addSettingsUpdateConsumer(INDEX_REMOTE_TRANSLOG_KEEP_EXTRA_GEN_SETTING, this::setRemoteTranslogKeepExtraGen);
+        scopedSettings.addSettingsUpdateConsumer(INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING, this::setEnableFuzzySetForDocId);
+        scopedSettings.addSettingsUpdateConsumer(
+            INDEX_DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING,
+            this::setDocIdFuzzySetFalsePositiveProbability
+        );
     }
 
     private void setSearchIdleAfter(TimeValue searchIdleAfter) {
@@ -1801,4 +1837,20 @@ public void setDefaultSearchPipeline(String defaultSearchPipeline) {
     public boolean shouldWidenIndexSortType() {
         return this.widenIndexSortType;
     }
+
+    public boolean isEnableFuzzySetForDocId() {
+        return enableFuzzySetForDocId;
+    }
+
+    public void setEnableFuzzySetForDocId(boolean enableFuzzySetForDocId) {
+        this.enableFuzzySetForDocId = enableFuzzySetForDocId;
+    }
+
+    public double getDocIdFuzzySetFalsePositiveProbability() {
+        return docIdFuzzySetFalsePositiveProbability;
+    }
+
+    public void setDocIdFuzzySetFalsePositiveProbability(double docIdFuzzySetFalsePositiveProbability) {
+        this.docIdFuzzySetFalsePositiveProbability = docIdFuzzySetFalsePositiveProbability;
+    }
 }
diff --git a/server/src/main/java/org/opensearch/index/codec/PerFieldMappingPostingFormatCodec.java b/server/src/main/java/org/opensearch/index/codec/PerFieldMappingPostingFormatCodec.java
index dc28ad2d6dc07..1ad17f121560c 100644
--- a/server/src/main/java/org/opensearch/index/codec/PerFieldMappingPostingFormatCodec.java
+++ b/server/src/main/java/org/opensearch/index/codec/PerFieldMappingPostingFormatCodec.java
@@ -39,10 +39,16 @@
 import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
 import org.apache.lucene.codecs.lucene99.Lucene99Codec;
 import org.opensearch.common.lucene.Lucene;
+import org.opensearch.index.codec.fuzzy.FuzzyFilterPostingsFormat;
+import org.opensearch.index.codec.fuzzy.FuzzySetFactory;
+import org.opensearch.index.codec.fuzzy.FuzzySetParameters;
 import org.opensearch.index.mapper.CompletionFieldMapper;
+import org.opensearch.index.mapper.IdFieldMapper;
 import org.opensearch.index.mapper.MappedFieldType;
 import org.opensearch.index.mapper.MapperService;
 
+import java.util.Map;
+
 /**
  * {@link PerFieldMappingPostingFormatCodec This postings format} is the default
  * {@link PostingsFormat} for OpenSearch. It utilizes the
@@ -57,6 +63,8 @@ public class PerFieldMappingPostingFormatCodec extends Lucene99Codec {
     private final Logger logger;
     private final MapperService mapperService;
     private final DocValuesFormat dvFormat = new Lucene90DocValuesFormat();
+    private final FuzzySetFactory fuzzySetFactory;
+    private PostingsFormat docIdPostingsFormat;
 
     static {
         assert Codec.forName(Lucene.LATEST_CODEC).getClass().isAssignableFrom(PerFieldMappingPostingFormatCodec.class)
@@ -67,6 +75,12 @@ public PerFieldMappingPostingFormatCodec(Mode compressionMode, MapperService map
         super(compressionMode);
         this.mapperService = mapperService;
         this.logger = logger;
+        fuzzySetFactory = new FuzzySetFactory(
+            Map.of(
+                IdFieldMapper.NAME,
+                new FuzzySetParameters(() -> mapperService.getIndexSettings().getDocIdFuzzySetFalsePositiveProbability())
+            )
+        );
     }
 
     @Override
@@ -76,6 +90,11 @@ public PostingsFormat getPostingsFormatForField(String field) {
             logger.warn("no index mapper found for field: [{}] returning default postings format", field);
         } else if (fieldType instanceof CompletionFieldMapper.CompletionFieldType) {
             return CompletionFieldMapper.CompletionFieldType.postingsFormat();
+        } else if (IdFieldMapper.NAME.equals(field) && mapperService.getIndexSettings().isEnableFuzzySetForDocId()) {
+            if (docIdPostingsFormat == null) {
+                docIdPostingsFormat = new FuzzyFilterPostingsFormat(super.getPostingsFormatForField(field), fuzzySetFactory);
+            }
+            return docIdPostingsFormat;
         }
         return super.getPostingsFormatForField(field);
     }
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/AbstractFuzzySet.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/AbstractFuzzySet.java
new file mode 100644
index 0000000000000..09976297361fa
--- /dev/null
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/AbstractFuzzySet.java
@@ -0,0 +1,61 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.codec.fuzzy;
+
+import org.apache.lucene.util.BytesRef;
+import org.opensearch.common.CheckedSupplier;
+import org.opensearch.common.hash.T1ha1;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+/**
+ * Encapsulates common behaviour implementation for a fuzzy set.
+ */
+public abstract class AbstractFuzzySet implements FuzzySet {
+
+    /**
+     * Add an item to this fuzzy set.
+     * @param value The value to be added
+     */
+    protected abstract void add(BytesRef value);
+
+    /**
+     * Add all items to the underlying set.
+     * Implementations can choose to perform this using an optimized strategy based on the type of set.
+     * @param valuesIteratorProvider Supplier for an iterator over All values which should be added to the set.
+     */
+    protected void addAll(CheckedSupplier<Iterator<BytesRef>, IOException> valuesIteratorProvider) throws IOException {
+        Iterator<BytesRef> values = valuesIteratorProvider.get();
+        while (values.hasNext()) {
+            add(values.next());
+        }
+    }
+
+    public Result contains(BytesRef val) {
+        return containsHash(generateKey(val));
+    }
+
+    protected abstract Result containsHash(long hash);
+
+    protected long generateKey(BytesRef value) {
+        return T1ha1.hash(value.bytes, value.offset, value.length, 0L);
+    }
+
+    protected void assertAllElementsExist(CheckedSupplier<Iterator<BytesRef>, IOException> iteratorProvider) throws IOException {
+        Iterator<BytesRef> iter = iteratorProvider.get();
+        int cnt = 0;
+        while (iter.hasNext()) {
+            BytesRef item = iter.next();
+            assert contains(item) == Result.MAYBE
+                : "Expected Filter to return positive response for elements added to it. Elements matched: " + cnt;
+            cnt++;
+        }
+    }
+}
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java
new file mode 100644
index 0000000000000..5870ca0e16984
--- /dev/null
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java
@@ -0,0 +1,147 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+/*
+ * Based on code from the Apache Lucene project (https://github.com/apache/lucene) under the Apache License, version 2.0.
+ * Copyright 2001-2022 The Apache Software Foundation
+ * Modifications (C) OpenSearch Contributors. All Rights Reserved.
+ */
+
+package org.opensearch.index.codec.fuzzy;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.opensearch.common.CheckedSupplier;
+import org.opensearch.common.util.io.IOUtils;
+import org.opensearch.core.Assertions;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+/**
+ * The code is based on Lucene's implementation of Bloom Filter.
+ * It represents a subset of the Lucene implementation needed for OpenSearch use cases.
+ * Since the Lucene implementation is marked experimental,
+ * this aims to ensure we can provide a bwc implementation during upgrades.
+ */
+public class BloomFilter extends AbstractFuzzySet {
+
+    private static final Logger logger = LogManager.getLogger(BloomFilter.class);
+
+    // The sizes of BitSet used are all numbers that, when expressed in binary form,
+    // are all ones. This is to enable fast downsizing from one bitset to another
+    // by simply ANDing each set index in one bitset with the size of the target bitset
+    // - this provides a fast modulo of the number. Values previously accumulated in
+    // a large bitset and then mapped to a smaller set can be looked up using a single
+    // AND operation of the query term's hash rather than needing to perform a 2-step
+    // translation of the query term that mirrors the stored content's reprojections.
+    static final int[] usableBitSetSizes;
+
+    static {
+        usableBitSetSizes = new int[26];
+        for (int i = 0; i < usableBitSetSizes.length; i++) {
+            usableBitSetSizes[i] = (1 << (i + 6)) - 1;
+        }
+    }
+
+    private final LongArrayBackedBitSet bitset;
+    private final int setSize;
+    private final int hashCount;
+
+    BloomFilter(long maxDocs, double maxFpp, CheckedSupplier<Iterator<BytesRef>, IOException> fieldIteratorProvider) throws IOException {
+        int setSize = (int) Math.ceil((maxDocs * Math.log(maxFpp)) / Math.log(1 / Math.pow(2, Math.log(2))));
+        setSize = getNearestSetSize(2 * setSize);
+        int optimalK = (int) Math.round(((double) setSize / maxDocs) * Math.log(2));
+        this.bitset = new LongArrayBackedBitSet(setSize + 1);
+        this.setSize = setSize;
+        this.hashCount = optimalK;
+        addAll(fieldIteratorProvider);
+        if (Assertions.ENABLED) {
+            assertAllElementsExist(fieldIteratorProvider);
+        }
+        logger.trace("Bloom filter created with fpp: {}, setSize: {}, hashCount: {}", maxFpp, setSize, hashCount);
+    }
+
+    BloomFilter(IndexInput in) throws IOException {
+        hashCount = in.readInt();
+        setSize = in.readInt();
+        this.bitset = new LongArrayBackedBitSet(in);
+    }
+
+    @Override
+    public void writeTo(DataOutput out) throws IOException {
+        out.writeInt(hashCount);
+        out.writeInt(setSize);
+        bitset.writeTo(out);
+    }
+
+    private static int getNearestSetSize(int maxNumberOfBits) {
+        int result = usableBitSetSizes[0];
+        for (int i = 0; i < usableBitSetSizes.length; i++) {
+            if (usableBitSetSizes[i] <= maxNumberOfBits) {
+                result = usableBitSetSizes[i];
+            }
+        }
+        return result;
+    }
+
+    @Override
+    public SetType setType() {
+        return SetType.BLOOM_FILTER_V1;
+    }
+
+    @Override
+    public Result containsHash(long hash) {
+        int msb = (int) (hash >>> Integer.SIZE);
+        int lsb = (int) hash;
+        for (int i = 0; i < hashCount; i++) {
+            int bloomPos = (lsb + i * msb);
+            if (!mayContainValue(bloomPos)) {
+                return Result.NO;
+            }
+        }
+        return Result.MAYBE;
+    }
+
+    protected void add(BytesRef value) {
+        long hash = generateKey(value);
+        int msb = (int) (hash >>> Integer.SIZE);
+        int lsb = (int) hash;
+        for (int i = 0; i < hashCount; i++) {
+            // Bitmasking using bloomSize is effectively a modulo operation since set sizes are always power of 2
+            int bloomPos = (lsb + i * msb) & setSize;
+            bitset.set(bloomPos);
+        }
+    }
+
+    @Override
+    public boolean isSaturated() {
+        long numBitsSet = bitset.cardinality();
+        return (float) numBitsSet / (float) setSize > 0.9f;
+    }
+
+    @Override
+    public long ramBytesUsed() {
+        return RamUsageEstimator.sizeOf(bitset.ramBytesUsed());
+    }
+
+    private boolean mayContainValue(int aHash) {
+        // Bloom sizes are always base 2 and so can be ANDed for a fast modulo
+        int pos = aHash & setSize;
+        return bitset.isSet(pos);
+    }
+
+    @Override
+    public void close() throws IOException {
+        IOUtils.close(bitset);
+    }
+}
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java
new file mode 100644
index 0000000000000..a31bd41259e46
--- /dev/null
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java
@@ -0,0 +1,488 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+/*
+ * Based on code from the Apache Lucene project (https://github.com/apache/lucene) under the Apache License, version 2.0.
+ * Copyright 2001-2022 The Apache Software Foundation
+ * Modifications (C) OpenSearch Contributors. All Rights Reserved.
+ */
+
+package org.opensearch.index.codec.fuzzy;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.NormsProducer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.index.BaseTermsEnum;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.ImpactsEnum;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.opensearch.common.util.io.IOUtils;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Based on Lucene's BloomFilterPostingsFormat.
+ * Discussion with Lucene community based on which the decision to have this in OpenSearch code was taken
+ * is captured here: https://github.com/apache/lucene/issues/12986
+ *
+ * The class deals with persisting the bloom filter through the postings format,
+ * and reading the field via a bloom filter fronted terms enum (to reduce disk seeks in case of absence of requested values)
+ * The class should be handled during lucene upgrades. There are bwc tests present to verify the format continues to work after upgrade.
+ */
+
+public final class FuzzyFilterPostingsFormat extends PostingsFormat {
+
+    private static final Logger logger = LogManager.getLogger(FuzzyFilterPostingsFormat.class);
+
+    /**
+     * This name is stored in headers. If changing the implementation for the format, this name/version should be updated
+     * so that reads can work as expected.
+     */
+    public static final String FUZZY_SET_CODEC_NAME = "FuzzySetCodec99";
+
+    public static final int VERSION_START = 0;
+    public static final int VERSION_CURRENT = VERSION_START;
+
+    /** Extension of Fuzzy Filters file */
+    public static final String FUZZY_FILTER_FILE_EXTENSION = "fzd";
+
+    private final PostingsFormat delegatePostingsFormat;
+    private final FuzzySetFactory fuzzySetFactory;
+
+    public FuzzyFilterPostingsFormat(PostingsFormat delegatePostingsFormat, FuzzySetFactory fuzzySetFactory) {
+        super(FUZZY_SET_CODEC_NAME);
+        this.delegatePostingsFormat = delegatePostingsFormat;
+        this.fuzzySetFactory = fuzzySetFactory;
+    }
+
+    // Needed for SPI
+    public FuzzyFilterPostingsFormat() {
+        this(null, null);
+    }
+
+    @Override
+    public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+        if (delegatePostingsFormat == null) {
+            throw new UnsupportedOperationException(
+                "Error - " + getClass().getName() + " has been constructed without a choice of PostingsFormat"
+            );
+        }
+        FieldsConsumer fieldsConsumer = delegatePostingsFormat.fieldsConsumer(state);
+        return new FuzzySetFieldsConsumer(fieldsConsumer, state);
+    }
+
+    @Override
+    public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
+        return new FuzzySetFieldsProducer(state);
+    }
+
+    static class FuzzySetFieldsProducer extends FieldsProducer {
+        private FieldsProducer delegateFieldsProducer;
+        HashMap<String, FuzzySet> fuzzySetsByFieldName = new HashMap<>();
+        private List<Closeable> closeables = new ArrayList<>();
+
+        public FuzzySetFieldsProducer(SegmentReadState state) throws IOException {
+            String fuzzySetFileName = IndexFileNames.segmentFileName(
+                state.segmentInfo.name,
+                state.segmentSuffix,
+                FUZZY_FILTER_FILE_EXTENSION
+            );
+            IndexInput filterIn = null;
+            boolean success = false;
+            try {
+                filterIn = state.directory.openInput(fuzzySetFileName, state.context);
+
+                CodecUtil.checkIndexHeader(
+                    filterIn,
+                    FUZZY_SET_CODEC_NAME,
+                    VERSION_START,
+                    VERSION_CURRENT,
+                    state.segmentInfo.getId(),
+                    state.segmentSuffix
+                );
+                // Load the delegate postings format
+                PostingsFormat delegatePostingsFormat = PostingsFormat.forName(filterIn.readString());
+                this.delegateFieldsProducer = delegatePostingsFormat.fieldsProducer(state);
+                int numFilters = filterIn.readInt();
+                for (int i = 0; i < numFilters; i++) {
+                    int fieldNum = filterIn.readInt();
+                    FuzzySet set = FuzzySetFactory.deserializeFuzzySet(filterIn);
+                    closeables.add(set);
+                    FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
+                    fuzzySetsByFieldName.put(fieldInfo.name, set);
+                }
+                CodecUtil.retrieveChecksum(filterIn);
+                CodecUtil.checksumEntireFile(filterIn);
+                success = true;
+                closeables.add(filterIn);
+            } finally {
+                if (!success) {
+                    IOUtils.closeWhileHandlingException(filterIn, delegateFieldsProducer);
+                }
+            }
+        }
+
+        @Override
+        public Iterator<String> iterator() {
+            return delegateFieldsProducer.iterator();
+        }
+
+        @Override
+        public void close() throws IOException {
+            IOUtils.closeWhileHandlingException(closeables);
+            delegateFieldsProducer.close();
+        }
+
+        @Override
+        public Terms terms(String field) throws IOException {
+            FuzzySet filter = fuzzySetsByFieldName.get(field);
+            if (filter == null) {
+                return delegateFieldsProducer.terms(field);
+            } else {
+                Terms result = delegateFieldsProducer.terms(field);
+                if (result == null) {
+                    return null;
+                }
+                return new FuzzySetFieldsProducer.FuzzyFilterFrontedTerms(result, filter);
+            }
+        }
+
+        @Override
+        public int size() {
+            return delegateFieldsProducer.size();
+        }
+
+        static class FuzzyFilterFrontedTerms extends Terms {
+            private Terms delegateTerms;
+            private FuzzySet filter;
+
+            public FuzzyFilterFrontedTerms(Terms terms, FuzzySet filter) {
+                this.delegateTerms = terms;
+                this.filter = filter;
+            }
+
+            @Override
+            public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) throws IOException {
+                return delegateTerms.intersect(compiled, startTerm);
+            }
+
+            @Override
+            public TermsEnum iterator() throws IOException {
+                return new FilterAppliedTermsEnum(delegateTerms, filter);
+            }
+
+            @Override
+            public long size() throws IOException {
+                return delegateTerms.size();
+            }
+
+            @Override
+            public long getSumTotalTermFreq() throws IOException {
+                return delegateTerms.getSumTotalTermFreq();
+            }
+
+            @Override
+            public long getSumDocFreq() throws IOException {
+                return delegateTerms.getSumDocFreq();
+            }
+
+            @Override
+            public int getDocCount() throws IOException {
+                return delegateTerms.getDocCount();
+            }
+
+            @Override
+            public boolean hasFreqs() {
+                return delegateTerms.hasFreqs();
+            }
+
+            @Override
+            public boolean hasOffsets() {
+                return delegateTerms.hasOffsets();
+            }
+
+            @Override
+            public boolean hasPositions() {
+                return delegateTerms.hasPositions();
+            }
+
+            @Override
+            public boolean hasPayloads() {
+                return delegateTerms.hasPayloads();
+            }
+
+            @Override
+            public BytesRef getMin() throws IOException {
+                return delegateTerms.getMin();
+            }
+
+            @Override
+            public BytesRef getMax() throws IOException {
+                return delegateTerms.getMax();
+            }
+        }
+
+        static final class FilterAppliedTermsEnum extends BaseTermsEnum {
+
+            private Terms delegateTerms;
+            private TermsEnum delegateTermsEnum;
+            private final FuzzySet filter;
+
+            public FilterAppliedTermsEnum(Terms delegateTerms, FuzzySet filter) throws IOException {
+                this.delegateTerms = delegateTerms;
+                this.filter = filter;
+            }
+
+            void reset(Terms delegateTerms) throws IOException {
+                this.delegateTerms = delegateTerms;
+                this.delegateTermsEnum = null;
+            }
+
+            private TermsEnum delegate() throws IOException {
+                if (delegateTermsEnum == null) {
+                    /* pull the iterator only if we really need it -
+                     * this can be a relativly heavy operation depending on the
+                     * delegate postings format and the underlying directory
+                     * (clone IndexInput) */
+                    delegateTermsEnum = delegateTerms.iterator();
+                }
+                return delegateTermsEnum;
+            }
+
+            @Override
+            public BytesRef next() throws IOException {
+                return delegate().next();
+            }
+
+            @Override
+            public boolean seekExact(BytesRef text) throws IOException {
+                // The magical fail-fast speed up that is the entire point of all of
+                // this code - save a disk seek if there is a match on an in-memory
+                // structure
+                // that may occasionally give a false positive but guaranteed no false
+                // negatives
+                if (filter.contains(text) == FuzzySet.Result.NO) {
+                    return false;
+                }
+                return delegate().seekExact(text);
+            }
+
+            @Override
+            public SeekStatus seekCeil(BytesRef text) throws IOException {
+                return delegate().seekCeil(text);
+            }
+
+            @Override
+            public void seekExact(long ord) throws IOException {
+                delegate().seekExact(ord);
+            }
+
+            @Override
+            public BytesRef term() throws IOException {
+                return delegate().term();
+            }
+
+            @Override
+            public long ord() throws IOException {
+                return delegate().ord();
+            }
+
+            @Override
+            public int docFreq() throws IOException {
+                return delegate().docFreq();
+            }
+
+            @Override
+            public long totalTermFreq() throws IOException {
+                return delegate().totalTermFreq();
+            }
+
+            @Override
+            public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
+                return delegate().postings(reuse, flags);
+            }
+
+            @Override
+            public ImpactsEnum impacts(int flags) throws IOException {
+                return delegate().impacts(flags);
+            }
+
+            @Override
+            public String toString() {
+                return getClass().getSimpleName() + "(filter=" + filter.toString() + ")";
+            }
+        }
+
+        @Override
+        public void checkIntegrity() throws IOException {
+            delegateFieldsProducer.checkIntegrity();
+        }
+
+        @Override
+        public String toString() {
+            return getClass().getSimpleName() + "(fields=" + fuzzySetsByFieldName.size() + ",delegate=" + delegateFieldsProducer + ")";
+        }
+    }
+
+    class FuzzySetFieldsConsumer extends FieldsConsumer {
+        private FieldsConsumer delegateFieldsConsumer;
+        private Map<FieldInfo, FuzzySet> fuzzySets = new HashMap<>();
+        private SegmentWriteState state;
+        private List<Closeable> closeables = new ArrayList<>();
+
+        public FuzzySetFieldsConsumer(FieldsConsumer fieldsConsumer, SegmentWriteState state) {
+            this.delegateFieldsConsumer = fieldsConsumer;
+            this.state = state;
+        }
+
+        @Override
+        public void write(Fields fields, NormsProducer norms) throws IOException {
+
+            // Delegate must write first: it may have opened files
+            // on creating the class
+            // (e.g. Lucene41PostingsConsumer), and write() will
+            // close them; alternatively, if we delayed pulling
+            // the fields consumer until here, we could do it
+            // afterwards:
+            delegateFieldsConsumer.write(fields, norms);
+
+            for (String field : fields) {
+                Terms terms = fields.terms(field);
+                if (terms == null) {
+                    continue;
+                }
+                FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
+                FuzzySet fuzzySet = fuzzySetFactory.createFuzzySet(state.segmentInfo.maxDoc(), fieldInfo.name, () -> iterator(terms));
+                if (fuzzySet == null) {
+                    break;
+                }
+                assert fuzzySets.containsKey(fieldInfo) == false;
+                closeables.add(fuzzySet);
+                fuzzySets.put(fieldInfo, fuzzySet);
+            }
+        }
+
+        private Iterator<BytesRef> iterator(Terms terms) throws IOException {
+            TermsEnum termIterator = terms.iterator();
+            return new Iterator<>() {
+
+                private BytesRef currentTerm;
+                private PostingsEnum postingsEnum;
+
+                @Override
+                public boolean hasNext() {
+                    try {
+                        do {
+                            currentTerm = termIterator.next();
+                            if (currentTerm == null) {
+                                return false;
+                            }
+                            postingsEnum = termIterator.postings(postingsEnum, 0);
+                            if (postingsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
+                                return true;
+                            }
+                        } while (true);
+                    } catch (IOException ex) {
+                        throw new IllegalStateException("Cannot read terms: " + termIterator.attributes());
+                    }
+                }
+
+                @Override
+                public BytesRef next() {
+                    return currentTerm;
+                }
+            };
+        }
+
+        private boolean closed;
+
+        @Override
+        public void close() throws IOException {
+            if (closed) {
+                return;
+            }
+            closed = true;
+            delegateFieldsConsumer.close();
+
+            // Now we are done accumulating values for these fields
+            List<Map.Entry<FieldInfo, FuzzySet>> nonSaturatedSets = new ArrayList<>();
+
+            for (Map.Entry<FieldInfo, FuzzySet> entry : fuzzySets.entrySet()) {
+                FuzzySet fuzzySet = entry.getValue();
+                if (!fuzzySet.isSaturated()) {
+                    nonSaturatedSets.add(entry);
+                }
+            }
+            String fuzzyFilterFileName = IndexFileNames.segmentFileName(
+                state.segmentInfo.name,
+                state.segmentSuffix,
+                FUZZY_FILTER_FILE_EXTENSION
+            );
+            try (IndexOutput fuzzyFilterFileOutput = state.directory.createOutput(fuzzyFilterFileName, state.context)) {
+                logger.trace(
+                    "Writing fuzzy filter postings with version: {} for segment: {}",
+                    VERSION_CURRENT,
+                    state.segmentInfo.toString()
+                );
+                CodecUtil.writeIndexHeader(
+                    fuzzyFilterFileOutput,
+                    FUZZY_SET_CODEC_NAME,
+                    VERSION_CURRENT,
+                    state.segmentInfo.getId(),
+                    state.segmentSuffix
+                );
+
+                // remember the name of the postings format we will delegate to
+                fuzzyFilterFileOutput.writeString(delegatePostingsFormat.getName());
+
+                // First field in the output file is the number of fields+sets saved
+                fuzzyFilterFileOutput.writeInt(nonSaturatedSets.size());
+                for (Map.Entry<FieldInfo, FuzzySet> entry : nonSaturatedSets) {
+                    FieldInfo fieldInfo = entry.getKey();
+                    FuzzySet fuzzySet = entry.getValue();
+                    saveAppropriatelySizedFuzzySet(fuzzyFilterFileOutput, fuzzySet, fieldInfo);
+                }
+                CodecUtil.writeFooter(fuzzyFilterFileOutput);
+            }
+            // We are done with large bitsets so no need to keep them hanging around
+            fuzzySets.clear();
+            IOUtils.closeWhileHandlingException(closeables);
+        }
+
+        private void saveAppropriatelySizedFuzzySet(IndexOutput fileOutput, FuzzySet fuzzySet, FieldInfo fieldInfo) throws IOException {
+            fileOutput.writeInt(fieldInfo.number);
+            fileOutput.writeString(fuzzySet.setType().getSetName());
+            fuzzySet.writeTo(fileOutput);
+        }
+    }
+
+    @Override
+    public String toString() {
+        return "FuzzyFilterPostingsFormat(" + delegatePostingsFormat + ")";
+    }
+}
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySet.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySet.java
new file mode 100644
index 0000000000000..df443ffbca33d
--- /dev/null
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySet.java
@@ -0,0 +1,98 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.codec.fuzzy;
+
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.BytesRef;
+import org.opensearch.common.CheckedFunction;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * Fuzzy Filter interface
+ */
+public interface FuzzySet extends Accountable, Closeable {
+
+    /**
+     * Name used for a codec to be aware of what fuzzy set has been used.
+     */
+    SetType setType();
+
+    /**
+     * @param value the item whose membership needs to be checked.
+     */
+    Result contains(BytesRef value);
+
+    boolean isSaturated();
+
+    void writeTo(DataOutput out) throws IOException;
+
+    /**
+     * Enum to represent result of membership check on a fuzzy set.
+     */
+    enum Result {
+        /**
+         * A definite no for the set membership of an item.
+         */
+        NO,
+
+        /**
+         * Fuzzy sets cannot guarantee that a given item is present in the set or not due the data being stored in
+         * a lossy format (e.g. fingerprint, hash).
+         * Hence, we return a response denoting that the item maybe present.
+         */
+        MAYBE
+    }
+
+    /**
+     * Enum to declare supported properties and mappings for a fuzzy set implementation.
+     */
+    enum SetType {
+        BLOOM_FILTER_V1("bloom_filter_v1", BloomFilter::new, List.of("bloom_filter"));
+
+        /**
+         * Name persisted in postings file. This will be used when reading to determine the bloom filter implementation.
+         */
+        private final String setName;
+
+        /**
+         * Interface for reading the actual fuzzy set implementation into java object.
+         */
+        private final CheckedFunction<IndexInput, ? extends FuzzySet, IOException> deserializer;
+
+        SetType(String setName, CheckedFunction<IndexInput, ? extends FuzzySet, IOException> deserializer, List<String> aliases) {
+            if (aliases.size() < 1) {
+                throw new IllegalArgumentException("Alias list is empty. Could not create Set Type: " + setName);
+            }
+            this.setName = setName;
+            this.deserializer = deserializer;
+        }
+
+        public String getSetName() {
+            return setName;
+        }
+
+        public CheckedFunction<IndexInput, ? extends FuzzySet, IOException> getDeserializer() {
+            return deserializer;
+        }
+
+        public static SetType from(String name) {
+            for (SetType type : SetType.values()) {
+                if (type.setName.equals(name)) {
+                    return type;
+                }
+            }
+            throw new IllegalArgumentException("There is no implementation for fuzzy set: " + name);
+        }
+    }
+}
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySetFactory.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySetFactory.java
new file mode 100644
index 0000000000000..5d1fd03f099d4
--- /dev/null
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySetFactory.java
@@ -0,0 +1,49 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.codec.fuzzy;
+
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BytesRef;
+import org.opensearch.common.CheckedSupplier;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Map;
+
+/**
+ * Factory class to create fuzzy set.
+ * Supports bloom filters for now. More sets can be added as required.
+ */
+public class FuzzySetFactory {
+
+    private final Map<String, FuzzySetParameters> setTypeForField;
+
+    public FuzzySetFactory(Map<String, FuzzySetParameters> setTypeForField) {
+        this.setTypeForField = setTypeForField;
+    }
+
+    public FuzzySet createFuzzySet(int maxDocs, String fieldName, CheckedSupplier<Iterator<BytesRef>, IOException> iteratorProvider)
+        throws IOException {
+        FuzzySetParameters params = setTypeForField.get(fieldName);
+        if (params == null) {
+            throw new IllegalArgumentException("No fuzzy set defined for field: " + fieldName);
+        }
+        switch (params.getSetType()) {
+            case BLOOM_FILTER_V1:
+                return new BloomFilter(maxDocs, params.getFalsePositiveProbability(), iteratorProvider);
+            default:
+                throw new IllegalArgumentException("No Implementation for set type: " + params.getSetType());
+        }
+    }
+
+    public static FuzzySet deserializeFuzzySet(IndexInput in) throws IOException {
+        FuzzySet.SetType setType = FuzzySet.SetType.from(in.readString());
+        return setType.getDeserializer().apply(in);
+    }
+}
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySetParameters.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySetParameters.java
new file mode 100644
index 0000000000000..7bb96e7c34f0b
--- /dev/null
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzySetParameters.java
@@ -0,0 +1,34 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.codec.fuzzy;
+
+import java.util.function.Supplier;
+
+/**
+ * Wrapper for params to create a fuzzy set.
+ */
+public class FuzzySetParameters {
+    private final Supplier<Double> falsePositiveProbabilityProvider;
+    private final FuzzySet.SetType setType;
+
+    public static final double DEFAULT_FALSE_POSITIVE_PROBABILITY = 0.2047d;
+
+    public FuzzySetParameters(Supplier<Double> falsePositiveProbabilityProvider) {
+        this.falsePositiveProbabilityProvider = falsePositiveProbabilityProvider;
+        this.setType = FuzzySet.SetType.BLOOM_FILTER_V1;
+    }
+
+    public double getFalsePositiveProbability() {
+        return falsePositiveProbabilityProvider.get();
+    }
+
+    public FuzzySet.SetType getSetType() {
+        return setType;
+    }
+}
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/IndexInputImmutableLongArray.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/IndexInputImmutableLongArray.java
new file mode 100644
index 0000000000000..6c56cfcc3118b
--- /dev/null
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/IndexInputImmutableLongArray.java
@@ -0,0 +1,68 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.codec.fuzzy;
+
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.opensearch.OpenSearchException;
+import org.opensearch.common.util.LongArray;
+
+import java.io.IOException;
+
+/**
+ * A Long array backed by RandomAccessInput.
+ * This implementation supports read operations only.
+ */
+class IndexInputImmutableLongArray implements LongArray {
+
+    private final RandomAccessInput input;
+    private final long size;
+
+    IndexInputImmutableLongArray(long size, RandomAccessInput input) {
+        this.size = size;
+        this.input = input;
+    }
+
+    @Override
+    public void close() {}
+
+    @Override
+    public long size() {
+        return size;
+    }
+
+    @Override
+    public synchronized long get(long index) {
+        try {
+            return input.readLong(index << 3);
+        } catch (IOException ex) {
+            throw new OpenSearchException(ex);
+        }
+    }
+
+    @Override
+    public long set(long index, long value) {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long increment(long index, long inc) {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void fill(long fromIndex, long toIndex, long value) {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long ramBytesUsed() {
+        return RamUsageEstimator.shallowSizeOfInstance(IndexInputImmutableLongArray.class);
+    }
+}
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java
new file mode 100644
index 0000000000000..74737845975ef
--- /dev/null
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java
@@ -0,0 +1,80 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.codec.fuzzy;
+
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Accountable;
+import org.opensearch.common.util.BigArrays;
+import org.opensearch.common.util.LongArray;
+import org.opensearch.common.util.io.IOUtils;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+/**
+ * A bitset backed by a long-indexed array.
+ */
+class LongArrayBackedBitSet implements Accountable, Closeable {
+
+    private long underlyingArrayLength = 0L;
+    private LongArray longArray;
+
+    LongArrayBackedBitSet(long capacity) {
+        // Since the bitset is backed by a long array, we only need 1 element for every 64 bits in the underlying array.
+        underlyingArrayLength = ((capacity - 1L) >> 6) + 1;
+        this.longArray = BigArrays.NON_RECYCLING_INSTANCE.withCircuitBreaking().newLongArray(underlyingArrayLength);
+    }
+
+    LongArrayBackedBitSet(IndexInput in) throws IOException {
+        underlyingArrayLength = in.readLong();
+        long streamLength = underlyingArrayLength << 3;
+        this.longArray = new IndexInputImmutableLongArray(underlyingArrayLength, in.randomAccessSlice(in.getFilePointer(), streamLength));
+        in.skipBytes(streamLength);
+    }
+
+    public void writeTo(DataOutput out) throws IOException {
+        out.writeLong(underlyingArrayLength);
+        for (int idx = 0; idx < underlyingArrayLength; idx++) {
+            out.writeLong(longArray.get(idx));
+        }
+    }
+
+    public long cardinality() {
+        long tot = 0;
+        for (int i = 0; i < underlyingArrayLength; ++i) {
+            tot += Long.bitCount(longArray.get(i));
+        }
+        return tot;
+    }
+
+    public boolean isSet(long index) {
+        long i = index >> 6; // div 64
+        long val = longArray.get(i);
+        long bitmask = 1L << index;
+        return (val & bitmask) != 0;
+    }
+
+    public void set(long index) {
+        long wordNum = index >> 6; // div 64
+        long bitmask = 1L << index;
+        long val = longArray.get(wordNum);
+        longArray.set(wordNum, val | bitmask);
+    }
+
+    @Override
+    public long ramBytesUsed() {
+        return 128L + longArray.ramBytesUsed();
+    }
+
+    @Override
+    public void close() throws IOException {
+        IOUtils.close(longArray);
+    }
+}
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/package-info.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/package-info.java
new file mode 100644
index 0000000000000..7aeac68cd192a
--- /dev/null
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/package-info.java
@@ -0,0 +1,10 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+/** classes responsible for handling all fuzzy codecs and operations */
+package org.opensearch.index.codec.fuzzy;
diff --git a/server/src/main/java/org/opensearch/index/engine/SegmentsStats.java b/server/src/main/java/org/opensearch/index/engine/SegmentsStats.java
index d4a97f0267222..34aecfc62b8b2 100644
--- a/server/src/main/java/org/opensearch/index/engine/SegmentsStats.java
+++ b/server/src/main/java/org/opensearch/index/engine/SegmentsStats.java
@@ -41,6 +41,7 @@
 import org.opensearch.core.xcontent.ToXContentFragment;
 import org.opensearch.core.xcontent.XContentBuilder;
 import org.opensearch.index.ReplicationStats;
+import org.opensearch.index.codec.fuzzy.FuzzyFilterPostingsFormat;
 import org.opensearch.index.remote.RemoteSegmentStats;
 
 import java.io.IOException;
@@ -95,7 +96,8 @@ public class SegmentsStats implements Writeable, ToXContentFragment {
         Map.entry("tvx", "Term Vector Index"),
         Map.entry("tvd", "Term Vector Documents"),
         Map.entry("tvf", "Term Vector Fields"),
-        Map.entry("liv", "Live Documents")
+        Map.entry("liv", "Live Documents"),
+        Map.entry(FuzzyFilterPostingsFormat.FUZZY_FILTER_FILE_EXTENSION, "Fuzzy Filter")
     );
 
     public SegmentsStats() {
diff --git a/server/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/server/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
index 2c92f0ecd3f51..80b1d25064885 100644
--- a/server/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
+++ b/server/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
@@ -1 +1,2 @@
 org.apache.lucene.search.suggest.document.Completion50PostingsFormat
+org.opensearch.index.codec.fuzzy.FuzzyFilterPostingsFormat
diff --git a/server/src/test/java/org/opensearch/index/codec/fuzzy/BloomFilterTests.java b/server/src/test/java/org/opensearch/index/codec/fuzzy/BloomFilterTests.java
new file mode 100644
index 0000000000000..92669d5bc1d92
--- /dev/null
+++ b/server/src/test/java/org/opensearch/index/codec/fuzzy/BloomFilterTests.java
@@ -0,0 +1,82 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.codec.fuzzy;
+
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.util.BytesRef;
+import org.opensearch.common.lucene.store.ByteArrayIndexInput;
+import org.opensearch.test.OpenSearchTestCase;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+
+public class BloomFilterTests extends OpenSearchTestCase {
+
+    public void testBloomFilterSerializationDeserialization() throws IOException {
+        int elementCount = randomIntBetween(1, 100);
+        long maxDocs = elementCount * 10L; // Keeping this high so that it ensures some bits are not set.
+        BloomFilter filter = new BloomFilter(maxDocs, getFpp(), () -> idIterator(elementCount));
+        byte[] buffer = new byte[(int) maxDocs * 5];
+        ByteArrayDataOutput out = new ByteArrayDataOutput(buffer);
+
+        // Write in the format readable through factory
+        out.writeString(filter.setType().getSetName());
+        filter.writeTo(out);
+
+        FuzzySet reconstructedFilter = FuzzySetFactory.deserializeFuzzySet(new ByteArrayIndexInput("filter", buffer));
+        assertEquals(FuzzySet.SetType.BLOOM_FILTER_V1, reconstructedFilter.setType());
+
+        Iterator<BytesRef> idIterator = idIterator(elementCount);
+        while (idIterator.hasNext()) {
+            BytesRef element = idIterator.next();
+            assertEquals(FuzzySet.Result.MAYBE, reconstructedFilter.contains(element));
+            assertEquals(FuzzySet.Result.MAYBE, filter.contains(element));
+        }
+    }
+
+    public void testBloomFilterIsSaturated_returnsTrue() throws IOException {
+        BloomFilter bloomFilter = new BloomFilter(1L, getFpp(), () -> idIterator(1000));
+        assertEquals(FuzzySet.SetType.BLOOM_FILTER_V1, bloomFilter.setType());
+        assertEquals(true, bloomFilter.isSaturated());
+    }
+
+    public void testBloomFilterIsSaturated_returnsFalse() throws IOException {
+        int elementCount = randomIntBetween(1, 100);
+        BloomFilter bloomFilter = new BloomFilter(20000, getFpp(), () -> idIterator(elementCount));
+        assertEquals(FuzzySet.SetType.BLOOM_FILTER_V1, bloomFilter.setType());
+        assertEquals(false, bloomFilter.isSaturated());
+    }
+
+    public void testBloomFilterWithLargeCapacity() throws IOException {
+        long maxDocs = randomLongBetween(Integer.MAX_VALUE, 5L * Integer.MAX_VALUE);
+        BloomFilter bloomFilter = new BloomFilter(maxDocs, getFpp(), () -> List.of(new BytesRef("bar")).iterator());
+        assertEquals(FuzzySet.SetType.BLOOM_FILTER_V1, bloomFilter.setType());
+    }
+
+    private double getFpp() {
+        return randomDoubleBetween(0.01, 0.50, true);
+    }
+
+    private Iterator<BytesRef> idIterator(int count) {
+        return new Iterator<BytesRef>() {
+            int cnt = count;
+
+            @Override
+            public boolean hasNext() {
+                return cnt-- > 0;
+            }
+
+            @Override
+            public BytesRef next() {
+                return new BytesRef(Integer.toString(cnt));
+            }
+        };
+    }
+}
diff --git a/server/src/test/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormatTests.java b/server/src/test/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormatTests.java
new file mode 100644
index 0000000000000..868c2175d0689
--- /dev/null
+++ b/server/src/test/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormatTests.java
@@ -0,0 +1,34 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.codec.fuzzy;
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.tests.index.BasePostingsFormatTestCase;
+import org.apache.lucene.tests.util.TestUtil;
+
+import java.util.TreeMap;
+
+public class FuzzyFilterPostingsFormatTests extends BasePostingsFormatTestCase {
+
+    private TreeMap<String, FuzzySetParameters> params = new TreeMap<>() {
+        @Override
+        public FuzzySetParameters get(Object k) {
+            return new FuzzySetParameters(() -> FuzzySetParameters.DEFAULT_FALSE_POSITIVE_PROBABILITY);
+        }
+    };
+
+    private Codec fuzzyFilterCodec = TestUtil.alwaysPostingsFormat(
+        new FuzzyFilterPostingsFormat(TestUtil.getDefaultPostingsFormat(), new FuzzySetFactory(params))
+    );
+
+    @Override
+    protected Codec getCodec() {
+        return fuzzyFilterCodec;
+    }
+}
diff --git a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java
index 5e79396de8d8d..9adcafc21dba3 100644
--- a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java
+++ b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java
@@ -197,6 +197,8 @@
 import static org.opensearch.core.common.util.CollectionUtils.eagerPartition;
 import static org.opensearch.discovery.DiscoveryModule.DISCOVERY_SEED_PROVIDERS_SETTING;
 import static org.opensearch.discovery.SettingsBasedSeedHostsProvider.DISCOVERY_SEED_HOSTS_SETTING;
+import static org.opensearch.index.IndexSettings.INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING;
+import static org.opensearch.index.IndexSettings.INDEX_DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING;
 import static org.opensearch.index.IndexSettings.INDEX_SOFT_DELETES_RETENTION_LEASE_PERIOD_SETTING;
 import static org.opensearch.index.query.QueryBuilders.matchAllQuery;
 import static org.opensearch.test.XContentTestUtils.convertToMap;
@@ -630,6 +632,11 @@ public Settings indexSettings() {
             );
         }
 
+        if (randomBoolean()) {
+            builder.put(INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING.getKey(), true);
+            builder.put(INDEX_DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING.getKey(), randomDoubleBetween(0.01, 0.50, true));
+        }
+
         return builder.build();
     }
 

From 8cdf2c0b7107497320ff227da2b5881543ca0038 Mon Sep 17 00:00:00 2001
From: mgodwan <mgodwan@amazon.com>
Date: Thu, 1 Feb 2024 11:58:04 +0530
Subject: [PATCH 2/8] Address PR comments

Signed-off-by: mgodwan <mgodwan@amazon.com>
---
 .../org/opensearch/index/IndexSettings.java   |  4 +--
 .../index/codec/fuzzy/BloomFilter.java        |  6 +++--
 .../fuzzy/FuzzyFilterPostingsFormat.java      | 12 ++++-----
 .../codec/fuzzy/LongArrayBackedBitSet.java    | 25 ++++++++++++++++++-
 4 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/server/src/main/java/org/opensearch/index/IndexSettings.java b/server/src/main/java/org/opensearch/index/IndexSettings.java
index 3966bca42c249..b3a3775325e87 100644
--- a/server/src/main/java/org/opensearch/index/IndexSettings.java
+++ b/server/src/main/java/org/opensearch/index/IndexSettings.java
@@ -660,14 +660,14 @@ public static IndexMergePolicy fromString(String text) {
     );
 
     public static final Setting<Boolean> INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING = Setting.boolSetting(
-        "index.doc_id_fuzzy_set.enabled",
+        "index.optimize_doc_id_lookup.fuzzy_set.enabled",
         false,
         Property.IndexScope,
         Property.Dynamic
     );
 
     public static final Setting<Double> INDEX_DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING = Setting.doubleSetting(
-        "index.doc_id_fuzzy_set.false_positive_probability",
+        "index.optimize_doc_id_lookup.fuzzy_set.false_positive_probability",
         DEFAULT_FALSE_POSITIVE_PROBABILITY,
         0.01,
         0.50,
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java
index 5870ca0e16984..bb0d5e5c2168a 100644
--- a/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java
@@ -68,7 +68,7 @@ public class BloomFilter extends AbstractFuzzySet {
         if (Assertions.ENABLED) {
             assertAllElementsExist(fieldIteratorProvider);
         }
-        logger.trace("Bloom filter created with fpp: {}, setSize: {}, hashCount: {}", maxFpp, setSize, hashCount);
+        logger.debug("Bloom filter created with fpp: {}, setSize: {}, hashCount: {}", maxFpp, setSize, hashCount);
     }
 
     BloomFilter(IndexInput in) throws IOException {
@@ -126,6 +126,8 @@ protected void add(BytesRef value) {
     @Override
     public boolean isSaturated() {
         long numBitsSet = bitset.cardinality();
+        // Don't bother saving bitsets if >90% of bits are set - we don't want to
+        // throw any more memory at this problem.
         return (float) numBitsSet / (float) setSize > 0.9f;
     }
 
@@ -137,7 +139,7 @@ public long ramBytesUsed() {
     private boolean mayContainValue(int aHash) {
         // Bloom sizes are always base 2 and so can be ANDed for a fast modulo
         int pos = aHash & setSize;
-        return bitset.isSet(pos);
+        return bitset.get(pos);
     }
 
     @Override
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java
index a31bd41259e46..73023ef3bd0fc 100644
--- a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java
@@ -93,7 +93,7 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException
             );
         }
         FieldsConsumer fieldsConsumer = delegatePostingsFormat.fieldsConsumer(state);
-        return new FuzzySetFieldsConsumer(fieldsConsumer, state);
+        return new FuzzyFilteredFieldsConsumer(fieldsConsumer, state);
     }
 
     @Override
@@ -168,7 +168,7 @@ public Terms terms(String field) throws IOException {
                 if (result == null) {
                     return null;
                 }
-                return new FuzzySetFieldsProducer.FuzzyFilterFrontedTerms(result, filter);
+                return new FuzzyFilteredTerms(result, filter);
             }
         }
 
@@ -177,11 +177,11 @@ public int size() {
             return delegateFieldsProducer.size();
         }
 
-        static class FuzzyFilterFrontedTerms extends Terms {
+        static class FuzzyFilteredTerms extends Terms {
             private Terms delegateTerms;
             private FuzzySet filter;
 
-            public FuzzyFilterFrontedTerms(Terms terms, FuzzySet filter) {
+            public FuzzyFilteredTerms(Terms terms, FuzzySet filter) {
                 this.delegateTerms = terms;
                 this.filter = filter;
             }
@@ -349,13 +349,13 @@ public String toString() {
         }
     }
 
-    class FuzzySetFieldsConsumer extends FieldsConsumer {
+    class FuzzyFilteredFieldsConsumer extends FieldsConsumer {
         private FieldsConsumer delegateFieldsConsumer;
         private Map<FieldInfo, FuzzySet> fuzzySets = new HashMap<>();
         private SegmentWriteState state;
         private List<Closeable> closeables = new ArrayList<>();
 
-        public FuzzySetFieldsConsumer(FieldsConsumer fieldsConsumer, SegmentWriteState state) {
+        public FuzzyFilteredFieldsConsumer(FieldsConsumer fieldsConsumer, SegmentWriteState state) {
             this.delegateFieldsConsumer = fieldsConsumer;
             this.state = state;
         }
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java
index 74737845975ef..a27f9ba7a0613 100644
--- a/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java
@@ -26,12 +26,21 @@ class LongArrayBackedBitSet implements Accountable, Closeable {
     private long underlyingArrayLength = 0L;
     private LongArray longArray;
 
+    /**
+     * Constructor which uses an on heap array. This should be using during construction of the bitset.
+     * @param capacity The maximum capacity to provision for the bitset.
+     */
     LongArrayBackedBitSet(long capacity) {
         // Since the bitset is backed by a long array, we only need 1 element for every 64 bits in the underlying array.
         underlyingArrayLength = ((capacity - 1L) >> 6) + 1;
         this.longArray = BigArrays.NON_RECYCLING_INSTANCE.withCircuitBreaking().newLongArray(underlyingArrayLength);
     }
 
+    /**
+     * Constructor which uses Lucene's IndexInput to read the bitset into a read-only buffer.
+     * @param in IndexInput containing the serialized bitset.
+     * @throws IOException
+     */
     LongArrayBackedBitSet(IndexInput in) throws IOException {
         underlyingArrayLength = in.readLong();
         long streamLength = underlyingArrayLength << 3;
@@ -46,6 +55,11 @@ public void writeTo(DataOutput out) throws IOException {
         }
     }
 
+    /**
+     * This is an O(n) operation, and will iterate over all the elements in the underlying long array
+     * to determine cardinality of the set.
+     * @return number of set bits in the bitset.
+     */
     public long cardinality() {
         long tot = 0;
         for (int i = 0; i < underlyingArrayLength; ++i) {
@@ -54,13 +68,22 @@ public long cardinality() {
         return tot;
     }
 
-    public boolean isSet(long index) {
+    /**
+     * Retrieves whether the bit is set or not at the given index.
+     * @param index the index to look up for the bit
+     * @return true if bit is set, false otherwise
+     */
+    public boolean get(long index) {
         long i = index >> 6; // div 64
         long val = longArray.get(i);
         long bitmask = 1L << index;
         return (val & bitmask) != 0;
     }
 
+    /**
+     * Sets the bit at the given index.
+     * @param index the index to set the bit at.
+     */
     public void set(long index) {
         long wordNum = index >> 6; // div 64
         long bitmask = 1L << index;

From 91150dc1d7a7d1db0390e385c5f121a57de65a17 Mon Sep 17 00:00:00 2001
From: mgodwan <mgodwan@amazon.com>
Date: Thu, 1 Feb 2024 17:41:25 +0530
Subject: [PATCH 3/8] Address PR comments, add feature flag

Signed-off-by: mgodwan <mgodwan@amazon.com>
---
 .../common/settings/FeatureFlagSettings.java  |  3 ++-
 .../opensearch/common/util/FeatureFlags.java  | 11 ++++++++++
 .../org/opensearch/index/IndexSettings.java   | 22 +++++++++++++++----
 .../index/codec/fuzzy/BloomFilter.java        |  3 ++-
 .../fuzzy/FuzzyFilterPostingsFormat.java      | 22 +++++++++++--------
 .../codec/fuzzy/LongArrayBackedBitSet.java    |  1 +
 6 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/server/src/main/java/org/opensearch/common/settings/FeatureFlagSettings.java b/server/src/main/java/org/opensearch/common/settings/FeatureFlagSettings.java
index e19f8e8370d5b..aed9d9b4668b9 100644
--- a/server/src/main/java/org/opensearch/common/settings/FeatureFlagSettings.java
+++ b/server/src/main/java/org/opensearch/common/settings/FeatureFlagSettings.java
@@ -35,6 +35,7 @@ protected FeatureFlagSettings(
         FeatureFlags.CONCURRENT_SEGMENT_SEARCH_SETTING,
         FeatureFlags.TELEMETRY_SETTING,
         FeatureFlags.DATETIME_FORMATTER_CACHING_SETTING,
-        FeatureFlags.WRITEABLE_REMOTE_INDEX_SETTING
+        FeatureFlags.WRITEABLE_REMOTE_INDEX_SETTING,
+        FeatureFlags.DOC_ID_FUZZY_SET_SETTING
     );
 }
diff --git a/server/src/main/java/org/opensearch/common/util/FeatureFlags.java b/server/src/main/java/org/opensearch/common/util/FeatureFlags.java
index d4ab161527cc0..56a754209d9fd 100644
--- a/server/src/main/java/org/opensearch/common/util/FeatureFlags.java
+++ b/server/src/main/java/org/opensearch/common/util/FeatureFlags.java
@@ -60,6 +60,11 @@ public class FeatureFlags {
      */
     public static final String WRITEABLE_REMOTE_INDEX = "opensearch.experimental.feature.writeable_remote_index.enabled";
 
+    /**
+     * Gates the optimization to enable bloom filters for doc id lookup.
+     */
+    public static final String DOC_ID_FUZZY_SET = "opensearch.experimental.optimize_doc_id_lookup.fuzzy_set.enabled";
+
     /**
      * Should store the settings from opensearch.yml.
      */
@@ -122,4 +127,10 @@ public static boolean isEnabled(Setting<Boolean> featureFlag) {
         false,
         Property.NodeScope
     );
+
+    public static final Setting<Boolean> DOC_ID_FUZZY_SET_SETTING = Setting.boolSetting(
+        DOC_ID_FUZZY_SET,
+        false,
+        Property.NodeScope
+    );
 }
diff --git a/server/src/main/java/org/opensearch/index/IndexSettings.java b/server/src/main/java/org/opensearch/index/IndexSettings.java
index b3a3775325e87..34c109de8e124 100644
--- a/server/src/main/java/org/opensearch/index/IndexSettings.java
+++ b/server/src/main/java/org/opensearch/index/IndexSettings.java
@@ -65,6 +65,7 @@
 import java.util.function.UnaryOperator;
 
 import static org.opensearch.Version.V_2_7_0;
+import static org.opensearch.common.util.FeatureFlags.DOC_ID_FUZZY_SET_SETTING;
 import static org.opensearch.common.util.FeatureFlags.SEARCHABLE_SNAPSHOT_EXTENDED_COMPATIBILITY;
 import static org.opensearch.index.codec.fuzzy.FuzzySetParameters.DEFAULT_FALSE_POSITIVE_PROBABILITY;
 import static org.opensearch.index.mapper.MapperService.INDEX_MAPPING_DEPTH_LIMIT_SETTING;
@@ -954,8 +955,11 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti
          */
         widenIndexSortType = IndexMetadata.SETTING_INDEX_VERSION_CREATED.get(settings).before(V_2_7_0);
 
-        enableFuzzySetForDocId = scopedSettings.get(INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING);
-        docIdFuzzySetFalsePositiveProbability = scopedSettings.get(INDEX_DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING);
+        boolean isOptimizeDocIdLookupUsingFuzzySetFeatureEnabled = FeatureFlags.isEnabled(DOC_ID_FUZZY_SET_SETTING);
+        if (isOptimizeDocIdLookupUsingFuzzySetFeatureEnabled) {
+            enableFuzzySetForDocId = scopedSettings.get(INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING);
+            docIdFuzzySetFalsePositiveProbability = scopedSettings.get(INDEX_DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING);
+        }
 
         scopedSettings.addSettingsUpdateConsumer(
             TieredMergePolicyProvider.INDEX_COMPOUND_FORMAT_SETTING,
@@ -1843,7 +1847,7 @@ public boolean isEnableFuzzySetForDocId() {
     }
 
     public void setEnableFuzzySetForDocId(boolean enableFuzzySetForDocId) {
-        this.enableFuzzySetForDocId = enableFuzzySetForDocId;
+        verifyFeatureToSetDocIdFuzzySetSetting(enabled -> this.enableFuzzySetForDocId = enabled, enableFuzzySetForDocId);
     }
 
     public double getDocIdFuzzySetFalsePositiveProbability() {
@@ -1851,6 +1855,16 @@ public double getDocIdFuzzySetFalsePositiveProbability() {
     }
 
     public void setDocIdFuzzySetFalsePositiveProbability(double docIdFuzzySetFalsePositiveProbability) {
-        this.docIdFuzzySetFalsePositiveProbability = docIdFuzzySetFalsePositiveProbability;
+        verifyFeatureToSetDocIdFuzzySetSetting(
+            fpp -> this.docIdFuzzySetFalsePositiveProbability = fpp, docIdFuzzySetFalsePositiveProbability);
+    }
+
+    private static <T> void verifyFeatureToSetDocIdFuzzySetSetting(Consumer<T> settingUpdater, T val) {
+        if (FeatureFlags.isEnabled(DOC_ID_FUZZY_SET_SETTING)) {
+            settingUpdater.accept(val);
+        } else {
+            throw new IllegalArgumentException(String.format("Fuzzy set for optimizing doc id lookup " +
+                "cannot be enabled with feature flag %s set to false", FeatureFlags.DOC_ID_FUZZY_SET));
+        }
     }
 }
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java
index bb0d5e5c2168a..edff33e9dd580 100644
--- a/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java
@@ -59,7 +59,7 @@ public class BloomFilter extends AbstractFuzzySet {
 
     BloomFilter(long maxDocs, double maxFpp, CheckedSupplier<Iterator<BytesRef>, IOException> fieldIteratorProvider) throws IOException {
         int setSize = (int) Math.ceil((maxDocs * Math.log(maxFpp)) / Math.log(1 / Math.pow(2, Math.log(2))));
-        setSize = getNearestSetSize(2 * setSize);
+        setSize = getNearestSetSize(setSize < Integer.MAX_VALUE / 2 ? 2 * setSize : Integer.MAX_VALUE);
         int optimalK = (int) Math.round(((double) setSize / maxDocs) * Math.log(2));
         this.bitset = new LongArrayBackedBitSet(setSize + 1);
         this.setSize = setSize;
@@ -85,6 +85,7 @@ public void writeTo(DataOutput out) throws IOException {
     }
 
     private static int getNearestSetSize(int maxNumberOfBits) {
+        assert maxNumberOfBits > 0 : "Provided size estimate for bloom filter is illegal (<=0) : " + maxNumberOfBits;
         int result = usableBitSetSizes[0];
         for (int i = 0; i < usableBitSetSizes.length; i++) {
             if (usableBitSetSizes[i] <= maxNumberOfBits) {
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java
index 73023ef3bd0fc..01f8054fc91be 100644
--- a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java
@@ -63,7 +63,7 @@ public final class FuzzyFilterPostingsFormat extends PostingsFormat {
      * This name is stored in headers. If changing the implementation for the format, this name/version should be updated
      * so that reads can work as expected.
      */
-    public static final String FUZZY_SET_CODEC_NAME = "FuzzySetCodec99";
+    public static final String FUZZY_FILTER_CODEC_NAME = "FuzzyFilterCodec99";
 
     public static final int VERSION_START = 0;
     public static final int VERSION_CURRENT = VERSION_START;
@@ -75,7 +75,7 @@ public final class FuzzyFilterPostingsFormat extends PostingsFormat {
     private final FuzzySetFactory fuzzySetFactory;
 
     public FuzzyFilterPostingsFormat(PostingsFormat delegatePostingsFormat, FuzzySetFactory fuzzySetFactory) {
-        super(FUZZY_SET_CODEC_NAME);
+        super(FUZZY_FILTER_CODEC_NAME);
         this.delegatePostingsFormat = delegatePostingsFormat;
         this.fuzzySetFactory = fuzzySetFactory;
     }
@@ -98,16 +98,16 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException
 
     @Override
     public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
-        return new FuzzySetFieldsProducer(state);
+        return new FuzzyFilteredFieldsProducer(state);
     }
 
-    static class FuzzySetFieldsProducer extends FieldsProducer {
+    static class FuzzyFilteredFieldsProducer extends FieldsProducer {
         private FieldsProducer delegateFieldsProducer;
         HashMap<String, FuzzySet> fuzzySetsByFieldName = new HashMap<>();
         private List<Closeable> closeables = new ArrayList<>();
 
-        public FuzzySetFieldsProducer(SegmentReadState state) throws IOException {
-            String fuzzySetFileName = IndexFileNames.segmentFileName(
+        public FuzzyFilteredFieldsProducer(SegmentReadState state) throws IOException {
+            String fuzzyFilterFileName = IndexFileNames.segmentFileName(
                 state.segmentInfo.name,
                 state.segmentSuffix,
                 FUZZY_FILTER_FILE_EXTENSION
@@ -115,11 +115,12 @@ public FuzzySetFieldsProducer(SegmentReadState state) throws IOException {
             IndexInput filterIn = null;
             boolean success = false;
             try {
-                filterIn = state.directory.openInput(fuzzySetFileName, state.context);
+                // Using IndexInput directly instead of ChecksumIndexInput since we want to support RandomAccessInput
+                filterIn = state.directory.openInput(fuzzyFilterFileName, state.context);
 
                 CodecUtil.checkIndexHeader(
                     filterIn,
-                    FUZZY_SET_CODEC_NAME,
+                    FUZZY_FILTER_CODEC_NAME,
                     VERSION_START,
                     VERSION_CURRENT,
                     state.segmentInfo.getId(),
@@ -137,6 +138,8 @@ public FuzzySetFieldsProducer(SegmentReadState state) throws IOException {
                     fuzzySetsByFieldName.put(fieldInfo.name, set);
                 }
                 CodecUtil.retrieveChecksum(filterIn);
+
+                // Can we disable it if we foresee performance issues?
                 CodecUtil.checksumEntireFile(filterIn);
                 success = true;
                 closeables.add(filterIn);
@@ -154,6 +157,7 @@ public Iterator<String> iterator() {
 
         @Override
         public void close() throws IOException {
+            // Why closing here?
             IOUtils.closeWhileHandlingException(closeables);
             delegateFieldsProducer.close();
         }
@@ -451,7 +455,7 @@ public void close() throws IOException {
                 );
                 CodecUtil.writeIndexHeader(
                     fuzzyFilterFileOutput,
-                    FUZZY_SET_CODEC_NAME,
+                    FUZZY_FILTER_CODEC_NAME,
                     VERSION_CURRENT,
                     state.segmentInfo.getId(),
                     state.segmentSuffix
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java
index a27f9ba7a0613..5de3156202a17 100644
--- a/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java
@@ -43,6 +43,7 @@ class LongArrayBackedBitSet implements Accountable, Closeable {
      */
     LongArrayBackedBitSet(IndexInput in) throws IOException {
         underlyingArrayLength = in.readLong();
+        // Multiplying by 8 since this
         long streamLength = underlyingArrayLength << 3;
         this.longArray = new IndexInputImmutableLongArray(underlyingArrayLength, in.randomAccessSlice(in.getFilePointer(), streamLength));
         in.skipBytes(streamLength);

From 36925d2881fe4d844c0451ff3801286245cd1eb0 Mon Sep 17 00:00:00 2001
From: mgodwan <mgodwan@amazon.com>
Date: Thu, 1 Feb 2024 17:45:08 +0530
Subject: [PATCH 4/8] Address PR comments

Signed-off-by: mgodwan <mgodwan@amazon.com>
---
 .../index/codec/fuzzy/IndexInputImmutableLongArray.java        | 2 ++
 .../opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java    | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/IndexInputImmutableLongArray.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/IndexInputImmutableLongArray.java
index 6c56cfcc3118b..08d6059c1e82e 100644
--- a/server/src/main/java/org/opensearch/index/codec/fuzzy/IndexInputImmutableLongArray.java
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/IndexInputImmutableLongArray.java
@@ -40,6 +40,8 @@ public long size() {
     @Override
     public synchronized long get(long index) {
         try {
+            // Multiplying by 8 since each long is 8 bytes, and we need to get the long value at (index * 8) in the
+            // RandomAccessInput being accessed.
             return input.readLong(index << 3);
         } catch (IOException ex) {
             throw new OpenSearchException(ex);
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java
index 5de3156202a17..714be24d28e00 100644
--- a/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java
@@ -43,7 +43,8 @@ class LongArrayBackedBitSet implements Accountable, Closeable {
      */
     LongArrayBackedBitSet(IndexInput in) throws IOException {
         underlyingArrayLength = in.readLong();
-        // Multiplying by 8 since this
+        // Multiplying by 8 since the length above is of the long array, so we will have
+        // 8 times the number of bytes in our stream.
         long streamLength = underlyingArrayLength << 3;
         this.longArray = new IndexInputImmutableLongArray(underlyingArrayLength, in.randomAccessSlice(in.getFilePointer(), streamLength));
         in.skipBytes(streamLength);

From 886757e9a5f48d441ba63abbd2026ce2befc3768 Mon Sep 17 00:00:00 2001
From: mgodwan <mgodwan@amazon.com>
Date: Thu, 1 Feb 2024 18:05:09 +0530
Subject: [PATCH 5/8] Address PR comments

Signed-off-by: mgodwan <mgodwan@amazon.com>
---
 .../main/java/org/opensearch/common/util/FeatureFlags.java  | 6 +-----
 .../src/main/java/org/opensearch/index/IndexSettings.java   | 4 ++--
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/server/src/main/java/org/opensearch/common/util/FeatureFlags.java b/server/src/main/java/org/opensearch/common/util/FeatureFlags.java
index 56a754209d9fd..3ee54a74c53d8 100644
--- a/server/src/main/java/org/opensearch/common/util/FeatureFlags.java
+++ b/server/src/main/java/org/opensearch/common/util/FeatureFlags.java
@@ -128,9 +128,5 @@ public static boolean isEnabled(Setting<Boolean> featureFlag) {
         Property.NodeScope
     );
 
-    public static final Setting<Boolean> DOC_ID_FUZZY_SET_SETTING = Setting.boolSetting(
-        DOC_ID_FUZZY_SET,
-        false,
-        Property.NodeScope
-    );
+    public static final Setting<Boolean> DOC_ID_FUZZY_SET_SETTING = Setting.boolSetting(DOC_ID_FUZZY_SET, false, Property.NodeScope);
 }
diff --git a/server/src/main/java/org/opensearch/index/IndexSettings.java b/server/src/main/java/org/opensearch/index/IndexSettings.java
index 34c109de8e124..386926f683c4d 100644
--- a/server/src/main/java/org/opensearch/index/IndexSettings.java
+++ b/server/src/main/java/org/opensearch/index/IndexSettings.java
@@ -1863,8 +1863,8 @@ private static <T> void verifyFeatureToSetDocIdFuzzySetSetting(Consumer<T> setti
         if (FeatureFlags.isEnabled(DOC_ID_FUZZY_SET_SETTING)) {
             settingUpdater.accept(val);
         } else {
-            throw new IllegalArgumentException(String.format("Fuzzy set for optimizing doc id lookup " +
-                "cannot be enabled with feature flag %s set to false", FeatureFlags.DOC_ID_FUZZY_SET));
+            throw new IllegalArgumentException("Fuzzy set for optimizing doc id lookup " +
+                "cannot be enabled with feature flag [" +  FeatureFlags.DOC_ID_FUZZY_SET + "] set to false");
         }
     }
 }

From 278dfea15099f89d7a65d9008e9aab1956f7ecb1 Mon Sep 17 00:00:00 2001
From: mgodwan <mgodwan@amazon.com>
Date: Thu, 1 Feb 2024 18:26:48 +0530
Subject: [PATCH 6/8] Address PR comments

Signed-off-by: mgodwan <mgodwan@amazon.com>
---
 .../java/org/opensearch/index/IndexSettings.java     | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/server/src/main/java/org/opensearch/index/IndexSettings.java b/server/src/main/java/org/opensearch/index/IndexSettings.java
index 386926f683c4d..43a3a49418fbf 100644
--- a/server/src/main/java/org/opensearch/index/IndexSettings.java
+++ b/server/src/main/java/org/opensearch/index/IndexSettings.java
@@ -1856,15 +1856,21 @@ public double getDocIdFuzzySetFalsePositiveProbability() {
 
     public void setDocIdFuzzySetFalsePositiveProbability(double docIdFuzzySetFalsePositiveProbability) {
         verifyFeatureToSetDocIdFuzzySetSetting(
-            fpp -> this.docIdFuzzySetFalsePositiveProbability = fpp, docIdFuzzySetFalsePositiveProbability);
+            fpp -> this.docIdFuzzySetFalsePositiveProbability = fpp,
+            docIdFuzzySetFalsePositiveProbability
+        );
     }
 
     private static <T> void verifyFeatureToSetDocIdFuzzySetSetting(Consumer<T> settingUpdater, T val) {
         if (FeatureFlags.isEnabled(DOC_ID_FUZZY_SET_SETTING)) {
             settingUpdater.accept(val);
         } else {
-            throw new IllegalArgumentException("Fuzzy set for optimizing doc id lookup " +
-                "cannot be enabled with feature flag [" +  FeatureFlags.DOC_ID_FUZZY_SET + "] set to false");
+            throw new IllegalArgumentException(
+                "Fuzzy set for optimizing doc id lookup "
+                    + "cannot be enabled with feature flag ["
+                    + FeatureFlags.DOC_ID_FUZZY_SET
+                    + "] set to false"
+            );
         }
     }
 }

From b89878545ecce8d81da63b0666d8c3143ebab875 Mon Sep 17 00:00:00 2001
From: mgodwan <mgodwan@amazon.com>
Date: Thu, 1 Feb 2024 21:09:12 +0530
Subject: [PATCH 7/8] Fix tests after feature flag

Signed-off-by: mgodwan <mgodwan@amazon.com>
---
 .../java/org/opensearch/index/codec/fuzzy/BloomFilter.java     | 2 +-
 .../opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java    | 2 +-
 .../main/java/org/opensearch/test/OpenSearchIntegTestCase.java | 3 +++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java
index edff33e9dd580..b8a8352183ca8 100644
--- a/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java
@@ -61,7 +61,7 @@ public class BloomFilter extends AbstractFuzzySet {
         int setSize = (int) Math.ceil((maxDocs * Math.log(maxFpp)) / Math.log(1 / Math.pow(2, Math.log(2))));
         setSize = getNearestSetSize(setSize < Integer.MAX_VALUE / 2 ? 2 * setSize : Integer.MAX_VALUE);
         int optimalK = (int) Math.round(((double) setSize / maxDocs) * Math.log(2));
-        this.bitset = new LongArrayBackedBitSet(setSize + 1);
+        this.bitset = new LongArrayBackedBitSet(setSize);
         this.setSize = setSize;
         this.hashCount = optimalK;
         addAll(fieldIteratorProvider);
diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java
index 714be24d28e00..bd4936aeec366 100644
--- a/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java
+++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java
@@ -32,7 +32,7 @@ class LongArrayBackedBitSet implements Accountable, Closeable {
      */
     LongArrayBackedBitSet(long capacity) {
         // Since the bitset is backed by a long array, we only need 1 element for every 64 bits in the underlying array.
-        underlyingArrayLength = ((capacity - 1L) >> 6) + 1;
+        underlyingArrayLength = (capacity >> 6) + 1L;
         this.longArray = BigArrays.NON_RECYCLING_INSTANCE.withCircuitBreaking().newLongArray(underlyingArrayLength);
     }
 
diff --git a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java
index 9adcafc21dba3..9b00149e936b6 100644
--- a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java
+++ b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java
@@ -653,6 +653,9 @@ protected Settings featureFlagSettings() {
         }
         // Enabling Telemetry setting by default
         featureSettings.put(FeatureFlags.TELEMETRY_SETTING.getKey(), true);
+
+        // Enabling fuzzy set for tests by default
+        featureSettings.put(FeatureFlags.DOC_ID_FUZZY_SET_SETTING.getKey(), true);
         return featureSettings.build();
     }
 

From fed8863e8e4867e093003959bc8e15d0c5f71296 Mon Sep 17 00:00:00 2001
From: mgodwan <mgodwan@amazon.com>
Date: Thu, 1 Feb 2024 21:16:17 +0530
Subject: [PATCH 8/8] Fix upgrade tests after feature flag

Signed-off-by: mgodwan <mgodwan@amazon.com>
---
 qa/rolling-upgrade/build.gradle | 1 +
 1 file changed, 1 insertion(+)

diff --git a/qa/rolling-upgrade/build.gradle b/qa/rolling-upgrade/build.gradle
index 3dff452be855f..777377f04e8b9 100644
--- a/qa/rolling-upgrade/build.gradle
+++ b/qa/rolling-upgrade/build.gradle
@@ -62,6 +62,7 @@ for (Version bwcVersion : BuildParams.bwcVersions.wireCompatible) {
       setting 'repositories.url.allowed_urls', 'http://snapshot.test*'
       setting 'path.repo', "${buildDir}/cluster/shared/repo/${baseName}"
       setting 'http.content_type.required', 'true'
+      systemProperty 'opensearch.experimental.optimize_doc_id_lookup.fuzzy_set.enabled', 'true'
     }
   }