From 3b18ac8877b9741fd4d96a421c822481183beaff Mon Sep 17 00:00:00 2001
From: conggguan <157357330+conggguan@users.noreply.github.com>
Date: Sat, 3 Aug 2024 05:52:32 +0800
Subject: [PATCH] Init a sparse model auto tracing workflow. (#394)

* Init a sparse model auto tracing workflow.
Signed-off-by: conggguan <congguan@amazon.com>

* Change the minimum-approvals of sparse model uploader to 2. Add some test case. Remove some redundant lines.

Signed-off-by: conggguan <congguan@amazon.com>

* Fix some test cases.

Signed-off-by: conggguan <congguan@amazon.com>

* Remove the temp test jupyter notebook.

Signed-off-by: conggguan <congguan@amazon.com>

* Change the variable name of inner model, and optimize the license verification.

Signed-off-by: conggguan <congguan@amazon.com>

* Address some comments, and nox format.

Signed-off-by: conggguan <congguan@amazon.com>

* Fix a bug for NeuralSparseModel's init. And remove a redundant save_pretrained.

Signed-off-by: conggguan <congguan@amazon.com>

* [Fix] Deleted some redundant code caused a faiure test case, fixed it.

Signed-off-by: conggguan <congguan@amazon.com>

* [Style]:Run a nox -s format to make format identical.

Signed-off-by: conggguan <congguan@amazon.com>

* [Fix] Simplify the SparseEncodingModel and fix a bug for multiple texts embeddings.

Signed-off-by: conggguan <congguan@amazon.com>

* [Fix] Make register_and_deploy_sparse_encoding_model return proper list but not single map.

Signed-off-by: conggguan <congguan@amazon.com>

* [Fix] Fix a bug for register_and_deploy_sparse_encoding_model, it now generate correct list of embedding of input texts.

Signed-off-by: conggguan <congguan@amazon.com>

* [Fix] Fix sparse encoding mdoel's test_check_required_fields test case.

Signed-off-by: conggguan <congguan@amazon.com>

* [Fix] Renamed a unproper variable name.

Signed-off-by: conggguan <congguan@amazon.com>

* [Refactor] Add some comments and extract some constants to a new file.

Signed-off-by: conggguan <congguan@amazon.com>

* [Refactor] Simplify and reuse some code from model auto tracing.

Signed-off-by: conggguan <congguan@amazon.com>

* [Refactor] Simplify and reuse some code from model auto tracing.

Signed-off-by: conggguan <congguan@amazon.com>

* [Refactor] Add a function comments and merge the sparse model trace workflow and dense.

Signed-off-by: conggguan <congguan@amazon.com>

* [Refactor] Merge the sparse and dense model's ci branch.

Signed-off-by: conggguan <congguan@amazon.com>

* [Refactor] Change for more common API, add a line of comments.

Signed-off-by: conggguan <congguan@amazon.com>

---------

Signed-off-by: conggguan <congguan@amazon.com>
---
 .ci/run-repository.sh                         |  20 +-
 .github/CODEOWNERS                            |   2 +-
 .github/workflows/model_uploader.yml          |  29 +-
 CHANGELOG.md                                  |   2 +-
 noxfile.py                                    |  17 +
 .../ml_commons/ml_common_utils.py             |   8 +-
 .../ml_commons/ml_commons_client.py           |  18 +
 opensearch_py_ml/ml_commons/model_uploader.py |   9 +-
 opensearch_py_ml/ml_models/__init__.py        |   3 +-
 opensearch_py_ml/ml_models/base_models.py     | 117 ++++++
 .../ml_models/sentencetransformermodel.py     |  62 +---
 .../ml_models/sparse_encoding_model.py        | 334 +++++++++++++++++
 .../test_sparseencondingmodel_pytest.py       | 270 ++++++++++++++
 utils/model_uploader/autotracing_utils.py     | 304 ++++++++++++++++
 utils/model_uploader/model_autotracing.py     | 222 +++---------
 .../sparse_model_autotracing.py               | 339 ++++++++++++++++++
 16 files changed, 1503 insertions(+), 253 deletions(-)
 create mode 100644 opensearch_py_ml/ml_models/base_models.py
 create mode 100644 opensearch_py_ml/ml_models/sparse_encoding_model.py
 create mode 100644 tests/ml_models/test_sparseencondingmodel_pytest.py
 create mode 100644 utils/model_uploader/autotracing_utils.py
 create mode 100644 utils/model_uploader/sparse_model_autotracing.py

diff --git a/.ci/run-repository.sh b/.ci/run-repository.sh
index bd97c17b6..f94e3e439 100755
--- a/.ci/run-repository.sh
+++ b/.ci/run-repository.sh
@@ -65,7 +65,7 @@ elif [[ "$TASK_TYPE" == "doc" ]]; then
   
   docker cp opensearch-py-ml-doc-runner:/code/opensearch-py-ml/docs/build/ ./docs/
   docker rm opensearch-py-ml-doc-runner
-elif [[ "$TASK_TYPE" == "trace" ]]; then
+elif [[ "$TASK_TYPE" == "SentenceTransformerTrace" || "$TASK_TYPE" == "SparseTrace" ]]; then
   # Set up OpenSearch cluster & Run model autotracing (Invoked by model_uploader.yml workflow)
   echo -e "\033[34;1mINFO:\033[0m MODEL_ID: ${MODEL_ID}\033[0m"
   echo -e "\033[34;1mINFO:\033[0m MODEL_VERSION: ${MODEL_VERSION}\033[0m"
@@ -74,6 +74,17 @@ elif [[ "$TASK_TYPE" == "trace" ]]; then
   echo -e "\033[34;1mINFO:\033[0m POOLING_MODE: ${POOLING_MODE:-N/A}\033[0m"
   echo -e "\033[34;1mINFO:\033[0m MODEL_DESCRIPTION: ${MODEL_DESCRIPTION:-N/A}\033[0m"
 
+  if [[ "$TASK_TYPE" == "SentenceTransformerTrace" ]]; then
+      NOX_TRACE_TYPE="trace"
+      EXTRA_ARGS="-ed ${EMBEDDING_DIMENSION} -pm ${POOLING_MODE}"
+  elif [[ "$TASK_TYPE" == "SparseTrace" ]]; then
+      NOX_TRACE_TYPE="sparsetrace"
+      EXTRA_ARGS=""
+  else
+      echo "Unknown TASK_TYPE: $TASK_TYPE"
+      exit 1
+  fi
+
   docker run \
   --network=${network_name} \
   --env "STACK_VERSION=${STACK_VERSION}" \
@@ -84,9 +95,12 @@ elif [[ "$TASK_TYPE" == "trace" ]]; then
   --env "TEST_TYPE=server" \
   --name opensearch-py-ml-trace-runner \
   opensearch-project/opensearch-py-ml \
-  nox -s "trace-${PYTHON_VERSION}" -- ${MODEL_ID} ${MODEL_VERSION} ${TRACING_FORMAT} -ed ${EMBEDDING_DIMENSION} -pm ${POOLING_MODE} -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"}
-  
+  nox -s "${NOX_TRACE_TYPE}-${PYTHON_VERSION}" -- ${MODEL_ID} ${MODEL_VERSION} ${TRACING_FORMAT} ${EXTRA_ARGS} -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"}
+
+  # To upload a model, we need the model artifact, description, license files into local path
+  # trace_output should include description and license file.
   docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/upload/ ./upload/
   docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/trace_output/ ./trace_output/
+  # Delete the docker image
   docker rm opensearch-py-ml-trace-runner
 fi
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index e04a41ea8..fc5796e95 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1 +1 @@
-*   @dhrubo-os  @greaa-aws @ylwu-amzn @b4sjoo @jngz-es @rbhavna
\ No newline at end of file
+* @dhrubo-os @greaa-aws @ylwu-amzn @b4sjoo @jngz-es @rbhavna
\ No newline at end of file
diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml
index 68d75c90f..1c7362b26 100644
--- a/.github/workflows/model_uploader.yml
+++ b/.github/workflows/model_uploader.yml
@@ -17,13 +17,21 @@ on:
         required: true
         type: string
       tracing_format:
-        description: "Model format for auto-tracing (torch_script/onnx)"
+        description: "Model format for auto-tracing (torch_script/onnx), now the sparse model only support torchscript model."
         required: true
         type: choice
         options:
         - "BOTH"
         - "TORCH_SCRIPT"
         - "ONNX"
+      model_type:
+        description: "Model type for auto-tracing (SentenceTransformer/Sparse)"
+        required: true
+        type: choice
+        options:
+          - "SentenceTransformer"
+          - "Sparse"
+        default: "SentenceTransformer"
       embedding_dimension:
         description: "(Optional) Embedding Dimension (Specify here if it does not exist in original config.json file, or you want to overwrite it.)"
         required: false
@@ -66,14 +74,14 @@ jobs:
       run: |
         model_id=${{ github.event.inputs.model_id }}
         echo "model_folder=ml-models/${{github.event.inputs.model_source}}/${model_id}" >> $GITHUB_OUTPUT
-        echo "sentence_transformer_folder=ml-models/${{github.event.inputs.model_source}}/${model_id%%/*}/" >> $GITHUB_OUTPUT
+        echo "model_prefix_folder=ml-models/${{github.event.inputs.model_source}}/${model_id%%/*}/" >> $GITHUB_OUTPUT
     - name: Initiate workflow_info
       id: init_workflow_info
       run: |
         embedding_dimension=${{ github.event.inputs.embedding_dimension }}
         pooling_mode=${{ github.event.inputs.pooling_mode }}
         model_description="${{ github.event.inputs.model_description }}"
-        
+        model_type=${{ github.event.inputs.model_type }}
         workflow_info="
         ============= Workflow Details ==============
         - Workflow Name: ${{ github.workflow }}
@@ -84,6 +92,7 @@ jobs:
         ========= Workflow Input Information =========
         - Model ID: ${{ github.event.inputs.model_id }}
         - Model Version: ${{ github.event.inputs.model_version }}
+        - Model Type: ${{ github.event.inputs.model_type }}
         - Tracing Format: ${{ github.event.inputs.tracing_format }}
         - Embedding Dimension: ${embedding_dimension:-N/A}
         - Pooling Mode: ${pooling_mode:-N/A}
@@ -103,7 +112,7 @@ jobs:
         echo "unverified=- [ ]  :warning: The license cannot be verified. Please confirm by yourself that the model is licensed under Apache 2.0  :warning:" >> $GITHUB_OUTPUT
     outputs:
       model_folder: ${{ steps.init_folders.outputs.model_folder }}
-      sentence_transformer_folder: ${{ steps.init_folders.outputs.sentence_transformer_folder }}
+      model_prefix_folder: ${{ steps.init_folders.outputs.model_prefix_folder }}
       workflow_info: ${{ steps.init_workflow_info.outputs.workflow_info }}
       verified_license_line: ${{ steps.init_license_line.outputs.verified }}
       unverified_license_line: ${{ steps.init_license_line.outputs.unverified }}
@@ -133,7 +142,7 @@ jobs:
         if: github.event.inputs.allow_overwrite == 'NO' && (github.event.inputs.tracing_format == 'TORCH_SCRIPT' || github.event.inputs.tracing_format == 'BOTH')
         run: |
           TORCH_FILE_PATH=$(python utils/model_uploader/save_model_file_path_to_env.py \
-              ${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} ${{ github.event.inputs.model_id }} \
+              ${{ needs.init-workflow-var.outputs.model_prefix_folder }} ${{ github.event.inputs.model_id }} \
               ${{ github.event.inputs.model_version }} TORCH_SCRIPT)
           aws s3api head-object --bucket ${{ secrets.MODEL_BUCKET }} --key $TORCH_FILE_PATH > /dev/null 2>&1 || TORCH_MODEL_NOT_EXIST=true
           if [[ -z $TORCH_MODEL_NOT_EXIST ]]
@@ -145,7 +154,7 @@ jobs:
         if: github.event.inputs.allow_overwrite == 'NO' && (github.event.inputs.tracing_format == 'ONNX' || github.event.inputs.tracing_format == 'BOTH')
         run: |
           ONNX_FILE_PATH=$(python utils/model_uploader/save_model_file_path_to_env.py \
-            ${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} ${{ github.event.inputs.model_id }} \
+            ${{ needs.init-workflow-var.outputs.model_prefix_folder }} ${{ github.event.inputs.model_id }} \
             ${{ github.event.inputs.model_version }} ONNX)
           aws s3api head-object --bucket ${{ secrets.MODEL_BUCKET }} --key $ONNX_FILE_PATH > /dev/null 2>&1 || ONNX_MODEL_NOT_EXIST=true
           if [[ -z $ONNX_MODEL_NOT_EXIST ]]
@@ -168,7 +177,7 @@ jobs:
         cluster: ["opensearch"]
         secured: ["true"]
         entry:
-          - { opensearch_version: 2.7.0 }
+          - { opensearch_version: 2.11.0 }
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -181,7 +190,7 @@ jobs:
           echo "POOLING_MODE=${{ github.event.inputs.pooling_mode }}" >> $GITHUB_ENV     
           echo "MODEL_DESCRIPTION=${{ github.event.inputs.model_description }}" >> $GITHUB_ENV     
       - name: Autotracing ${{ matrix.cluster }} secured=${{ matrix.secured }} version=${{matrix.entry.opensearch_version}}
-        run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} trace"
+        run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} ${{github.event.inputs.model_type}}Trace"
       - name: Limit Model Size to 2GB
         run: |
           upload_size_in_binary_bytes=$(ls -lR ./upload/ | awk '{ SUM += $5} END {print SUM}')
@@ -226,7 +235,7 @@ jobs:
       - name: Dryrun model uploading
         id: dryrun_model_uploading
         run: |
-          dryrun_output=$(aws s3 sync ./upload/ s3://${{ secrets.MODEL_BUCKET }}/${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} --dryrun \
+          dryrun_output=$(aws s3 sync ./upload/ s3://${{ secrets.MODEL_BUCKET }}/${{ needs.init-workflow-var.outputs.model_prefix_folder }} --dryrun \
             | sed 's|s3://${{ secrets.MODEL_BUCKET }}/|s3://(MODEL_BUCKET)/|' 
           )
           echo "dryrun_output<<EOF" >> $GITHUB_OUTPUT
@@ -301,7 +310,7 @@ jobs:
       - name: Copy Files to the Bucket
         id: copying_to_bucket
         run: |
-          aws s3 sync ./upload/ s3://${{ secrets.MODEL_BUCKET }}/${{ needs.init-workflow-var.outputs.sentence_transformer_folder }}
+          aws s3 sync ./upload/ s3://${{ secrets.MODEL_BUCKET }}/${{ needs.init-workflow-var.outputs.model_prefix_folder }}
           echo "upload_time=$(TZ='America/Los_Angeles' date "+%Y-%m-%d %T")" >> $GITHUB_OUTPUT
     outputs:
       upload_time: ${{ steps.copying_to_bucket.outputs.upload_time }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index adac4dbf7..f81687625 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,7 +14,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - Add support for model profiles by @rawwar in ([#358](https://github.com/opensearch-project/opensearch-py-ml/pull/358))
 - Support for security default admin credential changes in 2.12.0 in ([#365](https://github.com/opensearch-project/opensearch-py-ml/pull/365))
 - adding cross encoder models in the pre-trained traced list ([#378](https://github.com/opensearch-project/opensearch-py-ml/pull/378))
-
+- Add workflows and scripts for sparse encoding model tracing and uploading process by @conggguan in ([#394](https://github.com/opensearch-project/opensearch-py-ml/pull/394))
 
 ### Changed
 - Modify ml-models.JenkinsFile so that it takes model format into account and can be triggered with generic webhook by @thanawan-atc in ([#211](https://github.com/opensearch-project/opensearch-py-ml/pull/211))
diff --git a/noxfile.py b/noxfile.py
index c7da4f32e..0ae8ec697 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -166,3 +166,20 @@ def trace(session):
         "utils/model_uploader/model_autotracing.py",
         *(session.posargs),
     )
+
+
+@nox.session(python=["3.9"])
+def sparsetrace(session):
+    session.install(
+        "-r",
+        "requirements-dev.txt",
+        "--timeout",
+        "1500",
+    )
+    session.install(".")
+
+    session.run(
+        "python",
+        "utils/model_uploader/sparse_model_autotracing.py",
+        *(session.posargs),
+    )
diff --git a/opensearch_py_ml/ml_commons/ml_common_utils.py b/opensearch_py_ml/ml_commons/ml_common_utils.py
index 8ca5bab23..4af11a198 100644
--- a/opensearch_py_ml/ml_commons/ml_common_utils.py
+++ b/opensearch_py_ml/ml_commons/ml_common_utils.py
@@ -11,7 +11,7 @@
 MODEL_CHUNK_MAX_SIZE = 10_000_000
 MODEL_MAX_SIZE = 4_000_000_000
 BUF_SIZE = 65536  # lets read stuff in 64kb chunks!
-TIMEOUT = 120  # timeout for synchronous method calls in seconds
+TIMEOUT = 240  # timeout for synchronous method calls in seconds
 META_API_ENDPOINT = "models/meta"
 MODEL_NAME_FIELD = "name"
 MODEL_VERSION_FIELD = "version"
@@ -24,6 +24,12 @@
 FRAMEWORK_TYPE = "framework_type"
 MODEL_CONTENT_HASH_VALUE = "model_content_hash_value"
 MODEL_GROUP_ID = "model_group_id"
+MODEL_FUNCTION_NAME = "function_name"
+MODEL_TASK_TYPE = "model_task_type"
+# URL of the license file for the OpenSearch project
+LICENSE_URL = "https://github.com/opensearch-project/opensearch-py-ml/raw/main/LICENSE"
+# Name of the function used for sparse encoding
+SPARSE_ENCODING_FUNCTION_NAME = "SPARSE_ENCODING"
 
 
 def _generate_model_content_hash_value(model_file_path: str) -> str:
diff --git a/opensearch_py_ml/ml_commons/ml_commons_client.py b/opensearch_py_ml/ml_commons/ml_commons_client.py
index 2f387ad11..5509eaa71 100644
--- a/opensearch_py_ml/ml_commons/ml_commons_client.py
+++ b/opensearch_py_ml/ml_commons/ml_commons_client.py
@@ -498,6 +498,24 @@ def get_model_info(self, model_id: str) -> object:
             url=API_URL,
         )
 
+    def generate_model_inference(self, model_id: str, request_body: dict) -> object:
+        """
+        Generates inference result for the given input using the specified request body.
+
+        :param model_id: Unique ID of the model.
+        :type model_id: string
+        :param request_body: Request body to send to the API.
+        :type request_body: dict
+        :return: Returns a JSON object `inference_results` containing the results for the given input.
+        :rtype: object
+        """
+        API_URL = f"{ML_BASE_URI}/models/{model_id}/_predict/"
+        return self._client.transport.perform_request(
+            method="POST",
+            url=API_URL,
+            body=request_body,
+        )
+
     def generate_embedding(self, model_id: str, sentences: List[str]) -> object:
         """
         This method return embedding for given sentences (using ml commons _predict api)
diff --git a/opensearch_py_ml/ml_commons/model_uploader.py b/opensearch_py_ml/ml_commons/model_uploader.py
index 850f6a80a..53cad82d0 100644
--- a/opensearch_py_ml/ml_commons/model_uploader.py
+++ b/opensearch_py_ml/ml_commons/model_uploader.py
@@ -22,9 +22,11 @@
     MODEL_CONTENT_HASH_VALUE,
     MODEL_CONTENT_SIZE_IN_BYTES_FIELD,
     MODEL_FORMAT_FIELD,
+    MODEL_FUNCTION_NAME,
     MODEL_GROUP_ID,
     MODEL_MAX_SIZE,
     MODEL_NAME_FIELD,
+    MODEL_TASK_TYPE,
     MODEL_TYPE,
     MODEL_VERSION_FIELD,
     TOTAL_CHUNKS_FIELD,
@@ -167,6 +169,7 @@ def _check_mandatory_field(self, model_meta: dict) -> bool:
         """
 
         if model_meta:
+
             if not model_meta.get(MODEL_NAME_FIELD):
                 raise ValueError(f"{MODEL_NAME_FIELD} can not be empty")
             if not model_meta.get(MODEL_VERSION_FIELD):
@@ -178,7 +181,11 @@ def _check_mandatory_field(self, model_meta: dict) -> bool:
             if not model_meta.get(TOTAL_CHUNKS_FIELD):
                 raise ValueError(f"{TOTAL_CHUNKS_FIELD} can not be empty")
             if not model_meta.get(MODEL_CONFIG_FIELD):
-                raise ValueError(f"{MODEL_CONFIG_FIELD} can not be empty")
+                if (
+                    model_meta.get(MODEL_FUNCTION_NAME) != "SPARSE_ENCODING"
+                    and model_meta.get(MODEL_TASK_TYPE) != "SPARSE_ENCODING"
+                ):
+                    raise ValueError(f"{MODEL_CONFIG_FIELD} can not be empty")
             else:
                 if not isinstance(model_meta.get(MODEL_CONFIG_FIELD), dict):
                     raise TypeError(
diff --git a/opensearch_py_ml/ml_models/__init__.py b/opensearch_py_ml/ml_models/__init__.py
index 3ec96ebd5..ccc204dae 100644
--- a/opensearch_py_ml/ml_models/__init__.py
+++ b/opensearch_py_ml/ml_models/__init__.py
@@ -7,5 +7,6 @@
 
 from .metrics_correlation.mcorr import MCorr
 from .sentencetransformermodel import SentenceTransformerModel
+from .sparse_encoding_model import SparseEncodingModel
 
-__all__ = ["SentenceTransformerModel", "MCorr"]
+__all__ = ["SentenceTransformerModel", "MCorr", "SparseEncodingModel"]
diff --git a/opensearch_py_ml/ml_models/base_models.py b/opensearch_py_ml/ml_models/base_models.py
new file mode 100644
index 000000000..36d3ca397
--- /dev/null
+++ b/opensearch_py_ml/ml_models/base_models.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.
+# Any modifications Copyright OpenSearch Contributors. See
+# GitHub history for details.
+import json
+import os
+from abc import ABC, abstractmethod
+from zipfile import ZipFile
+
+import requests
+
+from opensearch_py_ml.ml_commons.ml_common_utils import (
+    LICENSE_URL,
+    SPARSE_ENCODING_FUNCTION_NAME,
+)
+
+
+class BaseUploadModel(ABC):
+    """
+    A base class for uploading models to OpenSearch pretrained model hub.
+    """
+
+    def __init__(
+        self, model_id: str, folder_path: str = None, overwrite: bool = False
+    ) -> None:
+        self.model_id = model_id
+        self.folder_path = folder_path
+        self.overwrite = overwrite
+
+    @abstractmethod
+    def save_as_pt(self, *args, **kwargs):
+        pass
+
+    @abstractmethod
+    def save_as_onnx(self, *args, **kwargs):
+        pass
+
+    @abstractmethod
+    def make_model_config_json(
+        self,
+        version_number: str,
+        model_format: str,
+        description: str,
+    ) -> str:
+        pass
+
+    def _fill_null_truncation_field(
+        self,
+        save_json_folder_path: str,
+        max_length: int,
+    ) -> None:
+        """
+        Fill truncation field in tokenizer.json when it is null
+
+        :param save_json_folder_path:
+             path to save model json file, e.g, "home/save_pre_trained_model_json/")
+        :type save_json_folder_path: string
+        :param max_length:
+             maximum sequence length for model
+        :type max_length: int
+        :return: no return value expected
+        :rtype: None
+        """
+        tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json")
+        with open(tokenizer_file_path) as user_file:
+            parsed_json = json.load(user_file)
+        if "truncation" not in parsed_json or parsed_json["truncation"] is None:
+            parsed_json["truncation"] = {
+                "direction": "Right",
+                "max_length": max_length,
+                "strategy": "LongestFirst",
+                "stride": 0,
+            }
+            with open(tokenizer_file_path, "w") as file:
+                json.dump(parsed_json, file, indent=2)
+
+    def _add_apache_license_to_model_zip_file(self, model_zip_file_path: str):
+        """
+        Add Apache-2.0 license file to the model zip file at model_zip_file_path
+
+        :param model_zip_file_path:
+            Path to the model zip file
+        :type model_zip_file_path: string
+        :return: no return value expected
+        :rtype: None
+        """
+        r = requests.get(LICENSE_URL)
+        assert r.status_code == 200, "Failed to add license file to the model zip file"
+
+        with ZipFile(str(model_zip_file_path), "a") as zipObj:
+            zipObj.writestr("LICENSE", r.content)
+
+
+class SparseModel(BaseUploadModel, ABC):
+    """
+    Class for autotracing the Sparse Encoding model.
+    """
+
+    def __init__(
+        self,
+        model_id: str,
+        folder_path: str = "./model_files/",
+        overwrite: bool = False,
+    ):
+        super().__init__(model_id, folder_path, overwrite)
+        self.model_id = model_id
+        self.folder_path = folder_path
+        self.overwrite = overwrite
+        self.function_name = SPARSE_ENCODING_FUNCTION_NAME
+
+    def pre_process(self):
+        pass
+
+    def post_process(self):
+        pass
diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py
index 10f1174b5..0fad0dfc6 100644
--- a/opensearch_py_ml/ml_models/sentencetransformermodel.py
+++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py
@@ -21,7 +21,6 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-import requests
 import torch
 import yaml
 from accelerate import Accelerator, notebook_launcher
@@ -37,10 +36,10 @@
     _generate_model_content_hash_value,
 )
 
-LICENSE_URL = "https://github.com/opensearch-project/opensearch-py-ml/raw/main/LICENSE"
+from .base_models import BaseUploadModel
 
 
-class SentenceTransformerModel:
+class SentenceTransformerModel(BaseUploadModel):
     """
     Class for training, exporting and configuring the SentenceTransformers model.
     """
@@ -73,6 +72,7 @@ def __init__(
         :return: no return value expected
         :rtype: None
         """
+        super().__init__(model_id, folder_path, overwrite)
         default_folder_path = os.path.join(
             os.getcwd(), "sentence_transformer_model_files"
         )
@@ -641,22 +641,6 @@ def train_model(
         print("Model saved to path: " + self.folder_path + "\n")
         return traced_cpu
 
-    def _add_apache_license_to_model_zip_file(self, model_zip_file_path: str):
-        """
-        Add Apache-2.0 license file to the model zip file at model_zip_file_path
-
-        :param model_zip_file_path:
-            Path to the model zip file
-        :type model_zip_file_path: string
-        :return: no return value expected
-        :rtype: None
-        """
-        r = requests.get(LICENSE_URL)
-        assert r.status_code == 200, "Failed to add license file to the model zip file"
-
-        with ZipFile(str(model_zip_file_path), "a") as zipObj:
-            zipObj.writestr("LICENSE", r.content)
-
     def zip_model(
         self,
         model_path: str = None,
@@ -728,40 +712,10 @@ def zip_model(
                 arcname="tokenizer.json",
             )
         if add_apache_license:
-            self._add_apache_license_to_model_zip_file(zip_file_path)
+            super()._add_apache_license_to_model_zip_file(zip_file_path)
 
         print("zip file is saved to " + zip_file_path + "\n")
 
-    def _fill_null_truncation_field(
-        self,
-        save_json_folder_path: str,
-        max_length: int,
-    ) -> None:
-        """
-        Fill truncation field in tokenizer.json when it is null
-
-        :param save_json_folder_path:
-             path to save model json file, e.g, "home/save_pre_trained_model_json/")
-        :type save_json_folder_path: string
-        :param max_length:
-             maximum sequence length for model
-        :type max_length: int
-        :return: no return value expected
-        :rtype: None
-        """
-        tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json")
-        with open(tokenizer_file_path) as user_file:
-            parsed_json = json.load(user_file)
-        if "truncation" not in parsed_json or parsed_json["truncation"] is None:
-            parsed_json["truncation"] = {
-                "direction": "Right",
-                "max_length": max_length,
-                "strategy": "LongestFirst",
-                "stride": 0,
-            }
-            with open(tokenizer_file_path, "w") as file:
-                json.dump(parsed_json, file, indent=2)
-
     def save_as_pt(
         self,
         sentences: [str],
@@ -833,7 +787,7 @@ def save_as_pt(
 
         # save tokenizer.json in save_json_folder_name
         model.save(save_json_folder_path)
-        self._fill_null_truncation_field(
+        super()._fill_null_truncation_field(
             save_json_folder_path, model.tokenizer.model_max_length
         )
 
@@ -869,7 +823,7 @@ def save_as_pt(
                 arcname="tokenizer.json",
             )
         if add_apache_license:
-            self._add_apache_license_to_model_zip_file(zip_file_path)
+            super()._add_apache_license_to_model_zip_file(zip_file_path)
 
         self.torch_script_zip_file_path = zip_file_path
         print("zip file is saved to ", zip_file_path, "\n")
@@ -943,7 +897,7 @@ def save_as_onnx(
 
         # save tokenizer.json in output_path
         model.save(save_json_folder_path)
-        self._fill_null_truncation_field(
+        super()._fill_null_truncation_field(
             save_json_folder_path, model.tokenizer.model_max_length
         )
 
@@ -967,7 +921,7 @@ def save_as_onnx(
                 arcname="tokenizer.json",
             )
         if add_apache_license:
-            self._add_apache_license_to_model_zip_file(zip_file_path)
+            super()._add_apache_license_to_model_zip_file(zip_file_path)
 
         self.onnx_zip_file_path = zip_file_path
         print("zip file is saved to ", zip_file_path, "\n")
diff --git a/opensearch_py_ml/ml_models/sparse_encoding_model.py b/opensearch_py_ml/ml_models/sparse_encoding_model.py
new file mode 100644
index 000000000..747862759
--- /dev/null
+++ b/opensearch_py_ml/ml_models/sparse_encoding_model.py
@@ -0,0 +1,334 @@
+# SPDX-License-Identifier: Apache-2.0
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.
+# Any modifications Copyright OpenSearch Contributors. See
+# GitHub history for details.
+import json
+import os
+from zipfile import ZipFile
+
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+from opensearch_py_ml.ml_commons.ml_common_utils import (
+    SPARSE_ENCODING_FUNCTION_NAME,
+    _generate_model_content_hash_value,
+)
+from opensearch_py_ml.ml_models.base_models import SparseModel
+
+
+def _generate_default_model_description() -> str:
+    """
+    Generate default model description
+
+    :return: Description of the model
+    :rtype: string
+    """
+    print(
+        "Using default description (You can overwrite this by specifying description parameter in "
+        "make_model_config_json function"
+    )
+    description = "This is a neural sparse model: It maps sentences & paragraphs to sparse vector space."
+    return description
+
+
+class SparseEncodingModel(SparseModel):
+    """
+    Class for  exporting and configuring the NeuralSparseV2Model model.
+    """
+
+    DEFAULT_MODEL_ID = "opensearch-project/opensearch-neural-sparse-encoding-v1"
+
+    def __init__(
+        self,
+        model_id: str = DEFAULT_MODEL_ID,
+        folder_path: str = None,
+        overwrite: bool = False,
+    ) -> None:
+
+        super().__init__(model_id, folder_path, overwrite)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+        self.backbone_model = AutoModelForMaskedLM.from_pretrained(model_id)
+        default_folder_path = os.path.join(
+            os.getcwd(), "opensearch_neural_sparse_model_files"
+        )
+        if folder_path is None:
+            self.folder_path = default_folder_path
+        else:
+            self.folder_path = folder_path
+
+        if os.path.exists(self.folder_path) and not overwrite:
+            print(
+                "To prevent overwriting, please enter a different folder path or delete the folder or enable "
+                "overwrite = True "
+            )
+            raise Exception(
+                str("The default folder path already exists at : " + self.folder_path)
+            )
+        self.model_id = model_id
+        self.torch_script_zip_file_path = None
+        self.onnx_zip_file_path = None
+
+    def save_as_pt(
+        self,
+        sentences: [str],
+        model_id=DEFAULT_MODEL_ID,
+        model_name: str = None,
+        save_json_folder_path: str = None,
+        model_output_path: str = None,
+        zip_file_name: str = None,
+        add_apache_license: bool = True,
+    ) -> str:
+        """
+        Download sentence transformer model directly from huggingface, convert model to torch script format,
+        zip the model file and its tokenizer.json file to prepare to upload to the Open Search cluster
+
+        :param sentences:
+            Required, for example  sentences = ['today is sunny']
+        :type sentences: List of string [str]
+        :param model_id:
+             model id to download model from a sparse encoding model.
+            default model_id = "opensearch-project/opensearch-neural-sparse-encoding-v1"
+        :type model_id: string
+        :param model_name:
+            Optional, model name to name the model file, e.g, "sample_model.pt". If None, default takes the
+            model_id and add the extension with ".pt"
+        :type model_name: string
+        :param save_json_folder_path:
+             Optional, path to save model json file, e.g, "home/save_pre_trained_model_json/"). If None, default as
+             default_folder_path from the constructor
+        :type save_json_folder_path: string
+        :param model_output_path:
+             Optional, path to save traced model zip file. If None, default as
+             default_folder_path from the constructor
+        :type model_output_path: string
+        :param zip_file_name:
+            Optional, file name for zip file. e.g, "sample_model.zip". If None, default takes the model_id
+            and add the extension with ".zip"
+        :type zip_file_name: string
+        :param add_apache_license:
+            Optional, whether to add Apache-2.0 license file to model zip file
+        :type add_apache_license: string
+
+        :return: model zip file path. The file path where the zip file is being saved
+        :rtype: string
+        """
+        if model_name is None:
+            model_name = str(model_id.split("/")[-1] + ".pt")
+
+        model_path = os.path.join(self.folder_path, model_name)
+
+        if save_json_folder_path is None:
+            save_json_folder_path = self.folder_path
+
+        if model_output_path is None:
+            model_output_path = self.folder_path
+
+        if zip_file_name is None:
+            zip_file_name = str(model_id.split("/")[-1] + ".zip")
+        zip_file_path = os.path.join(model_output_path, zip_file_name)
+
+        model = NeuralSparseModel(self.backbone_model, self.tokenizer)
+
+        # save tokenizer.json in save_json_folder_name
+        self.tokenizer.save_pretrained(save_json_folder_path)
+
+        super()._fill_null_truncation_field(
+            save_json_folder_path, self.tokenizer.model_max_length
+        )
+
+        # convert to pt format will need to be in cpu,
+        # set the device to cpu, convert its input_ids and attention_mask in cpu and save as .pt format
+        device = torch.device("cpu")
+        cpu_model = model.to(device)
+
+        features = self.tokenizer(
+            sentences,
+            add_special_tokens=True,
+            padding=True,
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_attention_mask=True,
+            return_token_type_ids=False,
+            return_tensors="pt",
+        ).to(device)
+
+        compiled_model = torch.jit.trace(cpu_model, dict(features), strict=False)
+        torch.jit.save(compiled_model, model_path)
+        print("model file is saved to ", model_path)
+
+        # zip model file along with self.tokenizer.json (and license file) as output
+        with ZipFile(str(zip_file_path), "w") as zipObj:
+            zipObj.write(
+                model_path,
+                arcname=str(model_name),
+            )
+            zipObj.write(
+                os.path.join(save_json_folder_path, "tokenizer.json"),
+                arcname="tokenizer.json",
+            )
+        if add_apache_license:
+            super()._add_apache_license_to_model_zip_file(zip_file_path)
+
+        self.torch_script_zip_file_path = zip_file_path
+        print("zip file is saved to ", zip_file_path, "\n")
+        return zip_file_path
+
+    def save_as_onnx(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def make_model_config_json(
+        self,
+        model_name: str = None,
+        version_number: str = "1.0.0",
+        model_format: str = "TORCH_SCRIPT",
+        description: str = None,
+        model_zip_file_path: str = None,
+    ) -> str:
+        folder_path = self.folder_path
+        if model_name is None:
+            model_name = self.model_id
+
+        model_config_content = {
+            "name": model_name,
+            "version": version_number,
+            "model_format": model_format,
+            "function_name": SPARSE_ENCODING_FUNCTION_NAME,
+        }
+        if model_zip_file_path is None:
+            model_zip_file_path = (
+                self.torch_script_zip_file_path
+                if model_format == "TORCH_SCRIPT"
+                else self.onnx_zip_file_path
+            )
+            if model_zip_file_path is None:
+                raise Exception(
+                    "The model configuration JSON file currently lacks the 'model_content_size_in_bytes' and "
+                    "'model_content_hash_value' fields. You can include these fields by specifying the "
+                    "'model_zip_file_path' parameter. Failure to do so may result in the model registration process "
+                    "encountering issues."
+                )
+            else:
+                model_config_content["model_content_size_in_bytes"] = os.stat(
+                    model_zip_file_path
+                ).st_size
+                model_config_content["model_content_hash_value"] = (
+                    _generate_model_content_hash_value(model_zip_file_path)
+                )
+        if description is not None:
+            model_config_content["description"] = description
+
+        model_config_file_path = os.path.join(
+            folder_path, "ml-commons_model_config.json"
+        )
+        os.makedirs(os.path.dirname(model_config_file_path), exist_ok=True)
+        with open(model_config_file_path, "w") as file:
+            json.dump(model_config_content, file, indent=4)
+        print(
+            "ml-commons_model_config.json file is saved at : ", model_config_file_path
+        )
+        return model_config_file_path
+
+    def get_backbone_model(self):
+        if self.backbone_model is not None:
+            return self.backbone_model
+        else:
+            return AutoModelForMaskedLM.from_pretrained(self.model_id)
+
+    def get_model(self):
+        return NeuralSparseModel(self.get_backbone_model(), self.get_tokenizer())
+
+    def save(self, path):
+        backbone_model = self.get_backbone_model()
+        tokenizer = self.get_tokenizer()
+        backbone_model.save_pretrained(path)
+        tokenizer.save_pretrained(path)
+
+    def post_process(self):
+        pass
+
+    def pre_process(self):
+        pass
+
+    def get_tokenizer(self):
+        if self.tokenizer is not None:
+            return self.tokenizer
+        else:
+            return AutoTokenizer.from_pretrained(self.model_id)
+
+    def process_sparse_encoding(self, queries):
+        return self.get_model().process_sparse_encoding(queries)
+
+    def init_tokenizer(self, model_id=None):
+        if model_id is None:
+            model_id = self.model_id
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+
+INPUT_ID_KEY = "input_ids"
+ATTENTION_MASK_KEY = "attention_mask"
+OUTPUT_KEY = "output"
+
+
+class NeuralSparseModel(torch.nn.Module):
+    """
+    A PyTorch module for transforming input text to sparse vector representation using a pre-trained internal BERT model.
+    This class encapsulates the BERT model and provides methods to process text queries into sparse vectors,
+    which are easier to handle in sparse data scenarios such as information retrieval.
+    """
+
+    def __init__(self, backbone_model, tokenizer=None):
+        super().__init__()
+        self.backbone_model = backbone_model
+        if tokenizer is not None:
+            self.tokenizer = tokenizer
+            self.special_token_ids = [
+                tokenizer.vocab[token]
+                for token in tokenizer.special_tokens_map.values()
+            ]
+            self.id_to_token = ["" for _ in range(len(tokenizer.vocab))]
+            for token, idx in tokenizer.vocab.items():
+                self.id_to_token[idx] = token
+
+    def forward(self, input: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+        result = self.backbone_model(
+            input_ids=input[INPUT_ID_KEY], attention_mask=input[ATTENTION_MASK_KEY]
+        )[0]
+        values, _ = torch.max(result * input[ATTENTION_MASK_KEY].unsqueeze(-1), dim=1)
+        values = torch.log(1 + torch.relu(values))
+        values[:, self.special_token_ids] = 0
+        return {OUTPUT_KEY: values}
+
+    def get_sparse_vector(self, feature):
+        output = self.forward(feature)
+        values = output[OUTPUT_KEY]
+        return values
+
+    def transform_sparse_vector_to_dict(self, sparse_vector):
+        all_sparse_dicts = []
+        for vector in sparse_vector:
+            tokens = [
+                self.id_to_token[i]
+                for i in torch.nonzero(vector, as_tuple=True)[0].tolist()
+            ]
+            sparse_dict = {
+                token: weight.item()
+                for token, weight in zip(
+                    tokens, vector[torch.nonzero(vector, as_tuple=True)]
+                )
+            }
+            all_sparse_dicts.append(sparse_dict)
+        return all_sparse_dicts
+
+    def process_sparse_encoding(self, queries):
+        features = self.tokenizer(
+            queries,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+            return_token_type_ids=False,
+        )
+        sparse_vector = self.get_sparse_vector(features)
+        sparse_dict = self.transform_sparse_vector_to_dict(sparse_vector)
+        return sparse_dict
diff --git a/tests/ml_models/test_sparseencondingmodel_pytest.py b/tests/ml_models/test_sparseencondingmodel_pytest.py
new file mode 100644
index 000000000..328331bba
--- /dev/null
+++ b/tests/ml_models/test_sparseencondingmodel_pytest.py
@@ -0,0 +1,270 @@
+# SPDX-License-Identifier: Apache-2.0
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.
+# Any modifications Copyright OpenSearch Contributors. See
+# GitHub history for details.
+
+import json
+import os
+import shutil
+from zipfile import ZipFile
+
+import pytest
+
+from opensearch_py_ml.ml_models import SparseEncodingModel
+
+TEST_FOLDER = os.path.join(
+    os.path.dirname(os.path.abspath("__file__")), "tests", "test_model_files"
+)
+TESTDATA_FILENAME = os.path.join(
+    os.path.dirname(os.path.abspath("__file__")), "tests", "sample_zip.zip"
+)
+TESTDATA_UNZIP_FOLDER = os.path.join(
+    os.path.dirname(os.path.abspath("__file__")), "tests", "sample_zip"
+)
+
+
+def clean_test_folder(TEST_FOLDER):
+    if os.path.exists(TEST_FOLDER):
+        for files in os.listdir(TEST_FOLDER):
+            sub_path = os.path.join(TEST_FOLDER, files)
+            if os.path.isfile(sub_path):
+                os.remove(sub_path)
+            else:
+                try:
+                    shutil.rmtree(sub_path)
+                except OSError as err:
+                    print(
+                        "Fail to delete files, please delete all files in "
+                        + str(TEST_FOLDER)
+                        + " "
+                        + str(err)
+                    )
+
+        shutil.rmtree(TEST_FOLDER)
+
+
+def compare_model_config(
+    model_config_path, model_id, model_format, expected_model_description=None
+):
+    try:
+        with open(model_config_path) as json_file:
+            model_config_data = json.load(json_file)
+    except Exception as exec:
+        assert (
+            False
+        ), f"Creating model config file for tracing in {model_format} raised an exception {exec}"
+
+    assert (
+        "name" in model_config_data and model_config_data["name"] == model_id
+    ), f"Missing or Wrong model name in {model_format} model config file"
+
+    assert (
+        "model_format" in model_config_data
+        and model_config_data["model_format"] == model_format
+    ), f"Missing or Wrong model_format in {model_format} model config file"
+
+    assert (
+        "function_name" in model_config_data
+        and model_config_data["function_name"] == "SPARSE_ENCODING"
+    ), f"Missing or Wrong function_name in {model_format} model config file"
+
+    if expected_model_description is not None:
+        assert (
+            "description" in model_config_data
+            and model_config_data["description"] == expected_model_description
+        ), f"Missing or Wrong model description in {model_format} model config file'"
+    assert (
+        "model_content_size_in_bytes" in model_config_data
+    ), f"Missing 'model_content_size_in_bytes' in {model_format} model config file"
+
+    assert (
+        "model_content_hash_value" in model_config_data
+    ), f"Missing 'model_content_hash_value' in {model_format} model config file"
+
+
+def compare_model_zip_file(zip_file_path, expected_filenames, model_format):
+    with ZipFile(zip_file_path, "r") as f:
+        filenames = set(f.namelist())
+        assert (
+            filenames == expected_filenames
+        ), f"The content in the {model_format} model zip file does not match the expected content: {filenames} != {expected_filenames}"
+
+
+clean_test_folder(TEST_FOLDER)
+# test model with a default model id opensearch-project/opensearch-neural-sparse-encoding-v1
+test_model = SparseEncodingModel(folder_path=TEST_FOLDER)
+
+
+def test_check_attribute():
+    try:
+        check_attribute = getattr(test_model, "model_id", "folder_path")
+    except AttributeError:
+        check_attribute = False
+    assert check_attribute
+
+    assert test_model.folder_path == TEST_FOLDER
+    assert (
+        test_model.model_id == "opensearch-project/opensearch-neural-sparse-encoding-v1"
+    )
+
+    default_folder = os.path.join(os.getcwd(), "opensearch_neural_sparse_model_files")
+
+    clean_test_folder(default_folder)
+    test_model0 = SparseEncodingModel()
+    assert test_model0.folder_path == default_folder
+    clean_test_folder(default_folder)
+
+    clean_test_folder(TEST_FOLDER)
+    test_model1 = SparseEncodingModel(
+        folder_path=TEST_FOLDER, model_id="sentence-transformers/all-MiniLM-L6-v2"
+    )
+    assert test_model1.model_id == "sentence-transformers/all-MiniLM-L6-v2"
+
+
+def test_folder_path():
+    with pytest.raises(Exception) as exc_info:
+        test_non_empty_path = os.path.join(
+            os.path.dirname(os.path.abspath("__file__")), "tests"
+        )
+        SparseEncodingModel(folder_path=test_non_empty_path, overwrite=False)
+    assert exc_info.type is Exception
+    assert "The default folder path already exists" in exc_info.value.args[0]
+
+
+def test_check_required_fields():
+    # test without required_fields should raise TypeError
+    with pytest.raises(TypeError):
+        test_model.process_sparse_encoding()
+    with pytest.raises(TypeError):
+        test_model.save_as_pt()
+
+
+def test_save_as_pt():
+    try:
+        test_model.save_as_pt(sentences=["today is sunny"])
+    except Exception as exec:
+        assert False, f"Tracing model in torchScript raised an exception {exec}"
+
+
+def test_make_model_config_json_for_torch_script():
+    model_format = "TORCH_SCRIPT"
+    expected_model_description = (
+        "This is a sparse encoding model for opensearch-neural-sparse-encoding-v1."
+    )
+    model_id = "opensearch-project/opensearch-neural-sparse-encoding-v1"
+    clean_test_folder(TEST_FOLDER)
+    test_model3 = SparseEncodingModel(folder_path=TEST_FOLDER)
+    test_model3.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    model_config_path_torch = test_model3.make_model_config_json(
+        model_format="TORCH_SCRIPT", description=expected_model_description
+    )
+
+    compare_model_config(
+        model_config_path_torch,
+        model_id,
+        model_format,
+        expected_model_description=expected_model_description,
+    )
+
+    clean_test_folder(TEST_FOLDER)
+
+
+def test_overwrite_description():
+    model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
+    model_format = "TORCH_SCRIPT"
+    expected_model_description = "Expected Description"
+
+    clean_test_folder(TEST_FOLDER)
+    test_model4 = SparseEncodingModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model4.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    model_config_path_torch = test_model4.make_model_config_json(
+        model_format=model_format, description=expected_model_description
+    )
+    try:
+        with open(model_config_path_torch) as json_file:
+            model_config_data_torch = json.load(json_file)
+    except Exception as exec:
+        assert (
+            False
+        ), f"Creating model config file for tracing in {model_format} raised an exception {exec}"
+
+    assert (
+        "description" in model_config_data_torch
+        and model_config_data_torch["description"] == expected_model_description
+    ), "Cannot overwrite description in model config file"
+
+    clean_test_folder(TEST_FOLDER)
+
+
+def test_long_description():
+    model_id = "opensearch-project/opensearch-neural-sparse-encoding-v1"
+    model_format = "TORCH_SCRIPT"
+    expected_model_description = (
+        "This is a sparce encoding model: It generate lots of tokens with different weight "
+        "which used to semantic search."
+        " The model was specifically trained for the task of semantic search."
+    )
+
+    clean_test_folder(TEST_FOLDER)
+    test_model5 = SparseEncodingModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model5.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    model_config_path_torch = test_model5.make_model_config_json(
+        model_format=model_format, description=expected_model_description
+    )
+    try:
+        with open(model_config_path_torch) as json_file:
+            model_config_data_torch = json.load(json_file)
+    except Exception as exec:
+        assert (
+            False
+        ), f"Creating model config file for tracing in {model_format} raised an exception {exec}"
+
+    assert (
+        "description" in model_config_data_torch
+        and model_config_data_torch["description"] == expected_model_description
+    ), "Missing or Wrong model description in model config file when the description is longer than normally."
+
+    clean_test_folder(TEST_FOLDER)
+
+
+def test_save_as_pt_with_license():
+    model_id = "opensearch-project/opensearch-neural-sparse-encoding-v1"
+    model_format = "TORCH_SCRIPT"
+    torch_script_zip_file_path = os.path.join(
+        TEST_FOLDER, "opensearch-neural-sparse-encoding-v1.zip"
+    )
+    torch_script_expected_filenames = {
+        "opensearch-neural-sparse-encoding-v1.pt",
+        "tokenizer.json",
+        "LICENSE",
+    }
+
+    clean_test_folder(TEST_FOLDER)
+    test_model6 = SparseEncodingModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model6.save_as_pt(
+        model_id=model_id, sentences=["today is sunny"], add_apache_license=True
+    )
+
+    compare_model_zip_file(
+        torch_script_zip_file_path, torch_script_expected_filenames, model_format
+    )
+
+    clean_test_folder(TEST_FOLDER)
+
+
+clean_test_folder(TEST_FOLDER)
+clean_test_folder(TESTDATA_UNZIP_FOLDER)
diff --git a/utils/model_uploader/autotracing_utils.py b/utils/model_uploader/autotracing_utils.py
new file mode 100644
index 000000000..affd36eb7
--- /dev/null
+++ b/utils/model_uploader/autotracing_utils.py
@@ -0,0 +1,304 @@
+# SPDX-License-Identifier: Apache-2.0
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.
+# Any modifications Copyright OpenSearch Contributors. See
+# GitHub history for details.
+import json
+import os
+import shutil
+import warnings
+from typing import Type, TypeVar
+
+from huggingface_hub import HfApi
+
+from opensearch_py_ml.ml_commons import MLCommonClient
+
+# We need to append ROOT_DIR path so that we can import
+# OPENSEARCH_TEST_CLIENT and opensearch_py_ml since this
+# python script is not in the root directory.
+
+
+BOTH_FORMAT = "BOTH"
+TORCH_SCRIPT_FORMAT = "TORCH_SCRIPT"
+ONNX_FORMAT = "ONNX"
+
+DENSE_MODEL_ALGORITHM = "TEXT_EMBEDDING"
+SPARSE_ALGORITHM = "SPARSE_ENCODING"
+TEMP_MODEL_PATH = "temp_model_path"
+TORCHSCRIPT_FOLDER_PATH = "model-torchscript/"
+ONNX_FOLDER_PATH = "model-onnx/"
+UPLOAD_FOLDER_PATH = "upload/"
+MODEL_CONFIG_FILE_NAME = "ml-commons_model_config.json"
+OUTPUT_DIR = "trace_output/"
+LICENSE_VAR_FILE = "apache_verified.txt"
+DESCRIPTION_VAR_FILE = "description.txt"
+RTOL_TEST = 1e-03
+ATOL_TEST = 1e-05
+
+
+def register_and_deploy_model(
+    ml_client: "MLCommonClient",
+    model_format: str,
+    model_path: str,
+    model_config_path: str,
+):
+    """
+
+    Args:
+        ml_client: The ml client to register and deploy model
+        model_format: The format of the model, one of [TORCH_SCRIPT,ONNX]
+        model_path: The path of the model
+        model_config_path: The path of the model config
+
+    Returns:
+        model_id The model_id of the registered model in OpenSearch
+
+    """
+    try:
+        model_id = ml_client.register_model(
+            model_path=model_path,
+            model_config_path=model_config_path,
+            deploy_model=True,
+            isVerbose=True,
+        )
+        print(f"\n{model_format}_model_id:", model_id)
+        assert model_id != "" or model_id is not None
+        return model_id
+    except Exception as e:
+        assert (
+            False
+        ), f"Raised Exception in {model_format} model registration/deployment: {e}"
+
+
+def check_model_status(
+    ml_client: "MLCommonClient", model_id: str, model_format: str, model_algorithm: str
+):
+    """
+    Check the status of the model.
+
+    Args:
+        ml_client:  Ml client to register and deploy model
+        model_id:  The model_id of the registered model in OpenSearch
+        model_format: The format of the model, one of [TORCH_SCRIPT,ONNX]
+
+    Returns:
+
+    """
+    try:
+        ml_model_status = ml_client.get_model_info(model_id)
+        print("\nModel Status:")
+        print(ml_model_status)
+        assert ml_model_status.get("model_state") == "DEPLOYED"
+        assert ml_model_status.get("model_format") == model_format
+        assert ml_model_status.get("algorithm") == model_algorithm
+    except Exception as e:
+        assert False, f"Raised Exception in getting {model_format} model info: {e}"
+
+
+def undeploy_model(ml_client: "MLCommonClient", model_id: str, model_format: str):
+    """
+    Undeploy the model from OpenSearch cluster.
+
+    Args:
+        ml_client:  Ml client to register and deploy model
+        model_id:  The model_id of the registered model in OpenSearch
+        model_format:  The format of the model, one of [TORCH_SCRIPT,ONNX]
+
+    Returns:
+
+    """
+    try:
+        ml_client.undeploy_model(model_id)
+        ml_model_status = ml_client.get_model_info(model_id)
+        assert ml_model_status.get("model_state") == "UNDEPLOYED"
+    except Exception as e:
+        assert False, f"Raised Exception in {model_format} model undeployment: {e}"
+
+
+def delete_model(ml_client: "MLCommonClient", model_id: str, model_format: str):
+    """
+    Delete the model from OpenSearch cluster.
+
+    Args:
+        ml_client:  Ml client to register and deploy model
+        model_id:  The model_id of the registered model in OpenSearch
+        model_format:  The format of the model, one of [TORCH_SCRIPT,ONNX]
+
+    Returns:
+
+    """
+    try:
+        delete_model_obj = ml_client.delete_model(model_id)
+        assert delete_model_obj.get("result") == "deleted"
+    except Exception as e:
+        assert False, f"Raised Exception in deleting {model_format} model: {e}"
+
+
+def autotracing_warning_filters():
+    warnings.filterwarnings("ignore", category=DeprecationWarning)
+    warnings.filterwarnings("ignore", category=FutureWarning)
+    warnings.filterwarnings("ignore", message="Unverified HTTPS request")
+    warnings.filterwarnings("ignore", message="TracerWarning: torch.tensor")
+    warnings.filterwarnings(
+        "ignore", message="using SSL with verify_certs=False is insecure."
+    )
+
+
+def store_description_variable(config_path_for_checking_description: str) -> None:
+    """
+    Store model description in OUTPUT_DIR/DESCRIPTION_VAR_FILE
+    to be used to generate issue body for manual approval
+
+    :param config_path_for_checking_description: Path to config json file
+    :type config_path_for_checking_description: str
+    :return: No return value expected
+    :rtype: None
+    """
+    try:
+        os.makedirs(OUTPUT_DIR, exist_ok=True)
+        description_var_filepath = OUTPUT_DIR + "/" + DESCRIPTION_VAR_FILE
+        with open(config_path_for_checking_description, "r") as f:
+            config_dict = json.load(f)
+            description = (
+                config_dict["description"] if "description" in config_dict else "-"
+            )
+        print(f"Storing the following description at {description_var_filepath}")
+        print(description)
+        with open(description_var_filepath, "w") as f:
+            f.write(description)
+    except Exception as e:
+        print(
+            f"Cannot store description ({description}) in {description_var_filepath}: {e}"
+        )
+
+
+def store_license_verified_variable(license_verified: bool) -> None:
+    """
+    Store whether the model is licensed under Apache 2.0 in OUTPUT_DIR/LICENSE_VAR_FILE
+    to be used to generate issue body for manual approval
+
+    :param license_verified: Whether the model is licensed under Apache 2.0
+    :type model_path: bool
+    :return: No return value expected
+    :rtype: None
+    """
+    try:
+        os.makedirs(OUTPUT_DIR, exist_ok=True)
+        license_var_filepath = OUTPUT_DIR + "/" + LICENSE_VAR_FILE
+        with open(license_var_filepath, "w") as f:
+            f.write(str(license_verified))
+    except Exception as e:
+        print(
+            f"Cannot store license_verified ({license_verified}) in {license_var_filepath}: {e}"
+        )
+
+
+def preview_model_config(model_format: str, model_config_path: str) -> None:
+    print(f"\n+++++ {model_format} Model Config +++++\n")
+    with open(model_config_path, "r") as f:
+        model_config = json.load(f)
+        print(json.dumps(model_config, indent=4))
+    print("\n+++++++++++++++++++++++++++++++++++++++\n")
+
+
+class ModelTraceError(Exception):
+    """Custom exception for errors during the model tracing process."""
+
+    def __init__(self, stage: str, model_format: str, original_exception: Exception):
+        super().__init__(
+            f"Error during {stage} for {model_format} model: {original_exception}"
+        )
+        self.stage = stage
+        self.model_format = model_format
+        self.original_exception = original_exception
+
+
+T = TypeVar("T")
+
+
+def init_sparse_model(model_class: Type[T], model_id, model_format, folder_path) -> T:
+    try:
+        pre_trained_model = model_class(
+            model_id=model_id, folder_path=folder_path, overwrite=True
+        )
+    except Exception as e:
+        raise ModelTraceError(
+            "initiating a sparse encoding model class object", model_format, e
+        )
+    return pre_trained_model
+
+
+def prepare_files_for_uploading(
+    model_id: str,
+    model_version: str,
+    model_format: str,
+    src_model_path: str,
+    src_model_config_path: str,
+) -> tuple[str, str]:
+    """
+    Prepare files for uploading by storing them in UPLOAD_FOLDER_PATH
+
+    :param model_id: Model ID of the pretrained model
+    :type model_id: string
+    :param model_version: Version of the pretrained model for registration
+    :type model_version: string
+    :param model_format: Model format ("TORCH_SCRIPT" or "ONNX")
+    :type model_format: string
+    :param src_model_path: Path to model files for uploading
+    :type src_model_path: string
+    :param src_model_config_path: Path to model config files for uploading
+    :type src_model_config_path: string
+    :return: Tuple of dst_model_path (path to model zip file) and dst_model_config_path
+    (path to model config json file) in the UPLOAD_FOLDER_PATH
+    :rtype: Tuple[str, str]
+    """
+    model_type, model_name = model_id.split("/")
+    model_format = model_format.lower()
+    folder_to_delete = (
+        TORCHSCRIPT_FOLDER_PATH if model_format == "torch_script" else ONNX_FOLDER_PATH
+    )
+
+    # Store to be uploaded files in UPLOAD_FOLDER_PATH
+    try:
+        dst_model_dir = (
+            f"{UPLOAD_FOLDER_PATH}{model_name}/{model_version}/{model_format}"
+        )
+        os.makedirs(dst_model_dir, exist_ok=True)
+        dst_model_filename = (
+            f"{model_type}_{model_name}-{model_version}-{model_format}.zip"
+        )
+        dst_model_path = dst_model_dir + "/" + dst_model_filename
+        shutil.copy(src_model_path, dst_model_path)
+        print(f"\nCopied {src_model_path} to {dst_model_path}")
+
+        dst_model_config_dir = (
+            f"{UPLOAD_FOLDER_PATH}{model_name}/{model_version}/{model_format}"
+        )
+        os.makedirs(dst_model_config_dir, exist_ok=True)
+        dst_model_config_filename = "config.json"
+        dst_model_config_path = dst_model_config_dir + "/" + dst_model_config_filename
+        shutil.copy(src_model_config_path, dst_model_config_path)
+        print(f"Copied {src_model_config_path} to {dst_model_config_path}")
+    except Exception as e:
+        assert (
+            False
+        ), f"Raised Exception during preparing {model_format} files for uploading: {e}"
+
+    # Delete model folder downloaded from HuggingFace during model tracing
+    try:
+        shutil.rmtree(folder_to_delete)
+    except Exception as e:
+        assert False, f"Raised Exception while deleting {folder_to_delete}: {e}"
+
+    return dst_model_path, dst_model_config_path
+
+
+def verify_license_by_hfapi(model_id: str):
+    api = HfApi()
+    model_info = api.model_info(model_id)
+    model_license = model_info.cardData.get("license", "License information not found.")
+    if model_license == "apache-2.0":
+        return True
+    else:
+        return False
diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py
index 3794087a8..7f9f837ff 100644
--- a/utils/model_uploader/model_autotracing.py
+++ b/utils/model_uploader/model_autotracing.py
@@ -10,11 +10,9 @@
 # files for uploading to OpenSearch model hub.
 
 import argparse
-import json
 import os
 import shutil
 import sys
-import warnings
 from typing import List, Optional, Tuple
 
 import numpy as np
@@ -32,26 +30,36 @@
 from opensearch_py_ml.ml_commons import MLCommonClient
 from opensearch_py_ml.ml_models.sentencetransformermodel import SentenceTransformerModel
 from tests import OPENSEARCH_TEST_CLIENT
+from utils.model_uploader.autotracing_utils import (
+    ATOL_TEST,
+    BOTH_FORMAT,
+    DENSE_MODEL_ALGORITHM,
+    ONNX_FOLDER_PATH,
+    ONNX_FORMAT,
+    RTOL_TEST,
+    TEMP_MODEL_PATH,
+    TORCH_SCRIPT_FORMAT,
+    TORCHSCRIPT_FOLDER_PATH,
+    autotracing_warning_filters,
+    check_model_status,
+    prepare_files_for_uploading,
+    preview_model_config,
+    register_and_deploy_model,
+    store_description_variable,
+)
+from utils.model_uploader.sparse_model_autotracing import (
+    store_license_verified_variable,
+)
 
-BOTH_FORMAT = "BOTH"
-TORCH_SCRIPT_FORMAT = "TORCH_SCRIPT"
-ONNX_FORMAT = "ONNX"
-
-TEMP_MODEL_PATH = "temp_model_path"
-TORCHSCRIPT_FOLDER_PATH = "model-torchscript/"
-ONNX_FOLDER_PATH = "model-onnx/"
-UPLOAD_FOLDER_PATH = "upload/"
-MODEL_CONFIG_FILE_NAME = "ml-commons_model_config.json"
-OUTPUT_DIR = "trace_output/"
-LICENSE_VAR_FILE = "apache_verified.txt"
-DESCRIPTION_VAR_FILE = "description.txt"
 TEST_SENTENCES = [
     "First test sentence",
     "This is another sentence used for testing model embedding outputs.",
-    "OpenSearch is a scalable, flexible, and extensible open-source software suite for search, analytics, and observability applications licensed under Apache 2.0. Powered by Apache Lucene and driven by the OpenSearch Project community, OpenSearch offers a vendor-agnostic toolset you can use to build secure, high-performance, cost-efficient applications. Use OpenSearch as an end-to-end solution or connect it with your preferred open-source tools or partner projects.",
+    "OpenSearch is a scalable, flexible, and extensible open-source software suite for search, analytics, "
+    "and observability applications licensed under Apache 2.0. Powered by Apache Lucene and driven by the OpenSearch "
+    "Project community, OpenSearch offers a vendor-agnostic toolset you can use to build secure, high-performance, "
+    "cost-efficient applications. Use OpenSearch as an end-to-end solution or connect it with your preferred "
+    "open-source tools or partner projects.",
 ]
-RTOL_TEST = 1e-03
-ATOL_TEST = 1e-05
 
 
 def verify_license_in_md_file() -> bool:
@@ -160,11 +168,7 @@ def trace_sentence_transformer_model(
         ), f"Raised Exception during making model config file for {model_format} model: {e}"
 
     # 4.) Preview model config
-    print(f"\n+++++ {model_format} Model Config +++++\n")
-    with open(model_config_path, "r") as f:
-        model_config = json.load(f)
-        print(json.dumps(model_config, indent=4))
-    print("\n+++++++++++++++++++++++++++++++++++++++\n")
+    preview_model_config(model_format, model_config_path)
 
     # 5.) Return model_path & model_config_path for model registration
     return model_path, model_config_path
@@ -195,32 +199,11 @@ def register_and_deploy_sentence_transformer_model(
     embedding_data = None
 
     # 1.) Register & Deploy the model
-    model_id = ""
-    try:
-        model_id = ml_client.register_model(
-            model_path=model_path,
-            model_config_path=model_config_path,
-            deploy_model=True,
-            isVerbose=True,
-        )
-        print(f"\n{model_format}_model_id:", model_id)
-        assert model_id != "" or model_id is not None
-    except Exception as e:
-        assert (
-            False
-        ), f"Raised Exception in {model_format} model registration/deployment: {e}"
-
+    model_id = register_and_deploy_model(
+        ml_client, model_format, model_path, model_config_path
+    )
     # 2.) Check model status
-    try:
-        ml_model_status = ml_client.get_model_info(model_id)
-        print("\nModel Status:")
-        print(ml_model_status)
-        assert ml_model_status.get("model_state") == "DEPLOYED"
-        assert ml_model_status.get("model_format") == model_format
-        assert ml_model_status.get("algorithm") == "TEXT_EMBEDDING"
-    except Exception as e:
-        assert False, f"Raised Exception in getting {model_format} model info: {e}"
-
+    check_model_status(ml_client, model_id, model_format, DENSE_MODEL_ALGORITHM)
     # 3.) Generate embeddings
     try:
         embedding_output = ml_client.generate_embedding(model_id, TEST_SENTENCES)
@@ -291,120 +274,6 @@ def verify_embedding_data(
         return True
 
 
-def prepare_files_for_uploading(
-    model_id: str,
-    model_version: str,
-    model_format: str,
-    src_model_path: str,
-    src_model_config_path: str,
-) -> None:
-    """
-    Prepare files for uploading by storing them in UPLOAD_FOLDER_PATH
-
-    :param model_id: Model ID of the pretrained model
-    :type model_id: string
-    :param model_version: Version of the pretrained model for registration
-    :type model_version: string
-    :param model_format: Model format ("TORCH_SCRIPT" or "ONNX")
-    :type model_format: string
-    :param src_model_path: Path to model files for uploading
-    :type src_model_path: string
-    :param src_model_config_path: Path to model config files for uploading
-    :type src_model_config_path: string
-    :return: Tuple of dst_model_path (path to model zip file) and dst_model_config_path
-    (path to model config json file) in the UPLOAD_FOLDER_PATH
-    :rtype: Tuple[str, str]
-    """
-    model_type, model_name = model_id.split("/")
-    model_format = model_format.lower()
-    folder_to_delete = (
-        TORCHSCRIPT_FOLDER_PATH if model_format == "torch_script" else ONNX_FOLDER_PATH
-    )
-
-    # Store to be uploaded files in UPLOAD_FOLDER_PATH
-    try:
-        dst_model_dir = (
-            f"{UPLOAD_FOLDER_PATH}{model_name}/{model_version}/{model_format}"
-        )
-        os.makedirs(dst_model_dir, exist_ok=True)
-        dst_model_filename = (
-            f"{model_type}_{model_name}-{model_version}-{model_format}.zip"
-        )
-        dst_model_path = dst_model_dir + "/" + dst_model_filename
-        shutil.copy(src_model_path, dst_model_path)
-        print(f"\nCopied {src_model_path} to {dst_model_path}")
-
-        dst_model_config_dir = (
-            f"{UPLOAD_FOLDER_PATH}{model_name}/{model_version}/{model_format}"
-        )
-        os.makedirs(dst_model_config_dir, exist_ok=True)
-        dst_model_config_filename = "config.json"
-        dst_model_config_path = dst_model_config_dir + "/" + dst_model_config_filename
-        shutil.copy(src_model_config_path, dst_model_config_path)
-        print(f"Copied {src_model_config_path} to {dst_model_config_path}")
-    except Exception as e:
-        assert (
-            False
-        ), f"Raised Exception during preparing {model_format} files for uploading: {e}"
-
-    # Delete model folder downloaded from HuggingFace during model tracing
-    try:
-        shutil.rmtree(folder_to_delete)
-    except Exception as e:
-        assert False, f"Raised Exception while deleting {folder_to_delete}: {e}"
-
-    return dst_model_path, dst_model_config_path
-
-
-def store_license_verified_variable(license_verified: bool) -> None:
-    """
-    Store whether the model is licensed under Apache 2.0 in OUTPUT_DIR/LICENSE_VAR_FILE
-    to be used to generate issue body for manual approval
-
-    :param license_verified: Whether the model is licensed under Apache 2.0
-    :type model_path: bool
-    :return: No return value expected
-    :rtype: None
-    """
-    try:
-        os.makedirs(OUTPUT_DIR, exist_ok=True)
-        license_var_filepath = OUTPUT_DIR + "/" + LICENSE_VAR_FILE
-        with open(license_var_filepath, "w") as f:
-            f.write(str(license_verified))
-    except Exception as e:
-        print(
-            f"Cannot store license_verified ({license_verified}) in {license_var_filepath}: {e}"
-        )
-
-
-def store_description_variable(config_path_for_checking_description: str) -> None:
-    """
-    Store model description in OUTPUT_DIR/DESCRIPTION_VAR_FILE
-    to be used to generate issue body for manual approval
-
-    :param config_path_for_checking_description: Path to config json file
-    :type config_path_for_checking_description: str
-    :return: No return value expected
-    :rtype: None
-    """
-    try:
-        os.makedirs(OUTPUT_DIR, exist_ok=True)
-        description_var_filepath = OUTPUT_DIR + "/" + DESCRIPTION_VAR_FILE
-        with open(config_path_for_checking_description, "r") as f:
-            config_dict = json.load(f)
-            description = (
-                config_dict["description"] if "description" in config_dict else "-"
-            )
-        print(f"Storing the following description at {description_var_filepath}")
-        print(description)
-        with open(description_var_filepath, "w") as f:
-            f.write(description)
-    except Exception as e:
-        print(
-            f"Cannot store description ({description}) in {description_var_filepath}: {e}"
-        )
-
-
 def main(
     model_id: str,
     model_version: str,
@@ -431,21 +300,18 @@ def main(
     :return: No return value expected
     :rtype: None
     """
-
-    print("\n=== Begin running model_autotracing.py ===")
-    print("Model ID: ", model_id)
-    print("Model Version: ", model_version)
-    print("Tracing Format: ", tracing_format)
     print(
-        "Embedding Dimension: ",
-        embedding_dimension if embedding_dimension is not None else "N/A",
-    )
-    print("Pooling Mode: ", pooling_mode if pooling_mode is not None else "N/A")
-    print(
-        "Model Description: ",
-        model_description if model_description is not None else "N/A",
+        f"""
+    === Begin running model_autotracing.py ===
+    Model ID: {model_id}
+    Model Version: {model_version}
+    Tracing Format: {tracing_format}
+    Embedding Dimension: {embedding_dimension if embedding_dimension is not None else 'N/A'}
+    Pooling Mode: {pooling_mode if pooling_mode is not None else 'N/A'}
+    Model Description: {model_description if model_description is not None else 'N/A'}
+    ==========================================
+    """
     )
-    print("==========================================")
 
     ml_client = MLCommonClient(OPENSEARCH_TEST_CLIENT)
 
@@ -543,13 +409,7 @@ def main(
 
 
 if __name__ == "__main__":
-    warnings.filterwarnings("ignore", category=DeprecationWarning)
-    warnings.filterwarnings("ignore", category=FutureWarning)
-    warnings.filterwarnings("ignore", message="Unverified HTTPS request")
-    warnings.filterwarnings("ignore", message="TracerWarning: torch.tensor")
-    warnings.filterwarnings(
-        "ignore", message="using SSL with verify_certs=False is insecure."
-    )
+    autotracing_warning_filters()
 
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument(
diff --git a/utils/model_uploader/sparse_model_autotracing.py b/utils/model_uploader/sparse_model_autotracing.py
new file mode 100644
index 000000000..b03435d8b
--- /dev/null
+++ b/utils/model_uploader/sparse_model_autotracing.py
@@ -0,0 +1,339 @@
+# SPDX-License-Identifier: Apache-2.0
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.
+# Any modifications Copyright OpenSearch Contributors. See
+# GitHub history for details.
+
+import argparse
+import os
+import shutil
+import sys
+from typing import Optional, Tuple
+
+import numpy as np
+
+THIS_DIR = os.path.dirname(__file__)
+ROOT_DIR = os.path.join(THIS_DIR, "../..")
+sys.path.append(ROOT_DIR)
+
+from opensearch_py_ml.ml_commons import MLCommonClient
+from opensearch_py_ml.ml_models import SparseEncodingModel
+from tests import OPENSEARCH_TEST_CLIENT
+from utils.model_uploader.autotracing_utils import (
+    ATOL_TEST,
+    BOTH_FORMAT,
+    ONNX_FOLDER_PATH,
+    ONNX_FORMAT,
+    RTOL_TEST,
+    SPARSE_ALGORITHM,
+    TEMP_MODEL_PATH,
+    TORCH_SCRIPT_FORMAT,
+    TORCHSCRIPT_FOLDER_PATH,
+    ModelTraceError,
+    autotracing_warning_filters,
+    check_model_status,
+    delete_model,
+    init_sparse_model,
+    prepare_files_for_uploading,
+    preview_model_config,
+    register_and_deploy_model,
+    store_description_variable,
+    store_license_verified_variable,
+    undeploy_model,
+    verify_license_by_hfapi,
+)
+
+TEST_SENTENCES = ["Nice to meet you.", "I like playing football.", "Thanks."]
+
+
+def trace_sparse_encoding_model(
+    model_id: str,
+    model_version: str,
+    model_format: str,
+    model_description: Optional[str] = None,
+) -> Tuple[str, str]:
+    """
+    Trace the pretrained sparse encoding model, create a model config file,
+    and return a path to the model file and a path to the model config file required for model registration
+
+    :param model_id: Model ID of the pretrained model
+    :type model_id: string
+    :param model_version: Version of the pretrained model for registration
+    :type model_version: string
+    :param model_format: Model format ("TORCH_SCRIPT" or "ONNX")
+    :type model_format: string
+    :param model_description: Model description input
+    :type model_description: string
+    :return: Tuple of model_path (path to model zip file) and model_config_path (path to model config json file)
+    :rtype: Tuple[str, str]
+    """
+
+    folder_path = (
+        TORCHSCRIPT_FOLDER_PATH
+        if model_format == TORCH_SCRIPT_FORMAT
+        else ONNX_FOLDER_PATH
+    )
+
+    # 1.) Initiate a sparse encoding model class object
+    pre_trained_model = init_sparse_model(
+        SparseEncodingModel, model_id, model_format, folder_path
+    )
+
+    # 2.) Save the model in the specified format
+
+    try:
+        if model_format == TORCH_SCRIPT_FORMAT:
+            model_path = pre_trained_model.save_as_pt(
+                model_id=model_id,
+                sentences=TEST_SENTENCES,
+                add_apache_license=True,
+            )
+        else:
+            model_path = pre_trained_model.save_as_onnx(
+                model_id=model_id, add_apache_license=True
+            )
+    except Exception as e:
+        raise ModelTraceError("saving model", model_format, e)
+
+    # 3.) Create a model config json file
+    try:
+        model_config_path = pre_trained_model.make_model_config_json(
+            version_number=model_version,
+            model_format=model_format,
+            description=model_description,
+        )
+    except Exception as e:
+        raise ModelTraceError("making model config file", model_format, e)
+
+    # 4.) Preview model config
+    preview_model_config(model_format, model_config_path)
+
+    # 5.) Return model_path & model_config_path for model registration
+    return model_path, model_config_path
+
+
+def register_and_deploy_sparse_encoding_model(
+    ml_client: "MLCommonClient",
+    model_path: str,
+    model_config_path: str,
+    model_format: str,
+    texts: list[str],
+) -> list:
+    encoding_datas = None
+    model_id = register_and_deploy_model(
+        ml_client, model_format, model_path, model_config_path
+    )
+    check_model_status(ml_client, model_id, model_format, SPARSE_ALGORITHM)
+    try:
+        encoding_input = {"text_docs": texts}
+        encoding_output = ml_client.generate_model_inference(model_id, encoding_input)
+        encoding_datas = [
+            encoding_output["inference_results"][i]["output"][0]["dataAsMap"][
+                "response"
+            ][0]
+            for i in range(len(texts))
+        ]
+    except Exception as e:
+        assert (
+            False
+        ), f"Raised Exception in generating sparse encoding with {model_format} model: {e}"
+    undeploy_model(ml_client, model_id, model_format)
+    delete_model(ml_client, model_id, model_format)
+    return encoding_datas
+
+
+def verify_embedding_data_vectors(original_embedding_datas, tracing_embedding_datas):
+    if len(original_embedding_datas) != len(tracing_embedding_datas):
+        print(
+            f"The length of original_embedding_data_vector: {len(original_embedding_datas)} and "
+            f"tracing_embedding_data_vector: {len(tracing_embedding_datas)} are different"
+        )
+        return False
+
+    for index, (original, tracing) in enumerate(
+        zip(original_embedding_datas, tracing_embedding_datas)
+    ):
+        if not verify_sparse_encoding(original, tracing):
+            print(
+                f"Verification failed for index {index}, whose input is {TEST_SENTENCES[index]}."
+            )
+            return False
+
+    return True
+
+
+def verify_sparse_encoding(
+    original_embedding_data: dict,
+    tracing_embedding_data: dict,
+) -> bool:
+    if original_embedding_data.keys() != tracing_embedding_data.keys():
+        print("Different encoding dimensions")
+        return False
+    for key in original_embedding_data:
+        a = original_embedding_data[key]
+        b = tracing_embedding_data[key]
+        if not np.allclose(a, b, rtol=RTOL_TEST, atol=ATOL_TEST):
+            print(
+                f"{key}'s score has gap: {original_embedding_data[key]} != {tracing_embedding_data[key]}"
+            )
+            return False
+    return True
+
+
+def main(
+    model_id: str,
+    model_version: str,
+    tracing_format: str,
+    model_description: Optional[str] = None,
+) -> None:
+    """
+    Perform model auto-tracing and prepare files for uploading to OpenSearch model hub
+
+    :param model_id: Model ID of the pretrained model
+    :type model_id: string
+    :param model_version: Version of the pretrained model for registration
+    :type model_version: string
+    :param tracing_format: Tracing format ("TORCH_SCRIPT", "ONNX", or "BOTH")
+    :type tracing_format: string
+    :param model_description: Model description input
+    :type model_description: string
+    :return: No return value expected
+    :rtype: None
+    """
+
+    print(
+        f"""
+    === Begin running sparse_model_autotracing.py ===
+    Model ID: {model_id}
+    Model Version: {model_version}
+    Tracing Format: {tracing_format}
+    Model Description: {model_description if model_description is not None else 'N/A'}
+    ==========================================
+    """
+    )
+
+    # Now Sparse model auto tracing only support Torch Script.
+    assert (
+        tracing_format == TORCH_SCRIPT_FORMAT
+    ), f"Now Only {TORCH_SCRIPT_FORMAT} is supported."
+
+    ml_client = MLCommonClient(OPENSEARCH_TEST_CLIENT)
+    pre_trained_model = SparseEncodingModel(model_id)
+    original_encoding_datas = pre_trained_model.process_sparse_encoding(TEST_SENTENCES)
+    pre_trained_model.save(path=TEMP_MODEL_PATH)
+    license_verified = verify_license_by_hfapi(model_id)
+
+    try:
+        shutil.rmtree(TEMP_MODEL_PATH)
+    except Exception as e:
+        assert False, f"Raised Exception while deleting {TEMP_MODEL_PATH}: {e}"
+
+    if tracing_format in [TORCH_SCRIPT_FORMAT, BOTH_FORMAT]:
+        print("--- Begin tracing a model in TORCH_SCRIPT ---")
+        (
+            torchscript_model_path,
+            torchscript_model_config_path,
+        ) = trace_sparse_encoding_model(
+            model_id, model_version, TORCH_SCRIPT_FORMAT, model_description=None
+        )
+
+        torchscript_encoding_datas = register_and_deploy_sparse_encoding_model(
+            ml_client,
+            torchscript_model_path,
+            torchscript_model_config_path,
+            TORCH_SCRIPT_FORMAT,
+            TEST_SENTENCES,
+        )
+
+        pass_test = verify_embedding_data_vectors(
+            original_encoding_datas, torchscript_encoding_datas
+        )
+        assert (
+            pass_test
+        ), f"Failed while verifying embeddings of {model_id} model in TORCH_SCRIPT format"
+
+        (
+            torchscript_dst_model_path,
+            torchscript_dst_model_config_path,
+        ) = prepare_files_for_uploading(
+            model_id,
+            model_version,
+            TORCH_SCRIPT_FORMAT,
+            torchscript_model_path,
+            torchscript_model_config_path,
+        )
+
+        config_path_for_checking_description = torchscript_dst_model_config_path
+        print("--- Finished tracing a model in TORCH_SCRIPT ---")
+
+    if tracing_format in [ONNX_FORMAT, BOTH_FORMAT]:
+        print("--- Begin tracing a model in ONNX ---")
+        (
+            onnx_model_path,
+            onnx_model_config_path,
+        ) = trace_sparse_encoding_model(
+            model_id, model_version, ONNX_FORMAT, model_description=None
+        )
+
+        onnx_embedding_datas = register_and_deploy_sparse_encoding_model(
+            ml_client,
+            onnx_model_path,
+            onnx_model_config_path,
+            ONNX_FORMAT,
+            TEST_SENTENCES,
+        )
+
+        pass_test = verify_embedding_data_vectors(
+            original_encoding_datas, onnx_embedding_datas
+        )
+        assert (
+            pass_test
+        ), f"Failed while verifying embeddings of {model_id} model in ONNX format"
+
+        onnx_dst_model_path, onnx_dst_model_config_path = prepare_files_for_uploading(
+            model_id,
+            model_version,
+            ONNX_FORMAT,
+            onnx_model_path,
+            onnx_model_config_path,
+        )
+
+        config_path_for_checking_description = onnx_dst_model_config_path
+        print("--- Finished tracing a model in ONNX ---")
+
+    store_license_verified_variable(license_verified)
+    store_description_variable(config_path_for_checking_description)
+
+    print("\n=== Finished running sparse_model_autotracing.py ===")
+
+
+if __name__ == "__main__":
+    autotracing_warning_filters()
+
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "model_id",
+        type=str,
+        help="Model ID for auto-tracing and uploading (e.g. opensearch-project/opensearch-neural-sparse-encoding-v1)",
+    )
+    parser.add_argument(
+        "model_version", type=str, help="Model version number (e.g. 1.0.1)"
+    )
+    parser.add_argument(
+        "tracing_format",
+        choices=["BOTH", "TORCH_SCRIPT", "ONNX"],
+        help="Model format for auto-tracing",
+    )
+    parser.add_argument(
+        "-md",
+        "--model_description",
+        type=str,
+        nargs="?",
+        default=None,
+        const=None,
+        help="Model description if you want to overwrite the default description",
+    )
+    args = parser.parse_args()
+
+    main(args.model_id, args.model_version, args.tracing_format, args.model_description)