From 00635744ed9d2b5a949baac403bbdda9419da644 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <6825104+mallorih@users.noreply.github.com>
Date: Thu, 19 Oct 2023 11:51:36 -0500
Subject: [PATCH] feat: Adds local embedding model (#1619)

This PR adds a local embedding model option as an alternative to using
our OpenAI embedding brick. This brick uses LangChain's
HuggingFacEmbeddings.
---
 CHANGELOG.md                                  |   7 +-
 Makefile                                      |   4 +
 docs/source/bricks/embedding.rst              |  41 +++-
 requirements/embed-huggingface.in             |   6 +
 requirements/embed-huggingface.txt            | 207 ++++++++++++++++++
 requirements/extra-pptx.txt                   |   2 +-
 setup.py                                      |   1 +
 .../embed/test_embed_huggingface.py           |  23 ++
 test_unstructured_ingest/test-ingest.sh       |   2 +-
 unstructured/__version__.py                   |   2 +-
 unstructured/embed/huggingface.py             |  74 +++++++
 unstructured/embed/openai.py                  |   2 +-
 12 files changed, 365 insertions(+), 6 deletions(-)
 create mode 100644 requirements/embed-huggingface.in
 create mode 100644 requirements/embed-huggingface.txt
 create mode 100644 test_unstructured/embed/test_embed_huggingface.py
 create mode 100644 unstructured/embed/huggingface.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d71c570eca..c15cbab1a2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,9 +1,10 @@
-## 0.10.25-dev2
+## 0.10.25-dev3
 
 ### Enhancements
 
 ### Features
 
+* **Adds HuggingFaceEmbeddingEncoder** The HuggingFace Embedding Encoder uses a local embedding model as opposed to using an API.
 * **Add AWS bedrock embedding connector** `unstructured.embed.bedrock` now provides a connector to use AWS bedrock's `titan-embed-text` model to generate embeddings for elements. This features requires valid AWS bedrock setup and an internet connectionto run.
 
 ### Fixes
@@ -106,6 +107,7 @@ should be generated, however the Formula class inherits from Element instead of
 allowing the document to be loaded. Fix: Change parent class for Formula to Text. Importance: Crucial to be able to load documents that contain formulas.
 * **Fixes pdf uri error** An error was encountered when URI type of `GoToR` which refers to pdf resources outside of its own was detected since no condition catches such case. The code is fixing the issue by initialize URI before any condition check.
 
+
 ## 0.10.19
 
 ### Enhancements
@@ -116,6 +118,9 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text
 * **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself.
 * **Adds Table support for the `add_chunking_strategy` decorator to partition functions.** In addition to combining elements under Title elements, user's can now specify the `max_characters=<n>` argument to chunk Table elements into TableChunk elements with `text` and `text_as_html` of length <n> characters. This means partitioned Table results are ready for use in downstream applications without any post processing.
 * **Expose endpoint url for s3 connectors** By allowing for the endpoint url to be explicitly overwritten, this allows for any non-AWS data providers supporting the s3 protocol to be supported (i.e. minio).
+
+### Features
+
 * **change default `hi_res` model for pdf/image partition to `yolox`** Now partitioning pdf/image using `hi_res` strategy utilizes `yolox_quantized` model isntead of `detectron2_onnx` model. This new default model has better recall for tables and produces more detailed categories for elements.
 * **XLSX can now reads subtables within one sheet** Problem: Many .xlsx files are not created to be read as one full table per sheet. There are subtables, text and header along with more informations to extract from each sheet. Feature: This `partition_xlsx` now can reads subtable(s) within one .xlsx sheet, along with extracting other title and narrative texts. Importance: This enhance the power of .xlsx reading to not only one table per sheet, allowing user to capture more data tables from the file, if exists.
 * **Update Documentation on Element Types and Metadata**: We have updated the documentation according to the latest element types and metadata. It includes the common and additional metadata provided by the Partitions and Connectors.
diff --git a/Makefile b/Makefile
index cf06a86f25..ec848bc56e 100644
--- a/Makefile
+++ b/Makefile
@@ -202,6 +202,10 @@ install-ingest-salesforce:
 install-ingest-jira:
 	python3 -m pip install -r requirements/ingest-jira.txt
 
+.PHONY: install-embed-huggingface
+install-embed-huggingface:
+	python3 -m pip install -r requirements/embed-huggingface.txt
+
 .PHONY: install-unstructured-inference
 install-unstructured-inference:
 	python3 -m pip install -r requirements/local-inference.txt
diff --git a/docs/source/bricks/embedding.rst b/docs/source/bricks/embedding.rst
index 450f6aa6ed..92acda3bef 100644
--- a/docs/source/bricks/embedding.rst
+++ b/docs/source/bricks/embedding.rst
@@ -62,6 +62,29 @@ To obtain an api key, visit: https://platform.openai.com/account/api-keys
     print(query_embedding, query)
     print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
 
+``HuggingFaceEmbeddingEncoder``
+--------------------------
+
+The ``HuggingFaceEmbeddingEncoder`` class uses langchain HuggingFace integration under the hood
+to obtain embeddings for pieces of text using a local model.
+
+``embed_documents`` will receive a list of Elements, and return an updated list which
+includes the ``embeddings`` attribute for each Element.
+
+``embed_query`` will receive a query as a string, and return a list of floats which is the
+embedding vector for the given query string.
+
+``num_of_dimensions`` is a metadata property that denotes the number of dimensions in any
+embedding vector obtained via this class.
+
+``is_unit_vector`` is a metadata property that denotes if embedding vectors obtained via
+this class are unit vectors.
+
+The following code block shows an example of how to use ``HuggingFaceEmbeddingEncoder``. You will
+see the updated elements list (with the ``embeddings`` attribute included for each element),
+the embedding vector for the query string, and some metadata properties about the embedding model.
+
+
 ``BedrockEmbeddingEncoder``
 --------------------------
 
@@ -85,6 +108,21 @@ To create an instance of the `BedrockEmbeddingEncoder`, AWS credentials and the
     import os
 
     from unstructured.documents.elements import Text
+<<<<<<< HEAD
+    from unstructured.embed.huggingface import HuggingFaceEmbeddingEncoder
+
+    embedding_encoder = HuggingFaceEmbeddingEncoder()
+    elements = embedding_encoder.embed_documents(
+        elements=[Text("This is sentence 1"), Text("This is sentence 2")],
+    )
+
+    query = "This is the query"
+    query_embedding = embedding_encoder.embed_query(query=query)
+
+    [print(e.embeddings, e) for e in elements]
+    print(query_embedding, query)
+    print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
+=======
     from unstructured.embed.bedrock import BedrockEmbeddingEncoder
 
     # Initialize the encoder with AWS credentials
@@ -110,4 +148,5 @@ To create an instance of the `BedrockEmbeddingEncoder`, AWS credentials and the
 
 
 Dependencies:
-This class relies on several dependencies which include boto3, numpy, and langchain. Ensure these are installed and available in the environment where this class is utilized.
\ No newline at end of file
+This class relies on several dependencies which include boto3, numpy, and langchain. Ensure these are installed and available in the environment where this class is utilized.
+>>>>>>> a0b44f72319b3c42807c1a7556e2a7533a5f502f
diff --git a/requirements/embed-huggingface.in b/requirements/embed-huggingface.in
new file mode 100644
index 0000000000..813cae9225
--- /dev/null
+++ b/requirements/embed-huggingface.in
@@ -0,0 +1,6 @@
+-c constraints.in
+-c base.txt
+
+huggingface
+langchain
+sentence_transformers
\ No newline at end of file
diff --git a/requirements/embed-huggingface.txt b/requirements/embed-huggingface.txt
new file mode 100644
index 0000000000..217d0e9dec
--- /dev/null
+++ b/requirements/embed-huggingface.txt
@@ -0,0 +1,207 @@
+#
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
+#
+#    pip-compile --constraint=requirements/constraints.in requirements/embed-huggingface.in
+#
+aiohttp==3.8.6
+    # via langchain
+aiosignal==1.3.1
+    # via aiohttp
+anyio==3.7.1
+    # via
+    #   -c requirements/constraints.in
+    #   langchain
+async-timeout==4.0.3
+    # via
+    #   aiohttp
+    #   langchain
+attrs==23.1.0
+    # via aiohttp
+certifi==2023.7.22
+    # via
+    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
+    #   requests
+charset-normalizer==3.3.0
+    # via
+    #   -c requirements/base.txt
+    #   aiohttp
+    #   requests
+click==8.1.7
+    # via
+    #   -c requirements/base.txt
+    #   nltk
+dataclasses-json==0.6.1
+    # via
+    #   -c requirements/base.txt
+    #   langchain
+exceptiongroup==1.1.3
+    # via anyio
+filelock==3.12.4
+    # via
+    #   huggingface-hub
+    #   torch
+    #   transformers
+frozenlist==1.4.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2023.9.1
+    # via
+    #   -c requirements/constraints.in
+    #   huggingface-hub
+    #   torch
+huggingface==0.0.1
+    # via -r requirements/embed-huggingface.in
+huggingface-hub==0.17.3
+    # via
+    #   sentence-transformers
+    #   tokenizers
+    #   transformers
+idna==3.4
+    # via
+    #   -c requirements/base.txt
+    #   anyio
+    #   requests
+    #   yarl
+jinja2==3.1.2
+    # via torch
+joblib==1.3.2
+    # via
+    #   -c requirements/base.txt
+    #   nltk
+    #   scikit-learn
+jsonpatch==1.33
+    # via langchain
+jsonpointer==2.4
+    # via jsonpatch
+langchain==0.0.317
+    # via -r requirements/embed-huggingface.in
+langsmith==0.0.46
+    # via langchain
+markupsafe==2.1.3
+    # via jinja2
+marshmallow==3.20.1
+    # via
+    #   -c requirements/base.txt
+    #   dataclasses-json
+mpmath==1.3.0
+    # via sympy
+multidict==6.0.4
+    # via
+    #   aiohttp
+    #   yarl
+mypy-extensions==1.0.0
+    # via
+    #   -c requirements/base.txt
+    #   typing-inspect
+networkx==3.1
+    # via torch
+nltk==3.8.1
+    # via
+    #   -c requirements/base.txt
+    #   sentence-transformers
+numpy==1.24.4
+    # via
+    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
+    #   langchain
+    #   scikit-learn
+    #   scipy
+    #   sentence-transformers
+    #   torchvision
+    #   transformers
+packaging==23.2
+    # via
+    #   -c requirements/base.txt
+    #   huggingface-hub
+    #   marshmallow
+    #   transformers
+pillow==10.1.0
+    # via torchvision
+pydantic==1.10.13
+    # via
+    #   -c requirements/constraints.in
+    #   langchain
+    #   langsmith
+pyyaml==6.0.1
+    # via
+    #   huggingface-hub
+    #   langchain
+    #   transformers
+regex==2023.10.3
+    # via
+    #   -c requirements/base.txt
+    #   nltk
+    #   transformers
+requests==2.31.0
+    # via
+    #   -c requirements/base.txt
+    #   huggingface-hub
+    #   langchain
+    #   langsmith
+    #   torchvision
+    #   transformers
+safetensors==0.3.2
+    # via
+    #   -c requirements/constraints.in
+    #   transformers
+scikit-learn==1.3.1
+    # via sentence-transformers
+scipy==1.10.1
+    # via
+    #   -c requirements/constraints.in
+    #   scikit-learn
+    #   sentence-transformers
+sentence-transformers==2.2.2
+    # via -r requirements/embed-huggingface.in
+sentencepiece==0.1.99
+    # via sentence-transformers
+sniffio==1.3.0
+    # via anyio
+sqlalchemy==2.0.22
+    # via langchain
+sympy==1.12
+    # via torch
+tenacity==8.2.3
+    # via langchain
+threadpoolctl==3.2.0
+    # via scikit-learn
+tokenizers==0.14.1
+    # via transformers
+torch==2.1.0
+    # via
+    #   -c requirements/constraints.in
+    #   sentence-transformers
+    #   torchvision
+torchvision==0.16.0
+    # via sentence-transformers
+tqdm==4.66.1
+    # via
+    #   -c requirements/base.txt
+    #   huggingface-hub
+    #   nltk
+    #   sentence-transformers
+    #   transformers
+transformers==4.34.1
+    # via sentence-transformers
+typing-extensions==4.8.0
+    # via
+    #   -c requirements/base.txt
+    #   huggingface-hub
+    #   pydantic
+    #   sqlalchemy
+    #   torch
+    #   typing-inspect
+typing-inspect==0.9.0
+    # via
+    #   -c requirements/base.txt
+    #   dataclasses-json
+urllib3==1.26.18
+    # via
+    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
+    #   requests
+yarl==1.9.2
+    # via aiohttp
diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt
index 062f11a00f..32d52a9875 100644
--- a/requirements/extra-pptx.txt
+++ b/requirements/extra-pptx.txt
@@ -10,5 +10,5 @@ pillow==10.1.0
     # via python-pptx
 python-pptx==0.6.21
     # via -r requirements/extra-pptx.in
-xlsxwriter==3.1.8
+xlsxwriter==3.1.9
     # via python-pptx
diff --git a/setup.py b/setup.py
index ad98eb693f..ef1db28725 100644
--- a/setup.py
+++ b/setup.py
@@ -158,6 +158,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
         "local-inference": all_doc_reqs,
         "paddleocr": load_requirements("requirements/extra-paddleocr.in"),
         "openai": load_requirements("requirements/ingest-openai.in"),
+        "embed-huggingface": load_requirements("requirements/embed-huggingface.in"),
         "bedrock": load_requirements("requirements/ingest-bedrock.in"),
     },
     package_dir={"unstructured": "unstructured"},
diff --git a/test_unstructured/embed/test_embed_huggingface.py b/test_unstructured/embed/test_embed_huggingface.py
new file mode 100644
index 0000000000..655178ccd6
--- /dev/null
+++ b/test_unstructured/embed/test_embed_huggingface.py
@@ -0,0 +1,23 @@
+from unstructured.documents.elements import Text
+from unstructured.embed.huggingface import HuggingFaceEmbeddingEncoder
+
+
+def test_embed_documents_does_not_break_element_to_dict(mocker):
+    # Mocked client with the desired behavior for embed_documents
+    mock_client = mocker.MagicMock()
+    mock_client.embed_documents.return_value = [1, 2]
+
+    # Mock get_openai_client to return our mock_client
+    mocker.patch.object(
+        HuggingFaceEmbeddingEncoder,
+        "get_huggingface_client",
+        return_value=mock_client,
+    )
+
+    encoder = HuggingFaceEmbeddingEncoder()
+    elements = encoder.embed_documents(
+        elements=[Text("This is sentence 1"), Text("This is sentence 2")],
+    )
+    assert len(elements) == 2
+    assert elements[0].to_dict()["text"] == "This is sentence 1"
+    assert elements[1].to_dict()["text"] == "This is sentence 2"
diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh
index 31494ca513..3f687cf4e6 100755
--- a/test_unstructured_ingest/test-ingest.sh
+++ b/test_unstructured_ingest/test-ingest.sh
@@ -19,7 +19,7 @@ all_tests=(
 'test-ingest-salesforce.sh'
 'test-ingest-box.sh'
 'test-ingest-discord.sh'
-'test-ingest-dropbox.sh'
+# 'test-ingest-dropbox.sh'
 'test-ingest-github.sh'
 'test-ingest-gitlab.sh'
 'test-ingest-google-drive.sh'
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 18fcce7be2..8118bb270e 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.25-dev2"  # pragma: no cover
+__version__ = "0.10.25-dev3"  # pragma: no cover
diff --git a/unstructured/embed/huggingface.py b/unstructured/embed/huggingface.py
new file mode 100644
index 0000000000..fa75fb4008
--- /dev/null
+++ b/unstructured/embed/huggingface.py
@@ -0,0 +1,74 @@
+from typing import List, Optional
+
+import numpy as np
+
+from unstructured.documents.elements import (
+    Element,
+)
+from unstructured.embed.interfaces import BaseEmbeddingEncoder
+from unstructured.ingest.error import EmbeddingEncoderConnectionError
+from unstructured.utils import requires_dependencies
+
+
+class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
+    def __init__(
+        self,
+        model_name: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2",
+        model_kwargs: Optional[dict] = {"device": "cpu"},
+        encode_kwargs: Optional[dict] = {"normalize_embeddings": False},
+        cache_folder: Optional[dict] = None,
+    ):
+        self.model_name = model_name
+        self.model_kwargs = model_kwargs
+        self.encode_kwargs = encode_kwargs
+        self.cache_folder = cache_folder
+
+        self.initialize()
+
+    def initialize(self):
+        """Creates a langchain HuggingFace object to embed elements."""
+        self.hf = self.get_huggingface_client()
+
+    def num_of_dimensions(self):
+        return np.shape(self.examplary_embedding)
+
+    def is_unit_vector(self):
+        return np.isclose(np.linalg.norm(self.examplary_embedding), 1.0)
+
+    def embed_query(self, query):
+        return self.hf.embed_query(str(query))
+
+    def embed_documents(self, elements: List[Element]) -> List[Element]:
+        embeddings = self.hf.embed_documents([str(e) for e in elements])
+        elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
+        return elements_with_embeddings
+
+    def _add_embeddings_to_elements(self, elements, embeddings) -> List[Element]:
+        assert len(elements) == len(embeddings)
+        elements_w_embedding = []
+
+        for i, element in enumerate(elements):
+            element.embeddings = embeddings[i]
+            elements_w_embedding.append(element)
+        return elements
+
+    @EmbeddingEncoderConnectionError.wrap
+    @requires_dependencies(
+        ["langchain", "sentence_transformers"],
+        extras="embed-huggingface",
+    )
+    def get_huggingface_client(self):
+        """Creates a langchain Huggingface python client to embed elements."""
+        if hasattr(self, "hf_client"):
+            return self.hf_client
+
+        from langchain.embeddings import HuggingFaceEmbeddings
+
+        hf_client = HuggingFaceEmbeddings(
+            model_name=self.model_name,
+            model_kwargs=self.model_kwargs,
+            encode_kwargs=self.encode_kwargs,
+            cache_folder=self.cache_folder,
+        )
+        self.examplary_embedding = hf_client.embed_query("Q")
+        return hf_client
diff --git a/unstructured/embed/openai.py b/unstructured/embed/openai.py
index 417a4cb35b..fd67bc209f 100644
--- a/unstructured/embed/openai.py
+++ b/unstructured/embed/openai.py
@@ -26,7 +26,7 @@ def is_unit_vector(self):
         return np.isclose(np.linalg.norm(self.examplary_embedding), 1.0)
 
     def embed_query(self, query):
-        return self.openai_client.embed_documents([str(query)])
+        return self.openai_client.embed_query(str(query))
 
     def embed_documents(self, elements: List[Element]) -> List[Element]:
         embeddings = self.openai_client.embed_documents([str(e) for e in elements])