Unstructured-IO · ahmetmeleq · Nov 29, 2023 · Oct 17, 2023 · Oct 18, 2023 · Oct 19, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -287,6 +287,7 @@ jobs:
         AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
         AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
         OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}}
         TABLE_OCR: "tesseract"
         OCR_AGENT: "tesseract"
         CI: "true"
@@ -347,6 +348,7 @@ jobs:
         OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         MONGODB_URI: ${{ secrets.MONGODB_URI }}
         MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }}
+        PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}}
         TABLE_OCR: "tesseract"
         OCR_AGENT: "tesseract"
         CI: "true"

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.11.1-dev4
+## 0.11.1-dev5
 
 ### Enhancements
 
@@ -7,13 +7,15 @@
 ### Features
 
 * **Adds HubSpot connector** Adds connector to retrieve call, communications, emails, notes, products and tickets from HubSpot
+* * **Add Pinecone destination connector.** Problem: After ingesting data from a source, users might want to produce embeddings for their data and write these into a vector DB. Pinecone is an option among these vector databases. Feature: Added Pinecone destination connector to be able to ingest documents from any supported source, embed them and write the embeddings / documents into Pinecone.
 
 ### Fixes
 
 * **Do not extract text of `<style>` tags in HTML.** `<style>` tags containing CSS in invalid positions previously contributed to element text. Do not consider text node of a `<style>` element as textual content.
 * **Fix DOCX merged table cell repeats cell text.** Only include text for a merged cell, not for each underlying cell spanned by the merge.
 * **Fix tables not extracted from DOCX header/footers.** Headers and footers in DOCX documents skip tables defined in the header and commonly used for layout/alignment purposes. Extract text from tables as a string and include in the `Header` and `Footer` document elements.
 * **Fix output filepath for fsspec-based source connectors.** Previously the base directory was being included in the output filepath unnecessarily.
+* **Process chunking parameter names in ingest correctly** Solves a bug where chunking parameters weren't being processed and used by ingest cli by renaming faulty parameter names and prepends; adds relevant parameters to ingest pinecone test to verify that the parameters are functional.
 
 ## 0.11.0
 

diff --git a/Makefile b/Makefile
@@ -211,6 +211,10 @@ install-ingest-jira:
 install-ingest-hubspot:
 	python3 -m pip install -r requirements/ingest-hubspot.txt
 
+.PHONY: install-ingest-pinecone
+install-ingest-pinecone:
+	python3 -m pip install -r requirements/ingest-pinecone.txt
+
 .PHONY: install-embed-huggingface
 install-embed-huggingface:
 	python3 -m pip install -r requirements/ingest/embed-huggingface.txt

diff --git a/docs/source/core/chunking.rst b/docs/source/core/chunking.rst
@@ -26,11 +26,11 @@ that span between pages. This kwarg is ``True`` by default.
 not split elements, it is possible for a section to exceed that lenght, for
 example if a ``NarrativeText`` elements exceeds ``1500`` characters on its on.
 
-Similarly, sections under ``combine_under_n_chars`` will be combined if they
+Similarly, sections under ``combine_text_under_n_chars`` will be combined if they
 do not exceed the specified threshold, which defaults to ``500``. This will combine
 a series of ``Title`` elements that occur one after another, which sometimes
 happens in lists that are not detected as ``ListItem`` elements. Set
-``combine_under_n_chars=0`` to turn off this behavior.
+``combine_text_under_n_chars=0`` to turn off this behavior.
 
 The following shows an example of how to use ``chunk_by_title``. You will
 see the document chunked into sections instead of elements.

diff --git a/docs/source/ingest/configs/chunking_config.rst b/docs/source/ingest/configs/chunking_config.rst
@@ -16,4 +16,5 @@ Configs
 * ``chunk_elements (default False)``: Boolean flag whether to run chunking as part of the ingest process.
 * ``multipage_sections (default True)``: If True, sections can span multiple pages.
 * ``combine_text_under_n_chars (default 500)``: Combines elements (for example a series of titles) until a section reaches a length of n characters. Defaults to `max_characters` which combines chunks whenever space allows. Specifying 0 for this argument suppresses combining of small chunks. Note this value is "capped" at the `new_after_n_chars` value since a value higher than that would not change this parameter's effect.
+* ``new_after_n_chars (default 1500)``: Cuts off new sections once they reach a length of n characters (soft max). Defaults to `max_characters` when not specified, which effectively disables any soft window. Specifying 0 for this argument causes each element to appear in a chunk by itself (although an element with text longer than `max_characters` will be still be split into two or more chunks).
 * ``max_characters (default 1500)``: Chunks elements text and text_as_html (if present) into chunks of length n characters (hard max)
diff --git a/docs/source/ingest/configs/embedding_config.rst b/docs/source/ingest/configs/embedding_config.rst
@@ -10,5 +10,6 @@ the dataset.
 
 Configs
 ---------------------
-* ``api_key``: If an api key is required to generate the embeddings via an api (i.e. OpenAI)
-* ``model_name``: The model to use for the embedder.
+* ``embedding_provider``: An unstructured embedding provider to use while doing embedding. A few examples: langchain-openai, langchain-huggingface, langchain-aws-bedrock.
+* ``embedding_api_key``: If an api key is required to generate the embeddings via an api (i.e. OpenAI)
+* ``embedding_model_name``: The model to use for the embedder, if necessary.
diff --git a/docs/source/ingest/destination_connectors.rst b/docs/source/ingest/destination_connectors.rst
@@ -11,3 +11,5 @@ in our community `Slack. <https://short.unstructured.io/pzw05l7>`_
    destination_connectors/azure_cognitive_search
    destination_connectors/delta_table
    destination_connectors/mongodb
+   destination_connectors/pinecone
+   destination_connectors/s3
diff --git a/docs/source/ingest/destination_connectors/pinecone.rst b/docs/source/ingest/destination_connectors/pinecone.rst
@@ -0,0 +1,79 @@
+Pinecone
+===========
+
+Batch process all your records using ``unstructured-ingest`` to store structured outputs and embeddings locally on your filesystem and upload those to a Pinecone index.
+
+First you'll need to install the Pinecone dependencies as shown here.
+
+.. code:: shell
+
+  pip install "unstructured[pinecone]"
+
+Run Locally
+-----------
+The upstream connector can be any of the ones supported, but for convenience here, showing a sample command using the
+upstream local connector. This will create new files on your local.
+
+.. tabs::
+
+   .. tab:: Shell
+
+      .. code:: shell
+
+        unstructured-ingest \
+            local \
+            --input-path example-docs/book-war-and-peace-1225p.txt \
+            --output-dir local-to-pinecone \
+            --strategy fast \
+            --chunk-elements \
+            --embedding-provider <an unstructured embedding provider, ie. langchain-huggingface> \
+            --num-processes 2 \
+            --verbose \
+            --work-dir "<directory for intermediate outputs to be saved>" \
+            pinecone \
+            --api-key <your pinecone api key here> \
+            --index-name <your index name here, ie. ingest-test> \
+            --environment <your environment name here, ie. gcp-starter> \
+            --batch-size <number of elements to be uploaded per batch, ie. 80> \
+            --num-processes <number of processes to be used to upload, ie. 2>
+
+   .. tab:: Python
+
+      .. code:: python
+
+        import os
+
+        from unstructured.ingest.interfaces import PartitionConfig, ProcessorConfig, ReadConfig, ChunkingConfig, EmbeddingConfig
+        from unstructured.ingest.runner import LocalRunner
+        if __name__ == "__main__":
+            runner = LocalRunner(
+                processor_config=ProcessorConfig(
+                    verbose=True,
+                    output_dir="local-output-to-pinecone",
+                    num_processes=2,
+                ),
+                read_config=ReadConfig(),
+                partition_config=PartitionConfig(),
+                chunking_config=ChunkingConfig(
+                  chunk_elements=True
+                ),
+                embedding_config=EmbeddingConfig(
+                  provider="langchain-huggingface",
+                ),
+                writer_type="pinecone",
+                writer_kwargs={
+                    "api_key": os.getenv("PINECONE_API_KEY"),
+                    "index_name": os.getenv("PINECONE_INDEX_NAME"),
+                    "environment": os.getenv("PINECONE_ENVIRONMENT_NAME"),
+                    "batch_size": 80,
+                    "num_processes": 2,
+                }
+            )
+            runner.run(
+                input_path="example-docs/fake-memo.pdf",
+            )
+
+
+For a full list of the options the CLI accepts check ``unstructured-ingest <upstream connector> pinecone --help``.
+
+NOTE: Keep in mind that you will need to have all the appropriate extras and dependencies for the file types of the documents contained in your data storage platform if you're running this locally. You can find more information about this in the `installation guide <https://unstructured-io.github.io/unstructured/installing.html>`_.
diff --git a/docs/source/ingest/destination_connectors/s3.rst b/docs/source/ingest/destination_connectors/s3.rst
@@ -0,0 +1,73 @@
+S3
+===========
+
+Batch process all your records using ``unstructured-ingest`` to store structured outputs locally on your filesystem and upload those local files to an S3 bucket.
+
+First you'll need to install the S3 dependencies as shown here.
+
+.. code:: shell
+
+  pip install "unstructured[s3]"
+
+Run Locally
+-----------
+The upstream connector can be any of the ones supported, but for convenience here, showing a sample command using the
+upstream local connector. This will create new files on your local.
+
+.. tabs::
+
+   .. tab:: Shell
+
+      .. code:: shell
+
+        unstructured-ingest \
+            local \
+            --input-path example-docs/book-war-and-peace-1225p.txt \
+            --output-dir local-to-s3 \
+            --strategy fast \
+            --chunk-elements \
+            --embedding-provider <an unstructured embedding provider, ie. langchain-huggingface> \
+            --num-processes 2 \
+            --verbose \
+            --work-dir "<directory for intermediate outputs to be saved>" \
+            s3 \
+            --anonymous \
+            --remote-url "<your destination path here, ie 's3://unstructured/war-and-peace-output'>"
+
+   .. tab:: Python
+
+      .. code:: python
+
+        import os
+
+        from unstructured.ingest.interfaces import PartitionConfig, ProcessorConfig, ReadConfig, ChunkingConfig, EmbeddingConfig
+        from unstructured.ingest.runner import LocalRunner
+        if __name__ == "__main__":
+            runner = LocalRunner(
+                processor_config=ProcessorConfig(
+                    verbose=True,
+                    output_dir="local-output-to-s3",
+                    num_processes=2,
+                ),
+                read_config=ReadConfig(),
+                partition_config=PartitionConfig(),
+                chunking_config=ChunkingConfig(
+                  chunk_elements=True
+                ),
+                embedding_config=EmbeddingConfig(
+                  provider="langchain-huggingface",
+                ),
+                writer_type="s3",
+                writer_kwargs={
+                    "anonymous": True,
+                    "--remote-url": "<your destination path here, ie 's3://unstructured/war-and-peace-output'>",
+                }
+            )
+            runner.run(
+                input_path="example-docs/book-war-and-peace-1225p.txt",
+            )
+
+
+For a full list of the options the CLI accepts check ``unstructured-ingest <upstream connector> s3 --help``.
+
+NOTE: Keep in mind that you will need to have all the appropriate extras and dependencies for the file types of the documents contained in your data storage platform if you're running this locally. You can find more information about this in the `installation guide <https://unstructured-io.github.io/unstructured/installing.html>`_.
diff --git a/examples/ingest/pinecone/ingest.sh b/examples/ingest/pinecone/ingest.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Processes all the files from s3://utic-dev-tech-fixtures/small-pdf-set/,
+# embeds the processed documents, and writes to results to a Pinecone index.
+
+# Structured outputs are stored in s3-small-batch-output-to-pinecone/
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd "$SCRIPT_DIR"/../../.. || exit 1
+
+
+# As an example we're using the s3 source connector,
+# however ingesting from any supported source connector is possible.
+# shellcheck disable=2094
+PYTHONPATH=. ./unstructured/ingest/main.py \
+        local \
+        --input-path example-docs/book-war-and-peace-1225p.txt \
+        --output-dir local-to-pinecone \
+        --strategy fast \
+        --chunk-elements \
+        --embedding-provider <an unstructured embedding provider, ie. langchain-huggingface> \
+        --num-processes 2 \
+        --verbose \
+        --work-dir "<directory for intermediate outputs to be saved>" \
+        pinecone \
+        --api-key "<Pinecone API Key to write into a Pinecone index>" \
+        --index-name "<Pinecone index name, ie: ingest-test>" \
+        --environment "<Pinecone index name, ie: ingest-test>" \
+        --batch-size "<Number of elements to be uploaded per batch, ie. 80>" \
+        --num-processes "<Number of processes to be used to upload, ie. 2>"
diff --git a/requirements/ingest/pinecone.in b/requirements/ingest/pinecone.in
@@ -0,0 +1,3 @@
+-c constraints.in
+-c base.txt
+pinecone-client
diff --git a/requirements/ingest/pinecone.txt b/requirements/ingest/pinecone.txt
@@ -0,0 +1,56 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile requirements/ingest-pinecone.in
+#
+certifi==2023.7.22
+    # via
+    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
+    #   requests
+charset-normalizer==3.3.0
+    # via
+    #   -c requirements/base.txt
+    #   requests
+dnspython==2.4.2
+    # via pinecone-client
+idna==3.4
+    # via
+    #   -c requirements/base.txt
+    #   requests
+loguru==0.7.2
+    # via pinecone-client
+numpy==1.24.4
+    # via
+    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
+    #   pinecone-client
+pinecone-client==2.2.4
+    # via -r requirements/ingest-pinecone.in
+python-dateutil==2.8.2
+    # via pinecone-client
+pyyaml==6.0.1
+    # via pinecone-client
+requests==2.31.0
+    # via
+    #   -c requirements/base.txt
+    #   pinecone-client
+six==1.16.0
+    # via
+    #   -c requirements/base.txt
+    #   python-dateutil
+tqdm==4.66.1
+    # via
+    #   -c requirements/base.txt
+    #   pinecone-client
+typing-extensions==4.8.0
+    # via
+    #   -c requirements/base.txt
+    #   pinecone-client
+urllib3==1.26.18
+    # via
+    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
+    #   pinecone-client
+    #   requests
diff --git a/setup.py b/setup.py
@@ -127,33 +127,34 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
         "tsv": tsv_reqs,
         "xlsx": xlsx_reqs,
         # Extra requirements for data connectors
-        "s3": load_requirements("requirements/ingest/s3.in"),
+        "airtable": load_requirements("requirements/ingest/airtable.in"),
         "azure": load_requirements("requirements/ingest/azure.in"),
         "azure-cognitive-search": load_requirements(
             "requirements/ingest/azure-cognitive-search.in",
         ),
         "biomed": load_requirements("requirements/ingest/biomed.in"),
+        "box": load_requirements("requirements/ingest/box.in"),
+        "confluence": load_requirements("requirements/ingest/confluence.in"),
+        "delta-table": load_requirements("requirements/ingest/delta-table.in"),
         "discord": load_requirements("requirements/ingest/discord.in"),
+        "dropbox": load_requirements("requirements/ingest/dropbox.in"),
+        "elasticsearch": load_requirements("requirements/ingest/elasticsearch.in"),
+        "gcs": load_requirements("requirements/ingest/gcs.in"),
         "github": load_requirements("requirements/ingest/github.in"),
         "gitlab": load_requirements("requirements/ingest/gitlab.in"),
-        "reddit": load_requirements("requirements/ingest/reddit.in"),
-        "notion": load_requirements("requirements/ingest/notion.in"),
-        "slack": load_requirements("requirements/ingest/slack.in"),
-        "wikipedia": load_requirements("requirements/ingest/wikipedia.in"),
         "google-drive": load_requirements("requirements/ingest/google-drive.in"),
-        "gcs": load_requirements("requirements/ingest/gcs.in"),
-        "elasticsearch": load_requirements("requirements/ingest/elasticsearch.in"),
-        "dropbox": load_requirements("requirements/ingest/dropbox.in"),
-        "box": load_requirements("requirements/ingest/box.in"),
+        "hubspot": load_requirements("requirements/ingest/hubspot.in"),
+        "jira": load_requirements("requirements/ingest/jira.in"),
+        "notion": load_requirements("requirements/ingest/notion.in"),
         "onedrive": load_requirements("requirements/ingest/onedrive.in"),
         "outlook": load_requirements("requirements/ingest/outlook.in"),
-        "confluence": load_requirements("requirements/ingest/confluence.in"),
-        "airtable": load_requirements("requirements/ingest/airtable.in"),
+        "pinecone": load_requirements("requirements/ingest/pinecone.in"),
+        "reddit": load_requirements("requirements/ingest/reddit.in"),
+        "s3": load_requirements("requirements/ingest/s3.in"),
         "sharepoint": load_requirements("requirements/ingest/sharepoint.in"),
-        "delta-table": load_requirements("requirements/ingest/delta-table.in"),
         "salesforce": load_requirements("requirements/ingest/salesforce.in"),
-        "jira": load_requirements("requirements/ingest/jira.in"),
-        "hubspot": load_requirements("requirements/ingest/hubspot.in"),
+        "slack": load_requirements("requirements/ingest/slack.in"),
+        "wikipedia": load_requirements("requirements/ingest/wikipedia.in"),
         # Legacy extra requirements
         "huggingface": load_requirements("requirements/huggingface.in"),
         "local-inference": all_doc_reqs,