From 9fea85dc210a52f0aaafecad97a43fd708a98304 Mon Sep 17 00:00:00 2001
From: David Potter <potterdavidm@gmail.com>
Date: Tue, 23 Jan 2024 13:52:11 -0800
Subject: [PATCH 1/3] fix: remove none value keys from flattened dictionary
 (#2442)

When a partitioned or embedded document json has null values, those get
converted to a dictionary with None values.

This happens in the metadata. I have not see it in other keys.

Chroma and Pinecone do not like those None values.

`flatten_dict` has been modified with a `remove_none` arg to remove keys
with None values.

Also, Pinecone has been pinned at 2.2.4 because at 3.0 and above it
breaks our code.

---------

Co-authored-by: potter-potter <david.potter@gmail.com>
---
 CHANGELOG.md                                  |  3 +-
 requirements/ingest/pinecone.in               |  2 +-
 .../staging/test_base_staging.py              | 28 +++++++++++++++++++
 unstructured/__version__.py                   |  2 +-
 unstructured/ingest/connector/chroma.py       |  4 ++-
 unstructured/ingest/connector/pinecone.py     |  1 +
 unstructured/staging/base.py                  | 17 +++++++++--
 7 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 941c847c2f..a1bc828652 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.12.3-dev3
+## 0.12.3-dev4
 
 ### Enhancements
 
@@ -12,6 +12,7 @@
 ### Fixes
 * **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector.
 * **Fix databricks-volumes extra location.** `setup.py` is currently pointing to the wrong location for the databricks-volumes extra requirements. This results in errors when trying to build the wheel for unstructured. This change updates to point to the correct path.
+* **Fix uploading None values to Chroma and Pinecone.** Removes keys with None values with Pinecone and Chroma destinations. Pins Pinecone dependency
 
 ## 0.12.2
 
diff --git a/requirements/ingest/pinecone.in b/requirements/ingest/pinecone.in
index ebaedb531b..d1cc814f89 100644
--- a/requirements/ingest/pinecone.in
+++ b/requirements/ingest/pinecone.in
@@ -1,3 +1,3 @@
 -c ../constraints.in
 -c ../base.txt
-pinecone-client
+pinecone-client==2.2.4
diff --git a/test_unstructured/staging/test_base_staging.py b/test_unstructured/staging/test_base_staging.py
index f113f1f3d5..52f1410c17 100644
--- a/test_unstructured/staging/test_base_staging.py
+++ b/test_unstructured/staging/test_base_staging.py
@@ -464,6 +464,34 @@ def test_flatten_dict_flatten_list_omit_keys():
     )
 
 
+def test_flatten_dict_flatten_list_omit_keys_remove_none():
+    """Flattening a dictionary with flatten_lists set to True and also omitting keys
+    and setting remove_none to True"""
+    dictionary = {"a": None, "b": [2, 3, 4], "c": {"d": None, "e": [6, 7]}}
+    keys_to_omit = ["c"]
+    expected_result = {"b_0": 2, "b_1": 3, "b_2": 4, "c": {"d": None, "e": [6, 7]}}
+    assert (
+        base.flatten_dict(
+            dictionary, keys_to_omit=keys_to_omit, flatten_lists=True, remove_none=True
+        )
+        == expected_result
+    )
+
+
+def test_flatten_dict_flatten_list_remove_none():
+    """Flattening a dictionary with flatten_lists set to True and setting remove_none to True"""
+    dictionary = {"a": None, "b": [2, 3, 4], "c": {"d": None, "e": [6, 7]}}
+    expected_result = {"b_0": 2, "b_1": 3, "b_2": 4, "c_e_0": 6, "c_e_1": 7}
+    assert base.flatten_dict(dictionary, flatten_lists=True, remove_none=True) == expected_result
+
+
+def test_flatten_dict_flatten_list_none_in_list_remove_none():
+    """Flattening a dictionary with flatten_lists and remove_none set to True and None in list"""
+    dictionary = {"a": 1, "b": [2, 3, 4], "c": {"d": None, "e": [6, None]}}
+    expected_result = {"a": 1, "b_0": 2, "b_1": 3, "b_2": 4, "c_e_0": 6}
+    assert base.flatten_dict(dictionary, flatten_lists=True, remove_none=True) == expected_result
+
+
 def test_flatten_dict_flatten_list_omit_keys2():
     """Flattening a dictionary with flatten_lists set to True and also omitting keys"""
     dictionary = {"a": 1, "b": [2, 3, 4], "c": {"d": 5, "e": [6, 7]}}
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index e62434f7cd..5b743183bf 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.12.3-dev3"  # pragma: no cover
+__version__ = "0.12.3-dev4"  # pragma: no cover
diff --git a/unstructured/ingest/connector/chroma.py b/unstructured/ingest/connector/chroma.py
index ae9912ec53..688f4f2dad 100644
--- a/unstructured/ingest/connector/chroma.py
+++ b/unstructured/ingest/connector/chroma.py
@@ -151,5 +151,7 @@ def normalize_dict(self, element_dict: dict) -> dict:
             "id": str(uuid.uuid4()),
             "embedding": element_dict.pop("embeddings", None),
             "document": element_dict.pop("text", None),
-            "metadata": flatten_dict(element_dict, separator="-", flatten_lists=True),
+            "metadata": flatten_dict(
+                element_dict, separator="-", flatten_lists=True, remove_none=True
+            ),
         }
diff --git a/unstructured/ingest/connector/pinecone.py b/unstructured/ingest/connector/pinecone.py
index e117043e1b..dd6f5d023c 100644
--- a/unstructured/ingest/connector/pinecone.py
+++ b/unstructured/ingest/connector/pinecone.py
@@ -135,6 +135,7 @@ def normalize_dict(self, element_dict: dict) -> dict:
                     element_dict,
                     separator="-",
                     flatten_lists=True,
+                    remove_none=True,
                 ),
             },
         }
diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py
index 08f29e9d1b..b7e17a0659 100644
--- a/unstructured/staging/base.py
+++ b/unstructured/staging/base.py
@@ -177,20 +177,30 @@ def elements_from_json(
 
 
 def flatten_dict(
-    dictionary, parent_key="", separator="_", flatten_lists=False, keys_to_omit: List[str] = None
+    dictionary,
+    parent_key="",
+    separator="_",
+    flatten_lists=False,
+    remove_none=False,
+    keys_to_omit: List[str] = None,
 ):
     """Flattens a nested dictionary into a single level dictionary. keys_to_omit is a list of keys
     that don't get flattened. If omitting a nested key, format as {parent_key}{separator}{key}.
-    If flatten_lists is True, then lists and tuples are flattened as well."""
+    If flatten_lists is True, then lists and tuples are flattened as well.
+    If remove_none is True, then None keys/values are removed from the flattened dictionary."""
     keys_to_omit = keys_to_omit if keys_to_omit else []
     flattened_dict = {}
     for key, value in dictionary.items():
         new_key = f"{parent_key}{separator}{key}" if parent_key else key
         if new_key in keys_to_omit:
             flattened_dict[new_key] = value
+        elif value is None and remove_none:
+            continue
         elif isinstance(value, dict):
             flattened_dict.update(
-                flatten_dict(value, new_key, separator, flatten_lists, keys_to_omit=keys_to_omit),
+                flatten_dict(
+                    value, new_key, separator, flatten_lists, remove_none, keys_to_omit=keys_to_omit
+                ),
             )
         elif isinstance(value, (list, tuple)) and flatten_lists:
             for index, item in enumerate(value):
@@ -200,6 +210,7 @@ def flatten_dict(
                         "",
                         separator,
                         flatten_lists,
+                        remove_none,
                         keys_to_omit=keys_to_omit,
                     )
                 )

From 4613e52e11c6d5cfe9f4c82d61fb6c0cd53b797c Mon Sep 17 00:00:00 2001
From: Matt Robinson <mrobinson@unstructured.io>
Date: Wed, 24 Jan 2024 12:48:36 -0500
Subject: [PATCH 2/3] fix: treat yaml files as plain text (#2446)

### Summary

Closes #2412. Adds support for YAML MIME types and treats them as plain
text. In response to `500` errors that the API currently returns if the
MIME type is `text/yaml`.
---
 CHANGELOG.md                                  |  2 ++
 test_unstructured/file_utils/test_filetype.py | 21 +++++++++++++++++++
 unstructured/file_utils/filetype.py           | 11 +++++++++-
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a1bc828652..0e8ea10274 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,8 @@
 * **Add Databricks Volumes destination connector** Databricks Volumes connector added to ingest CLI.  Users may now use `unstructured-ingest` to write partitioned data to a Databricks Volumes storage service.
 
 ### Fixes
+* **Treat YAML files as text.** Adds YAML MIME types to the file detection code and treats those
+  files as text.
 * **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector.
 * **Fix databricks-volumes extra location.** `setup.py` is currently pointing to the wrong location for the databricks-volumes extra requirements. This results in errors when trying to build the wheel for unstructured. This change updates to point to the correct path.
 * **Fix uploading None values to Chroma and Pinecone.** Removes keys with None values with Pinecone and Chroma destinations. Pins Pinecone dependency
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
index 814b4c726d..2388b07aaf 100644
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@@ -4,6 +4,7 @@
 
 import magic
 import pytest
+import yaml
 from PIL import Image
 
 from unstructured.file_utils import filetype
@@ -481,3 +482,23 @@ def test_detect_wav_from_filename(filename="example-docs/CantinaBand3.wav"):
 def test_detect_wav_from_file(filename="example-docs/CantinaBand3.wav"):
     with open(filename, "rb") as f:
         assert detect_filetype(file=f) == FileType.WAV
+
+
+def test_detect_yaml_as_text_from_filename(tmpdir):
+    data = {"hi": "there", "this is": "yaml"}
+    filename = os.path.join(tmpdir.dirname, "test.yaml")
+    with open(filename, "w") as f:
+        yaml.dump(data, f)
+
+    assert detect_filetype(filename=filename) == FileType.TXT
+
+
+def test_detect_yaml_as_text_from_file(tmpdir, monkeypatch):
+    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/yaml")
+    data = {"hi": "there", "this is": "yaml"}
+    filename = os.path.join(tmpdir.dirname, "test.yaml")
+    with open(filename, "w") as f:
+        yaml.dump(data, f)
+
+    with open(filename, "rb") as f:
+        assert detect_filetype(file=f) == FileType.TXT
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
index 96eac3a118..6353b0278f 100644
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@@ -114,6 +114,13 @@ def __lt__(self, other):
     "image/png": FileType.PNG,
     "image/tiff": FileType.TIFF,
     "image/bmp": FileType.BMP,
+    # NOTE(robinson) - https://mimetype.io/application/yaml
+    # In the future, we may have special processing for YAML
+    # files instead of treating them as plaintext
+    "application/yaml": FileType.TXT,
+    "application/x-yaml": FileType.TXT,
+    "text/x-yaml": FileType.TXT,
+    "text/yaml": FileType.TXT,
     "text/plain": FileType.TXT,
     "text/x-csv": FileType.CSV,
     "application/csv": FileType.CSV,
@@ -209,6 +216,8 @@ def __lt__(self, other):
     ".swift": FileType.TXT,
     ".ts": FileType.TXT,
     ".go": FileType.TXT,
+    ".yaml": FileType.TXT,
+    ".yml": FileType.TXT,
     None: FileType.UNK,
 }
 
@@ -349,7 +358,7 @@ def detect_filetype(
             return FileType.EML
 
         if extension in PLAIN_TEXT_EXTENSIONS:
-            return EXT_TO_FILETYPE.get(extension)
+            return EXT_TO_FILETYPE.get(extension, FileType.UNK)
 
         # Safety catch
         if mime_type in STR_TO_FILETYPE:

From d8b3bdb919579dddba8bfc925e4b99ca807e3005 Mon Sep 17 00:00:00 2001
From: Antonio Jose Jimeno Yepes <antonio.jimeno@gmail.com>
Date: Thu, 25 Jan 2024 13:33:32 +1100
Subject: [PATCH 3/3] Check chipper version and prevent running pdfminer with
 chipper (#2347)

We have added a new version of chipper (Chipperv3), which needs to allow
unstructured to effective work with all the current Chipper versions.
This implies resizing images with the appropriate resolution and make
sure that Chipper elements are not sorted by unstructured.

In addition, it seems that PDFMiner is being called when calling
Chipper, which adds repeated elements from Chipper and PDFMiner.

To evaluate this PR, you can test the code below with the attached PDF.
The code writes a JSON file with the generated elements. The output can
be examined with `cat out.un.json | python -m json.tool`. There are
three things to check:

1. The size of the image passed to Chipper, which can be identiied in
the layout_height and layout_width attributes, which should have values
3301 and 2550 as shown in the example below:

```
[
    {
        "element_id": "c0493a7872f227e4172c4192c5f48a06",
        "metadata": {
            "coordinates": {
                "layout_height": 3301,
                "layout_width": 2550,

```

2. There should be no repeated elements.
3. Order should be closer to reading order.

The script to run Chipper from unstructured is:

```
from unstructured import __version__
print(__version__.__version__)

import json
from unstructured.partition.auto import partition
from unstructured.staging.base import elements_to_json

elements = json.loads(elements_to_json(partition("Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf", strategy="hi_res", model_name="chipperv3")))

with open('out.un.json', 'w') as w:
    json.dump(elements, w)

```


[Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf](https://github.com/Unstructured-IO/unstructured/files/13817273/Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf)

---------

Co-authored-by: Antonio Jimeno Yepes <antonio@unstructured.io>
---
 CHANGELOG.md                                  |  3 +-
 .../partition/pdf_image/test_chipper.py       | 10 +++
 unstructured/__version__.py                   |  2 +-
 unstructured/partition/pdf.py                 | 77 ++++++++++---------
 4 files changed, 54 insertions(+), 38 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0e8ea10274..702919886b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.12.3-dev4
+## 0.12.3-dev5
 
 ### Enhancements
 
@@ -10,6 +10,7 @@
 * **Add Databricks Volumes destination connector** Databricks Volumes connector added to ingest CLI.  Users may now use `unstructured-ingest` to write partitioned data to a Databricks Volumes storage service.
 
 ### Fixes
+* **Fix support for different Chipper versions and prevent running PDFMiner with Chipper** 
 * **Treat YAML files as text.** Adds YAML MIME types to the file detection code and treats those
   files as text.
 * **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector.
diff --git a/test_unstructured/partition/pdf_image/test_chipper.py b/test_unstructured/partition/pdf_image/test_chipper.py
index d625f97876..81f421159c 100644
--- a/test_unstructured/partition/pdf_image/test_chipper.py
+++ b/test_unstructured/partition/pdf_image/test_chipper.py
@@ -30,3 +30,13 @@ def test_chipper_not_losing_parents(chipper_results, chipper_children):
         [el for el in chipper_results if el.id == child.metadata.parent_id]
         for child in chipper_children
     )
+
+
+def chipper_test_pdfminer_repeated(chipper_results):
+    """
+    Test to verify that PDFMiner has not been run together with Chipper
+    """
+    elements = chipper_results
+    assert len([element.text for element in elements]) == len(
+        {element.text for element in elements}
+    )
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 5b743183bf..230fc8d54b 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.12.3-dev4"  # pragma: no cover
+__version__ = "0.12.3-dev5"  # pragma: no cover
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index 8a0b003028..867a53a075 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -298,8 +298,8 @@ def _partition_pdf_or_image_local(
         hi_res_model_name or model_name or default_hi_res_model(infer_table_structure)
     )
     if pdf_image_dpi is None:
-        pdf_image_dpi = 300 if hi_res_model_name == "chipper" else 200
-    if (pdf_image_dpi < 300) and (hi_res_model_name == "chipper"):
+        pdf_image_dpi = 300 if hi_res_model_name.startswith("chipper") else 200
+    if (pdf_image_dpi < 300) and (hi_res_model_name.startswith("chipper")):
         logger.warning(
             "The Chipper model performs better when images are rendered with DPI >= 300 "
             f"(currently {pdf_image_dpi}).",
@@ -313,32 +313,33 @@ def _partition_pdf_or_image_local(
             pdf_image_dpi=pdf_image_dpi,
         )
 
-        extracted_layout = (
-            process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
-            if pdf_text_extractable
-            else []
-        )
+        if hi_res_model_name.startswith("chipper"):
+            # NOTE(alan): We shouldn't do OCR with chipper
+            # NOTE(antonio): We shouldn't do PDFMiner with chipper
+            final_document_layout = inferred_document_layout
+        else:
+            extracted_layout = (
+                process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
+                if pdf_text_extractable
+                else []
+            )
+
+            if analysis:
+                annotate_layout_elements(
+                    inferred_document_layout=inferred_document_layout,
+                    extracted_layout=extracted_layout,
+                    filename=filename,
+                    output_dir_path=analyzed_image_output_dir_path,
+                    pdf_image_dpi=pdf_image_dpi,
+                    is_image=is_image,
+                )
 
-        if analysis:
-            annotate_layout_elements(
+            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+            merged_document_layout = merge_inferred_with_extracted_layout(
                 inferred_document_layout=inferred_document_layout,
                 extracted_layout=extracted_layout,
-                filename=filename,
-                output_dir_path=analyzed_image_output_dir_path,
-                pdf_image_dpi=pdf_image_dpi,
-                is_image=is_image,
             )
 
-        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-        merged_document_layout = merge_inferred_with_extracted_layout(
-            inferred_document_layout=inferred_document_layout,
-            extracted_layout=extracted_layout,
-        )
-
-        if hi_res_model_name.startswith("chipper"):
-            # NOTE(alan): We shouldn't do OCR with chipper
-            final_document_layout = merged_document_layout
-        else:
             final_document_layout = process_file_with_ocr(
                 filename,
                 merged_document_layout,
@@ -355,23 +356,27 @@ def _partition_pdf_or_image_local(
             model_name=hi_res_model_name,
             pdf_image_dpi=pdf_image_dpi,
         )
-        if hasattr(file, "seek"):
-            file.seek(0)
-
-        extracted_layout = (
-            process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
-        )
-
-        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-        merged_document_layout = merge_inferred_with_extracted_layout(
-            inferred_document_layout=inferred_document_layout,
-            extracted_layout=extracted_layout,
-        )
 
         if hi_res_model_name.startswith("chipper"):
             # NOTE(alan): We shouldn't do OCR with chipper
+            # NOTE(antonio): We shouldn't do PDFMiner with chipper
             final_document_layout = merged_document_layout
         else:
+            if hasattr(file, "seek"):
+                file.seek(0)
+
+            extracted_layout = (
+                process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
+                if pdf_text_extractable
+                else []
+            )
+
+            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+            merged_document_layout = merge_inferred_with_extracted_layout(
+                inferred_document_layout=inferred_document_layout,
+                extracted_layout=extracted_layout,
+            )
+
             if hasattr(file, "seek"):
                 file.seek(0)
             final_document_layout = process_data_with_ocr(
@@ -385,7 +390,7 @@ def _partition_pdf_or_image_local(
             )
 
     # NOTE(alan): starting with v2, chipper sorts the elements itself.
-    if hi_res_model_name == "chipper":
+    if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1":
         kwargs["sort_mode"] = SORT_MODE_DONT
 
     final_document_layout = clean_pdfminer_inner_elements(final_document_layout)