Merge remote-tracking branch 'origin/main' into fix/default-hi-res-re…

…ly-on-inference-setting
Unstructured-IO · Jan 25, 2024 · a43e528 · a43e528
2 parents 7d535a6 + d8b3bdb
commit a43e528
Show file tree

Hide file tree

Showing 11 changed files with 136 additions and 45 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.12.3-dev4
+## 0.12.3-dev6
 
 ### Enhancements
 
@@ -11,9 +11,13 @@
 
 ### Fixes
 
-* **Default `hi_res_model_name` now relies on `unstructured-inference`** When no explicit `hi_res_model_name` is passed into `partition` or `partition_pdf_or_image` the default model is picked by `unstructured-inference`'s settings or os env variable `UNSTRUCTURED_HI_RES_MODEL_NAME`; it now returns the same model name regardless of `infer_table_structure`'s value; this function will be deprecated in the future and the default model name will simply rely on `unstructured-inference` and will not consider os env in a future release.
+* **Fix support for different Chipper versions and prevent running PDFMiner with Chipper**
+* **Treat YAML files as text.** Adds YAML MIME types to the file detection code and treats those
+  files as text.
 * **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector.
 * **Fix databricks-volumes extra location.** `setup.py` is currently pointing to the wrong location for the databricks-volumes extra requirements. This results in errors when trying to build the wheel for unstructured. This change updates to point to the correct path.
+* **Fix uploading None values to Chroma and Pinecone.** Removes keys with None values with Pinecone and Chroma destinations. Pins Pinecone dependency
+* **Default `hi_res_model_name` now relies on `unstructured-inference`** When no explicit `hi_res_model_name` is passed into `partition` or `partition_pdf_or_image` the default model is picked by `unstructured-inference`'s settings or os env variable `UNSTRUCTURED_HI_RES_MODEL_NAME`; it now returns the same model name regardless of `infer_table_structure`'s value; this function will be deprecated in the future and the default model name will simply rely on `unstructured-inference` and will not consider os env in a future release.
 
 ## 0.12.2
 

diff --git a/requirements/ingest/pinecone.in b/requirements/ingest/pinecone.in
@@ -1,3 +1,3 @@
 -c ../constraints.in
 -c ../base.txt
-pinecone-client
+pinecone-client==2.2.4
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -4,6 +4,7 @@
 
 import magic
 import pytest
+import yaml
 from PIL import Image
 
 from unstructured.file_utils import filetype
@@ -481,3 +482,23 @@ def test_detect_wav_from_filename(filename="example-docs/CantinaBand3.wav"):
 def test_detect_wav_from_file(filename="example-docs/CantinaBand3.wav"):
     with open(filename, "rb") as f:
         assert detect_filetype(file=f) == FileType.WAV
+
+
+def test_detect_yaml_as_text_from_filename(tmpdir):
+    data = {"hi": "there", "this is": "yaml"}
+    filename = os.path.join(tmpdir.dirname, "test.yaml")
+    with open(filename, "w") as f:
+        yaml.dump(data, f)
+
+    assert detect_filetype(filename=filename) == FileType.TXT
+
+
+def test_detect_yaml_as_text_from_file(tmpdir, monkeypatch):
+    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/yaml")
+    data = {"hi": "there", "this is": "yaml"}
+    filename = os.path.join(tmpdir.dirname, "test.yaml")
+    with open(filename, "w") as f:
+        yaml.dump(data, f)
+
+    with open(filename, "rb") as f:
+        assert detect_filetype(file=f) == FileType.TXT
diff --git a/test_unstructured/partition/pdf_image/test_chipper.py b/test_unstructured/partition/pdf_image/test_chipper.py
@@ -30,3 +30,13 @@ def test_chipper_not_losing_parents(chipper_results, chipper_children):
         [el for el in chipper_results if el.id == child.metadata.parent_id]
         for child in chipper_children
     )
+
+
+def chipper_test_pdfminer_repeated(chipper_results):
+    """
+    Test to verify that PDFMiner has not been run together with Chipper
+    """
+    elements = chipper_results
+    assert len([element.text for element in elements]) == len(
+        {element.text for element in elements}
+    )
diff --git a/test_unstructured/staging/test_base_staging.py b/test_unstructured/staging/test_base_staging.py
@@ -464,6 +464,34 @@ def test_flatten_dict_flatten_list_omit_keys():
     )
 
 
+def test_flatten_dict_flatten_list_omit_keys_remove_none():
+    """Flattening a dictionary with flatten_lists set to True and also omitting keys
+    and setting remove_none to True"""
+    dictionary = {"a": None, "b": [2, 3, 4], "c": {"d": None, "e": [6, 7]}}
+    keys_to_omit = ["c"]
+    expected_result = {"b_0": 2, "b_1": 3, "b_2": 4, "c": {"d": None, "e": [6, 7]}}
+    assert (
+        base.flatten_dict(
+            dictionary, keys_to_omit=keys_to_omit, flatten_lists=True, remove_none=True
+        )
+        == expected_result
+    )
+
+
+def test_flatten_dict_flatten_list_remove_none():
+    """Flattening a dictionary with flatten_lists set to True and setting remove_none to True"""
+    dictionary = {"a": None, "b": [2, 3, 4], "c": {"d": None, "e": [6, 7]}}
+    expected_result = {"b_0": 2, "b_1": 3, "b_2": 4, "c_e_0": 6, "c_e_1": 7}
+    assert base.flatten_dict(dictionary, flatten_lists=True, remove_none=True) == expected_result
+
+
+def test_flatten_dict_flatten_list_none_in_list_remove_none():
+    """Flattening a dictionary with flatten_lists and remove_none set to True and None in list"""
+    dictionary = {"a": 1, "b": [2, 3, 4], "c": {"d": None, "e": [6, None]}}
+    expected_result = {"a": 1, "b_0": 2, "b_1": 3, "b_2": 4, "c_e_0": 6}
+    assert base.flatten_dict(dictionary, flatten_lists=True, remove_none=True) == expected_result
+
+
 def test_flatten_dict_flatten_list_omit_keys2():
     """Flattening a dictionary with flatten_lists set to True and also omitting keys"""
     dictionary = {"a": 1, "b": [2, 3, 4], "c": {"d": 5, "e": [6, 7]}}

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.12.3-dev4"  # pragma: no cover
+__version__ = "0.12.3-dev6"  # pragma: no cover
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
@@ -114,6 +114,13 @@ def __lt__(self, other):
     "image/png": FileType.PNG,
     "image/tiff": FileType.TIFF,
     "image/bmp": FileType.BMP,
+    # NOTE(robinson) - https://mimetype.io/application/yaml
+    # In the future, we may have special processing for YAML
+    # files instead of treating them as plaintext
+    "application/yaml": FileType.TXT,
+    "application/x-yaml": FileType.TXT,
+    "text/x-yaml": FileType.TXT,
+    "text/yaml": FileType.TXT,
     "text/plain": FileType.TXT,
     "text/x-csv": FileType.CSV,
     "application/csv": FileType.CSV,
@@ -209,6 +216,8 @@ def __lt__(self, other):
     ".swift": FileType.TXT,
     ".ts": FileType.TXT,
     ".go": FileType.TXT,
+    ".yaml": FileType.TXT,
+    ".yml": FileType.TXT,
     None: FileType.UNK,
 }
 
@@ -349,7 +358,7 @@ def detect_filetype(
             return FileType.EML
 
         if extension in PLAIN_TEXT_EXTENSIONS:
-            return EXT_TO_FILETYPE.get(extension)
+            return EXT_TO_FILETYPE.get(extension, FileType.UNK)
 
         # Safety catch
         if mime_type in STR_TO_FILETYPE:

diff --git a/unstructured/ingest/connector/chroma.py b/unstructured/ingest/connector/chroma.py
@@ -151,5 +151,7 @@ def normalize_dict(self, element_dict: dict) -> dict:
             "id": str(uuid.uuid4()),
             "embedding": element_dict.pop("embeddings", None),
             "document": element_dict.pop("text", None),
-            "metadata": flatten_dict(element_dict, separator="-", flatten_lists=True),
+            "metadata": flatten_dict(
+                element_dict, separator="-", flatten_lists=True, remove_none=True
+            ),
         }
diff --git a/unstructured/ingest/connector/pinecone.py b/unstructured/ingest/connector/pinecone.py
@@ -135,6 +135,7 @@ def normalize_dict(self, element_dict: dict) -> dict:
                     element_dict,
                     separator="-",
                     flatten_lists=True,
+                    remove_none=True,
                 ),
             },
         }
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -303,8 +303,8 @@ def _partition_pdf_or_image_local(
 
     hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
     if pdf_image_dpi is None:
-        pdf_image_dpi = 300 if hi_res_model_name == "chipper" else 200
-    if (pdf_image_dpi < 300) and (hi_res_model_name == "chipper"):
+        pdf_image_dpi = 300 if hi_res_model_name.startswith("chipper") else 200
+    if (pdf_image_dpi < 300) and (hi_res_model_name.startswith("chipper")):
         logger.warning(
             "The Chipper model performs better when images are rendered with DPI >= 300 "
             f"(currently {pdf_image_dpi}).",
@@ -318,32 +318,33 @@ def _partition_pdf_or_image_local(
             pdf_image_dpi=pdf_image_dpi,
         )
 
-        extracted_layout = (
-            process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
-            if pdf_text_extractable
-            else []
-        )
+        if hi_res_model_name.startswith("chipper"):
+            # NOTE(alan): We shouldn't do OCR with chipper
+            # NOTE(antonio): We shouldn't do PDFMiner with chipper
+            final_document_layout = inferred_document_layout
+        else:
+            extracted_layout = (
+                process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
+                if pdf_text_extractable
+                else []
+            )
+
+            if analysis:
+                annotate_layout_elements(
+                    inferred_document_layout=inferred_document_layout,
+                    extracted_layout=extracted_layout,
+                    filename=filename,
+                    output_dir_path=analyzed_image_output_dir_path,
+                    pdf_image_dpi=pdf_image_dpi,
+                    is_image=is_image,
+                )
 
-        if analysis:
-            annotate_layout_elements(
+            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+            merged_document_layout = merge_inferred_with_extracted_layout(
                 inferred_document_layout=inferred_document_layout,
                 extracted_layout=extracted_layout,
-                filename=filename,
-                output_dir_path=analyzed_image_output_dir_path,
-                pdf_image_dpi=pdf_image_dpi,
-                is_image=is_image,
             )
 
-        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-        merged_document_layout = merge_inferred_with_extracted_layout(
-            inferred_document_layout=inferred_document_layout,
-            extracted_layout=extracted_layout,
-        )
-
-        if hi_res_model_name.startswith("chipper"):
-            # NOTE(alan): We shouldn't do OCR with chipper
-            final_document_layout = merged_document_layout
-        else:
             final_document_layout = process_file_with_ocr(
                 filename,
                 merged_document_layout,
@@ -360,23 +361,27 @@ def _partition_pdf_or_image_local(
             model_name=hi_res_model_name,
             pdf_image_dpi=pdf_image_dpi,
         )
-        if hasattr(file, "seek"):
-            file.seek(0)
-
-        extracted_layout = (
-            process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
-        )
-
-        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-        merged_document_layout = merge_inferred_with_extracted_layout(
-            inferred_document_layout=inferred_document_layout,
-            extracted_layout=extracted_layout,
-        )
 
         if hi_res_model_name.startswith("chipper"):
             # NOTE(alan): We shouldn't do OCR with chipper
+            # NOTE(antonio): We shouldn't do PDFMiner with chipper
             final_document_layout = merged_document_layout
         else:
+            if hasattr(file, "seek"):
+                file.seek(0)
+
+            extracted_layout = (
+                process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
+                if pdf_text_extractable
+                else []
+            )
+
+            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+            merged_document_layout = merge_inferred_with_extracted_layout(
+                inferred_document_layout=inferred_document_layout,
+                extracted_layout=extracted_layout,
+            )
+
             if hasattr(file, "seek"):
                 file.seek(0)
             final_document_layout = process_data_with_ocr(
@@ -390,7 +395,7 @@ def _partition_pdf_or_image_local(
             )
 
     # NOTE(alan): starting with v2, chipper sorts the elements itself.
-    if hi_res_model_name == "chipper":
+    if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1":
         kwargs["sort_mode"] = SORT_MODE_DONT
 
     final_document_layout = clean_pdfminer_inner_elements(final_document_layout)

diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py
@@ -177,20 +177,30 @@ def elements_from_json(
 
 
 def flatten_dict(
-    dictionary, parent_key="", separator="_", flatten_lists=False, keys_to_omit: List[str] = None
+    dictionary,
+    parent_key="",
+    separator="_",
+    flatten_lists=False,
+    remove_none=False,
+    keys_to_omit: List[str] = None,
 ):
     """Flattens a nested dictionary into a single level dictionary. keys_to_omit is a list of keys
     that don't get flattened. If omitting a nested key, format as {parent_key}{separator}{key}.
-    If flatten_lists is True, then lists and tuples are flattened as well."""
+    If flatten_lists is True, then lists and tuples are flattened as well.
+    If remove_none is True, then None keys/values are removed from the flattened dictionary."""
     keys_to_omit = keys_to_omit if keys_to_omit else []
     flattened_dict = {}
     for key, value in dictionary.items():
         new_key = f"{parent_key}{separator}{key}" if parent_key else key
         if new_key in keys_to_omit:
             flattened_dict[new_key] = value
+        elif value is None and remove_none:
+            continue
         elif isinstance(value, dict):
             flattened_dict.update(
-                flatten_dict(value, new_key, separator, flatten_lists, keys_to_omit=keys_to_omit),
+                flatten_dict(
+                    value, new_key, separator, flatten_lists, remove_none, keys_to_omit=keys_to_omit
+                ),
             )
         elif isinstance(value, (list, tuple)) and flatten_lists:
             for index, item in enumerate(value):
@@ -200,6 +210,7 @@ def flatten_dict(
                         "",
                         separator,
                         flatten_lists,
+                        remove_none,
                         keys_to_omit=keys_to_omit,
                     )
                 )
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.12.3-dev4" # pragma: no cover
		__version__ = "0.12.3-dev6" # pragma: no cover