From 9fea85dc210a52f0aaafecad97a43fd708a98304 Mon Sep 17 00:00:00 2001 From: David Potter Date: Tue, 23 Jan 2024 13:52:11 -0800 Subject: [PATCH 1/3] fix: remove none value keys from flattened dictionary (#2442) When a partitioned or embedded document json has null values, those get converted to a dictionary with None values. This happens in the metadata. I have not see it in other keys. Chroma and Pinecone do not like those None values. `flatten_dict` has been modified with a `remove_none` arg to remove keys with None values. Also, Pinecone has been pinned at 2.2.4 because at 3.0 and above it breaks our code. --------- Co-authored-by: potter-potter --- CHANGELOG.md | 3 +- requirements/ingest/pinecone.in | 2 +- .../staging/test_base_staging.py | 28 +++++++++++++++++++ unstructured/__version__.py | 2 +- unstructured/ingest/connector/chroma.py | 4 ++- unstructured/ingest/connector/pinecone.py | 1 + unstructured/staging/base.py | 17 +++++++++-- 7 files changed, 50 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 941c847c2f..a1bc828652 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.12.3-dev3 +## 0.12.3-dev4 ### Enhancements @@ -12,6 +12,7 @@ ### Fixes * **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector. * **Fix databricks-volumes extra location.** `setup.py` is currently pointing to the wrong location for the databricks-volumes extra requirements. This results in errors when trying to build the wheel for unstructured. This change updates to point to the correct path. +* **Fix uploading None values to Chroma and Pinecone.** Removes keys with None values with Pinecone and Chroma destinations. Pins Pinecone dependency ## 0.12.2 diff --git a/requirements/ingest/pinecone.in b/requirements/ingest/pinecone.in index ebaedb531b..d1cc814f89 100644 --- a/requirements/ingest/pinecone.in +++ b/requirements/ingest/pinecone.in @@ -1,3 +1,3 @@ -c ../constraints.in -c ../base.txt -pinecone-client +pinecone-client==2.2.4 diff --git a/test_unstructured/staging/test_base_staging.py b/test_unstructured/staging/test_base_staging.py index f113f1f3d5..52f1410c17 100644 --- a/test_unstructured/staging/test_base_staging.py +++ b/test_unstructured/staging/test_base_staging.py @@ -464,6 +464,34 @@ def test_flatten_dict_flatten_list_omit_keys(): ) +def test_flatten_dict_flatten_list_omit_keys_remove_none(): + """Flattening a dictionary with flatten_lists set to True and also omitting keys + and setting remove_none to True""" + dictionary = {"a": None, "b": [2, 3, 4], "c": {"d": None, "e": [6, 7]}} + keys_to_omit = ["c"] + expected_result = {"b_0": 2, "b_1": 3, "b_2": 4, "c": {"d": None, "e": [6, 7]}} + assert ( + base.flatten_dict( + dictionary, keys_to_omit=keys_to_omit, flatten_lists=True, remove_none=True + ) + == expected_result + ) + + +def test_flatten_dict_flatten_list_remove_none(): + """Flattening a dictionary with flatten_lists set to True and setting remove_none to True""" + dictionary = {"a": None, "b": [2, 3, 4], "c": {"d": None, "e": [6, 7]}} + expected_result = {"b_0": 2, "b_1": 3, "b_2": 4, "c_e_0": 6, "c_e_1": 7} + assert base.flatten_dict(dictionary, flatten_lists=True, remove_none=True) == expected_result + + +def test_flatten_dict_flatten_list_none_in_list_remove_none(): + """Flattening a dictionary with flatten_lists and remove_none set to True and None in list""" + dictionary = {"a": 1, "b": [2, 3, 4], "c": {"d": None, "e": [6, None]}} + expected_result = {"a": 1, "b_0": 2, "b_1": 3, "b_2": 4, "c_e_0": 6} + assert base.flatten_dict(dictionary, flatten_lists=True, remove_none=True) == expected_result + + def test_flatten_dict_flatten_list_omit_keys2(): """Flattening a dictionary with flatten_lists set to True and also omitting keys""" dictionary = {"a": 1, "b": [2, 3, 4], "c": {"d": 5, "e": [6, 7]}} diff --git a/unstructured/__version__.py b/unstructured/__version__.py index e62434f7cd..5b743183bf 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.12.3-dev3" # pragma: no cover +__version__ = "0.12.3-dev4" # pragma: no cover diff --git a/unstructured/ingest/connector/chroma.py b/unstructured/ingest/connector/chroma.py index ae9912ec53..688f4f2dad 100644 --- a/unstructured/ingest/connector/chroma.py +++ b/unstructured/ingest/connector/chroma.py @@ -151,5 +151,7 @@ def normalize_dict(self, element_dict: dict) -> dict: "id": str(uuid.uuid4()), "embedding": element_dict.pop("embeddings", None), "document": element_dict.pop("text", None), - "metadata": flatten_dict(element_dict, separator="-", flatten_lists=True), + "metadata": flatten_dict( + element_dict, separator="-", flatten_lists=True, remove_none=True + ), } diff --git a/unstructured/ingest/connector/pinecone.py b/unstructured/ingest/connector/pinecone.py index e117043e1b..dd6f5d023c 100644 --- a/unstructured/ingest/connector/pinecone.py +++ b/unstructured/ingest/connector/pinecone.py @@ -135,6 +135,7 @@ def normalize_dict(self, element_dict: dict) -> dict: element_dict, separator="-", flatten_lists=True, + remove_none=True, ), }, } diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index 08f29e9d1b..b7e17a0659 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -177,20 +177,30 @@ def elements_from_json( def flatten_dict( - dictionary, parent_key="", separator="_", flatten_lists=False, keys_to_omit: List[str] = None + dictionary, + parent_key="", + separator="_", + flatten_lists=False, + remove_none=False, + keys_to_omit: List[str] = None, ): """Flattens a nested dictionary into a single level dictionary. keys_to_omit is a list of keys that don't get flattened. If omitting a nested key, format as {parent_key}{separator}{key}. - If flatten_lists is True, then lists and tuples are flattened as well.""" + If flatten_lists is True, then lists and tuples are flattened as well. + If remove_none is True, then None keys/values are removed from the flattened dictionary.""" keys_to_omit = keys_to_omit if keys_to_omit else [] flattened_dict = {} for key, value in dictionary.items(): new_key = f"{parent_key}{separator}{key}" if parent_key else key if new_key in keys_to_omit: flattened_dict[new_key] = value + elif value is None and remove_none: + continue elif isinstance(value, dict): flattened_dict.update( - flatten_dict(value, new_key, separator, flatten_lists, keys_to_omit=keys_to_omit), + flatten_dict( + value, new_key, separator, flatten_lists, remove_none, keys_to_omit=keys_to_omit + ), ) elif isinstance(value, (list, tuple)) and flatten_lists: for index, item in enumerate(value): @@ -200,6 +210,7 @@ def flatten_dict( "", separator, flatten_lists, + remove_none, keys_to_omit=keys_to_omit, ) ) From 4613e52e11c6d5cfe9f4c82d61fb6c0cd53b797c Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Wed, 24 Jan 2024 12:48:36 -0500 Subject: [PATCH 2/3] fix: treat yaml files as plain text (#2446) ### Summary Closes #2412. Adds support for YAML MIME types and treats them as plain text. In response to `500` errors that the API currently returns if the MIME type is `text/yaml`. --- CHANGELOG.md | 2 ++ test_unstructured/file_utils/test_filetype.py | 21 +++++++++++++++++++ unstructured/file_utils/filetype.py | 11 +++++++++- 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a1bc828652..0e8ea10274 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ * **Add Databricks Volumes destination connector** Databricks Volumes connector added to ingest CLI. Users may now use `unstructured-ingest` to write partitioned data to a Databricks Volumes storage service. ### Fixes +* **Treat YAML files as text.** Adds YAML MIME types to the file detection code and treats those + files as text. * **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector. * **Fix databricks-volumes extra location.** `setup.py` is currently pointing to the wrong location for the databricks-volumes extra requirements. This results in errors when trying to build the wheel for unstructured. This change updates to point to the correct path. * **Fix uploading None values to Chroma and Pinecone.** Removes keys with None values with Pinecone and Chroma destinations. Pins Pinecone dependency diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 814b4c726d..2388b07aaf 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -4,6 +4,7 @@ import magic import pytest +import yaml from PIL import Image from unstructured.file_utils import filetype @@ -481,3 +482,23 @@ def test_detect_wav_from_filename(filename="example-docs/CantinaBand3.wav"): def test_detect_wav_from_file(filename="example-docs/CantinaBand3.wav"): with open(filename, "rb") as f: assert detect_filetype(file=f) == FileType.WAV + + +def test_detect_yaml_as_text_from_filename(tmpdir): + data = {"hi": "there", "this is": "yaml"} + filename = os.path.join(tmpdir.dirname, "test.yaml") + with open(filename, "w") as f: + yaml.dump(data, f) + + assert detect_filetype(filename=filename) == FileType.TXT + + +def test_detect_yaml_as_text_from_file(tmpdir, monkeypatch): + monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/yaml") + data = {"hi": "there", "this is": "yaml"} + filename = os.path.join(tmpdir.dirname, "test.yaml") + with open(filename, "w") as f: + yaml.dump(data, f) + + with open(filename, "rb") as f: + assert detect_filetype(file=f) == FileType.TXT diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 96eac3a118..6353b0278f 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -114,6 +114,13 @@ def __lt__(self, other): "image/png": FileType.PNG, "image/tiff": FileType.TIFF, "image/bmp": FileType.BMP, + # NOTE(robinson) - https://mimetype.io/application/yaml + # In the future, we may have special processing for YAML + # files instead of treating them as plaintext + "application/yaml": FileType.TXT, + "application/x-yaml": FileType.TXT, + "text/x-yaml": FileType.TXT, + "text/yaml": FileType.TXT, "text/plain": FileType.TXT, "text/x-csv": FileType.CSV, "application/csv": FileType.CSV, @@ -209,6 +216,8 @@ def __lt__(self, other): ".swift": FileType.TXT, ".ts": FileType.TXT, ".go": FileType.TXT, + ".yaml": FileType.TXT, + ".yml": FileType.TXT, None: FileType.UNK, } @@ -349,7 +358,7 @@ def detect_filetype( return FileType.EML if extension in PLAIN_TEXT_EXTENSIONS: - return EXT_TO_FILETYPE.get(extension) + return EXT_TO_FILETYPE.get(extension, FileType.UNK) # Safety catch if mime_type in STR_TO_FILETYPE: From d8b3bdb919579dddba8bfc925e4b99ca807e3005 Mon Sep 17 00:00:00 2001 From: Antonio Jose Jimeno Yepes Date: Thu, 25 Jan 2024 13:33:32 +1100 Subject: [PATCH 3/3] Check chipper version and prevent running pdfminer with chipper (#2347) We have added a new version of chipper (Chipperv3), which needs to allow unstructured to effective work with all the current Chipper versions. This implies resizing images with the appropriate resolution and make sure that Chipper elements are not sorted by unstructured. In addition, it seems that PDFMiner is being called when calling Chipper, which adds repeated elements from Chipper and PDFMiner. To evaluate this PR, you can test the code below with the attached PDF. The code writes a JSON file with the generated elements. The output can be examined with `cat out.un.json | python -m json.tool`. There are three things to check: 1. The size of the image passed to Chipper, which can be identiied in the layout_height and layout_width attributes, which should have values 3301 and 2550 as shown in the example below: ``` [ { "element_id": "c0493a7872f227e4172c4192c5f48a06", "metadata": { "coordinates": { "layout_height": 3301, "layout_width": 2550, ``` 2. There should be no repeated elements. 3. Order should be closer to reading order. The script to run Chipper from unstructured is: ``` from unstructured import __version__ print(__version__.__version__) import json from unstructured.partition.auto import partition from unstructured.staging.base import elements_to_json elements = json.loads(elements_to_json(partition("Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf", strategy="hi_res", model_name="chipperv3"))) with open('out.un.json', 'w') as w: json.dump(elements, w) ``` [Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf](https://github.com/Unstructured-IO/unstructured/files/13817273/Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf) --------- Co-authored-by: Antonio Jimeno Yepes --- CHANGELOG.md | 3 +- .../partition/pdf_image/test_chipper.py | 10 +++ unstructured/__version__.py | 2 +- unstructured/partition/pdf.py | 77 ++++++++++--------- 4 files changed, 54 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e8ea10274..702919886b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.12.3-dev4 +## 0.12.3-dev5 ### Enhancements @@ -10,6 +10,7 @@ * **Add Databricks Volumes destination connector** Databricks Volumes connector added to ingest CLI. Users may now use `unstructured-ingest` to write partitioned data to a Databricks Volumes storage service. ### Fixes +* **Fix support for different Chipper versions and prevent running PDFMiner with Chipper** * **Treat YAML files as text.** Adds YAML MIME types to the file detection code and treats those files as text. * **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector. diff --git a/test_unstructured/partition/pdf_image/test_chipper.py b/test_unstructured/partition/pdf_image/test_chipper.py index d625f97876..81f421159c 100644 --- a/test_unstructured/partition/pdf_image/test_chipper.py +++ b/test_unstructured/partition/pdf_image/test_chipper.py @@ -30,3 +30,13 @@ def test_chipper_not_losing_parents(chipper_results, chipper_children): [el for el in chipper_results if el.id == child.metadata.parent_id] for child in chipper_children ) + + +def chipper_test_pdfminer_repeated(chipper_results): + """ + Test to verify that PDFMiner has not been run together with Chipper + """ + elements = chipper_results + assert len([element.text for element in elements]) == len( + {element.text for element in elements} + ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 5b743183bf..230fc8d54b 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.12.3-dev4" # pragma: no cover +__version__ = "0.12.3-dev5" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 8a0b003028..867a53a075 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -298,8 +298,8 @@ def _partition_pdf_or_image_local( hi_res_model_name or model_name or default_hi_res_model(infer_table_structure) ) if pdf_image_dpi is None: - pdf_image_dpi = 300 if hi_res_model_name == "chipper" else 200 - if (pdf_image_dpi < 300) and (hi_res_model_name == "chipper"): + pdf_image_dpi = 300 if hi_res_model_name.startswith("chipper") else 200 + if (pdf_image_dpi < 300) and (hi_res_model_name.startswith("chipper")): logger.warning( "The Chipper model performs better when images are rendered with DPI >= 300 " f"(currently {pdf_image_dpi}).", @@ -313,32 +313,33 @@ def _partition_pdf_or_image_local( pdf_image_dpi=pdf_image_dpi, ) - extracted_layout = ( - process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi) - if pdf_text_extractable - else [] - ) + if hi_res_model_name.startswith("chipper"): + # NOTE(alan): We shouldn't do OCR with chipper + # NOTE(antonio): We shouldn't do PDFMiner with chipper + final_document_layout = inferred_document_layout + else: + extracted_layout = ( + process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi) + if pdf_text_extractable + else [] + ) + + if analysis: + annotate_layout_elements( + inferred_document_layout=inferred_document_layout, + extracted_layout=extracted_layout, + filename=filename, + output_dir_path=analyzed_image_output_dir_path, + pdf_image_dpi=pdf_image_dpi, + is_image=is_image, + ) - if analysis: - annotate_layout_elements( + # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout + merged_document_layout = merge_inferred_with_extracted_layout( inferred_document_layout=inferred_document_layout, extracted_layout=extracted_layout, - filename=filename, - output_dir_path=analyzed_image_output_dir_path, - pdf_image_dpi=pdf_image_dpi, - is_image=is_image, ) - # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout - merged_document_layout = merge_inferred_with_extracted_layout( - inferred_document_layout=inferred_document_layout, - extracted_layout=extracted_layout, - ) - - if hi_res_model_name.startswith("chipper"): - # NOTE(alan): We shouldn't do OCR with chipper - final_document_layout = merged_document_layout - else: final_document_layout = process_file_with_ocr( filename, merged_document_layout, @@ -355,23 +356,27 @@ def _partition_pdf_or_image_local( model_name=hi_res_model_name, pdf_image_dpi=pdf_image_dpi, ) - if hasattr(file, "seek"): - file.seek(0) - - extracted_layout = ( - process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else [] - ) - - # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout - merged_document_layout = merge_inferred_with_extracted_layout( - inferred_document_layout=inferred_document_layout, - extracted_layout=extracted_layout, - ) if hi_res_model_name.startswith("chipper"): # NOTE(alan): We shouldn't do OCR with chipper + # NOTE(antonio): We shouldn't do PDFMiner with chipper final_document_layout = merged_document_layout else: + if hasattr(file, "seek"): + file.seek(0) + + extracted_layout = ( + process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) + if pdf_text_extractable + else [] + ) + + # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout + merged_document_layout = merge_inferred_with_extracted_layout( + inferred_document_layout=inferred_document_layout, + extracted_layout=extracted_layout, + ) + if hasattr(file, "seek"): file.seek(0) final_document_layout = process_data_with_ocr( @@ -385,7 +390,7 @@ def _partition_pdf_or_image_local( ) # NOTE(alan): starting with v2, chipper sorts the elements itself. - if hi_res_model_name == "chipper": + if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1": kwargs["sort_mode"] = SORT_MODE_DONT final_document_layout = clean_pdfminer_inner_elements(final_document_layout)