Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into fix/default-hi-res-re…
Browse files Browse the repository at this point in the history
…ly-on-inference-setting
  • Loading branch information
badGarnet committed Jan 25, 2024
2 parents 7d535a6 + d8b3bdb commit a43e528
Show file tree
Hide file tree
Showing 11 changed files with 136 additions and 45 deletions.
8 changes: 6 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.12.3-dev4
## 0.12.3-dev6

### Enhancements

Expand All @@ -11,9 +11,13 @@

### Fixes

* **Default `hi_res_model_name` now relies on `unstructured-inference`** When no explicit `hi_res_model_name` is passed into `partition` or `partition_pdf_or_image` the default model is picked by `unstructured-inference`'s settings or os env variable `UNSTRUCTURED_HI_RES_MODEL_NAME`; it now returns the same model name regardless of `infer_table_structure`'s value; this function will be deprecated in the future and the default model name will simply rely on `unstructured-inference` and will not consider os env in a future release.
* **Fix support for different Chipper versions and prevent running PDFMiner with Chipper**
* **Treat YAML files as text.** Adds YAML MIME types to the file detection code and treats those
files as text.
* **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector.
* **Fix databricks-volumes extra location.** `setup.py` is currently pointing to the wrong location for the databricks-volumes extra requirements. This results in errors when trying to build the wheel for unstructured. This change updates to point to the correct path.
* **Fix uploading None values to Chroma and Pinecone.** Removes keys with None values with Pinecone and Chroma destinations. Pins Pinecone dependency
* **Default `hi_res_model_name` now relies on `unstructured-inference`** When no explicit `hi_res_model_name` is passed into `partition` or `partition_pdf_or_image` the default model is picked by `unstructured-inference`'s settings or os env variable `UNSTRUCTURED_HI_RES_MODEL_NAME`; it now returns the same model name regardless of `infer_table_structure`'s value; this function will be deprecated in the future and the default model name will simply rely on `unstructured-inference` and will not consider os env in a future release.

## 0.12.2

Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest/pinecone.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
-c ../constraints.in
-c ../base.txt
pinecone-client
pinecone-client==2.2.4
21 changes: 21 additions & 0 deletions test_unstructured/file_utils/test_filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import magic
import pytest
import yaml
from PIL import Image

from unstructured.file_utils import filetype
Expand Down Expand Up @@ -481,3 +482,23 @@ def test_detect_wav_from_filename(filename="example-docs/CantinaBand3.wav"):
def test_detect_wav_from_file(filename="example-docs/CantinaBand3.wav"):
with open(filename, "rb") as f:
assert detect_filetype(file=f) == FileType.WAV


def test_detect_yaml_as_text_from_filename(tmpdir):
data = {"hi": "there", "this is": "yaml"}
filename = os.path.join(tmpdir.dirname, "test.yaml")
with open(filename, "w") as f:
yaml.dump(data, f)

assert detect_filetype(filename=filename) == FileType.TXT


def test_detect_yaml_as_text_from_file(tmpdir, monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/yaml")
data = {"hi": "there", "this is": "yaml"}
filename = os.path.join(tmpdir.dirname, "test.yaml")
with open(filename, "w") as f:
yaml.dump(data, f)

with open(filename, "rb") as f:
assert detect_filetype(file=f) == FileType.TXT
10 changes: 10 additions & 0 deletions test_unstructured/partition/pdf_image/test_chipper.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,13 @@ def test_chipper_not_losing_parents(chipper_results, chipper_children):
[el for el in chipper_results if el.id == child.metadata.parent_id]
for child in chipper_children
)


def chipper_test_pdfminer_repeated(chipper_results):
"""
Test to verify that PDFMiner has not been run together with Chipper
"""
elements = chipper_results
assert len([element.text for element in elements]) == len(
{element.text for element in elements}
)
28 changes: 28 additions & 0 deletions test_unstructured/staging/test_base_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,34 @@ def test_flatten_dict_flatten_list_omit_keys():
)


def test_flatten_dict_flatten_list_omit_keys_remove_none():
"""Flattening a dictionary with flatten_lists set to True and also omitting keys
and setting remove_none to True"""
dictionary = {"a": None, "b": [2, 3, 4], "c": {"d": None, "e": [6, 7]}}
keys_to_omit = ["c"]
expected_result = {"b_0": 2, "b_1": 3, "b_2": 4, "c": {"d": None, "e": [6, 7]}}
assert (
base.flatten_dict(
dictionary, keys_to_omit=keys_to_omit, flatten_lists=True, remove_none=True
)
== expected_result
)


def test_flatten_dict_flatten_list_remove_none():
"""Flattening a dictionary with flatten_lists set to True and setting remove_none to True"""
dictionary = {"a": None, "b": [2, 3, 4], "c": {"d": None, "e": [6, 7]}}
expected_result = {"b_0": 2, "b_1": 3, "b_2": 4, "c_e_0": 6, "c_e_1": 7}
assert base.flatten_dict(dictionary, flatten_lists=True, remove_none=True) == expected_result


def test_flatten_dict_flatten_list_none_in_list_remove_none():
"""Flattening a dictionary with flatten_lists and remove_none set to True and None in list"""
dictionary = {"a": 1, "b": [2, 3, 4], "c": {"d": None, "e": [6, None]}}
expected_result = {"a": 1, "b_0": 2, "b_1": 3, "b_2": 4, "c_e_0": 6}
assert base.flatten_dict(dictionary, flatten_lists=True, remove_none=True) == expected_result


def test_flatten_dict_flatten_list_omit_keys2():
"""Flattening a dictionary with flatten_lists set to True and also omitting keys"""
dictionary = {"a": 1, "b": [2, 3, 4], "c": {"d": 5, "e": [6, 7]}}
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.12.3-dev4" # pragma: no cover
__version__ = "0.12.3-dev6" # pragma: no cover
11 changes: 10 additions & 1 deletion unstructured/file_utils/filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,13 @@ def __lt__(self, other):
"image/png": FileType.PNG,
"image/tiff": FileType.TIFF,
"image/bmp": FileType.BMP,
# NOTE(robinson) - https://mimetype.io/application/yaml
# In the future, we may have special processing for YAML
# files instead of treating them as plaintext
"application/yaml": FileType.TXT,
"application/x-yaml": FileType.TXT,
"text/x-yaml": FileType.TXT,
"text/yaml": FileType.TXT,
"text/plain": FileType.TXT,
"text/x-csv": FileType.CSV,
"application/csv": FileType.CSV,
Expand Down Expand Up @@ -209,6 +216,8 @@ def __lt__(self, other):
".swift": FileType.TXT,
".ts": FileType.TXT,
".go": FileType.TXT,
".yaml": FileType.TXT,
".yml": FileType.TXT,
None: FileType.UNK,
}

Expand Down Expand Up @@ -349,7 +358,7 @@ def detect_filetype(
return FileType.EML

if extension in PLAIN_TEXT_EXTENSIONS:
return EXT_TO_FILETYPE.get(extension)
return EXT_TO_FILETYPE.get(extension, FileType.UNK)

# Safety catch
if mime_type in STR_TO_FILETYPE:
Expand Down
4 changes: 3 additions & 1 deletion unstructured/ingest/connector/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,5 +151,7 @@ def normalize_dict(self, element_dict: dict) -> dict:
"id": str(uuid.uuid4()),
"embedding": element_dict.pop("embeddings", None),
"document": element_dict.pop("text", None),
"metadata": flatten_dict(element_dict, separator="-", flatten_lists=True),
"metadata": flatten_dict(
element_dict, separator="-", flatten_lists=True, remove_none=True
),
}
1 change: 1 addition & 0 deletions unstructured/ingest/connector/pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ def normalize_dict(self, element_dict: dict) -> dict:
element_dict,
separator="-",
flatten_lists=True,
remove_none=True,
),
},
}
77 changes: 41 additions & 36 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,8 +303,8 @@ def _partition_pdf_or_image_local(

hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
if pdf_image_dpi is None:
pdf_image_dpi = 300 if hi_res_model_name == "chipper" else 200
if (pdf_image_dpi < 300) and (hi_res_model_name == "chipper"):
pdf_image_dpi = 300 if hi_res_model_name.startswith("chipper") else 200
if (pdf_image_dpi < 300) and (hi_res_model_name.startswith("chipper")):
logger.warning(
"The Chipper model performs better when images are rendered with DPI >= 300 "
f"(currently {pdf_image_dpi}).",
Expand All @@ -318,32 +318,33 @@ def _partition_pdf_or_image_local(
pdf_image_dpi=pdf_image_dpi,
)

extracted_layout = (
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
if pdf_text_extractable
else []
)
if hi_res_model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
# NOTE(antonio): We shouldn't do PDFMiner with chipper
final_document_layout = inferred_document_layout
else:
extracted_layout = (
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
if pdf_text_extractable
else []
)

if analysis:
annotate_layout_elements(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
filename=filename,
output_dir_path=analyzed_image_output_dir_path,
pdf_image_dpi=pdf_image_dpi,
is_image=is_image,
)

if analysis:
annotate_layout_elements(
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
filename=filename,
output_dir_path=analyzed_image_output_dir_path,
pdf_image_dpi=pdf_image_dpi,
is_image=is_image,
)

# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
)

if hi_res_model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
final_document_layout = merged_document_layout
else:
final_document_layout = process_file_with_ocr(
filename,
merged_document_layout,
Expand All @@ -360,23 +361,27 @@ def _partition_pdf_or_image_local(
model_name=hi_res_model_name,
pdf_image_dpi=pdf_image_dpi,
)
if hasattr(file, "seek"):
file.seek(0)

extracted_layout = (
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
)

# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
)

if hi_res_model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
# NOTE(antonio): We shouldn't do PDFMiner with chipper
final_document_layout = merged_document_layout
else:
if hasattr(file, "seek"):
file.seek(0)

extracted_layout = (
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
if pdf_text_extractable
else []
)

# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
)

if hasattr(file, "seek"):
file.seek(0)
final_document_layout = process_data_with_ocr(
Expand All @@ -390,7 +395,7 @@ def _partition_pdf_or_image_local(
)

# NOTE(alan): starting with v2, chipper sorts the elements itself.
if hi_res_model_name == "chipper":
if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1":
kwargs["sort_mode"] = SORT_MODE_DONT

final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
Expand Down
17 changes: 14 additions & 3 deletions unstructured/staging/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,20 +177,30 @@ def elements_from_json(


def flatten_dict(
dictionary, parent_key="", separator="_", flatten_lists=False, keys_to_omit: List[str] = None
dictionary,
parent_key="",
separator="_",
flatten_lists=False,
remove_none=False,
keys_to_omit: List[str] = None,
):
"""Flattens a nested dictionary into a single level dictionary. keys_to_omit is a list of keys
that don't get flattened. If omitting a nested key, format as {parent_key}{separator}{key}.
If flatten_lists is True, then lists and tuples are flattened as well."""
If flatten_lists is True, then lists and tuples are flattened as well.
If remove_none is True, then None keys/values are removed from the flattened dictionary."""
keys_to_omit = keys_to_omit if keys_to_omit else []
flattened_dict = {}
for key, value in dictionary.items():
new_key = f"{parent_key}{separator}{key}" if parent_key else key
if new_key in keys_to_omit:
flattened_dict[new_key] = value
elif value is None and remove_none:
continue
elif isinstance(value, dict):
flattened_dict.update(
flatten_dict(value, new_key, separator, flatten_lists, keys_to_omit=keys_to_omit),
flatten_dict(
value, new_key, separator, flatten_lists, remove_none, keys_to_omit=keys_to_omit
),
)
elif isinstance(value, (list, tuple)) and flatten_lists:
for index, item in enumerate(value):
Expand All @@ -200,6 +210,7 @@ def flatten_dict(
"",
separator,
flatten_lists,
remove_none,
keys_to_omit=keys_to_omit,
)
)
Expand Down

0 comments on commit a43e528

Please sign in to comment.