Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into feat/table-as-cells
Browse files Browse the repository at this point in the history
  • Loading branch information
plutasnyy committed Apr 18, 2024
2 parents 8b71376 + 001fa17 commit 038dd60
Show file tree
Hide file tree
Showing 52 changed files with 898 additions and 14,979 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
## 0.13.3-dev4
## 0.13.3-dev7

### Enhancements

* **Add support for `start_index` in `html` links extraction**
* **Add `strategy` arg value to `_PptxPartitionerOptions`.** This makes this paritioning option available for sub-partitioners to come that may optionally use inference or other expensive operations to improve the partitioning.
* **Support pluggable sub-partitioner for PPTX Picture shapes.** Use a distinct sub-partitioner for partitioning PPTX Picture (image) shapes and allow the default picture sub-partitioner to be replaced at run-time by one of the user's choosing.
* **Introduce `starting_page_number` parameter to partitioning functions** It applies to those partitioners which support `page_number` in element's metadata: PDF, TIFF, XLSX, DOC, DOCX, PPT, PPTX.
* **Redesign the internal mechanism of assigning element IDs** This allows for further enhancements related to element IDs such as deterministic and document-unique hashes. The way partitioning functions operate hasn't changed, which means `unique_element_ids` continues to be `False` by default, utilizing text hashes.
* **Allow `UnstructuredTableTransformerModel` returning predictions in not parsed format

### Features
Expand Down
10 changes: 10 additions & 0 deletions docs/source/introduction/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,16 @@ if you'd like to use the IDs as a primary key in a database, for example.
elements = partition_text(text="Here is some example text.", unique_element_ids=True)
elements[0].id
Element ID Design Principles
""""""""""""""""""""""""""""""""""""

#. A partitioning function can assign only one of two available ID types to a returned element: a hash or a UUID.
#. All elements that are returned come with an ID, which is never None.
#. No matter which type of ID is used, it will always be in string format.
#. Partitioning a document returns elements with hashes as their default IDs.

For creating elements independently of partitioning functions, refer to the `Element` class documentation in the source code (`unstructured/documents/elements.py`).


Wrapping it all up
******************
Expand Down
54 changes: 31 additions & 23 deletions test_unstructured/documents/test_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,53 +10,59 @@

import pytest

from test_unstructured.unit_utils import assign_hash_ids
from unstructured.cleaners.core import clean_bullets, clean_prefix
from unstructured.documents.coordinates import (
CoordinateSystem,
Orientation,
RelativeCoordinateSystem,
)
from unstructured.documents.elements import (
UUID,
CheckBox,
ConsolidationStrategy,
CoordinatesMetadata,
DataSourceMetadata,
Element,
ElementMetadata,
NoID,
Points,
RegexMetadata,
Text,
Title,
)


def test_text_id():
text_element = Text(text="hello there!")
assert text_element.id == "c69509590d81db2f37f9d75480c8efed"
def test_Text_is_JSON_serializable():
# -- This shold run without an error --
json.dumps(Text(text="hello there!", element_id=None).to_dict())


def test_text_uuid():
text_element = Text(text="hello there!", element_id=UUID())

id = text_element.id

assert isinstance(id, str)
assert len(id) == 36
assert id.count("-") == 4
# -- Test that the element is JSON serializable. This shold run without an error --
json.dumps(text_element.to_dict())

@pytest.mark.parametrize(
"element",
[
Element(),
Text(text=""), # -- element_id should be implicitly None --
Text(text="", element_id=None), # -- setting explicitly to None --
CheckBox(),
],
)
def test_Element_autoassigns_a_UUID_then_becomes_an_idempotent_and_deterministic_hash(
element: Element,
):
assert element._element_id is None, "Element should not have an ID yet"

def test_element_defaults_to_blank_id():
element = Element()
assert isinstance(element.id, NoID)
# -- element self-assigns itself a UUID only when the ID is requested --
assert isinstance(element.id, str)
assert len(element.id) == 36
assert element.id.count("-") == 4

expected_hash = "e3b0c44298fc1c149afbf4c8996fb924"
# -- calling `.id_to_hash()` changes the element's id-type to hash --
assert element.id_to_hash() == expected_hash
assert element.id == expected_hash

def test_element_uuid():
element = Element(element_id=UUID())
assert isinstance(element.id, UUID)
# -- `.id_to_hash()` is idempotent --
assert element.id_to_hash() == expected_hash
assert element.id == expected_hash


def test_text_element_apply_cleaners():
Expand Down Expand Up @@ -392,11 +398,13 @@ def and_it_serializes_a_data_source_sub_object_to_a_dict_when_it_is_present(self
}

def and_it_serializes_an_orig_elements_sub_object_to_base64_when_it_is_present(self):
elements = assign_hash_ids([Title("Lorem"), Text("Lorem Ipsum")])
meta = ElementMetadata(
category_depth=1,
orig_elements=[Title("Lorem"), Text("Lorem Ipsum")],
orig_elements=elements,
page_number=2,
)

assert meta.to_dict() == {
"category_depth": 1,
"orig_elements": (
Expand Down
46 changes: 35 additions & 11 deletions test_unstructured/documents/test_email_elements.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,50 @@
import uuid
from functools import partial

import pytest

from unstructured.cleaners.core import clean_prefix
from unstructured.cleaners.translate import translate_text
from unstructured.documents.email_elements import UUID, EmailElement, Name, NoID
from unstructured.documents.email_elements import EmailElement, Name


def test_text_id():
name_element = Name(name="Example", text="hello there!")
assert name_element.id == "c69509590d81db2f37f9d75480c8efed"
def test_Name_should_assign_a_deterministic_and_an_idempotent_hash():
element = Name(name="Example", text="hello there!")
expected_hash = "c69509590d81db2f37f9d75480c8efed"

assert element._element_id is None, "Element should not have an ID yet"

def test_text_uuid():
name_element = Name(name="Example", text="hello there!", element_id=UUID())
assert isinstance(name_element.id, uuid.UUID)
# -- calculating hash for the first time --
assert element.id_to_hash() == expected_hash
assert element.id == expected_hash

# -- `.id_to_hash()` is idempotent --
assert element.id_to_hash() == expected_hash
assert element.id == expected_hash

def test_element_defaults_to_blank_id():
element = EmailElement()
assert isinstance(element.id, NoID)

@pytest.mark.parametrize(
"element",
[
EmailElement(text=""), # -- the default `element_id` is None --
Name(name="Example", text="hello there!"), # -- the default `element_id` is None --
Name(name="Example", text="hello there!", element_id=None),
],
)
def test_EmailElement_should_assign_a_UUID_only_once_and_only_at_the_first_id_request(
element: EmailElement,
):
assert element._element_id is None, "Element should not have an ID yet"

# -- this should generate and assign a fresh UUID --
id_value = element.id

# -- check that the UUID is valid --
assert element._element_id is not None, "Element should already have an ID"
assert isinstance(id_value, str)
assert len(id_value) == 36
assert id_value.count("-") == 4

assert element.id == id_value, "UUID assignment should happen only once"


def test_text_element_apply_cleaners():
Expand Down
8 changes: 5 additions & 3 deletions test_unstructured/partition/docx/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,11 +377,13 @@ def test_partition_docx_includes_page_numbers_when_page_break_elements_are_suppr


def test_partition_docx_includes_page_break_elements_when_so_instructed():
elements = partition_docx(example_doc_path("handbook-1p.docx"), include_page_breaks=True)
elements = partition_docx(
example_doc_path("handbook-1p.docx"), include_page_breaks=True, starting_page_number=3
)

assert "PageBreak" in [type(e).__name__ for e in elements]
assert elements[1].metadata.page_number == 1
assert elements[-2].metadata.page_number == 2
assert elements[1].metadata.page_number == 3
assert elements[-2].metadata.page_number == 4


# ------------------------------------------------------------------------------------------------
Expand Down
34 changes: 22 additions & 12 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,19 +148,21 @@ def test_partition_pdf_local_raises_with_no_filename():

@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize(
("strategy", "expected", "origin"),
("strategy", "starting_page_number", "expected_page_numbers", "origin"),
# fast: can't capture the "intentionally left blank page" page
# others: will ignore the actual blank page
[
(PartitionStrategy.FAST, {1, 4}, {"pdfminer"}),
(PartitionStrategy.HI_RES, {1, 3, 4}, {"yolox", "pdfminer"}),
(PartitionStrategy.OCR_ONLY, {1, 3, 4}, {"ocr_tesseract"}),
(PartitionStrategy.FAST, 1, {1, 4}, {"pdfminer"}),
(PartitionStrategy.FAST, 3, {3, 6}, {"pdfminer"}),
(PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer"}),
(PartitionStrategy.OCR_ONLY, 1, {1, 3, 4}, {"ocr_tesseract"}),
],
)
def test_partition_pdf(
def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
file_mode,
strategy,
expected,
starting_page_number,
expected_page_numbers,
origin,
filename=example_doc_path("layout-parser-paper-with-empty-pages.pdf"),
):
Expand All @@ -169,23 +171,29 @@ def _test(result):
# validate that the result is a non-empty list of dicts
assert len(result) > 10
# check that the pdf has multiple different page numbers
assert {element.metadata.page_number for element in result} == expected
assert {element.metadata.page_number for element in result} == expected_page_numbers
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.detection_origin for element in result} == origin

if file_mode == "filename":
result = pdf.partition_pdf(filename=filename, strategy=strategy)
result = pdf.partition_pdf(
filename=filename, strategy=strategy, starting_page_number=starting_page_number
)
_test(result)
elif file_mode == "rb":
with open(filename, "rb") as f:
result = pdf.partition_pdf(file=f, strategy=strategy)
result = pdf.partition_pdf(
file=f, strategy=strategy, starting_page_number=starting_page_number
)
_test(result)
else:
with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
result = pdf.partition_pdf(
file=spooled_temp_file, strategy=strategy, starting_page_number=starting_page_number
)
_test(result)


Expand Down Expand Up @@ -298,10 +306,12 @@ def test_partition_pdf_with_no_page_breaks(
def test_partition_pdf_with_fast_strategy(
filename=example_doc_path("layout-parser-paper-fast.pdf"),
):
elements = pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.FAST)
elements = pdf.partition_pdf(
filename=filename, url=None, strategy=PartitionStrategy.FAST, starting_page_number=3
)
assert len(elements) > 10
# check that the pdf has multiple different page numbers
assert {element.metadata.page_number for element in elements} == {1, 2}
assert {element.metadata.page_number for element in elements} == {3, 4}
for element in elements:
assert element.metadata.filename == "layout-parser-paper-fast.pdf"

Expand Down
15 changes: 15 additions & 0 deletions test_unstructured/partition/pptx/test_pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -703,6 +703,21 @@ def it_keeps_track_of_the_page_number(self, opts_args: dict[str, Any]):
list(opts.increment_page_number())
assert opts.page_number == 2

def it_assigns_the_correct_page_number_when_starting_page_number_is_given(
self, opts_args: dict[str, Any]
):
opts = _PptxPartitionerOptions(**opts_args, starting_page_number=3)
# -- move to the "first" slide --
list(opts.increment_page_number())

table_metadata = opts.table_metadata(text_as_html="<table><tr/></table>")
text_metadata = opts.text_metadata()

assert isinstance(table_metadata, ElementMetadata)
assert isinstance(text_metadata, ElementMetadata)
assert text_metadata.page_number == 3
assert table_metadata.page_number == 3

# -- .pptx_file ------------------------------

def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
Expand Down
6 changes: 6 additions & 0 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
extract_image_block_to_payload=False,
hi_res_model_name=None,
date_from_file_object=False,
starting_page_number=1,
)


Expand Down Expand Up @@ -840,6 +841,11 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"
assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE


def test_auto_partition_respects_starting_page_number_argument_for_xlsx():
elements = partition("example-docs/stanley-cups.xlsx", starting_page_number=3)
assert elements[1].metadata.page_number == 3


EXPECTED_XLS_TEXT_LEN = 550


Expand Down
3 changes: 2 additions & 1 deletion test_unstructured/staging/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pandas as pd
import pytest

from test_unstructured.unit_utils import assign_hash_ids
from unstructured.documents.elements import (
Address,
CheckBox,
Expand Down Expand Up @@ -44,7 +45,7 @@ def test_base64_gzipped_json_to_elements_can_deserialize_compressed_elements_fro


def test_elements_to_base64_gzipped_json_can_serialize_elements_to_a_base64_str():
elements = [Title("Lorem"), Text("Lorem Ipsum")]
elements = assign_hash_ids([Title("Lorem"), Text("Lorem Ipsum")])

assert base.elements_to_base64_gzipped_json(elements) == (
"eJyFzcsKwjAQheFXKVm7yDS3xjcQXNaViKTJjBR6o46glr67zVI3Lmf4Dv95EdhhjwNf2yT2hYDGUaWtJVm5WDoq"
Expand Down
16 changes: 9 additions & 7 deletions test_unstructured/staging/test_baseplate.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from test_unstructured.unit_utils import assign_hash_ids
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import (
CoordinatesMetadata,
Expand All @@ -18,13 +19,14 @@ def test_stage_for_baseplate():
system = PixelSpace(width=1700, height=2200)
coordinates_metadata = CoordinatesMetadata(points=points, system=system)
metadata = ElementMetadata(filename="fox.pdf", coordinates=coordinates_metadata)
elements = [
Title("A Wonderful Story About A Fox", metadata=metadata),
NarrativeText(
"A fox ran into the chicken coop and the chickens flew off!",
metadata=metadata,
),
]
elements = assign_hash_ids(
[
Title("A Wonderful Story About A Fox", metadata=metadata),
NarrativeText(
"A fox ran into the chicken coop and the chickens flew off!", metadata=metadata
),
]
)

rows = stage_for_baseplate(elements)
assert rows == {
Expand Down
Loading

0 comments on commit 038dd60

Please sign in to comment.