Merge remote-tracking branch 'origin/main' into feat/table-as-cells

Unstructured-IO · Apr 18, 2024 · 038dd60 · 038dd60
2 parents 8b71376 + 001fa17
commit 038dd60
Show file tree

Hide file tree

Showing 52 changed files with 898 additions and 14,979 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,12 @@
-## 0.13.3-dev4
+## 0.13.3-dev7
 
 ### Enhancements
 
 * **Add support for `start_index` in `html` links extraction**
 * **Add `strategy` arg value to `_PptxPartitionerOptions`.** This makes this paritioning option available for sub-partitioners to come that may optionally use inference or other expensive operations to improve the partitioning.
 * **Support pluggable sub-partitioner for PPTX Picture shapes.** Use a distinct sub-partitioner for partitioning PPTX Picture (image) shapes and allow the default picture sub-partitioner to be replaced at run-time by one of the user's choosing.
+* **Introduce `starting_page_number` parameter to partitioning functions** It applies to those partitioners which support `page_number` in element's metadata: PDF, TIFF, XLSX, DOC, DOCX, PPT, PPTX.
+* **Redesign the internal mechanism of assigning element IDs** This allows for further enhancements related to element IDs such as deterministic and document-unique hashes. The way partitioning functions operate hasn't changed, which means `unique_element_ids` continues to be `False` by default, utilizing text hashes.
 * **Allow `UnstructuredTableTransformerModel` returning predictions in not parsed format
 
 ### Features

diff --git a/docs/source/introduction/overview.rst b/docs/source/introduction/overview.rst
@@ -155,6 +155,16 @@ if you'd like to use the IDs as a primary key in a database, for example.
     elements = partition_text(text="Here is some example text.", unique_element_ids=True)
     elements[0].id
 
+Element ID Design Principles
+""""""""""""""""""""""""""""""""""""
+
+#. A partitioning function can assign only one of two available ID types to a returned element: a hash or a UUID.
+#. All elements that are returned come with an ID, which is never None.
+#. No matter which type of ID is used, it will always be in string format.
+#. Partitioning a document returns elements with hashes as their default IDs.
+
+For creating elements independently of partitioning functions, refer to the `Element` class documentation in the source code (`unstructured/documents/elements.py`).
+
 
 Wrapping it all up
 ******************

diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py
@@ -10,53 +10,59 @@
 
 import pytest
 
+from test_unstructured.unit_utils import assign_hash_ids
 from unstructured.cleaners.core import clean_bullets, clean_prefix
 from unstructured.documents.coordinates import (
     CoordinateSystem,
     Orientation,
     RelativeCoordinateSystem,
 )
 from unstructured.documents.elements import (
-    UUID,
     CheckBox,
     ConsolidationStrategy,
     CoordinatesMetadata,
     DataSourceMetadata,
     Element,
     ElementMetadata,
-    NoID,
     Points,
     RegexMetadata,
     Text,
     Title,
 )
 
 
-def test_text_id():
-    text_element = Text(text="hello there!")
-    assert text_element.id == "c69509590d81db2f37f9d75480c8efed"
+def test_Text_is_JSON_serializable():
+    # -- This shold run without an error --
+    json.dumps(Text(text="hello there!", element_id=None).to_dict())
 
 
-def test_text_uuid():
-    text_element = Text(text="hello there!", element_id=UUID())
-
-    id = text_element.id
-
-    assert isinstance(id, str)
-    assert len(id) == 36
-    assert id.count("-") == 4
-    # -- Test that the element is JSON serializable. This shold run without an error --
-    json.dumps(text_element.to_dict())
-
+@pytest.mark.parametrize(
+    "element",
+    [
+        Element(),
+        Text(text=""),  # -- element_id should be implicitly None --
+        Text(text="", element_id=None),  # -- setting explicitly to None --
+        CheckBox(),
+    ],
+)
+def test_Element_autoassigns_a_UUID_then_becomes_an_idempotent_and_deterministic_hash(
+    element: Element,
+):
+    assert element._element_id is None, "Element should not have an ID yet"
 
-def test_element_defaults_to_blank_id():
-    element = Element()
-    assert isinstance(element.id, NoID)
+    # -- element self-assigns itself a UUID only when the ID is requested --
+    assert isinstance(element.id, str)
+    assert len(element.id) == 36
+    assert element.id.count("-") == 4
 
+    expected_hash = "e3b0c44298fc1c149afbf4c8996fb924"
+    # -- calling `.id_to_hash()` changes the element's id-type to hash --
+    assert element.id_to_hash() == expected_hash
+    assert element.id == expected_hash
 
-def test_element_uuid():
-    element = Element(element_id=UUID())
-    assert isinstance(element.id, UUID)
+    # -- `.id_to_hash()` is idempotent --
+    assert element.id_to_hash() == expected_hash
+    assert element.id == expected_hash
 
 
 def test_text_element_apply_cleaners():
@@ -392,11 +398,13 @@ def and_it_serializes_a_data_source_sub_object_to_a_dict_when_it_is_present(self
         }
 
     def and_it_serializes_an_orig_elements_sub_object_to_base64_when_it_is_present(self):
+        elements = assign_hash_ids([Title("Lorem"), Text("Lorem Ipsum")])
         meta = ElementMetadata(
             category_depth=1,
-            orig_elements=[Title("Lorem"), Text("Lorem Ipsum")],
+            orig_elements=elements,
             page_number=2,
         )
+
         assert meta.to_dict() == {
             "category_depth": 1,
             "orig_elements": (

diff --git a/test_unstructured/documents/test_email_elements.py b/test_unstructured/documents/test_email_elements.py
@@ -1,26 +1,50 @@
-import uuid
 from functools import partial
 
 import pytest
 
 from unstructured.cleaners.core import clean_prefix
 from unstructured.cleaners.translate import translate_text
-from unstructured.documents.email_elements import UUID, EmailElement, Name, NoID
+from unstructured.documents.email_elements import EmailElement, Name
 
 
-def test_text_id():
-    name_element = Name(name="Example", text="hello there!")
-    assert name_element.id == "c69509590d81db2f37f9d75480c8efed"
+def test_Name_should_assign_a_deterministic_and_an_idempotent_hash():
+    element = Name(name="Example", text="hello there!")
+    expected_hash = "c69509590d81db2f37f9d75480c8efed"
 
+    assert element._element_id is None, "Element should not have an ID yet"
 
-def test_text_uuid():
-    name_element = Name(name="Example", text="hello there!", element_id=UUID())
-    assert isinstance(name_element.id, uuid.UUID)
+    # -- calculating hash for the first time --
+    assert element.id_to_hash() == expected_hash
+    assert element.id == expected_hash
 
+    # -- `.id_to_hash()` is idempotent --
+    assert element.id_to_hash() == expected_hash
+    assert element.id == expected_hash
 
-def test_element_defaults_to_blank_id():
-    element = EmailElement()
-    assert isinstance(element.id, NoID)
+
+@pytest.mark.parametrize(
+    "element",
+    [
+        EmailElement(text=""),  # -- the default `element_id` is None --
+        Name(name="Example", text="hello there!"),  # -- the default `element_id` is None --
+        Name(name="Example", text="hello there!", element_id=None),
+    ],
+)
+def test_EmailElement_should_assign_a_UUID_only_once_and_only_at_the_first_id_request(
+    element: EmailElement,
+):
+    assert element._element_id is None, "Element should not have an ID yet"
+
+    # -- this should generate and assign a fresh UUID --
+    id_value = element.id
+
+    # -- check that the UUID is valid --
+    assert element._element_id is not None, "Element should already have an ID"
+    assert isinstance(id_value, str)
+    assert len(id_value) == 36
+    assert id_value.count("-") == 4
+
+    assert element.id == id_value, "UUID assignment should happen only once"
 
 
 def test_text_element_apply_cleaners():

diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py
@@ -377,11 +377,13 @@ def test_partition_docx_includes_page_numbers_when_page_break_elements_are_suppr
 
 
 def test_partition_docx_includes_page_break_elements_when_so_instructed():
-    elements = partition_docx(example_doc_path("handbook-1p.docx"), include_page_breaks=True)
+    elements = partition_docx(
+        example_doc_path("handbook-1p.docx"), include_page_breaks=True, starting_page_number=3
+    )
 
     assert "PageBreak" in [type(e).__name__ for e in elements]
-    assert elements[1].metadata.page_number == 1
-    assert elements[-2].metadata.page_number == 2
+    assert elements[1].metadata.page_number == 3
+    assert elements[-2].metadata.page_number == 4
 
 
 # ------------------------------------------------------------------------------------------------

diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -148,19 +148,21 @@ def test_partition_pdf_local_raises_with_no_filename():
 
 @pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
 @pytest.mark.parametrize(
-    ("strategy", "expected", "origin"),
+    ("strategy", "starting_page_number", "expected_page_numbers", "origin"),
     # fast: can't capture the "intentionally left blank page" page
     # others: will ignore the actual blank page
     [
-        (PartitionStrategy.FAST, {1, 4}, {"pdfminer"}),
-        (PartitionStrategy.HI_RES, {1, 3, 4}, {"yolox", "pdfminer"}),
-        (PartitionStrategy.OCR_ONLY, {1, 3, 4}, {"ocr_tesseract"}),
+        (PartitionStrategy.FAST, 1, {1, 4}, {"pdfminer"}),
+        (PartitionStrategy.FAST, 3, {3, 6}, {"pdfminer"}),
+        (PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer"}),
+        (PartitionStrategy.OCR_ONLY, 1, {1, 3, 4}, {"ocr_tesseract"}),
     ],
 )
-def test_partition_pdf(
+def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
     file_mode,
     strategy,
-    expected,
+    starting_page_number,
+    expected_page_numbers,
     origin,
     filename=example_doc_path("layout-parser-paper-with-empty-pages.pdf"),
 ):
@@ -169,23 +171,29 @@ def _test(result):
         # validate that the result is a non-empty list of dicts
         assert len(result) > 10
         # check that the pdf has multiple different page numbers
-        assert {element.metadata.page_number for element in result} == expected
+        assert {element.metadata.page_number for element in result} == expected_page_numbers
         if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
             assert {element.metadata.detection_origin for element in result} == origin
 
     if file_mode == "filename":
-        result = pdf.partition_pdf(filename=filename, strategy=strategy)
+        result = pdf.partition_pdf(
+            filename=filename, strategy=strategy, starting_page_number=starting_page_number
+        )
         _test(result)
     elif file_mode == "rb":
         with open(filename, "rb") as f:
-            result = pdf.partition_pdf(file=f, strategy=strategy)
+            result = pdf.partition_pdf(
+                file=f, strategy=strategy, starting_page_number=starting_page_number
+            )
             _test(result)
     else:
         with open(filename, "rb") as test_file:
             spooled_temp_file = SpooledTemporaryFile()
             spooled_temp_file.write(test_file.read())
             spooled_temp_file.seek(0)
-            result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
+            result = pdf.partition_pdf(
+                file=spooled_temp_file, strategy=strategy, starting_page_number=starting_page_number
+            )
             _test(result)
 
 
@@ -298,10 +306,12 @@ def test_partition_pdf_with_no_page_breaks(
 def test_partition_pdf_with_fast_strategy(
     filename=example_doc_path("layout-parser-paper-fast.pdf"),
 ):
-    elements = pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.FAST)
+    elements = pdf.partition_pdf(
+        filename=filename, url=None, strategy=PartitionStrategy.FAST, starting_page_number=3
+    )
     assert len(elements) > 10
     # check that the pdf has multiple different page numbers
-    assert {element.metadata.page_number for element in elements} == {1, 2}
+    assert {element.metadata.page_number for element in elements} == {3, 4}
     for element in elements:
         assert element.metadata.filename == "layout-parser-paper-fast.pdf"
 

diff --git a/test_unstructured/partition/pptx/test_pptx.py b/test_unstructured/partition/pptx/test_pptx.py
@@ -703,6 +703,21 @@ def it_keeps_track_of_the_page_number(self, opts_args: dict[str, Any]):
         list(opts.increment_page_number())
         assert opts.page_number == 2
 
+    def it_assigns_the_correct_page_number_when_starting_page_number_is_given(
+        self, opts_args: dict[str, Any]
+    ):
+        opts = _PptxPartitionerOptions(**opts_args, starting_page_number=3)
+        # -- move to the "first" slide --
+        list(opts.increment_page_number())
+
+        table_metadata = opts.table_metadata(text_as_html="<table><tr/></table>")
+        text_metadata = opts.text_metadata()
+
+        assert isinstance(table_metadata, ElementMetadata)
+        assert isinstance(text_metadata, ElementMetadata)
+        assert text_metadata.page_number == 3
+        assert table_metadata.page_number == 3
+
     # -- .pptx_file ------------------------------
 
     def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(

diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -363,6 +363,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
         extract_image_block_to_payload=False,
         hi_res_model_name=None,
         date_from_file_object=False,
+        starting_page_number=1,
     )
 
 
@@ -840,6 +841,11 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"
     assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
 
 
+def test_auto_partition_respects_starting_page_number_argument_for_xlsx():
+    elements = partition("example-docs/stanley-cups.xlsx", starting_page_number=3)
+    assert elements[1].metadata.page_number == 3
+
+
 EXPECTED_XLS_TEXT_LEN = 550
 
 

diff --git a/test_unstructured/staging/test_base.py b/test_unstructured/staging/test_base.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import pytest
 
+from test_unstructured.unit_utils import assign_hash_ids
 from unstructured.documents.elements import (
     Address,
     CheckBox,
@@ -44,7 +45,7 @@ def test_base64_gzipped_json_to_elements_can_deserialize_compressed_elements_fro
 
 
 def test_elements_to_base64_gzipped_json_can_serialize_elements_to_a_base64_str():
-    elements = [Title("Lorem"), Text("Lorem Ipsum")]
+    elements = assign_hash_ids([Title("Lorem"), Text("Lorem Ipsum")])
 
     assert base.elements_to_base64_gzipped_json(elements) == (
         "eJyFzcsKwjAQheFXKVm7yDS3xjcQXNaViKTJjBR6o46glr67zVI3Lmf4Dv95EdhhjwNf2yT2hYDGUaWtJVm5WDoq"

diff --git a/test_unstructured/staging/test_baseplate.py b/test_unstructured/staging/test_baseplate.py
@@ -1,3 +1,4 @@
+from test_unstructured.unit_utils import assign_hash_ids
 from unstructured.documents.coordinates import PixelSpace
 from unstructured.documents.elements import (
     CoordinatesMetadata,
@@ -18,13 +19,14 @@ def test_stage_for_baseplate():
     system = PixelSpace(width=1700, height=2200)
     coordinates_metadata = CoordinatesMetadata(points=points, system=system)
     metadata = ElementMetadata(filename="fox.pdf", coordinates=coordinates_metadata)
-    elements = [
-        Title("A Wonderful Story About A Fox", metadata=metadata),
-        NarrativeText(
-            "A fox ran into the chicken coop and the chickens flew off!",
-            metadata=metadata,
-        ),
-    ]
+    elements = assign_hash_ids(
+        [
+            Title("A Wonderful Story About A Fox", metadata=metadata),
+            NarrativeText(
+                "A fox ran into the chicken coop and the chickens flew off!", metadata=metadata
+            ),
+        ]
+    )
 
     rows = stage_for_baseplate(elements)
     assert rows == {