Unstructured-IO · scanny · Apr 16, 2024 · Apr 3, 2024 · Apr 3, 2024 · Apr 3, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,12 @@
-## 0.13.3-dev6
+## 0.13.3-dev7
 
 ### Enhancements
 
 * **Add support for `start_index` in `html` links extraction**
 * **Add `strategy` arg value to `_PptxPartitionerOptions`.** This makes this paritioning option available for sub-partitioners to come that may optionally use inference or other expensive operations to improve the partitioning.
 * **Support pluggable sub-partitioner for PPTX Picture shapes.** Use a distinct sub-partitioner for partitioning PPTX Picture (image) shapes and allow the default picture sub-partitioner to be replaced at run-time by one of the user's choosing.
 * **Introduce `starting_page_number` parameter to partitioning functions** It applies to those partitioners which support `page_number` in element's metadata: PDF, TIFF, XLSX, DOC, DOCX, PPT, PPTX.
+* **Redesign the internal mechanism of assigning element IDs** This allows for further enhancements related to element IDs such as deterministic and document-unique hashes. The way partitioning functions operate hasn't changed, which means `unique_element_ids` continues to be `False` by default, utilizing text hashes.
 
 ### Features
 

diff --git a/docs/source/introduction/overview.rst b/docs/source/introduction/overview.rst
@@ -155,6 +155,16 @@ if you'd like to use the IDs as a primary key in a database, for example.
     elements = partition_text(text="Here is some example text.", unique_element_ids=True)
     elements[0].id
 
+Element ID Design Principles
+""""""""""""""""""""""""""""""""""""
+
+#. A partitioning function can assign only one of two available ID types to a returned element: a hash or a UUID.
+#. All elements that are returned come with an ID, which is never None.
+#. No matter which type of ID is used, it will always be in string format.
+#. Partitioning a document returns elements with hashes as their default IDs.
+
+For creating elements independently of partitioning functions, refer to the `Element` class documentation in the source code (`unstructured/documents/elements.py`).
+
 
 Wrapping it all up
 ******************

diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py
@@ -10,53 +10,59 @@
 
 import pytest
 
+from test_unstructured.unit_utils import assign_hash_ids
 from unstructured.cleaners.core import clean_bullets, clean_prefix
 from unstructured.documents.coordinates import (
     CoordinateSystem,
     Orientation,
     RelativeCoordinateSystem,
 )
 from unstructured.documents.elements import (
-    UUID,
     CheckBox,
     ConsolidationStrategy,
     CoordinatesMetadata,
     DataSourceMetadata,
     Element,
     ElementMetadata,
-    NoID,
     Points,
     RegexMetadata,
     Text,
     Title,
 )
 
 
-def test_text_id():
-    text_element = Text(text="hello there!")
-    assert text_element.id == "c69509590d81db2f37f9d75480c8efed"
+def test_Text_is_JSON_serializable():
+    # -- This shold run without an error --
+    json.dumps(Text(text="hello there!", element_id=None).to_dict())
 
 
-def test_text_uuid():
-    text_element = Text(text="hello there!", element_id=UUID())
-
-    id = text_element.id
-
-    assert isinstance(id, str)
-    assert len(id) == 36
-    assert id.count("-") == 4
-    # -- Test that the element is JSON serializable. This shold run without an error --
-    json.dumps(text_element.to_dict())
-
+@pytest.mark.parametrize(
+    "element",
+    [
+        Element(),
+        Text(text=""),  # -- element_id should be implicitly None --
+        Text(text="", element_id=None),  # -- setting explicitly to None --
+        CheckBox(),
+    ],
+)
+def test_Element_autoassigns_a_UUID_then_becomes_an_idempotent_and_deterministic_hash(
+    element: Element,
+):
+    assert element._element_id is None, "Element should not have an ID yet"
 
-def test_element_defaults_to_blank_id():
-    element = Element()
-    assert isinstance(element.id, NoID)
+    # -- element self-assigns itself a UUID only when the ID is requested --
+    assert isinstance(element.id, str)
+    assert len(element.id) == 36
+    assert element.id.count("-") == 4
 
+    expected_hash = "e3b0c44298fc1c149afbf4c8996fb924"
+    # -- calling `.id_to_hash()` changes the element's id-type to hash --
+    assert element.id_to_hash() == expected_hash
+    assert element.id == expected_hash
 
-def test_element_uuid():
-    element = Element(element_id=UUID())
-    assert isinstance(element.id, UUID)
+    # -- `.id_to_hash()` is idempotent --
+    assert element.id_to_hash() == expected_hash
+    assert element.id == expected_hash
 
 
 def test_text_element_apply_cleaners():
@@ -392,11 +398,13 @@ def and_it_serializes_a_data_source_sub_object_to_a_dict_when_it_is_present(self
         }
 
     def and_it_serializes_an_orig_elements_sub_object_to_base64_when_it_is_present(self):
+        elements = assign_hash_ids([Title("Lorem"), Text("Lorem Ipsum")])
         meta = ElementMetadata(
             category_depth=1,
-            orig_elements=[Title("Lorem"), Text("Lorem Ipsum")],
+            orig_elements=elements,
             page_number=2,
         )
+
         assert meta.to_dict() == {
             "category_depth": 1,
             "orig_elements": (

diff --git a/test_unstructured/documents/test_email_elements.py b/test_unstructured/documents/test_email_elements.py
@@ -1,26 +1,50 @@
-import uuid
 from functools import partial
 
 import pytest
 
 from unstructured.cleaners.core import clean_prefix
 from unstructured.cleaners.translate import translate_text
-from unstructured.documents.email_elements import UUID, EmailElement, Name, NoID
+from unstructured.documents.email_elements import EmailElement, Name
 
 
-def test_text_id():
-    name_element = Name(name="Example", text="hello there!")
-    assert name_element.id == "c69509590d81db2f37f9d75480c8efed"
+def test_Name_should_assign_a_deterministic_and_an_idempotent_hash():
+    element = Name(name="Example", text="hello there!")
+    expected_hash = "c69509590d81db2f37f9d75480c8efed"
 
+    assert element._element_id is None, "Element should not have an ID yet"
 
-def test_text_uuid():
-    name_element = Name(name="Example", text="hello there!", element_id=UUID())
-    assert isinstance(name_element.id, uuid.UUID)
+    # -- calculating hash for the first time --
+    assert element.id_to_hash() == expected_hash
+    assert element.id == expected_hash
 
+    # -- `.id_to_hash()` is idempotent --
+    assert element.id_to_hash() == expected_hash
+    assert element.id == expected_hash
 
-def test_element_defaults_to_blank_id():
-    element = EmailElement()
-    assert isinstance(element.id, NoID)
+
+@pytest.mark.parametrize(
+    "element",
+    [
+        EmailElement(text=""),  # -- the default `element_id` is None --
+        Name(name="Example", text="hello there!"),  # -- the default `element_id` is None --
+        Name(name="Example", text="hello there!", element_id=None),
+    ],
+)
+def test_EmailElement_should_assign_a_UUID_only_once_and_only_at_the_first_id_request(
+    element: EmailElement,
+):
+    assert element._element_id is None, "Element should not have an ID yet"
+
+    # -- this should generate and assign a fresh UUID --
+    id_value = element.id
+
+    # -- check that the UUID is valid --
+    assert element._element_id is not None, "Element should already have an ID"
+    assert isinstance(id_value, str)
+    assert len(id_value) == 36
+    assert id_value.count("-") == 4
+
+    assert element.id == id_value, "UUID assignment should happen only once"
 
 
 def test_text_element_apply_cleaners():

diff --git a/test_unstructured/staging/test_base.py b/test_unstructured/staging/test_base.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import pytest
 
+from test_unstructured.unit_utils import assign_hash_ids
 from unstructured.documents.elements import (
     Address,
     CheckBox,
@@ -44,7 +45,7 @@ def test_base64_gzipped_json_to_elements_can_deserialize_compressed_elements_fro
 
 
 def test_elements_to_base64_gzipped_json_can_serialize_elements_to_a_base64_str():
-    elements = [Title("Lorem"), Text("Lorem Ipsum")]
+    elements = assign_hash_ids([Title("Lorem"), Text("Lorem Ipsum")])
 
     assert base.elements_to_base64_gzipped_json(elements) == (
         "eJyFzcsKwjAQheFXKVm7yDS3xjcQXNaViKTJjBR6o46glr67zVI3Lmf4Dv95EdhhjwNf2yT2hYDGUaWtJVm5WDoq"

diff --git a/test_unstructured/staging/test_baseplate.py b/test_unstructured/staging/test_baseplate.py
@@ -1,3 +1,4 @@
+from test_unstructured.unit_utils import assign_hash_ids
 from unstructured.documents.coordinates import PixelSpace
 from unstructured.documents.elements import (
     CoordinatesMetadata,
@@ -18,13 +19,14 @@ def test_stage_for_baseplate():
     system = PixelSpace(width=1700, height=2200)
     coordinates_metadata = CoordinatesMetadata(points=points, system=system)
     metadata = ElementMetadata(filename="fox.pdf", coordinates=coordinates_metadata)
-    elements = [
-        Title("A Wonderful Story About A Fox", metadata=metadata),
-        NarrativeText(
-            "A fox ran into the chicken coop and the chickens flew off!",
-            metadata=metadata,
-        ),
-    ]
+    elements = assign_hash_ids(
+        [
+            Title("A Wonderful Story About A Fox", metadata=metadata),
+            NarrativeText(
+                "A fox ran into the chicken coop and the chickens flew off!", metadata=metadata
+            ),
+        ]
+    )
 
     rows = stage_for_baseplate(elements)
     assert rows == {

diff --git a/test_unstructured/staging/test_label_studio.py b/test_unstructured/staging/test_label_studio.py
@@ -5,6 +5,7 @@
 import vcr
 from label_studio_sdk.client import Client
 
+from test_unstructured.unit_utils import assign_hash_ids
 from unstructured.documents.elements import NarrativeText, Title
 from unstructured.staging import label_studio
 
@@ -161,7 +162,7 @@ def test_init_prediction(score, raises, exception):
 
 
 def test_stage_with_annotation():
-    element = NarrativeText(text="A big brown bear")
+    elements = assign_hash_ids([NarrativeText(text="A big brown bear")])
     annotations = [
         label_studio.LabelStudioAnnotation(
             result=[
@@ -174,7 +175,7 @@ def test_stage_with_annotation():
             ],
         ),
     ]
-    label_studio_data = label_studio.stage_for_label_studio([element], [annotations])
+    label_studio_data = label_studio.stage_for_label_studio(elements, [annotations])
     assert label_studio_data == [
         {
             "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
@@ -199,7 +200,8 @@ def test_stage_with_annotation():
 
 
 def test_stage_with_prediction():
-    element = NarrativeText(text="A big brown bear")
+    elements = assign_hash_ids([NarrativeText(text="A big brown bear")])
+
     prediction = [
         label_studio.LabelStudioPrediction(
             result=[
@@ -213,7 +215,7 @@ def test_stage_with_prediction():
             score=0.98,
         ),
     ]
-    label_studio_data = label_studio.stage_for_label_studio([element], predictions=[prediction])
+    label_studio_data = label_studio.stage_for_label_studio(elements, predictions=[prediction])
     assert label_studio_data == [
         {
             "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
@@ -239,7 +241,8 @@ def test_stage_with_prediction():
 
 
 def test_stage_with_annotation_for_ner():
-    element = NarrativeText(text="A big brown bear")
+    elements = assign_hash_ids([NarrativeText(text="A big brown bear")])
+
     annotations = [
         label_studio.LabelStudioAnnotation(
             result=[
@@ -252,7 +255,7 @@ def test_stage_with_annotation_for_ner():
             ],
         ),
     ]
-    label_studio_data = label_studio.stage_for_label_studio([element], [annotations])
+    label_studio_data = label_studio.stage_for_label_studio(elements, [annotations])
     assert label_studio_data == [
         {
             "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},
@@ -324,7 +327,7 @@ def test_stage_with_annotation_raises_with_invalid_type():
 
 
 def test_stage_with_reviewed_annotation():
-    element = NarrativeText(text="A big brown bear")
+    elements = assign_hash_ids([NarrativeText(text="A big brown bear")])
     annotations = [
         label_studio.LabelStudioAnnotation(
             result=[
@@ -338,7 +341,7 @@ def test_stage_with_reviewed_annotation():
             reviews=[label_studio.LabelStudioReview(created_by={"user_id": 1}, accepted=True)],
         ),
     ]
-    label_studio_data = label_studio.stage_for_label_studio([element], [annotations])
+    label_studio_data = label_studio.stage_for_label_studio(elements, [annotations])
     assert label_studio_data == [
         {
             "data": {"text": "A big brown bear", "ref_id": "8f458d5d0635df3975ceb9109cef9e12"},

diff --git a/test_unstructured/test_utils.py b/test_unstructured/test_utils.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import json
 import os
 import re

diff --git a/test_unstructured/unit_utils.py b/test_unstructured/unit_utils.py
@@ -62,6 +62,13 @@ def assert_round_trips_through_JSON(elements: List[Element]) -> None:
     )
 
 
+def assign_hash_ids(elements: list[Element]) -> list[Element]:
+    """Updates the `id` attribute of each element to a hash."""
+    for element in elements:
+        element.id_to_hash()
+    return elements
+
+
 def _diff(heading: str, actual: str, expected: str):
     """Diff of actual compared to expected.
 

diff --git a/...ted-structured-output/Sharepoint-with-permissions/Shared Documents/stanley-cups.xlsx.json b/...ted-structured-output/Sharepoint-with-permissions/Shared Documents/stanley-cups.xlsx.json
@@ -18,8 +18,7 @@
         "eng"
       ],
       "page_name": "Stanley Cups",
-      "page_number": 1,
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
+      "page_number": 1
     },
     "text": "Stanley Cups",
     "type": "Title"
@@ -68,8 +67,7 @@
         "eng"
       ],
       "page_name": "Stanley Cups Since 67",
-      "page_number": 2,
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
+      "page_number": 2
     },
     "text": "Stanley Cups Since 67",
     "type": "Title"

diff --git a/...ured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.xlsx.json b/...ured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.xlsx.json
@@ -18,8 +18,7 @@
         "eng"
       ],
       "page_name": "Stanley Cups",
-      "page_number": 1,
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
+      "page_number": 1
     },
     "text": "Stanley Cups",
     "type": "Title"
@@ -68,8 +67,7 @@
         "eng"
       ],
       "page_name": "Stanley Cups Since 67",
-      "page_number": 2,
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
+      "page_number": 2
     },
     "text": "Stanley Cups Since 67",
     "type": "Title"