Merge remote-tracking branch 'origin/main' into fix/pdfminer-returns-…

…cid-code
Unstructured-IO · Dec 13, 2023 · a4fd3f3 · a4fd3f3
2 parents a7fc2e0 + d3a404c
commit a4fd3f3
Show file tree

Hide file tree

Showing 10 changed files with 145 additions and 102 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.11.4-dev7
+## 0.11.4-dev9
 
 ### Enhancements
 
@@ -16,6 +16,7 @@
 
 ### Fixes
 
+* **Fix pdf `hi_res` partitioning failure when pdfminer fails.** Implemented logic to fall back to the "inferred_layout + OCR" if pdfminer fails in the `hi_res` strategy.
 * **partition returning cid code in `hi_res`** occasaionally pdfminer can fail to decode the text in an pdf file and return cid code as text. Now when this happens the text from OCR is used.
 
 ## 0.11.3

diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py
@@ -25,6 +25,7 @@
     Element,
     ElementMetadata,
     NoID,
+    Points,
     RegexMetadata,
     Text,
 )
@@ -37,9 +38,13 @@ def test_text_id():
 
 def test_text_uuid():
     text_element = Text(text="hello there!", element_id=UUID())
-    assert len(text_element.id) == 36
-    assert text_element.id.count("-") == 4
-    # Test that the element is JSON serializable. This shold run without an error
+
+    id = text_element.id
+
+    assert isinstance(id, str)
+    assert len(id) == 36
+    assert id.count("-") == 4
+    # -- Test that the element is JSON serializable. This shold run without an error --
     json.dumps(text_element.to_dict())
 
 
@@ -71,9 +76,13 @@ def test_text_element_apply_multiple_cleaners():
 
 
 def test_apply_raises_if_func_does_not_produce_string():
+    def bad_cleaner(s: str):
+        return 1
+
     text_element = Text(text="[1] A Textbook on Crocodile Habitats")
-    with pytest.raises(ValueError):
-        text_element.apply(lambda s: 1)
+
+    with pytest.raises(ValueError, match="Cleaner produced a non-string output."):
+        text_element.apply(bad_cleaner)  # pyright: ignore[reportGeneralTypeIssues]
 
 
 @pytest.mark.parametrize(
@@ -106,22 +115,27 @@ def test_apply_raises_if_func_does_not_produce_string():
     ],
 )
 def test_convert_coordinates_to_new_system(
-    coordinates,
-    orientation1,
-    orientation2,
-    expected_coords,
+    coordinates: Points,
+    orientation1: Orientation,
+    orientation2: Orientation,
+    expected_coords: Points,
 ):
     coord1 = CoordinateSystem(100, 200)
     coord1.orientation = orientation1
     coord2 = CoordinateSystem(1000, 2000)
     coord2.orientation = orientation2
     element = Element(coordinates=coordinates, coordinate_system=coord1)
+
     new_coords = element.convert_coordinates_to_new_system(coord2)
-    for new_coord, expected_coord in zip(new_coords, expected_coords):
-        new_coord == pytest.approx(expected_coord)
+
+    assert new_coords is not None
+    for new_coord, expected in zip(new_coords, expected_coords):
+        assert new_coord == pytest.approx(expected)  # pyright: ignore[reportUnknownMemberType]
     element.convert_coordinates_to_new_system(coord2, in_place=True)
-    for new_coord, expected_coord in zip(element.metadata.coordinates.points, expected_coords):
-        assert new_coord == pytest.approx(expected_coord)
+    assert element.metadata.coordinates is not None
+    assert element.metadata.coordinates.points is not None
+    for new_coord, expected in zip(element.metadata.coordinates.points, expected_coords):
+        assert new_coord == pytest.approx(expected)  # pyright: ignore[reportUnknownMemberType]
     assert element.metadata.coordinates.system == coord2
 
 

diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1055,7 +1055,6 @@ def test_partition_pdf_with_bad_color_profile():
     [
         ("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
         ("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
-        ("failure-after-repair.pdf", "PDFMiner failed to process PDF page 26 after repairing it."),
     ],
 )
 def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.11.4-dev7"  # pragma: no cover
+__version__ = "0.11.4-dev9"  # pragma: no cover
diff --git a/unstructured/cleaners/translate.py b/unstructured/cleaners/translate.py
@@ -21,7 +21,7 @@ def _validate_language_code(language_code: str):
         )
 
 
-def translate_text(text, source_lang: Optional[str] = None, target_lang: str = "en") -> str:
+def translate_text(text: str, source_lang: Optional[str] = None, target_lang: str = "en") -> str:
     """Translates the foreign language text. If the source language is not specified, the
     function will attempt to detect it using langdetect.
 

diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py
@@ -12,9 +12,9 @@
 import re
 import uuid
 from types import MappingProxyType
-from typing import Any, Callable, Dict, FrozenSet, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, FrozenSet, List, Optional, Sequence, Tuple, Union, cast
 
-from typing_extensions import ParamSpec, TypedDict
+from typing_extensions import ParamSpec, TypeAlias, TypedDict
 
 from unstructured.documents.coordinates import (
     TYPE_TO_COORDINATE_SYSTEM_MAP,
@@ -24,6 +24,9 @@
 from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
 from unstructured.utils import lazyproperty
 
+Point: TypeAlias = Tuple[float, float]
+Points: TypeAlias = Tuple[Point, ...]
+
 
 class NoID(abc.ABC):
     """Class to indicate that an element do not have an ID."""
@@ -61,10 +64,10 @@ def from_dict(cls, input_dict: Dict[str, Any]):
 class CoordinatesMetadata:
     """Metadata fields that pertain to the coordinates of the element."""
 
-    points: Tuple[Tuple[float, float], ...]
-    system: CoordinateSystem
+    points: Optional[Points]
+    system: Optional[CoordinateSystem]
 
-    def __init__(self, points, system):
+    def __init__(self, points: Optional[Points], system: Optional[CoordinateSystem]):
         # Both `points` and `system` must be present; one is not meaningful without the other.
         if (points is None and system is not None) or (points is not None and system is None):
             raise ValueError(
@@ -94,30 +97,38 @@ def to_dict(self):
     @classmethod
     def from_dict(cls, input_dict: Dict[str, Any]):
         # `input_dict` may contain a tuple of tuples or a list of lists
-        def convert_to_tuple_of_tuples(sequence_of_sequences):
-            subsequences = []
+        def convert_to_points(sequence_of_sequences: Sequence[Sequence[float]]) -> Points:
+            points: List[Point] = []
             for seq in sequence_of_sequences:
                 if isinstance(seq, list):
-                    subsequences.append(tuple(seq))
+                    points.append(cast(Point, tuple(seq)))
                 elif isinstance(seq, tuple):
-                    subsequences.append(seq)
-            return tuple(subsequences)
-
-        input_points = input_dict.get("points", None)
-        points = convert_to_tuple_of_tuples(input_points) if input_points is not None else None
-        width = input_dict.get("layout_width", None)
-        height = input_dict.get("layout_height", None)
-        system = None
-        if input_dict.get("system", None) == "RelativeCoordinateSystem":
-            system = RelativeCoordinateSystem()
-        elif (
-            width is not None
-            and height is not None
-            and input_dict.get("system", None) in TYPE_TO_COORDINATE_SYSTEM_MAP
-        ):
-            system = TYPE_TO_COORDINATE_SYSTEM_MAP[input_dict["system"]](width, height)
-        constructor_args = {"points": points, "system": system}
-        return cls(**constructor_args)
+                    points.append(cast(Point, seq))
+            return tuple(points)
+
+        # -- parse points --
+        input_points = input_dict.get("points")
+        points = convert_to_points(input_points) if input_points is not None else None
+
+        # -- parse system --
+        system_name = input_dict.get("system")
+        width = input_dict.get("layout_width")
+        height = input_dict.get("layout_height")
+        system = (
+            None
+            if system_name is None
+            else RelativeCoordinateSystem()
+            if system_name == "RelativeCoordinateSystem"
+            else TYPE_TO_COORDINATE_SYSTEM_MAP[system_name](width, height)
+            if (
+                width is not None
+                and height is not None
+                and system_name in TYPE_TO_COORDINATE_SYSTEM_MAP
+            )
+            else None
+        )
+
+        return cls(points=points, system=system)
 
 
 class RegexMetadata(TypedDict):
@@ -637,14 +648,19 @@ def to_dict(self) -> Dict[str, Any]:
         }
 
     def convert_coordinates_to_new_system(
-        self,
-        new_system: CoordinateSystem,
-        in_place=True,
-    ) -> Optional[Tuple[Tuple[Union[int, float], Union[int, float]], ...]]:
-        """Converts the element location coordinates to a new coordinate system. If inplace is true,
-        changes the coordinates in place and updates the coordinate system."""
-        if self.metadata.coordinates is None:
+        self, new_system: CoordinateSystem, in_place: bool = True
+    ) -> Optional[Points]:
+        """Converts the element location coordinates to a new coordinate system.
+
+        If inplace is true, changes the coordinates in place and updates the coordinate system.
+        """
+        if (
+            self.metadata.coordinates is None
+            or self.metadata.coordinates.system is None
+            or self.metadata.coordinates.points is None
+        ):
             return None
+
         new_coordinates = tuple(
             self.metadata.coordinates.system.convert_coordinates_to_new_system(
                 new_system=new_system,
@@ -653,15 +669,19 @@ def convert_coordinates_to_new_system(
             )
             for x, y in self.metadata.coordinates.points
         )
+
         if in_place:
             self.metadata.coordinates.points = new_coordinates
             self.metadata.coordinates.system = new_system
+
         return new_coordinates
 
 
 class CheckBox(Element):
-    """A checkbox with an attribute indicating whether its checked or not. Primarily used
-    in documents that are forms"""
+    """A checkbox with an attribute indicating whether its checked or not.
+
+    Primarily used in documents that are forms.
+    """
 
     def __init__(
         self,
@@ -682,12 +702,18 @@ def __init__(
         )
         self.checked: bool = checked
 
-    def __eq__(self, other):
-        return (self.checked == other.checked) and (
-            self.metadata.coordinates == other.metadata.coordinates
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, CheckBox):
+            return False
+        return all(
+            (
+                self.checked == other.checked,
+                self.metadata.coordinates == other.metadata.coordinates,
+            )
         )
 
-    def to_dict(self) -> dict:
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialize to JSON-compatible (str keys) dict."""
         out = super().to_dict()
         out["type"] = "CheckBox"
         out["checked"] = self.checked
@@ -729,20 +755,23 @@ def __init__(
             detection_origin=detection_origin,
         )
 
-    def __str__(self):
-        return self.text
-
-    def __eq__(self, other):
+    def __eq__(self, other: object):
+        if not isinstance(other, Text):
+            return False
         return all(
-            [
-                (self.text == other.text),
-                (self.metadata.coordinates == other.metadata.coordinates),
-                (self.category == other.category),
-                (self.embeddings == other.embeddings),
-            ],
+            (
+                self.text == other.text,
+                self.metadata.coordinates == other.metadata.coordinates,
+                self.category == other.category,
+                self.embeddings == other.embeddings,
+            ),
         )
 
-    def to_dict(self) -> dict:
+    def __str__(self):
+        return self.text
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialize to JSON-compatible (str keys) dict."""
         out = super().to_dict()
         out["element_id"] = self.id
         out["type"] = self.category
@@ -751,14 +780,17 @@ def to_dict(self) -> dict:
             out["embeddings"] = self.embeddings
         return out
 
-    def apply(self, *cleaners: Callable):
-        """Applies a cleaning brick to the text element. The function that's passed in
-        should take a string as input and produce a string as output."""
+    def apply(self, *cleaners: Callable[[str], str]):
+        """Applies a cleaning brick to the text element.
+
+        The function that's passed in should take a string as input and produce a string as
+        output.
+        """
         cleaned_text = self.text
         for cleaner in cleaners:
             cleaned_text = cleaner(cleaned_text)
 
-        if not isinstance(cleaned_text, str):
+        if not isinstance(cleaned_text, str):  # pyright: ignore[reportUnnecessaryIsInstance]
             raise ValueError("Cleaner produced a non-string output.")
 
         self.text = cleaned_text

diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py
@@ -198,7 +198,7 @@ def supplement_page_layout_with_ocr(
         )
     elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value:
         for element in page_layout.elements:
-            if element.text == "":
+            if not element.text:
                 padding = env_config.IMAGE_CROP_PAD
                 padded_element = pad_element_bboxes(element, padding=padding)
                 cropped_image = image.crop(
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.11.4-dev7" # pragma: no cover
		__version__ = "0.11.4-dev9" # pragma: no cover