Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into fix/pdfminer-returns-…
Browse files Browse the repository at this point in the history
…cid-code
  • Loading branch information
badGarnet committed Dec 13, 2023
2 parents a7fc2e0 + d3a404c commit a4fd3f3
Show file tree
Hide file tree
Showing 10 changed files with 145 additions and 102 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.11.4-dev7
## 0.11.4-dev9

### Enhancements

Expand All @@ -16,6 +16,7 @@

### Fixes

* **Fix pdf `hi_res` partitioning failure when pdfminer fails.** Implemented logic to fall back to the "inferred_layout + OCR" if pdfminer fails in the `hi_res` strategy.
* **partition returning cid code in `hi_res`** occasaionally pdfminer can fail to decode the text in an pdf file and return cid code as text. Now when this happens the text from OCR is used.

## 0.11.3
Expand Down
40 changes: 27 additions & 13 deletions test_unstructured/documents/test_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
Element,
ElementMetadata,
NoID,
Points,
RegexMetadata,
Text,
)
Expand All @@ -37,9 +38,13 @@ def test_text_id():

def test_text_uuid():
text_element = Text(text="hello there!", element_id=UUID())
assert len(text_element.id) == 36
assert text_element.id.count("-") == 4
# Test that the element is JSON serializable. This shold run without an error

id = text_element.id

assert isinstance(id, str)
assert len(id) == 36
assert id.count("-") == 4
# -- Test that the element is JSON serializable. This shold run without an error --
json.dumps(text_element.to_dict())


Expand Down Expand Up @@ -71,9 +76,13 @@ def test_text_element_apply_multiple_cleaners():


def test_apply_raises_if_func_does_not_produce_string():
def bad_cleaner(s: str):
return 1

text_element = Text(text="[1] A Textbook on Crocodile Habitats")
with pytest.raises(ValueError):
text_element.apply(lambda s: 1)

with pytest.raises(ValueError, match="Cleaner produced a non-string output."):
text_element.apply(bad_cleaner) # pyright: ignore[reportGeneralTypeIssues]


@pytest.mark.parametrize(
Expand Down Expand Up @@ -106,22 +115,27 @@ def test_apply_raises_if_func_does_not_produce_string():
],
)
def test_convert_coordinates_to_new_system(
coordinates,
orientation1,
orientation2,
expected_coords,
coordinates: Points,
orientation1: Orientation,
orientation2: Orientation,
expected_coords: Points,
):
coord1 = CoordinateSystem(100, 200)
coord1.orientation = orientation1
coord2 = CoordinateSystem(1000, 2000)
coord2.orientation = orientation2
element = Element(coordinates=coordinates, coordinate_system=coord1)

new_coords = element.convert_coordinates_to_new_system(coord2)
for new_coord, expected_coord in zip(new_coords, expected_coords):
new_coord == pytest.approx(expected_coord)

assert new_coords is not None
for new_coord, expected in zip(new_coords, expected_coords):
assert new_coord == pytest.approx(expected) # pyright: ignore[reportUnknownMemberType]
element.convert_coordinates_to_new_system(coord2, in_place=True)
for new_coord, expected_coord in zip(element.metadata.coordinates.points, expected_coords):
assert new_coord == pytest.approx(expected_coord)
assert element.metadata.coordinates is not None
assert element.metadata.coordinates.points is not None
for new_coord, expected in zip(element.metadata.coordinates.points, expected_coords):
assert new_coord == pytest.approx(expected) # pyright: ignore[reportUnknownMemberType]
assert element.metadata.coordinates.system == coord2


Expand Down
1 change: 0 additions & 1 deletion test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1055,7 +1055,6 @@ def test_partition_pdf_with_bad_color_profile():
[
("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
("failure-after-repair.pdf", "PDFMiner failed to process PDF page 26 after repairing it."),
],
)
def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.11.4-dev7" # pragma: no cover
__version__ = "0.11.4-dev9" # pragma: no cover
2 changes: 1 addition & 1 deletion unstructured/cleaners/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def _validate_language_code(language_code: str):
)


def translate_text(text, source_lang: Optional[str] = None, target_lang: str = "en") -> str:
def translate_text(text: str, source_lang: Optional[str] = None, target_lang: str = "en") -> str:
"""Translates the foreign language text. If the source language is not specified, the
function will attempt to detect it using langdetect.
Expand Down
140 changes: 86 additions & 54 deletions unstructured/documents/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
import re
import uuid
from types import MappingProxyType
from typing import Any, Callable, Dict, FrozenSet, List, Optional, Tuple, Union
from typing import Any, Callable, Dict, FrozenSet, List, Optional, Sequence, Tuple, Union, cast

from typing_extensions import ParamSpec, TypedDict
from typing_extensions import ParamSpec, TypeAlias, TypedDict

from unstructured.documents.coordinates import (
TYPE_TO_COORDINATE_SYSTEM_MAP,
Expand All @@ -24,6 +24,9 @@
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.utils import lazyproperty

Point: TypeAlias = Tuple[float, float]
Points: TypeAlias = Tuple[Point, ...]


class NoID(abc.ABC):
"""Class to indicate that an element do not have an ID."""
Expand Down Expand Up @@ -61,10 +64,10 @@ def from_dict(cls, input_dict: Dict[str, Any]):
class CoordinatesMetadata:
"""Metadata fields that pertain to the coordinates of the element."""

points: Tuple[Tuple[float, float], ...]
system: CoordinateSystem
points: Optional[Points]
system: Optional[CoordinateSystem]

def __init__(self, points, system):
def __init__(self, points: Optional[Points], system: Optional[CoordinateSystem]):
# Both `points` and `system` must be present; one is not meaningful without the other.
if (points is None and system is not None) or (points is not None and system is None):
raise ValueError(
Expand Down Expand Up @@ -94,30 +97,38 @@ def to_dict(self):
@classmethod
def from_dict(cls, input_dict: Dict[str, Any]):
# `input_dict` may contain a tuple of tuples or a list of lists
def convert_to_tuple_of_tuples(sequence_of_sequences):
subsequences = []
def convert_to_points(sequence_of_sequences: Sequence[Sequence[float]]) -> Points:
points: List[Point] = []
for seq in sequence_of_sequences:
if isinstance(seq, list):
subsequences.append(tuple(seq))
points.append(cast(Point, tuple(seq)))
elif isinstance(seq, tuple):
subsequences.append(seq)
return tuple(subsequences)

input_points = input_dict.get("points", None)
points = convert_to_tuple_of_tuples(input_points) if input_points is not None else None
width = input_dict.get("layout_width", None)
height = input_dict.get("layout_height", None)
system = None
if input_dict.get("system", None) == "RelativeCoordinateSystem":
system = RelativeCoordinateSystem()
elif (
width is not None
and height is not None
and input_dict.get("system", None) in TYPE_TO_COORDINATE_SYSTEM_MAP
):
system = TYPE_TO_COORDINATE_SYSTEM_MAP[input_dict["system"]](width, height)
constructor_args = {"points": points, "system": system}
return cls(**constructor_args)
points.append(cast(Point, seq))
return tuple(points)

# -- parse points --
input_points = input_dict.get("points")
points = convert_to_points(input_points) if input_points is not None else None

# -- parse system --
system_name = input_dict.get("system")
width = input_dict.get("layout_width")
height = input_dict.get("layout_height")
system = (
None
if system_name is None
else RelativeCoordinateSystem()
if system_name == "RelativeCoordinateSystem"
else TYPE_TO_COORDINATE_SYSTEM_MAP[system_name](width, height)
if (
width is not None
and height is not None
and system_name in TYPE_TO_COORDINATE_SYSTEM_MAP
)
else None
)

return cls(points=points, system=system)


class RegexMetadata(TypedDict):
Expand Down Expand Up @@ -637,14 +648,19 @@ def to_dict(self) -> Dict[str, Any]:
}

def convert_coordinates_to_new_system(
self,
new_system: CoordinateSystem,
in_place=True,
) -> Optional[Tuple[Tuple[Union[int, float], Union[int, float]], ...]]:
"""Converts the element location coordinates to a new coordinate system. If inplace is true,
changes the coordinates in place and updates the coordinate system."""
if self.metadata.coordinates is None:
self, new_system: CoordinateSystem, in_place: bool = True
) -> Optional[Points]:
"""Converts the element location coordinates to a new coordinate system.
If inplace is true, changes the coordinates in place and updates the coordinate system.
"""
if (
self.metadata.coordinates is None
or self.metadata.coordinates.system is None
or self.metadata.coordinates.points is None
):
return None

new_coordinates = tuple(
self.metadata.coordinates.system.convert_coordinates_to_new_system(
new_system=new_system,
Expand All @@ -653,15 +669,19 @@ def convert_coordinates_to_new_system(
)
for x, y in self.metadata.coordinates.points
)

if in_place:
self.metadata.coordinates.points = new_coordinates
self.metadata.coordinates.system = new_system

return new_coordinates


class CheckBox(Element):
"""A checkbox with an attribute indicating whether its checked or not. Primarily used
in documents that are forms"""
"""A checkbox with an attribute indicating whether its checked or not.
Primarily used in documents that are forms.
"""

def __init__(
self,
Expand All @@ -682,12 +702,18 @@ def __init__(
)
self.checked: bool = checked

def __eq__(self, other):
return (self.checked == other.checked) and (
self.metadata.coordinates == other.metadata.coordinates
def __eq__(self, other: object) -> bool:
if not isinstance(other, CheckBox):
return False
return all(
(
self.checked == other.checked,
self.metadata.coordinates == other.metadata.coordinates,
)
)

def to_dict(self) -> dict:
def to_dict(self) -> Dict[str, Any]:
"""Serialize to JSON-compatible (str keys) dict."""
out = super().to_dict()
out["type"] = "CheckBox"
out["checked"] = self.checked
Expand Down Expand Up @@ -729,20 +755,23 @@ def __init__(
detection_origin=detection_origin,
)

def __str__(self):
return self.text

def __eq__(self, other):
def __eq__(self, other: object):
if not isinstance(other, Text):
return False
return all(
[
(self.text == other.text),
(self.metadata.coordinates == other.metadata.coordinates),
(self.category == other.category),
(self.embeddings == other.embeddings),
],
(
self.text == other.text,
self.metadata.coordinates == other.metadata.coordinates,
self.category == other.category,
self.embeddings == other.embeddings,
),
)

def to_dict(self) -> dict:
def __str__(self):
return self.text

def to_dict(self) -> Dict[str, Any]:
"""Serialize to JSON-compatible (str keys) dict."""
out = super().to_dict()
out["element_id"] = self.id
out["type"] = self.category
Expand All @@ -751,14 +780,17 @@ def to_dict(self) -> dict:
out["embeddings"] = self.embeddings
return out

def apply(self, *cleaners: Callable):
"""Applies a cleaning brick to the text element. The function that's passed in
should take a string as input and produce a string as output."""
def apply(self, *cleaners: Callable[[str], str]):
"""Applies a cleaning brick to the text element.
The function that's passed in should take a string as input and produce a string as
output.
"""
cleaned_text = self.text
for cleaner in cleaners:
cleaned_text = cleaner(cleaned_text)

if not isinstance(cleaned_text, str):
if not isinstance(cleaned_text, str): # pyright: ignore[reportUnnecessaryIsInstance]
raise ValueError("Cleaner produced a non-string output.")

self.text = cleaned_text
Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/pdf_image/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def supplement_page_layout_with_ocr(
)
elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value:
for element in page_layout.elements:
if element.text == "":
if not element.text:
padding = env_config.IMAGE_CROP_PAD
padded_element = pad_element_bboxes(element, padding=padding)
cropped_image = image.crop(
Expand Down
Loading

0 comments on commit a4fd3f3

Please sign in to comment.