diff --git a/CHANGELOG.md b/CHANGELOG.md index c70f82bdaf..0bf06d23c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.11.4-dev7 +## 0.11.4-dev9 ### Enhancements @@ -16,6 +16,7 @@ ### Fixes +* **Fix pdf `hi_res` partitioning failure when pdfminer fails.** Implemented logic to fall back to the "inferred_layout + OCR" if pdfminer fails in the `hi_res` strategy. * **partition returning cid code in `hi_res`** occasaionally pdfminer can fail to decode the text in an pdf file and return cid code as text. Now when this happens the text from OCR is used. ## 0.11.3 diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py index 2b2d659be8..bf8bc7a146 100644 --- a/test_unstructured/documents/test_elements.py +++ b/test_unstructured/documents/test_elements.py @@ -25,6 +25,7 @@ Element, ElementMetadata, NoID, + Points, RegexMetadata, Text, ) @@ -37,9 +38,13 @@ def test_text_id(): def test_text_uuid(): text_element = Text(text="hello there!", element_id=UUID()) - assert len(text_element.id) == 36 - assert text_element.id.count("-") == 4 - # Test that the element is JSON serializable. This shold run without an error + + id = text_element.id + + assert isinstance(id, str) + assert len(id) == 36 + assert id.count("-") == 4 + # -- Test that the element is JSON serializable. This shold run without an error -- json.dumps(text_element.to_dict()) @@ -71,9 +76,13 @@ def test_text_element_apply_multiple_cleaners(): def test_apply_raises_if_func_does_not_produce_string(): + def bad_cleaner(s: str): + return 1 + text_element = Text(text="[1] A Textbook on Crocodile Habitats") - with pytest.raises(ValueError): - text_element.apply(lambda s: 1) + + with pytest.raises(ValueError, match="Cleaner produced a non-string output."): + text_element.apply(bad_cleaner) # pyright: ignore[reportGeneralTypeIssues] @pytest.mark.parametrize( @@ -106,22 +115,27 @@ def test_apply_raises_if_func_does_not_produce_string(): ], ) def test_convert_coordinates_to_new_system( - coordinates, - orientation1, - orientation2, - expected_coords, + coordinates: Points, + orientation1: Orientation, + orientation2: Orientation, + expected_coords: Points, ): coord1 = CoordinateSystem(100, 200) coord1.orientation = orientation1 coord2 = CoordinateSystem(1000, 2000) coord2.orientation = orientation2 element = Element(coordinates=coordinates, coordinate_system=coord1) + new_coords = element.convert_coordinates_to_new_system(coord2) - for new_coord, expected_coord in zip(new_coords, expected_coords): - new_coord == pytest.approx(expected_coord) + + assert new_coords is not None + for new_coord, expected in zip(new_coords, expected_coords): + assert new_coord == pytest.approx(expected) # pyright: ignore[reportUnknownMemberType] element.convert_coordinates_to_new_system(coord2, in_place=True) - for new_coord, expected_coord in zip(element.metadata.coordinates.points, expected_coords): - assert new_coord == pytest.approx(expected_coord) + assert element.metadata.coordinates is not None + assert element.metadata.coordinates.points is not None + for new_coord, expected in zip(element.metadata.coordinates.points, expected_coords): + assert new_coord == pytest.approx(expected) # pyright: ignore[reportUnknownMemberType] assert element.metadata.coordinates.system == coord2 diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index efd4a9b3a2..3c38ae1bdf 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1055,7 +1055,6 @@ def test_partition_pdf_with_bad_color_profile(): [ ("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."), ("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."), - ("failure-after-repair.pdf", "PDFMiner failed to process PDF page 26 after repairing it."), ], ) def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog): diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 4fc43477d0..e0544a08fd 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.11.4-dev7" # pragma: no cover +__version__ = "0.11.4-dev9" # pragma: no cover diff --git a/unstructured/cleaners/translate.py b/unstructured/cleaners/translate.py index 8cb16a17bb..0e38106d3c 100644 --- a/unstructured/cleaners/translate.py +++ b/unstructured/cleaners/translate.py @@ -21,7 +21,7 @@ def _validate_language_code(language_code: str): ) -def translate_text(text, source_lang: Optional[str] = None, target_lang: str = "en") -> str: +def translate_text(text: str, source_lang: Optional[str] = None, target_lang: str = "en") -> str: """Translates the foreign language text. If the source language is not specified, the function will attempt to detect it using langdetect. diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 365e2888f1..a5370a2e16 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -12,9 +12,9 @@ import re import uuid from types import MappingProxyType -from typing import Any, Callable, Dict, FrozenSet, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, FrozenSet, List, Optional, Sequence, Tuple, Union, cast -from typing_extensions import ParamSpec, TypedDict +from typing_extensions import ParamSpec, TypeAlias, TypedDict from unstructured.documents.coordinates import ( TYPE_TO_COORDINATE_SYSTEM_MAP, @@ -24,6 +24,9 @@ from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA from unstructured.utils import lazyproperty +Point: TypeAlias = Tuple[float, float] +Points: TypeAlias = Tuple[Point, ...] + class NoID(abc.ABC): """Class to indicate that an element do not have an ID.""" @@ -61,10 +64,10 @@ def from_dict(cls, input_dict: Dict[str, Any]): class CoordinatesMetadata: """Metadata fields that pertain to the coordinates of the element.""" - points: Tuple[Tuple[float, float], ...] - system: CoordinateSystem + points: Optional[Points] + system: Optional[CoordinateSystem] - def __init__(self, points, system): + def __init__(self, points: Optional[Points], system: Optional[CoordinateSystem]): # Both `points` and `system` must be present; one is not meaningful without the other. if (points is None and system is not None) or (points is not None and system is None): raise ValueError( @@ -94,30 +97,38 @@ def to_dict(self): @classmethod def from_dict(cls, input_dict: Dict[str, Any]): # `input_dict` may contain a tuple of tuples or a list of lists - def convert_to_tuple_of_tuples(sequence_of_sequences): - subsequences = [] + def convert_to_points(sequence_of_sequences: Sequence[Sequence[float]]) -> Points: + points: List[Point] = [] for seq in sequence_of_sequences: if isinstance(seq, list): - subsequences.append(tuple(seq)) + points.append(cast(Point, tuple(seq))) elif isinstance(seq, tuple): - subsequences.append(seq) - return tuple(subsequences) - - input_points = input_dict.get("points", None) - points = convert_to_tuple_of_tuples(input_points) if input_points is not None else None - width = input_dict.get("layout_width", None) - height = input_dict.get("layout_height", None) - system = None - if input_dict.get("system", None) == "RelativeCoordinateSystem": - system = RelativeCoordinateSystem() - elif ( - width is not None - and height is not None - and input_dict.get("system", None) in TYPE_TO_COORDINATE_SYSTEM_MAP - ): - system = TYPE_TO_COORDINATE_SYSTEM_MAP[input_dict["system"]](width, height) - constructor_args = {"points": points, "system": system} - return cls(**constructor_args) + points.append(cast(Point, seq)) + return tuple(points) + + # -- parse points -- + input_points = input_dict.get("points") + points = convert_to_points(input_points) if input_points is not None else None + + # -- parse system -- + system_name = input_dict.get("system") + width = input_dict.get("layout_width") + height = input_dict.get("layout_height") + system = ( + None + if system_name is None + else RelativeCoordinateSystem() + if system_name == "RelativeCoordinateSystem" + else TYPE_TO_COORDINATE_SYSTEM_MAP[system_name](width, height) + if ( + width is not None + and height is not None + and system_name in TYPE_TO_COORDINATE_SYSTEM_MAP + ) + else None + ) + + return cls(points=points, system=system) class RegexMetadata(TypedDict): @@ -637,14 +648,19 @@ def to_dict(self) -> Dict[str, Any]: } def convert_coordinates_to_new_system( - self, - new_system: CoordinateSystem, - in_place=True, - ) -> Optional[Tuple[Tuple[Union[int, float], Union[int, float]], ...]]: - """Converts the element location coordinates to a new coordinate system. If inplace is true, - changes the coordinates in place and updates the coordinate system.""" - if self.metadata.coordinates is None: + self, new_system: CoordinateSystem, in_place: bool = True + ) -> Optional[Points]: + """Converts the element location coordinates to a new coordinate system. + + If inplace is true, changes the coordinates in place and updates the coordinate system. + """ + if ( + self.metadata.coordinates is None + or self.metadata.coordinates.system is None + or self.metadata.coordinates.points is None + ): return None + new_coordinates = tuple( self.metadata.coordinates.system.convert_coordinates_to_new_system( new_system=new_system, @@ -653,15 +669,19 @@ def convert_coordinates_to_new_system( ) for x, y in self.metadata.coordinates.points ) + if in_place: self.metadata.coordinates.points = new_coordinates self.metadata.coordinates.system = new_system + return new_coordinates class CheckBox(Element): - """A checkbox with an attribute indicating whether its checked or not. Primarily used - in documents that are forms""" + """A checkbox with an attribute indicating whether its checked or not. + + Primarily used in documents that are forms. + """ def __init__( self, @@ -682,12 +702,18 @@ def __init__( ) self.checked: bool = checked - def __eq__(self, other): - return (self.checked == other.checked) and ( - self.metadata.coordinates == other.metadata.coordinates + def __eq__(self, other: object) -> bool: + if not isinstance(other, CheckBox): + return False + return all( + ( + self.checked == other.checked, + self.metadata.coordinates == other.metadata.coordinates, + ) ) - def to_dict(self) -> dict: + def to_dict(self) -> Dict[str, Any]: + """Serialize to JSON-compatible (str keys) dict.""" out = super().to_dict() out["type"] = "CheckBox" out["checked"] = self.checked @@ -729,20 +755,23 @@ def __init__( detection_origin=detection_origin, ) - def __str__(self): - return self.text - - def __eq__(self, other): + def __eq__(self, other: object): + if not isinstance(other, Text): + return False return all( - [ - (self.text == other.text), - (self.metadata.coordinates == other.metadata.coordinates), - (self.category == other.category), - (self.embeddings == other.embeddings), - ], + ( + self.text == other.text, + self.metadata.coordinates == other.metadata.coordinates, + self.category == other.category, + self.embeddings == other.embeddings, + ), ) - def to_dict(self) -> dict: + def __str__(self): + return self.text + + def to_dict(self) -> Dict[str, Any]: + """Serialize to JSON-compatible (str keys) dict.""" out = super().to_dict() out["element_id"] = self.id out["type"] = self.category @@ -751,14 +780,17 @@ def to_dict(self) -> dict: out["embeddings"] = self.embeddings return out - def apply(self, *cleaners: Callable): - """Applies a cleaning brick to the text element. The function that's passed in - should take a string as input and produce a string as output.""" + def apply(self, *cleaners: Callable[[str], str]): + """Applies a cleaning brick to the text element. + + The function that's passed in should take a string as input and produce a string as + output. + """ cleaned_text = self.text for cleaner in cleaners: cleaned_text = cleaner(cleaned_text) - if not isinstance(cleaned_text, str): + if not isinstance(cleaned_text, str): # pyright: ignore[reportUnnecessaryIsInstance] raise ValueError("Cleaner produced a non-string output.") self.text = cleaned_text diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index 7e5daa77d0..d4acbb42ac 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -198,7 +198,7 @@ def supplement_page_layout_with_ocr( ) elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value: for element in page_layout.elements: - if element.text == "": + if not element.text: padding = env_config.IMAGE_CROP_PAD padded_element = pad_element_bboxes(element, padding=padding) cropped_image = image.crop( diff --git a/unstructured/partition/pdf_image/pdf.py b/unstructured/partition/pdf_image/pdf.py index 01ff361782..dbd42f1e0d 100644 --- a/unstructured/partition/pdf_image/pdf.py +++ b/unstructured/partition/pdf_image/pdf.py @@ -238,6 +238,7 @@ def _partition_pdf_or_image_local( ocr_mode: str = OCRMode.FULL_PAGE.value, model_name: Optional[str] = None, metadata_last_modified: Optional[str] = None, + pdf_text_extractable: bool = False, extract_images_in_pdf: bool = False, extract_element_types: Optional[List[str]] = None, image_output_dir_path: Optional[str] = None, @@ -281,12 +282,14 @@ def _partition_pdf_or_image_local( pdf_image_dpi=pdf_image_dpi, ) - # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout - merged_document_layout = process_file_with_pdfminer( - inferred_document_layout, - filename, - is_image, - ) + if pdf_text_extractable is True: + # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout + merged_document_layout = process_file_with_pdfminer( + inferred_document_layout, + filename, + ) + else: + merged_document_layout = inferred_document_layout if model_name.startswith("chipper"): # NOTE(alan): We shouldn't do OCR with chipper @@ -310,13 +313,14 @@ def _partition_pdf_or_image_local( ) if hasattr(file, "seek"): file.seek(0) - - # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout - merged_document_layout = process_data_with_pdfminer( - inferred_document_layout, - file, - is_image, - ) + if pdf_text_extractable is True: + # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout + merged_document_layout = process_data_with_pdfminer( + inferred_document_layout, + file, + ) + else: + merged_document_layout = inferred_document_layout if model_name.startswith("chipper"): # NOTE(alan): We shouldn't do OCR with chipper @@ -339,6 +343,11 @@ def _partition_pdf_or_image_local( kwargs["sort_mode"] = SORT_MODE_DONT final_document_layout = clean_pdfminer_inner_elements(final_document_layout) + + for page in final_document_layout.pages: + for el in page.elements: + el.text = el.text or "" + elements = document_to_element_list( final_document_layout, sortable=True, @@ -452,7 +461,7 @@ def partition_pdf_or_image( isinstance(el, Text) and el.text.strip() for el in extracted_elements ) except Exception as e: - logger.error(e, exc_info=True) + logger.error(e) logger.warning("PDF text extraction failed, skip text extraction...") strategy = determine_pdf_or_image_strategy( @@ -476,6 +485,7 @@ def partition_pdf_or_image( include_page_breaks=include_page_breaks, languages=languages, metadata_last_modified=metadata_last_modified or last_modification_date, + pdf_text_extractable=pdf_text_extractable, extract_images_in_pdf=extract_images_in_pdf, extract_element_types=extract_element_types, image_output_dir_path=image_output_dir_path, diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 1cbbf6c0c4..8cffdbeaa7 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -27,14 +27,12 @@ def process_file_with_pdfminer( inferred_document_layout: "DocumentLayout", filename: str = "", - is_image: bool = False, ) -> "DocumentLayout": with open_filename(filename, "rb") as fp: fp = cast(BinaryIO, fp) inferred_document_layout = process_data_with_pdfminer( inferred_document_layout=inferred_document_layout, file=fp, - is_image=is_image, ) return inferred_document_layout @@ -42,13 +40,8 @@ def process_file_with_pdfminer( def process_data_with_pdfminer( inferred_document_layout: "DocumentLayout", file: Optional[Union[bytes, BinaryIO]] = None, - is_image: bool = False, ) -> "DocumentLayout": - if is_image: - for page in inferred_document_layout.pages: - for el in page.elements: - el.text = el.text or "" - return inferred_document_layout + """Process document data using PDFMiner to extract layout information.""" extracted_layouts = get_regions_by_pdfminer(file) diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index 5c50bff88c..564d9a15fd 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -104,14 +104,8 @@ def open_pdfminer_pages_generator( with pikepdf.Pdf.open(error_page_data) as pdf: pdf.save(tmp.name) page = next(PDFPage.get_pages(open(tmp.name, "rb"))) # noqa: SIM115 - try: - interpreter.process_page(page) - page_layout = device.get_result() - except Exception: - logger.warning( - f"PDFMiner failed to process PDF page {i+1} after repairing it." - ) - break + interpreter.process_page(page) + page_layout = device.get_result() i += 1 yield page, page_layout except PSSyntaxError: