From 34af41fe9cbb85fbdac799c773f27862246b5f8f Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Wed, 27 Sep 2023 16:47:35 -0400 Subject: [PATCH 01/26] move func merge_inferred_layout_with_ocr_layout --- .../inference/test_layout_element.py | 47 ++++++++-------- unstructured_inference/inference/layout.py | 34 +++++------ .../inference/layoutelement.py | 56 +++++++++---------- 3 files changed, 70 insertions(+), 67 deletions(-) diff --git a/test_unstructured_inference/inference/test_layout_element.py b/test_unstructured_inference/inference/test_layout_element.py index 9dfdb3d9..4636c916 100644 --- a/test_unstructured_inference/inference/test_layout_element.py +++ b/test_unstructured_inference/inference/test_layout_element.py @@ -8,7 +8,8 @@ LayoutElement, aggregate_ocr_text_by_block, get_elements_from_ocr_regions, - merge_inferred_layout_with_ocr_layout, + # move to unst + # merge_inferred_layout_with_ocr_layout, merge_text_regions, supplement_layout_with_ocr_elements, ) @@ -85,28 +86,28 @@ def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions): if ocr_element.is_almost_subregion_of(element, SUBREGION_THRESHOLD_FOR_OCR): assert ocr_element not in final_layout - -def test_merge_inferred_layout_with_ocr_layout(mock_inferred_layout, mock_ocr_regions): - ocr_elements = [ - LayoutElement( - r.x1, - r.y1, - r.x2, - r.y2, - text=r.text, - type="UncategorizedText", - ) - for r in mock_ocr_regions - ] - - final_layout = merge_inferred_layout_with_ocr_layout(mock_inferred_layout, mock_ocr_regions) - - # Check if the inferred layout's text attribute is updated with aggregated OCR text - assert final_layout[0].text == mock_ocr_regions[2].text - - # Check if the final layout contains both original elements and OCR-derived elements - assert all(element in final_layout for element in mock_inferred_layout) - assert any(element in final_layout for element in ocr_elements) +# move to unst +# def test_merge_inferred_layout_with_ocr_layout(mock_inferred_layout, mock_ocr_regions): +# ocr_elements = [ +# LayoutElement( +# r.x1, +# r.y1, +# r.x2, +# r.y2, +# text=r.text, +# type="UncategorizedText", +# ) +# for r in mock_ocr_regions +# ] + +# final_layout = merge_inferred_layout_with_ocr_layout(mock_inferred_layout, mock_ocr_regions) + +# # Check if the inferred layout's text attribute is updated with aggregated OCR text +# assert final_layout[0].text == mock_ocr_regions[2].text + +# # Check if the final layout contains both original elements and OCR-derived elements +# assert all(element in final_layout for element in mock_inferred_layout) +# assert any(element in final_layout for element in ocr_elements) @pytest.mark.parametrize("is_table", [False, True]) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 5768e2a4..727d6d40 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -24,7 +24,8 @@ LayoutElement, LocationlessLayoutElement, merge_inferred_layout_with_extracted_layout, - merge_inferred_layout_with_ocr_layout, + # move to unst + # merge_inferred_layout_with_ocr_layout, ) from unstructured_inference.inference.ordering import order_layout from unstructured_inference.inference.pdf import get_images_from_pdf_element @@ -312,21 +313,22 @@ def get_elements_with_detection_model( supplement_with_ocr_elements=self.supplement_with_ocr_elements, **threshold_kwargs, ) - elif ocr_layout is not None: - threshold_kwargs = {} - # NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn - # In other case the default values for the functions are used - if ( - isinstance(self.detection_model, UnstructuredDetectronONNXModel) - and "R_50" not in self.detection_model.model_path - ): - threshold_kwargs = {"subregion_threshold": 0.3} - merged_layout = merge_inferred_layout_with_ocr_layout( - inferred_layout=inferred_layout, - ocr_layout=ocr_layout, - supplement_with_ocr_elements=self.supplement_with_ocr_elements, - **threshold_kwargs, - ) + # move to unst + # elif ocr_layout is not None: + # threshold_kwargs = {} + # # NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn + # # In other case the default values for the functions are used + # if ( + # isinstance(self.detection_model, UnstructuredDetectronONNXModel) + # and "R_50" not in self.detection_model.model_path + # ): + # threshold_kwargs = {"subregion_threshold": 0.3} + # merged_layout = merge_inferred_layout_with_ocr_layout( + # inferred_layout=inferred_layout, + # ocr_layout=ocr_layout, + # supplement_with_ocr_elements=self.supplement_with_ocr_elements, + # **threshold_kwargs, + # ) else: merged_layout = inferred_layout diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index 5e00388d..1f2f23c2 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -185,34 +185,34 @@ def merge_inferred_layout_with_extracted_layout( return final_layout - -def merge_inferred_layout_with_ocr_layout( - inferred_layout: List[LayoutElement], - ocr_layout: List[TextRegion], - supplement_with_ocr_elements: bool = True, -) -> List[LayoutElement]: - """ - Merge the inferred layout with the OCR-detected text regions. - - This function iterates over each inferred layout element and aggregates the - associated text from the OCR layout using the specified threshold. The inferred - layout's text attribute is then updated with this aggregated text. - """ - - for inferred_region in inferred_layout: - inferred_region.text = aggregate_ocr_text_by_block( - ocr_layout, - inferred_region, - SUBREGION_THRESHOLD_FOR_OCR, - ) - - final_layout = ( - supplement_layout_with_ocr_elements(inferred_layout, ocr_layout) - if supplement_with_ocr_elements - else inferred_layout - ) - - return final_layout +# move to unst +# def merge_inferred_layout_with_ocr_layout( +# inferred_layout: List[LayoutElement], +# ocr_layout: List[TextRegion], +# supplement_with_ocr_elements: bool = True, +# ) -> List[LayoutElement]: +# """ +# Merge the inferred layout with the OCR-detected text regions. + +# This function iterates over each inferred layout element and aggregates the +# associated text from the OCR layout using the specified threshold. The inferred +# layout's text attribute is then updated with this aggregated text. +# """ + +# for inferred_region in inferred_layout: +# inferred_region.text = aggregate_ocr_text_by_block( +# ocr_layout, +# inferred_region, +# SUBREGION_THRESHOLD_FOR_OCR, +# ) + +# final_layout = ( +# supplement_layout_with_ocr_elements(inferred_layout, ocr_layout) +# if supplement_with_ocr_elements +# else inferred_layout +# ) + +# return final_layout def aggregate_ocr_text_by_block( From fc98bd6ccc03bfea2f9d51c76af51f0c6b607cfa Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Wed, 27 Sep 2023 16:51:22 -0400 Subject: [PATCH 02/26] aggregate_ocr_text_by_block --- .../inference/test_layout_element.py | 27 +++---- .../inference/layoutelement.py | 76 ++++++++++--------- 2 files changed, 53 insertions(+), 50 deletions(-) diff --git a/test_unstructured_inference/inference/test_layout_element.py b/test_unstructured_inference/inference/test_layout_element.py index 4636c916..2c8173d8 100644 --- a/test_unstructured_inference/inference/test_layout_element.py +++ b/test_unstructured_inference/inference/test_layout_element.py @@ -6,7 +6,8 @@ from unstructured_inference.inference.elements import TextRegion from unstructured_inference.inference.layoutelement import ( LayoutElement, - aggregate_ocr_text_by_block, + # move to unst + # aggregate_ocr_text_by_block, get_elements_from_ocr_regions, # move to unst # merge_inferred_layout_with_ocr_layout, @@ -14,19 +15,19 @@ supplement_layout_with_ocr_elements, ) +# move to unst +# def test_aggregate_ocr_text_by_block(): +# expected = "A Unified Toolkit" +# ocr_layout = [ +# TextRegion(0, 0, 20, 20, "A"), +# TextRegion(50, 50, 150, 150, "Unified"), +# TextRegion(150, 150, 300, 250, "Toolkit"), +# TextRegion(200, 250, 300, 350, "Deep"), +# ] +# region = TextRegion(0, 0, 250, 350, "") -def test_aggregate_ocr_text_by_block(): - expected = "A Unified Toolkit" - ocr_layout = [ - TextRegion(0, 0, 20, 20, "A"), - TextRegion(50, 50, 150, 150, "Unified"), - TextRegion(150, 150, 300, 250, "Toolkit"), - TextRegion(200, 250, 300, 350, "Deep"), - ] - region = TextRegion(0, 0, 250, 350, "") - - text = aggregate_ocr_text_by_block(ocr_layout, region, 0.5) - assert text == expected +# text = aggregate_ocr_text_by_block(ocr_layout, region, 0.5) +# assert text == expected def test_merge_text_regions(mock_embedded_text_regions): diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index 1f2f23c2..b30a27fc 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -164,24 +164,26 @@ def merge_inferred_layout_with_extracted_layout( inferred_regions_to_add = [ region for region in inferred_layout if region not in inferred_regions_to_remove ] - inferred_regions_to_add_without_text = [ - region for region in inferred_regions_to_add if not region.text - ] - if ocr_layout is not None: - for inferred_region in inferred_regions_to_add_without_text: - inferred_region.text = aggregate_ocr_text_by_block( - ocr_layout, - inferred_region, - SUBREGION_THRESHOLD_FOR_OCR, - ) - out_layout = categorized_extracted_elements_to_add + inferred_regions_to_add - final_layout = ( - supplement_layout_with_ocr_elements(out_layout, ocr_layout) - if supplement_with_ocr_elements - else out_layout - ) - else: - final_layout = categorized_extracted_elements_to_add + inferred_regions_to_add + # inferred_regions_to_add_without_text = [ + # region for region in inferred_regions_to_add if not region.text + # ] + # moved to unst + # if ocr_layout is not None: + # for inferred_region in inferred_regions_to_add_without_text: + # inferred_region.text = aggregate_ocr_text_by_block( + # ocr_layout, + # inferred_region, + # SUBREGION_THRESHOLD_FOR_OCR, + # ) + # out_layout = categorized_extracted_elements_to_add + inferred_regions_to_add + # final_layout = ( + # supplement_layout_with_ocr_elements(out_layout, ocr_layout) + # if supplement_with_ocr_elements + # else out_layout + # ) + # else: + # final_layout = categorized_extracted_elements_to_add + inferred_regions_to_add + final_layout = categorized_extracted_elements_to_add + inferred_regions_to_add return final_layout @@ -214,26 +216,26 @@ def merge_inferred_layout_with_extracted_layout( # return final_layout +# move to unst +# def aggregate_ocr_text_by_block( +# ocr_layout: List[TextRegion], +# region: TextRegion, +# subregion_threshold: float, +# ) -> Optional[str]: +# """Extracts the text aggregated from the regions of the ocr layout that lie within the given +# block.""" + +# extracted_texts = [] + +# for ocr_region in ocr_layout: +# ocr_region_is_subregion_of_given_region = ocr_region.is_almost_subregion_of( +# region, +# subregion_threshold=subregion_threshold, +# ) +# if ocr_region_is_subregion_of_given_region and ocr_region.text: +# extracted_texts.append(ocr_region.text) -def aggregate_ocr_text_by_block( - ocr_layout: List[TextRegion], - region: TextRegion, - subregion_threshold: float, -) -> Optional[str]: - """Extracts the text aggregated from the regions of the ocr layout that lie within the given - block.""" - - extracted_texts = [] - - for ocr_region in ocr_layout: - ocr_region_is_subregion_of_given_region = ocr_region.is_almost_subregion_of( - region, - subregion_threshold=subregion_threshold, - ) - if ocr_region_is_subregion_of_given_region and ocr_region.text: - extracted_texts.append(ocr_region.text) - - return " ".join(extracted_texts) if extracted_texts else None +# return " ".join(extracted_texts) if extracted_texts else None def supplement_layout_with_ocr_elements( From 57fb359df67149249bb895acdd35bec6a7575894 Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Wed, 27 Sep 2023 16:52:52 -0400 Subject: [PATCH 03/26] supplement_layout_with_ocr_elements --- .../inference/test_layout_element.py | 53 +++++----- .../inference/layoutelement.py | 96 +++++++++---------- 2 files changed, 75 insertions(+), 74 deletions(-) diff --git a/test_unstructured_inference/inference/test_layout_element.py b/test_unstructured_inference/inference/test_layout_element.py index 2c8173d8..fe5cee5f 100644 --- a/test_unstructured_inference/inference/test_layout_element.py +++ b/test_unstructured_inference/inference/test_layout_element.py @@ -8,11 +8,12 @@ LayoutElement, # move to unst # aggregate_ocr_text_by_block, - get_elements_from_ocr_regions, + # get_elements_from_ocr_regions, # move to unst # merge_inferred_layout_with_ocr_layout, merge_text_regions, - supplement_layout_with_ocr_elements, + # move to unst + # supplement_layout_with_ocr_elements, ) # move to unst @@ -58,34 +59,34 @@ def test_get_elements_from_ocr_regions(mock_embedded_text_regions): elements = get_elements_from_ocr_regions(mock_embedded_text_regions) assert elements == expected +# move to unst +# def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions): +# ocr_elements = [ +# LayoutElement( +# r.x1, +# r.y1, +# r.x2, +# r.y2, +# text=r.text, +# type="UncategorizedText", +# ) +# for r in mock_ocr_regions +# ] -def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions): - ocr_elements = [ - LayoutElement( - r.x1, - r.y1, - r.x2, - r.y2, - text=r.text, - type="UncategorizedText", - ) - for r in mock_ocr_regions - ] - - final_layout = supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions) +# final_layout = supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions) - # Check if the final layout contains the original layout elements - for element in mock_layout: - assert element in final_layout +# # Check if the final layout contains the original layout elements +# for element in mock_layout: +# assert element in final_layout - # Check if the final layout contains the OCR-derived elements - assert any(ocr_element in final_layout for ocr_element in ocr_elements) +# # Check if the final layout contains the OCR-derived elements +# assert any(ocr_element in final_layout for ocr_element in ocr_elements) - # Check if the OCR-derived elements that are subregions of layout elements are removed - for element in mock_layout: - for ocr_element in ocr_elements: - if ocr_element.is_almost_subregion_of(element, SUBREGION_THRESHOLD_FOR_OCR): - assert ocr_element not in final_layout +# # Check if the OCR-derived elements that are subregions of layout elements are removed +# for element in mock_layout: +# for ocr_element in ocr_elements: +# if ocr_element.is_almost_subregion_of(element, SUBREGION_THRESHOLD_FOR_OCR): +# assert ocr_element not in final_layout # move to unst # def test_merge_inferred_layout_with_ocr_layout(mock_inferred_layout, mock_ocr_regions): diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index b30a27fc..d9a2124c 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -237,56 +237,56 @@ def merge_inferred_layout_with_extracted_layout( # return " ".join(extracted_texts) if extracted_texts else None +# move to unst +# def supplement_layout_with_ocr_elements( +# layout: List[LayoutElement], +# ocr_layout: List[TextRegion], +# ) -> List[LayoutElement]: +# """ +# Supplement the existing layout with additional OCR-derived elements. + +# This function takes two lists: one list of pre-existing layout elements (`layout`) +# and another list of OCR-detected text regions (`ocr_layout`). It identifies OCR regions +# that are subregions of the elements in the existing layout and removes them from the +# OCR-derived list. Then, it appends the remaining OCR-derived regions to the existing layout. + +# Parameters: +# - layout (List[LayoutElement]): A list of existing layout elements, each of which is +# an instance of `LayoutElement`. +# - ocr_layout (List[TextRegion]): A list of OCR-derived text regions, each of which is +# an instance of `TextRegion`. + +# Returns: +# - List[LayoutElement]: The final combined layout consisting of both the original layout +# elements and the new OCR-derived elements. + +# Note: +# - The function relies on `is_almost_subregion_of()` method to determine if an OCR region +# is a subregion of an existing layout element. +# - It also relies on `get_elements_from_ocr_regions()` to convert OCR regions to layout elements. +# - The `SUBREGION_THRESHOLD_FOR_OCR` constant is used to specify the subregion matching +# threshold. +# """ -def supplement_layout_with_ocr_elements( - layout: List[LayoutElement], - ocr_layout: List[TextRegion], -) -> List[LayoutElement]: - """ - Supplement the existing layout with additional OCR-derived elements. - - This function takes two lists: one list of pre-existing layout elements (`layout`) - and another list of OCR-detected text regions (`ocr_layout`). It identifies OCR regions - that are subregions of the elements in the existing layout and removes them from the - OCR-derived list. Then, it appends the remaining OCR-derived regions to the existing layout. - - Parameters: - - layout (List[LayoutElement]): A list of existing layout elements, each of which is - an instance of `LayoutElement`. - - ocr_layout (List[TextRegion]): A list of OCR-derived text regions, each of which is - an instance of `TextRegion`. - - Returns: - - List[LayoutElement]: The final combined layout consisting of both the original layout - elements and the new OCR-derived elements. - - Note: - - The function relies on `is_almost_subregion_of()` method to determine if an OCR region - is a subregion of an existing layout element. - - It also relies on `get_elements_from_ocr_regions()` to convert OCR regions to layout elements. - - The `SUBREGION_THRESHOLD_FOR_OCR` constant is used to specify the subregion matching - threshold. - """ - - ocr_regions_to_remove = [] - for ocr_region in ocr_layout: - for el in layout: - ocr_region_is_subregion_of_out_el = ocr_region.is_almost_subregion_of( - cast(Rectangle, el), - SUBREGION_THRESHOLD_FOR_OCR, - ) - if ocr_region_is_subregion_of_out_el: - ocr_regions_to_remove.append(ocr_region) - break - - ocr_regions_to_add = [region for region in ocr_layout if region not in ocr_regions_to_remove] - if ocr_regions_to_add: - ocr_elements_to_add = get_elements_from_ocr_regions(ocr_regions_to_add) - final_layout = layout + ocr_elements_to_add - else: - final_layout = layout +# ocr_regions_to_remove = [] +# for ocr_region in ocr_layout: +# for el in layout: +# ocr_region_is_subregion_of_out_el = ocr_region.is_almost_subregion_of( +# cast(Rectangle, el), +# SUBREGION_THRESHOLD_FOR_OCR, +# ) +# if ocr_region_is_subregion_of_out_el: +# ocr_regions_to_remove.append(ocr_region) +# break + +# ocr_regions_to_add = [region for region in ocr_layout if region not in ocr_regions_to_remove] +# if ocr_regions_to_add: +# ocr_elements_to_add = get_elements_from_ocr_regions(ocr_regions_to_add) +# final_layout = layout + ocr_elements_to_add +# else: +# final_layout = layout - return final_layout +# return final_layout def merge_text_regions(regions: List[TextRegion]) -> TextRegion: From 08abf44e7a8b172e52c4146c7a4476ff7373fe1d Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Wed, 27 Sep 2023 16:53:48 -0400 Subject: [PATCH 04/26] get_elements_from_ocr_regions --- .../inference/test_layout_element.py | 28 ++++++------- .../inference/layoutelement.py | 42 +++++++++---------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/test_unstructured_inference/inference/test_layout_element.py b/test_unstructured_inference/inference/test_layout_element.py index fe5cee5f..50994d90 100644 --- a/test_unstructured_inference/inference/test_layout_element.py +++ b/test_unstructured_inference/inference/test_layout_element.py @@ -43,21 +43,21 @@ def test_merge_text_regions(mock_embedded_text_regions): merged_text_region = merge_text_regions(mock_embedded_text_regions) assert merged_text_region == expected +# move to unst +# def test_get_elements_from_ocr_regions(mock_embedded_text_regions): +# expected = [ +# LayoutElement( +# x1=437.83888888888885, +# y1=317.319341111111, +# x2=1256.334784222222, +# y2=406.9837855555556, +# text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image", +# type="UncategorizedText", +# ), +# ] -def test_get_elements_from_ocr_regions(mock_embedded_text_regions): - expected = [ - LayoutElement( - x1=437.83888888888885, - y1=317.319341111111, - x2=1256.334784222222, - y2=406.9837855555556, - text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image", - type="UncategorizedText", - ), - ] - - elements = get_elements_from_ocr_regions(mock_embedded_text_regions) - assert elements == expected +# elements = get_elements_from_ocr_regions(mock_embedded_text_regions) +# assert elements == expected # move to unst # def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions): diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index d9a2124c..d4576a0b 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -309,28 +309,28 @@ def merge_text_regions(regions: List[TextRegion]) -> TextRegion: return TextRegion(min_x1, min_y1, max_x2, max_y2, merged_text) +# move to unst +# def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutElement]: +# """ +# Get layout elements from OCR regions +# """ -def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutElement]: - """ - Get layout elements from OCR regions - """ - - grouped_regions = cast( - List[List[TextRegion]], - partition_groups_from_regions(ocr_regions), - ) - merged_regions = [merge_text_regions(group) for group in grouped_regions] - return [ - LayoutElement( - r.x1, - r.y1, - r.x2, - r.y2, - text=r.text, - type="UncategorizedText", - ) - for r in merged_regions - ] +# grouped_regions = cast( +# List[List[TextRegion]], +# partition_groups_from_regions(ocr_regions), +# ) +# merged_regions = [merge_text_regions(group) for group in grouped_regions] +# return [ +# LayoutElement( +# r.x1, +# r.y1, +# r.x2, +# r.y2, +# text=r.text, +# type="UncategorizedText", +# ) +# for r in merged_regions +# ] # NOTE(alan): The right way to do this is probably to rewrite LayoutElement as well as the different From 0f38f5ce6e2e3893eed7ed8e72eadba4eac0eb8b Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Wed, 27 Sep 2023 16:54:40 -0400 Subject: [PATCH 05/26] merge_text_regions --- .../inference/test_layout_element.py | 28 ++++++++--------- .../inference/layoutelement.py | 30 +++++++++---------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/test_unstructured_inference/inference/test_layout_element.py b/test_unstructured_inference/inference/test_layout_element.py index 50994d90..1895ecbc 100644 --- a/test_unstructured_inference/inference/test_layout_element.py +++ b/test_unstructured_inference/inference/test_layout_element.py @@ -2,7 +2,7 @@ from layoutparser.elements import TextBlock from layoutparser.elements.layout_elements import Rectangle as LPRectangle -from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR +# from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR from unstructured_inference.inference.elements import TextRegion from unstructured_inference.inference.layoutelement import ( LayoutElement, @@ -11,7 +11,7 @@ # get_elements_from_ocr_regions, # move to unst # merge_inferred_layout_with_ocr_layout, - merge_text_regions, + # merge_text_regions, # move to unst # supplement_layout_with_ocr_elements, ) @@ -30,18 +30,18 @@ # text = aggregate_ocr_text_by_block(ocr_layout, region, 0.5) # assert text == expected - -def test_merge_text_regions(mock_embedded_text_regions): - expected = TextRegion( - x1=437.83888888888885, - y1=317.319341111111, - x2=1256.334784222222, - y2=406.9837855555556, - text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image", - ) - - merged_text_region = merge_text_regions(mock_embedded_text_regions) - assert merged_text_region == expected +# move to unst +# def test_merge_text_regions(mock_embedded_text_regions): +# expected = TextRegion( +# x1=437.83888888888885, +# y1=317.319341111111, +# x2=1256.334784222222, +# y2=406.9837855555556, +# text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image", +# ) + +# merged_text_region = merge_text_regions(mock_embedded_text_regions) +# assert merged_text_region == expected # move to unst # def test_get_elements_from_ocr_regions(mock_embedded_text_regions): diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index d4576a0b..22c12827 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -288,26 +288,26 @@ def merge_inferred_layout_with_extracted_layout( # return final_layout +# move to unst +# def merge_text_regions(regions: List[TextRegion]) -> TextRegion: +# """ +# Merge a list of TextRegion objects into a single TextRegion. -def merge_text_regions(regions: List[TextRegion]) -> TextRegion: - """ - Merge a list of TextRegion objects into a single TextRegion. - - Parameters: - - group (List[TextRegion]): A list of TextRegion objects to be merged. +# Parameters: +# - group (List[TextRegion]): A list of TextRegion objects to be merged. - Returns: - - TextRegion: A single merged TextRegion object. - """ +# Returns: +# - TextRegion: A single merged TextRegion object. +# """ - min_x1 = min([tr.x1 for tr in regions]) - min_y1 = min([tr.y1 for tr in regions]) - max_x2 = max([tr.x2 for tr in regions]) - max_y2 = max([tr.y2 for tr in regions]) +# min_x1 = min([tr.x1 for tr in regions]) +# min_y1 = min([tr.y1 for tr in regions]) +# max_x2 = max([tr.x2 for tr in regions]) +# max_y2 = max([tr.y2 for tr in regions]) - merged_text = " ".join([tr.text for tr in regions if tr.text]) +# merged_text = " ".join([tr.text for tr in regions if tr.text]) - return TextRegion(min_x1, min_y1, max_x2, max_y2, merged_text) +# return TextRegion(min_x1, min_y1, max_x2, max_y2, merged_text) # move to unst # def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutElement]: From b8d08f726e494a30e4f7dd8a377e2f682f1b7083 Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Wed, 27 Sep 2023 17:06:52 -0400 Subject: [PATCH 06/26] remove ocr_layout --- unstructured_inference/inference/layout.py | 73 +++++++++++----------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 727d6d40..ee3721c7 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -225,7 +225,7 @@ def __init__( self.extract_tables = extract_tables self.analysis = analysis self.inferred_layout: Optional[List[LayoutElement]] = None - self.ocr_layout: Optional[List[TextRegion]] = None + # self.ocr_layout: Optional[List[TextRegion]] = None self.supplement_with_ocr_elements = supplement_with_ocr_elements def __str__(self) -> str: @@ -263,38 +263,39 @@ def get_elements_with_detection_model( # remote call in the future. inferred_layout: List[LayoutElement] = self.detection_model(self.image) - if self.ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value: - ocr_layout = None - elif self.ocr_mode == OCRMode.FULL_PAGE.value: - ocr_layout = None - entrie_page_ocr = os.getenv("ENTIRE_PAGE_OCR", "tesseract").lower() - if entrie_page_ocr not in ["paddle", "tesseract"]: - raise ValueError( - "Environment variable ENTIRE_PAGE_OCR must be set to 'tesseract' or 'paddle'.", - ) - - if entrie_page_ocr == "paddle": - logger.info("Processing entrie page OCR with paddle...") - from unstructured_inference.models import paddle_ocr - - # TODO(yuming): paddle only support one language at once, - # change ocr to tesseract if passed in multilanguages. - ocr_data = paddle_ocr.load_agent(language=self.ocr_languages).ocr( - np.array(self.image), - cls=True, - ) - ocr_layout = parse_ocr_data_paddle(ocr_data) - else: - logger.info("Processing entrie page OCR with tesseract...") - try: - ocr_data = pytesseract.image_to_data( - self.image, - lang=self.ocr_languages, - output_type=Output.DICT, - ) - ocr_layout = parse_ocr_data_tesseract(ocr_data) - except pytesseract.pytesseract.TesseractError: - logger.warning("TesseractError: Skipping page", exc_info=True) + # move to unst + # if self.ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value: + # ocr_layout = None + # elif self.ocr_mode == OCRMode.FULL_PAGE.value: + # ocr_layout = None + # entrie_page_ocr = os.getenv("ENTIRE_PAGE_OCR", "tesseract").lower() + # if entrie_page_ocr not in ["paddle", "tesseract"]: + # raise ValueError( + # "Environment variable ENTIRE_PAGE_OCR must be set to 'tesseract' or 'paddle'.", + # ) + + # if entrie_page_ocr == "paddle": + # logger.info("Processing entrie page OCR with paddle...") + # from unstructured_inference.models import paddle_ocr + + # # TODO(yuming): paddle only support one language at once, + # # change ocr to tesseract if passed in multilanguages. + # ocr_data = paddle_ocr.load_agent(language=self.ocr_languages).ocr( + # np.array(self.image), + # cls=True, + # ) + # ocr_layout = parse_ocr_data_paddle(ocr_data) + # else: + # logger.info("Processing entrie page OCR with tesseract...") + # try: + # ocr_data = pytesseract.image_to_data( + # self.image, + # lang=self.ocr_languages, + # output_type=Output.DICT, + # ) + # ocr_layout = parse_ocr_data_tesseract(ocr_data) + # except pytesseract.pytesseract.TesseractError: + # logger.warning("TesseractError: Skipping page", exc_info=True) if self.layout is not None: threshold_kwargs = {} @@ -309,8 +310,8 @@ def get_elements_with_detection_model( inferred_layout=inferred_layout, extracted_layout=self.layout, page_image_size=self.image.size, - ocr_layout=ocr_layout, - supplement_with_ocr_elements=self.supplement_with_ocr_elements, + # ocr_layout=ocr_layout, + # supplement_with_ocr_elements=self.supplement_with_ocr_elements, **threshold_kwargs, ) # move to unst @@ -336,7 +337,7 @@ def get_elements_with_detection_model( if self.analysis: self.inferred_layout = inferred_layout - self.ocr_layout = ocr_layout + # self.ocr_layout = ocr_layout if inplace: self.elements = elements From d7025df4e745a6685c90525913fbb7a05bd13f9c Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Thu, 28 Sep 2023 14:50:29 -0400 Subject: [PATCH 07/26] remove text region ocr --- unstructured_inference/inference/elements.py | 84 +++++++++++--------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py index 69ea4c19..fb60739c 100644 --- a/unstructured_inference/inference/elements.py +++ b/unstructured_inference/inference/elements.py @@ -218,7 +218,8 @@ def extract_text( text = aggregate_by_block(self, image, objects, ocr_strategy) elif image is not None: # We don't have anything to go on but the image itself, so we use OCR - text = ocr(self, image, languages=ocr_languages) if ocr_strategy != "never" else "" + # text = ocr(self, image, languages=ocr_languages) if ocr_strategy != "never" else "" + text = "" else: raise ValueError( "Got arguments image and layout as None, at least one must be populated to use for " @@ -257,38 +258,39 @@ def extract_text( if ocr_strategy == "never" or image is None: return "" else: - return ocr(self, image, languages=ocr_languages) + # return ocr(self, image, languages=ocr_languages) + return "" else: return super().extract_text(objects, image, extract_tables, ocr_strategy) - -def ocr(text_block: TextRegion, image: Image.Image, languages: str = "eng") -> str: - """Runs a cropped text block image through and OCR agent.""" - logger.debug("Running OCR on text block ...") - tesseract.load_agent(languages=languages) - padded_block = text_block.pad(12) - cropped_image = image.crop((padded_block.x1, padded_block.y1, padded_block.x2, padded_block.y2)) - entrie_page_ocr = os.getenv("ENTIRE_PAGE_OCR", "tesseract").lower() - if entrie_page_ocr == "paddle": - from unstructured_inference.models import paddle_ocr - - paddle_result = paddle_ocr.load_agent().ocr(np.array(cropped_image), cls=True) - recognized_text = "" - for idx in range(len(paddle_result)): - res = paddle_result[idx] - for line in res: - recognized_text += line[1][0] - return recognized_text - else: - agent = tesseract.ocr_agents.get(languages) - if agent is None: - raise RuntimeError("OCR agent is not loaded for {languages}.") - - try: - return agent.detect(cropped_image) - except tesseract.TesseractError: - logger.warning("TesseractError: Skipping region", exc_info=True) - return "" +# move to unst for individual_blocks mode +# def ocr(text_block: TextRegion, image: Image.Image, languages: str = "eng") -> str: +# """Runs a cropped text block image through and OCR agent.""" +# logger.debug("Running OCR on text block ...") +# tesseract.load_agent(languages=languages) +# padded_block = text_block.pad(12) +# cropped_image = image.crop((padded_block.x1, padded_block.y1, padded_block.x2, padded_block.y2)) +# entrie_page_ocr = os.getenv("ENTIRE_PAGE_OCR", "tesseract").lower() +# if entrie_page_ocr == "paddle": +# from unstructured_inference.models import paddle_ocr + +# paddle_result = paddle_ocr.load_agent().ocr(np.array(cropped_image), cls=True) +# recognized_text = "" +# for idx in range(len(paddle_result)): +# res = paddle_result[idx] +# for line in res: +# recognized_text += line[1][0] +# return recognized_text +# else: +# agent = tesseract.ocr_agents.get(languages) +# if agent is None: +# raise RuntimeError("OCR agent is not loaded for {languages}.") + +# try: +# return agent.detect(cropped_image) +# except tesseract.TesseractError: +# logger.warning("TesseractError: Skipping region", exc_info=True) +# return "" def needs_ocr( @@ -331,16 +333,20 @@ def aggregate_by_block( ) -> str: """Extracts the text aggregated from the elements of the given layout that lie within the given block.""" - if image is not None and needs_ocr(text_region, pdf_objects, ocr_strategy): - text = ocr(text_region, image, languages=ocr_languages) - else: - filtered_blocks = [obj for obj in pdf_objects if obj.is_in(text_region, error_margin=5)] - for little_block in filtered_blocks: - if image is not None and needs_ocr(little_block, pdf_objects, ocr_strategy): - little_block.text = ocr(little_block, image, languages=ocr_languages) - text = " ".join([x.text for x in filtered_blocks if x.text]) - text = remove_control_characters(text) + filtered_blocks = [obj for obj in pdf_objects if obj.is_in(text_region, error_margin=5)] + text = " ".join([x.text for x in filtered_blocks if x.text]) return text + + # if image is not None and needs_ocr(text_region, pdf_objects, ocr_strategy): + # text = ocr(text_region, image, languages=ocr_languages) + # else: + # filtered_blocks = [obj for obj in pdf_objects if obj.is_in(text_region, error_margin=5)] + # for little_block in filtered_blocks: + # if image is not None and needs_ocr(little_block, pdf_objects, ocr_strategy): + # little_block.text = ocr(little_block, image, languages=ocr_languages) + # text = " ".join([x.text for x in filtered_blocks if x.text]) + # text = remove_control_characters(text) + # return text def cid_ratio(text: str) -> float: From 2f567483b42f1b918541dd827f25c2270b98cea8 Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Thu, 28 Sep 2023 16:08:40 -0400 Subject: [PATCH 08/26] more clean up --- unstructured_inference/inference/elements.py | 95 ++++---- unstructured_inference/inference/layout.py | 203 ++++++------------ .../inference/layoutelement.py | 15 +- 3 files changed, 120 insertions(+), 193 deletions(-) diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py index fb60739c..e0e4ee55 100644 --- a/unstructured_inference/inference/elements.py +++ b/unstructured_inference/inference/elements.py @@ -207,15 +207,16 @@ def extract_text( objects: Optional[Collection[TextRegion]], image: Optional[Image.Image] = None, extract_tables: bool = False, - ocr_strategy: str = "auto", - ocr_languages: str = "eng", + # ocr_strategy: str = "auto", + # ocr_languages: str = "eng", ) -> str: """Extracts text contained in region.""" if self.text is not None: # If block text is already populated, we'll assume it's correct text = self.text elif objects is not None: - text = aggregate_by_block(self, image, objects, ocr_strategy) + # text = aggregate_by_block(self, image, objects, ocr_strategy) + text = aggregate_by_block(self, image, objects) elif image is not None: # We don't have anything to go on but the image itself, so we use OCR # text = ocr(self, image, languages=ocr_languages) if ocr_strategy != "never" else "" @@ -234,8 +235,8 @@ def extract_text( objects: Optional[Collection[TextRegion]], image: Optional[Image.Image] = None, extract_tables: bool = False, - ocr_strategy: str = "auto", - ocr_languages: str = "eng", + # ocr_strategy: str = "auto", + # ocr_languages: str = "eng", ) -> str: """Extracts text contained in region.""" if self.text is None: @@ -250,18 +251,20 @@ def extract_text( objects: Optional[Collection[TextRegion]], image: Optional[Image.Image] = None, extract_tables: bool = False, - ocr_strategy: str = "auto", - ocr_languages: str = "eng", + # ocr_strategy: str = "auto", + # ocr_languages: str = "eng", ) -> str: """Extracts text contained in region.""" if self.text is None: - if ocr_strategy == "never" or image is None: - return "" - else: - # return ocr(self, image, languages=ocr_languages) - return "" + # if ocr_strategy == "never" or image is None: + # return "" + # else: + # return ocr(self, image, languages=ocr_languages) + return "" else: - return super().extract_text(objects, image, extract_tables, ocr_strategy) + # return super().extract_text(objects, image, extract_tables, ocr_strategy) + return super().extract_text(objects, image, extract_table) + # move to unst for individual_blocks mode # def ocr(text_block: TextRegion, image: Image.Image, languages: str = "eng") -> str: @@ -292,51 +295,51 @@ def extract_text( # logger.warning("TesseractError: Skipping region", exc_info=True) # return "" - -def needs_ocr( - region: TextRegion, - pdf_objects: Collection[TextRegion], - ocr_strategy: str, -) -> bool: - """Logic to determine whether ocr is needed to extract text from given region.""" - if ocr_strategy == "force": - return True - elif ocr_strategy == "auto": - image_objects = [obj for obj in pdf_objects if isinstance(obj, ImageTextRegion)] - word_objects = [obj for obj in pdf_objects if isinstance(obj, EmbeddedTextRegion)] - # If any image object overlaps with the region of interest, we have hope of getting some - # text from OCR. Otherwise, there's nothing there to find, no need to waste our time with - # OCR. - image_intersects = any(region.intersects(img_obj) for img_obj in image_objects) - if region.text is None: - # If the region has no text check if any images overlap with the region that might - # contain text. - if any(obj.is_in(region) and obj.text is not None for obj in word_objects): - # If there are word objects in the region, we defer to that rather than OCR - return False - else: - return image_intersects - else: - # If the region has text, we should only have to OCR if too much of the text is - # uninterpretable. - return cid_ratio(region.text) > 0.5 - else: - return False +# move to unst for individual_blocks mode +# def needs_ocr( +# region: TextRegion, +# pdf_objects: Collection[TextRegion], +# ocr_strategy: str, +# ) -> bool: +# """Logic to determine whether ocr is needed to extract text from given region.""" +# if ocr_strategy == "force": +# return True +# elif ocr_strategy == "auto": +# image_objects = [obj for obj in pdf_objects if isinstance(obj, ImageTextRegion)] +# word_objects = [obj for obj in pdf_objects if isinstance(obj, EmbeddedTextRegion)] +# # If any image object overlaps with the region of interest, we have hope of getting some +# # text from OCR. Otherwise, there's nothing there to find, no need to waste our time with +# # OCR. +# image_intersects = any(region.intersects(img_obj) for img_obj in image_objects) +# if region.text is None: +# # If the region has no text check if any images overlap with the region that might +# # contain text. +# if any(obj.is_in(region) and obj.text is not None for obj in word_objects): +# # If there are word objects in the region, we defer to that rather than OCR +# return False +# else: +# return image_intersects +# else: +# # If the region has text, we should only have to OCR if too much of the text is +# # uninterpretable. +# return cid_ratio(region.text) > 0.5 +# else: +# return False def aggregate_by_block( text_region: TextRegion, image: Optional[Image.Image], pdf_objects: Collection[TextRegion], - ocr_strategy: str = "auto", - ocr_languages: str = "eng", + # ocr_strategy: str = "auto", + # ocr_languages: str = "eng", ) -> str: """Extracts the text aggregated from the elements of the given layout that lie within the given block.""" filtered_blocks = [obj for obj in pdf_objects if obj.is_in(text_region, error_margin=5)] text = " ".join([x.text for x in filtered_blocks if x.text]) return text - + # if image is not None and needs_ocr(text_region, pdf_objects, ocr_strategy): # text = ocr(text_region, image, languages=ocr_languages) # else: diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index ee3721c7..549d03af 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -48,11 +48,11 @@ import pdfplumber # noqa -VALID_OCR_STRATEGIES = ( - "auto", # Use OCR when it looks like other methods have failed - "force", # Always use OCR - "never", # Never use OCR -) +# VALID_OCR_STRATEGIES = ( +# "auto", # Use OCR when it looks like other methods have failed +# "force", # Always use OCR +# "never", # Never use OCR +# ) class DocumentLayout: @@ -85,9 +85,9 @@ def from_file( detection_model: Optional[UnstructuredObjectDetectionModel] = None, element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, - ocr_strategy: str = "auto", - ocr_languages: str = "eng", - ocr_mode: str = OCRMode.FULL_PAGE.value, + # ocr_strategy: str = "auto", + # ocr_languages: str = "eng", + # ocr_mode: str = OCRMode.FULL_PAGE.value, extract_tables: bool = False, pdf_image_dpi: int = 200, **kwargs, @@ -125,9 +125,9 @@ def from_file( detection_model=detection_model, element_extraction_model=element_extraction_model, layout=layout, - ocr_strategy=ocr_strategy, - ocr_languages=ocr_languages, - ocr_mode=ocr_mode, + # ocr_strategy=ocr_strategy, + # ocr_languages=ocr_languages, + # ocr_mode=ocr_mode, fixed_layout=fixed_layout, extract_tables=extract_tables, **kwargs, @@ -141,9 +141,9 @@ def from_image_file( filename: str, detection_model: Optional[UnstructuredObjectDetectionModel] = None, element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, - ocr_strategy: str = "auto", - ocr_languages: str = "eng", - ocr_mode: str = OCRMode.FULL_PAGE.value, + # ocr_strategy: str = "auto", + # ocr_languages: str = "eng", + # ocr_mode: str = OCRMode.FULL_PAGE.value, fixed_layout: Optional[List[TextRegion]] = None, extract_tables: bool = False, **kwargs, @@ -172,9 +172,9 @@ def from_image_file( detection_model=detection_model, element_extraction_model=element_extraction_model, layout=None, - ocr_strategy=ocr_strategy, - ocr_languages=ocr_languages, - ocr_mode=ocr_mode, + # ocr_strategy=ocr_strategy, + # ocr_languages=ocr_languages, + # ocr_mode=ocr_mode, fixed_layout=fixed_layout, extract_tables=extract_tables, **kwargs, @@ -196,12 +196,12 @@ def __init__( document_filename: Optional[Union[str, PurePath]] = None, detection_model: Optional[UnstructuredObjectDetectionModel] = None, element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, - ocr_strategy: str = "auto", - ocr_languages: str = "eng", - ocr_mode: str = OCRMode.FULL_PAGE.value, + # ocr_strategy: str = "auto", + # ocr_languages: str = "eng", + # ocr_mode: str = OCRMode.FULL_PAGE.value, extract_tables: bool = False, analysis: bool = False, - supplement_with_ocr_elements: bool = True, + # supplement_with_ocr_elements: bool = True, ): if detection_model is not None and element_extraction_model is not None: raise ValueError("Only one of detection_model and extraction_model should be passed.") @@ -217,16 +217,16 @@ def __init__( self.detection_model = detection_model self.element_extraction_model = element_extraction_model self.elements: Collection[Union[LayoutElement, LocationlessLayoutElement]] = [] - if ocr_strategy not in VALID_OCR_STRATEGIES: - raise ValueError(f"ocr_strategy must be one of {VALID_OCR_STRATEGIES}.") - self.ocr_strategy = ocr_strategy - self.ocr_languages = ocr_languages - self.ocr_mode = ocr_mode + # if ocr_strategy not in VALID_OCR_STRATEGIES: + # raise ValueError(f"ocr_strategy must be one of {VALID_OCR_STRATEGIES}.") + # self.ocr_strategy = ocr_strategy + # self.ocr_languages = ocr_languages + # self.ocr_mode = ocr_mode self.extract_tables = extract_tables self.analysis = analysis self.inferred_layout: Optional[List[LayoutElement]] = None # self.ocr_layout: Optional[List[TextRegion]] = None - self.supplement_with_ocr_elements = supplement_with_ocr_elements + # self.supplement_with_ocr_elements = supplement_with_ocr_elements def __str__(self) -> str: return "\n\n".join([str(element) for element in self.elements]) @@ -353,8 +353,8 @@ def get_elements_from_layout(self, layout: List[TextRegion]) -> List[LayoutEleme block=e, image=self.image, pdf_objects=self.layout, - ocr_strategy=self.ocr_strategy, - ocr_languages=self.ocr_languages, + # ocr_strategy=self.ocr_strategy, + # ocr_languages=self.ocr_languages, extract_tables=self.extract_tables, ) for e in layout @@ -466,12 +466,12 @@ def from_image( detection_model: Optional[UnstructuredObjectDetectionModel] = None, element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, layout: Optional[List[TextRegion]] = None, - ocr_strategy: str = "auto", - ocr_languages: str = "eng", - ocr_mode: str = OCRMode.FULL_PAGE.value, + # ocr_strategy: str = "auto", + # ocr_languages: str = "eng", + # ocr_mode: str = OCRMode.FULL_PAGE.value, extract_tables: bool = False, fixed_layout: Optional[List[TextRegion]] = None, - supplement_with_ocr_elements: bool = True, + # supplement_with_ocr_elements: bool = True, extract_images_in_pdf: bool = False, image_output_dir_path: Optional[str] = None, analysis: bool = False, @@ -484,12 +484,12 @@ def from_image( layout=layout, detection_model=detection_model, element_extraction_model=element_extraction_model, - ocr_strategy=ocr_strategy, - ocr_languages=ocr_languages, - ocr_mode=ocr_mode, + # ocr_strategy=ocr_strategy, + # ocr_languages=ocr_languages, + # ocr_mode=ocr_mode, extract_tables=extract_tables, analysis=analysis, - supplement_with_ocr_elements=supplement_with_ocr_elements, + # supplement_with_ocr_elements=supplement_with_ocr_elements, ) if page.element_extraction_model is not None: page.get_elements_using_image_extraction() @@ -520,9 +520,9 @@ def process_data_with_model( data: BinaryIO, model_name: Optional[str], is_image: bool = False, - ocr_strategy: str = "auto", - ocr_languages: str = "eng", - ocr_mode: str = OCRMode.FULL_PAGE.value, + # ocr_strategy: str = "auto", + # ocr_languages: str = "eng", + # ocr_mode: str = OCRMode.FULL_PAGE.value, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, extract_tables: bool = False, pdf_image_dpi: Optional[int] = None, @@ -537,9 +537,9 @@ def process_data_with_model( tmp_file.name, model_name, is_image=is_image, - ocr_strategy=ocr_strategy, - ocr_languages=ocr_languages, - ocr_mode=ocr_mode, + # ocr_strategy=ocr_strategy, + # ocr_languages=ocr_languages, + # ocr_mode=ocr_mode, fixed_layouts=fixed_layouts, extract_tables=extract_tables, pdf_image_dpi=pdf_image_dpi, @@ -553,9 +553,9 @@ def process_file_with_model( filename: str, model_name: Optional[str], is_image: bool = False, - ocr_strategy: str = "auto", - ocr_languages: str = "eng", - ocr_mode: str = OCRMode.FULL_PAGE.value, + # ocr_strategy: str = "auto", + # ocr_languages: str = "eng", + # ocr_mode: str = OCRMode.FULL_PAGE.value, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, extract_tables: bool = False, pdf_image_dpi: Optional[int] = None, @@ -564,13 +564,13 @@ def process_file_with_model( """Processes pdf file with name filename into a DocumentLayout by using a model identified by model_name.""" - if pdf_image_dpi is None: - pdf_image_dpi = 300 if model_name == "chipper" else 200 - if (pdf_image_dpi < 300) and (model_name == "chipper"): - logger.warning( - "The Chipper model performs better when images are rendered with DPI >= 300 " - f"(currently {pdf_image_dpi}).", - ) + # if pdf_image_dpi is None: + # pdf_image_dpi = 300 if model_name == "chipper" else 200 + # if (pdf_image_dpi < 300) and (model_name == "chipper"): + # logger.warning( + # "The Chipper model performs better when images are rendered with DPI >= 300 " + # f"(currently {pdf_image_dpi}).", + # ) model = get_model(model_name) if isinstance(model, UnstructuredObjectDetectionModel): @@ -586,9 +586,9 @@ def process_file_with_model( filename, detection_model=detection_model, element_extraction_model=element_extraction_model, - ocr_strategy=ocr_strategy, - ocr_languages=ocr_languages, - ocr_mode=ocr_mode, + # ocr_strategy=ocr_strategy, + # ocr_languages=ocr_languages, + # ocr_mode=ocr_mode, extract_tables=extract_tables, **kwargs, ) @@ -597,9 +597,9 @@ def process_file_with_model( filename, detection_model=detection_model, element_extraction_model=element_extraction_model, - ocr_strategy=ocr_strategy, - ocr_languages=ocr_languages, - ocr_mode=ocr_mode, + # ocr_strategy=ocr_strategy, + # ocr_languages=ocr_languages, + # ocr_mode=ocr_mode, fixed_layouts=fixed_layouts, extract_tables=extract_tables, pdf_image_dpi=pdf_image_dpi, @@ -613,8 +613,8 @@ def get_element_from_block( block: TextRegion, image: Optional[Image.Image] = None, pdf_objects: Optional[List[TextRegion]] = None, - ocr_strategy: str = "auto", - ocr_languages: str = "eng", + # ocr_strategy: str = "auto", + # ocr_languages: str = "eng", extract_tables: bool = False, ) -> LayoutElement: """Creates a LayoutElement from a given layout or image by finding all the text that lies within @@ -624,8 +624,8 @@ def get_element_from_block( objects=pdf_objects, image=image, extract_tables=extract_tables, - ocr_strategy=ocr_strategy, - ocr_languages=ocr_languages, + # ocr_strategy=ocr_strategy, + # ocr_languages=ocr_languages, ) return element @@ -685,80 +685,3 @@ def load_pdf( ) return layouts, images - - -def parse_ocr_data_tesseract(ocr_data: dict) -> List[TextRegion]: - """ - Parse the OCR result data to extract a list of TextRegion objects from - tesseract. - - The function processes the OCR result dictionary, looking for bounding - box information and associated text to create instances of the TextRegion - class, which are then appended to a list. - - Parameters: - - ocr_data (dict): A dictionary containing the OCR result data, expected - to have keys like "level", "left", "top", "width", - "height", and "text". - - Returns: - - List[TextRegion]: A list of TextRegion objects, each representing a - detected text region within the OCR-ed image. - - Note: - - An empty string or a None value for the 'text' key in the input - dictionary will result in its associated bounding box being ignored. - """ - - levels = ocr_data["level"] - text_regions = [] - for i, level in enumerate(levels): - (l, t, w, h) = ( - ocr_data["left"][i], - ocr_data["top"][i], - ocr_data["width"][i], - ocr_data["height"][i], - ) - (x1, y1, x2, y2) = l, t, l + w, t + h - text = ocr_data["text"][i] - if text: - text_region = TextRegion(x1, y1, x2, y2, text) - text_regions.append(text_region) - - return text_regions - - -def parse_ocr_data_paddle(ocr_data: list) -> List[TextRegion]: - """ - Parse the OCR result data to extract a list of TextRegion objects from - paddle. - - The function processes the OCR result dictionary, looking for bounding - box information and associated text to create instances of the TextRegion - class, which are then appended to a list. - - Parameters: - - ocr_data (list): A list containing the OCR result data - - Returns: - - List[TextRegion]: A list of TextRegion objects, each representing a - detected text region within the OCR-ed image. - - Note: - - An empty string or a None value for the 'text' key in the input - dictionary will result in its associated bounding box being ignored. - """ - text_regions = [] - for idx in range(len(ocr_data)): - res = ocr_data[idx] - for line in res: - x1 = min([i[0] for i in line[0]]) - y1 = min([i[1] for i in line[0]]) - x2 = max([i[0] for i in line[0]]) - y2 = max([i[1] for i in line[0]]) - text = line[1][0] - if text: - text_region = TextRegion(x1, y1, x2, y2, text) - text_regions.append(text_region) - - return text_regions diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index 22c12827..b596249a 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -31,16 +31,16 @@ def extract_text( objects: Optional[Collection[TextRegion]], image: Optional[Image.Image] = None, extract_tables: bool = False, - ocr_strategy: str = "auto", - ocr_languages: str = "eng", + # ocr_strategy: str = "auto", + # ocr_languages: str = "eng", ): """Extracts text contained in region""" text = super().extract_text( objects=objects, image=image, extract_tables=extract_tables, - ocr_strategy=ocr_strategy, - ocr_languages=ocr_languages, + # ocr_strategy=ocr_strategy, + # ocr_languages=ocr_languages, ) if extract_tables and self.type == "Table": self.text_as_html = interpret_table_block(self, image) @@ -88,8 +88,8 @@ def merge_inferred_layout_with_extracted_layout( inferred_layout: Collection[LayoutElement], extracted_layout: Collection[TextRegion], page_image_size: tuple, - ocr_layout: Optional[List[TextRegion]] = None, - supplement_with_ocr_elements: bool = True, + # ocr_layout: Optional[List[TextRegion]] = None, + # supplement_with_ocr_elements: bool = True, same_region_threshold: float = 0.75, subregion_threshold: float = 0.75, ) -> List[LayoutElement]: @@ -182,11 +182,12 @@ def merge_inferred_layout_with_extracted_layout( # else out_layout # ) # else: - # final_layout = categorized_extracted_elements_to_add + inferred_regions_to_add + # final_layout = categorized_extracted_elements_to_add + inferred_regions_to_add final_layout = categorized_extracted_elements_to_add + inferred_regions_to_add return final_layout + # move to unst # def merge_inferred_layout_with_ocr_layout( # inferred_layout: List[LayoutElement], From 97c6e02baf591d9359da1fe7aeed330857685ffc Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Thu, 28 Sep 2023 18:11:22 -0400 Subject: [PATCH 09/26] add back --- unstructured_inference/inference/elements.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py index e0e4ee55..291f3f84 100644 --- a/unstructured_inference/inference/elements.py +++ b/unstructured_inference/inference/elements.py @@ -338,6 +338,7 @@ def aggregate_by_block( block.""" filtered_blocks = [obj for obj in pdf_objects if obj.is_in(text_region, error_margin=5)] text = " ".join([x.text for x in filtered_blocks if x.text]) + text = remove_control_characters(text) return text # if image is not None and needs_ocr(text_region, pdf_objects, ocr_strategy): From 365f3f87efd4f7e9cbaa4d1930738080af096dbf Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Fri, 29 Sep 2023 11:37:58 -0400 Subject: [PATCH 10/26] remove tests clean up --- .../inference/test_layout.py | 537 +++++++++--------- .../inference/test_layout_element.py | 10 +- test_unstructured_inference/test_elements.py | 26 +- unstructured_inference/inference/elements.py | 12 +- unstructured_inference/inference/layout.py | 19 +- .../inference/layoutelement.py | 9 +- unstructured_inference/models/paddle_ocr.py | 1 + 7 files changed, 309 insertions(+), 305 deletions(-) diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index bd5aa3e8..0f37286a 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -2,7 +2,8 @@ import os.path import tempfile from functools import partial -from itertools import product + +# from itertools import product from unittest.mock import mock_open, patch import numpy as np @@ -10,9 +11,10 @@ from PIL import Image import unstructured_inference.models.base as models -from unstructured_inference.constants import OCRMode + +# from unstructured_inference.constants import OCRMode from unstructured_inference.inference import elements, layout, layoutelement -from unstructured_inference.models import chipper, detectron2, tesseract +from unstructured_inference.models import detectron2 from unstructured_inference.models.unstructuredmodel import ( UnstructuredElementExtractionModel, UnstructuredObjectDetectionModel, @@ -70,35 +72,35 @@ def verify_image_array(): verify_image_array() -def test_ocr(monkeypatch): - mock_text = "The parrot flies high in the air!" +# def test_ocr(monkeypatch): +# mock_text = "The parrot flies high in the air!" - class MockOCRAgent: - def detect(self, *args): - return mock_text +# class MockOCRAgent: +# def detect(self, *args): +# return mock_text - monkeypatch.setattr(tesseract, "ocr_agents", {"eng": MockOCRAgent}) - monkeypatch.setattr(tesseract, "is_pytesseract_available", lambda *args: True) +# monkeypatch.setattr(tesseract, "ocr_agents", {"eng": MockOCRAgent}) +# monkeypatch.setattr(tesseract, "is_pytesseract_available", lambda *args: True) - image = Image.fromarray(np.random.randint(12, 24, (40, 40)), mode="RGB") - text_block = layout.TextRegion(1, 2, 3, 4, text=None) +# image = Image.fromarray(np.random.randint(12, 24, (40, 40)), mode="RGB") +# text_block = layout.TextRegion(1, 2, 3, 4, text=None) - assert elements.ocr(text_block, image=image) == mock_text +# assert elements.ocr(text_block, image=image) == mock_text -def test_ocr_with_error(monkeypatch): - class MockOCRAgent: - def detect(self, *args): - # We sometimes get this error on very small images - raise tesseract.TesseractError(-8, "Estimating resolution as 1023") +# def test_ocr_with_error(monkeypatch): +# class MockOCRAgent: +# def detect(self, *args): +# # We sometimes get this error on very small images +# raise tesseract.TesseractError(-8, "Estimating resolution as 1023") - monkeypatch.setattr(tesseract, "ocr_agents", {"eng": MockOCRAgent}) - monkeypatch.setattr(tesseract, "is_pytesseract_available", lambda *args: True) +# monkeypatch.setattr(tesseract, "ocr_agents", {"eng": MockOCRAgent}) +# monkeypatch.setattr(tesseract, "is_pytesseract_available", lambda *args: True) - image = Image.fromarray(np.random.randint(12, 24, (40, 40)), mode="RGB") - text_block = layout.TextRegion(1, 2, 3, 4, text=None) +# image = Image.fromarray(np.random.randint(12, 24, (40, 40)), mode="RGB") +# text_block = layout.TextRegion(1, 2, 3, 4, text=None) - assert elements.ocr(text_block, image=image) == "" +# assert elements.ocr(text_block, image=image) == "" class MockLayoutModel: @@ -130,24 +132,24 @@ def test_get_page_elements(monkeypatch, mock_final_layout): assert elements == page.elements -def test_get_page_elements_with_tesseract_error(monkeypatch, mock_final_layout): - def mock_image_to_data(*args, **kwargs): - raise tesseract.TesseractError(-2, "Estimating resolution as 1023") +# def test_get_page_elements_with_tesseract_error(monkeypatch, mock_final_layout): +# def mock_image_to_data(*args, **kwargs): +# raise tesseract.TesseractError(-2, "Estimating resolution as 1023") - monkeypatch.setattr(layout.pytesseract, "image_to_data", mock_image_to_data) +# monkeypatch.setattr(layout.pytesseract, "image_to_data", mock_image_to_data) - image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB") - page = layout.PageLayout( - number=0, - image=image, - layout=mock_final_layout, - detection_model=MockLayoutModel(mock_final_layout), - ) +# image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB") +# page = layout.PageLayout( +# number=0, +# image=image, +# layout=mock_final_layout, +# detection_model=MockLayoutModel(mock_final_layout), +# ) - elements = page.get_elements_with_detection_model(inplace=False) +# elements = page.get_elements_with_detection_model(inplace=False) - assert str(elements[0]) == "A Catchy Title" - assert str(elements[1]).startswith("A very repetitive narrative.") +# assert str(elements[0]) == "A Catchy Title" +# assert str(elements[1]).startswith("A very repetitive narrative.") class MockPool: @@ -161,100 +163,100 @@ def join(self): pass -@pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI") -def test_get_page_elements_with_paddle_ocr(monkeypatch): - monkeypatch.setenv("ENTIRE_PAGE_OCR", "paddle") - text_block = layout.TextRegion(2, 4, 6, 8, text=None) - image_block = layout.ImageTextRegion(8, 14, 16, 18) - doc_initial_layout = [text_block, image_block] - text_layoutelement = layoutelement.LayoutElement( - 2, - 4, - 6, - 8, - text=None, - type="UncategorizedText", - ) - image_layoutelement = layoutelement.LayoutElement(8, 14, 16, 18, text=None, type="Image") - doc_final_layout = [text_layoutelement, image_layoutelement] - - monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True) - monkeypatch.setattr(elements, "ocr", lambda *args, **kwargs: "An Even Catchier Title") - - image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB") - page = layout.PageLayout( - number=0, - image=image, - layout=doc_initial_layout, - detection_model=MockLayoutModel(doc_final_layout), - # Note(yuming): there are differnt language codes for same language - # between paddle and tesseract - ocr_languages="en", - ) - page.get_elements_with_detection_model() - - assert str(page) == "\n\nAn Even Catchier Title" - - -def test_get_page_elements_with_tesseract_ocr(monkeypatch): - monkeypatch.setenv("ENTIRE_PAGE_OCR", "tesseract") - text_block = layout.TextRegion(2, 4, 6, 8, text=None) - image_block = layout.ImageTextRegion(8, 14, 16, 18) - doc_initial_layout = [text_block, image_block] - text_layoutelement = layoutelement.LayoutElement( - 2, - 4, - 6, - 8, - text=None, - type="UncategorizedText", - ) - image_layoutelement = layoutelement.LayoutElement(8, 14, 16, 18, text=None, type="Image") - doc_final_layout = [text_layoutelement, image_layoutelement] - - monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True) - monkeypatch.setattr(elements, "ocr", lambda *args, **kwargs: "An Even Catchier Title") - - image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB") - page = layout.PageLayout( - number=0, - image=image, - layout=doc_initial_layout, - detection_model=MockLayoutModel(doc_final_layout), - ) - page.get_elements_with_detection_model() - - assert str(page) == "\n\nAn Even Catchier Title" - - -def test_get_page_elements_with_ocr_invalid_entrie_page_ocr(monkeypatch): - monkeypatch.setenv("ENTIRE_PAGE_OCR", "invalid_entire_page_ocr") - text_block = layout.TextRegion(2, 4, 6, 8, text=None) - image_block = layout.ImageTextRegion(8, 14, 16, 18) - doc_initial_layout = [text_block, image_block] - text_layoutelement = layoutelement.LayoutElement( - 2, - 4, - 6, - 8, - text=None, - type="UncategorizedText", - ) - image_layoutelement = layoutelement.LayoutElement(8, 14, 16, 18, text=None, type="Image") - doc_final_layout = [text_layoutelement, image_layoutelement] - - monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True) - monkeypatch.setattr(elements, "ocr", lambda *args, **kwargs: "An Even Catchier Title") - - image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB") - page = layout.PageLayout( - number=0, - image=image, - layout=doc_initial_layout, - detection_model=MockLayoutModel(doc_final_layout), - ) - with pytest.raises(ValueError): - page.get_elements_with_detection_model() +# @pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI") +# def test_get_page_elements_with_paddle_ocr(monkeypatch): +# monkeypatch.setenv("ENTIRE_PAGE_OCR", "paddle") +# text_block = layout.TextRegion(2, 4, 6, 8, text=None) +# image_block = layout.ImageTextRegion(8, 14, 16, 18) +# doc_initial_layout = [text_block, image_block] +# text_layoutelement = layoutelement.LayoutElement( +# 2, +# 4, +# 6, +# 8, +# text=None, +# type="UncategorizedText", +# ) +# image_layoutelement = layoutelement.LayoutElement(8, 14, 16, 18, text=None, type="Image") +# doc_final_layout = [text_layoutelement, image_layoutelement] + +# monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True) +# monkeypatch.setattr(elements, "ocr", lambda *args, **kwargs: "An Even Catchier Title") + +# image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB") +# page = layout.PageLayout( +# number=0, +# image=image, +# layout=doc_initial_layout, +# detection_model=MockLayoutModel(doc_final_layout), +# # Note(yuming): there are differnt language codes for same language +# # between paddle and tesseract +# ocr_languages="en", +# ) +# page.get_elements_with_detection_model() + +# assert str(page) == "\n\nAn Even Catchier Title" + + +# def test_get_page_elements_with_tesseract_ocr(monkeypatch): +# monkeypatch.setenv("ENTIRE_PAGE_OCR", "tesseract") +# text_block = layout.TextRegion(2, 4, 6, 8, text=None) +# image_block = layout.ImageTextRegion(8, 14, 16, 18) +# doc_initial_layout = [text_block, image_block] +# text_layoutelement = layoutelement.LayoutElement( +# 2, +# 4, +# 6, +# 8, +# text=None, +# type="UncategorizedText", +# ) +# image_layoutelement = layoutelement.LayoutElement(8, 14, 16, 18, text=None, type="Image") +# doc_final_layout = [text_layoutelement, image_layoutelement] + +# monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True) +# monkeypatch.setattr(elements, "ocr", lambda *args, **kwargs: "An Even Catchier Title") + +# image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB") +# page = layout.PageLayout( +# number=0, +# image=image, +# layout=doc_initial_layout, +# detection_model=MockLayoutModel(doc_final_layout), +# ) +# page.get_elements_with_detection_model() + +# assert str(page) == "\n\nAn Even Catchier Title" + + +# def test_get_page_elements_with_ocr_invalid_entrie_page_ocr(monkeypatch): +# monkeypatch.setenv("ENTIRE_PAGE_OCR", "invalid_entire_page_ocr") +# text_block = layout.TextRegion(2, 4, 6, 8, text=None) +# image_block = layout.ImageTextRegion(8, 14, 16, 18) +# doc_initial_layout = [text_block, image_block] +# text_layoutelement = layoutelement.LayoutElement( +# 2, +# 4, +# 6, +# 8, +# text=None, +# type="UncategorizedText", +# ) +# image_layoutelement = layoutelement.LayoutElement(8, 14, 16, 18, text=None, type="Image") +# doc_final_layout = [text_layoutelement, image_layoutelement] + +# monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True) +# monkeypatch.setattr(elements, "ocr", lambda *args, **kwargs: "An Even Catchier Title") + +# image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB") +# page = layout.PageLayout( +# number=0, +# image=image, +# layout=doc_initial_layout, +# detection_model=MockLayoutModel(doc_final_layout), +# ) +# with pytest.raises(ValueError): +# page.get_elements_with_detection_model() def test_read_pdf(monkeypatch, mock_initial_layout, mock_final_layout, mock_image): @@ -412,23 +414,24 @@ def filter_by(self, *args, **kwargs): return MockLayout() -@pytest.mark.parametrize( - ("block_text", "layout_texts", "expected_text"), - [ - ("no ocr", ["pieced", "together", "group"], "no ocr"), - (None, ["pieced", "together", "group"], "pieced together group"), - ], -) -def test_get_element_from_block(block_text, layout_texts, mock_image, expected_text): - with patch("unstructured_inference.inference.elements.ocr", return_value="ocr"): - block = layout.TextRegion(0, 0, 10, 10, text=block_text) - captured_layout = [ - layout.TextRegion(i + 1, i + 1, i + 2, i + 2, text=text) - for i, text in enumerate(layout_texts) - ] - assert ( - layout.get_element_from_block(block, mock_image, captured_layout).text == expected_text - ) +# @pytest.mark.parametrize( +# ("block_text", "layout_texts", "expected_text"), +# [ +# ("no ocr", ["pieced", "together", "group"], "no ocr"), +# (None, ["pieced", "together", "group"], "pieced together group"), +# ], +# ) +# def test_get_element_from_block(block_text, layout_texts, mock_image, expected_text): +# with patch("unstructured_inference.inference.elements.ocr", return_value="ocr"): +# block = layout.TextRegion(0, 0, 10, 10, text=block_text) +# captured_layout = [ +# layout.TextRegion(i + 1, i + 1, i + 2, i + 2, text=text) +# for i, text in enumerate(layout_texts) +# ] +# assert ( +# layout.get_element_from_block(block, mock_image, +# captured_layout).text == expected_text +# ) def test_get_elements_from_block_raises(): @@ -544,9 +547,9 @@ def test_from_file_fixed_layout(fixed_layouts, called_method, not_called_method) getattr(layout.PageLayout, not_called_method).assert_not_called() -def test_invalid_ocr_strategy_raises(mock_image): - with pytest.raises(ValueError): - layout.PageLayout(0, mock_image, MockLayout(), ocr_strategy="fake_strategy") +# def test_invalid_ocr_strategy_raises(mock_image): +# with pytest.raises(ValueError): +# layout.PageLayout(0, mock_image, MockLayout(), ocr_strategy="fake_strategy") @pytest.mark.parametrize( @@ -572,91 +575,91 @@ def test_remove_control_characters(text, expected): unpopulated_text_region = layout.EmbeddedTextRegion(50, 50, 60, 60, text=None) -@pytest.mark.parametrize( - ("region", "objects", "ocr_strategy", "expected"), - [ - (no_text_region, [nonoverlapping_rect], "auto", False), - (no_text_region, [overlapping_rect], "auto", True), - (no_text_region, [], "auto", False), - (no_text_region, [populated_text_region, nonoverlapping_rect], "auto", False), - (no_text_region, [populated_text_region, overlapping_rect], "auto", False), - (no_text_region, [populated_text_region], "auto", False), - (no_text_region, [unpopulated_text_region, nonoverlapping_rect], "auto", False), - (no_text_region, [unpopulated_text_region, overlapping_rect], "auto", True), - (no_text_region, [unpopulated_text_region], "auto", False), - *list( - product( - [text_region], - [ - [], - [populated_text_region], - [unpopulated_text_region], - [nonoverlapping_rect], - [overlapping_rect], - [populated_text_region, nonoverlapping_rect], - [populated_text_region, overlapping_rect], - [unpopulated_text_region, nonoverlapping_rect], - [unpopulated_text_region, overlapping_rect], - ], - ["auto"], - [False], - ), - ), - *list( - product( - [cid_text_region], - [ - [], - [populated_text_region], - [unpopulated_text_region], - [overlapping_rect], - [populated_text_region, overlapping_rect], - [unpopulated_text_region, overlapping_rect], - ], - ["auto"], - [True], - ), - ), - *list( - product( - [no_text_region, text_region, cid_text_region], - [ - [], - [populated_text_region], - [unpopulated_text_region], - [nonoverlapping_rect], - [overlapping_rect], - [populated_text_region, nonoverlapping_rect], - [populated_text_region, overlapping_rect], - [unpopulated_text_region, nonoverlapping_rect], - [unpopulated_text_region, overlapping_rect], - ], - ["force"], - [True], - ), - ), - *list( - product( - [no_text_region, text_region, cid_text_region], - [ - [], - [populated_text_region], - [unpopulated_text_region], - [nonoverlapping_rect], - [overlapping_rect], - [populated_text_region, nonoverlapping_rect], - [populated_text_region, overlapping_rect], - [unpopulated_text_region, nonoverlapping_rect], - [unpopulated_text_region, overlapping_rect], - ], - ["never"], - [False], - ), - ), - ], -) -def test_ocr_image(region, objects, ocr_strategy, expected): - assert elements.needs_ocr(region, objects, ocr_strategy) is expected +# @pytest.mark.parametrize( +# ("region", "objects", "ocr_strategy", "expected"), +# [ +# (no_text_region, [nonoverlapping_rect], "auto", False), +# (no_text_region, [overlapping_rect], "auto", True), +# (no_text_region, [], "auto", False), +# (no_text_region, [populated_text_region, nonoverlapping_rect], "auto", False), +# (no_text_region, [populated_text_region, overlapping_rect], "auto", False), +# (no_text_region, [populated_text_region], "auto", False), +# (no_text_region, [unpopulated_text_region, nonoverlapping_rect], "auto", False), +# (no_text_region, [unpopulated_text_region, overlapping_rect], "auto", True), +# (no_text_region, [unpopulated_text_region], "auto", False), +# *list( +# product( +# [text_region], +# [ +# [], +# [populated_text_region], +# [unpopulated_text_region], +# [nonoverlapping_rect], +# [overlapping_rect], +# [populated_text_region, nonoverlapping_rect], +# [populated_text_region, overlapping_rect], +# [unpopulated_text_region, nonoverlapping_rect], +# [unpopulated_text_region, overlapping_rect], +# ], +# ["auto"], +# [False], +# ), +# ), +# *list( +# product( +# [cid_text_region], +# [ +# [], +# [populated_text_region], +# [unpopulated_text_region], +# [overlapping_rect], +# [populated_text_region, overlapping_rect], +# [unpopulated_text_region, overlapping_rect], +# ], +# ["auto"], +# [True], +# ), +# ), +# *list( +# product( +# [no_text_region, text_region, cid_text_region], +# [ +# [], +# [populated_text_region], +# [unpopulated_text_region], +# [nonoverlapping_rect], +# [overlapping_rect], +# [populated_text_region, nonoverlapping_rect], +# [populated_text_region, overlapping_rect], +# [unpopulated_text_region, nonoverlapping_rect], +# [unpopulated_text_region, overlapping_rect], +# ], +# ["force"], +# [True], +# ), +# ), +# *list( +# product( +# [no_text_region, text_region, cid_text_region], +# [ +# [], +# [populated_text_region], +# [unpopulated_text_region], +# [nonoverlapping_rect], +# [overlapping_rect], +# [populated_text_region, nonoverlapping_rect], +# [populated_text_region, overlapping_rect], +# [unpopulated_text_region, nonoverlapping_rect], +# [unpopulated_text_region, overlapping_rect], +# ], +# ["never"], +# [False], +# ), +# ), +# ], +# ) +# def test_ocr_image(region, objects, ocr_strategy, expected): +# assert elements.needs_ocr(region, objects, ocr_strategy) is expected @pytest.mark.parametrize("filename", ["loremipsum.pdf", "IRS-form-1987.pdf"]) @@ -747,9 +750,9 @@ def check_annotated_image(): check_annotated_image() -def test_textregion_returns_empty_ocr_never(mock_image): - tr = elements.TextRegion(0, 0, 24, 24) - assert tr.extract_text(objects=None, image=mock_image, ocr_strategy="never") == "" +# def test_textregion_returns_empty_ocr_never(mock_image): +# tr = elements.TextRegion(0, 0, 24, 24) +# assert tr.extract_text(objects=None, image=mock_image, ocr_strategy="never") == "" @pytest.mark.parametrize(("text", "expected"), [("asdf", "asdf"), (None, "")]) @@ -758,21 +761,22 @@ def test_embedded_text_region(text, expected): assert etr.extract_text(objects=None) == expected -@pytest.mark.parametrize( - ("text", "ocr_strategy", "expected"), - [ - (None, "never", ""), - (None, "always", "asdf"), - ("i have text", "never", "i have text"), - ("i have text", "always", "i have text"), - ], -) -def test_image_text_region(text, ocr_strategy, expected, mock_image): - itr = elements.ImageTextRegion(0, 0, 24, 24, text=text) - with patch.object(elements, "ocr", return_value="asdf"): - assert ( - itr.extract_text(objects=None, image=mock_image, ocr_strategy=ocr_strategy) == expected - ) +# @pytest.mark.parametrize( +# ("text", "ocr_strategy", "expected"), +# [ +# (None, "never", ""), +# (None, "always", "asdf"), +# ("i have text", "never", "i have text"), +# ("i have text", "always", "i have text"), +# ], +# ) +# def test_image_text_region(text, ocr_strategy, expected, mock_image): +# itr = elements.ImageTextRegion(0, 0, 24, 24, text=text) +# with patch.object(elements, "ocr", return_value="asdf"): +# assert ( +# itr.extract_text(objects=None, image=moc +# k_image, ocr_strategy=ocr_strategy) == expected +# ) @pytest.fixture() @@ -935,9 +939,9 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m "asdf", detection_model=detection_model, element_extraction_model=element_extraction_model, - ocr_strategy="auto", - ocr_languages="eng", - ocr_mode=OCRMode.FULL_PAGE.value, + # ocr_strategy="auto", + # ocr_languages="eng", + # ocr_mode=OCRMode.FULL_PAGE.value, fixed_layouts=None, extract_tables=False, pdf_image_dpi=200, @@ -951,12 +955,13 @@ def test_exposed_pdf_image_dpi(pdf_image_dpi, expected, monkeypatch): assert mock_from_image.call_args[0][0].height == expected -def test_warning_if_chipper_and_low_dpi(caplog): - with patch.object(layout.DocumentLayout, "from_file") as mock_from_file, patch.object( - chipper.UnstructuredChipperModel, - "initialize", - ): - layout.process_file_with_model("asdf", model_name="chipper", pdf_image_dpi=299) - mock_from_file.assert_called_once() - assert caplog.records[0].levelname == "WARNING" - assert "DPI >= 300" in caplog.records[0].msg +# dpi check moved to unst +# def test_warning_if_chipper_and_low_dpi(caplog): +# with patch.object(layout.DocumentLayout, "from_file") as mock_from_file, patch.object( +# chipper.UnstructuredChipperModel, +# "initialize", +# ): +# layout.process_file_with_model("asdf", model_name="chipper", pdf_image_dpi=299) +# mock_from_file.assert_called_once() +# assert caplog.records[0].levelname == "WARNING" +# assert "DPI >= 300" in caplog.records[0].msg diff --git a/test_unstructured_inference/inference/test_layout_element.py b/test_unstructured_inference/inference/test_layout_element.py index 1895ecbc..cb70924a 100644 --- a/test_unstructured_inference/inference/test_layout_element.py +++ b/test_unstructured_inference/inference/test_layout_element.py @@ -3,17 +3,9 @@ from layoutparser.elements.layout_elements import Rectangle as LPRectangle # from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR -from unstructured_inference.inference.elements import TextRegion +# from unstructured_inference.inference.elements import TextRegion from unstructured_inference.inference.layoutelement import ( LayoutElement, - # move to unst - # aggregate_ocr_text_by_block, - # get_elements_from_ocr_regions, - # move to unst - # merge_inferred_layout_with_ocr_layout, - # merge_text_regions, - # move to unst - # supplement_layout_with_ocr_elements, ) # move to unst diff --git a/test_unstructured_inference/test_elements.py b/test_unstructured_inference/test_elements.py index 56a35905..33a0fef1 100644 --- a/test_unstructured_inference/test_elements.py +++ b/test_unstructured_inference/test_elements.py @@ -1,11 +1,11 @@ -import logging +# import logging import os from random import randint from unittest.mock import PropertyMock, patch import pytest -from PIL import Image +# from PIL import Image from unstructured_inference.inference import elements skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"} @@ -191,14 +191,14 @@ def test_intersection_over_min( ) -@pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI") -def test_ocr_paddle(monkeypatch, caplog): - monkeypatch.setenv("ENTIRE_PAGE_OCR", "paddle") - image = Image.new("RGB", (100, 100), (255, 255, 255)) - text_block = elements.TextRegion(0, 0, 50, 50) - # Note(yuming): paddle result is currently non-deterministic on ci - # so don't check result like `assert result == ""` - # use logger info to confirm we are using paddle instead - with caplog.at_level(logging.INFO): - _ = elements.ocr(text_block, image, languages="en") - assert "paddle" in caplog.text +# @pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI") +# def test_ocr_paddle(monkeypatch, caplog): +# monkeypatch.setenv("ENTIRE_PAGE_OCR", "paddle") +# image = Image.new("RGB", (100, 100), (255, 255, 255)) +# text_block = elements.TextRegion(0, 0, 50, 50) +# # Note(yuming): paddle result is currently non-deterministic on ci +# # so don't check result like `assert result == ""` +# # use logger info to confirm we are using paddle instead +# with caplog.at_level(logging.INFO): +# _ = elements.ocr(text_block, image, languages="en") +# assert "paddle" in caplog.text diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py index 291f3f84..dde58376 100644 --- a/unstructured_inference/inference/elements.py +++ b/unstructured_inference/inference/elements.py @@ -1,6 +1,6 @@ from __future__ import annotations -import os +# import os import re import unicodedata from copy import deepcopy @@ -11,9 +11,10 @@ from PIL import Image from scipy.sparse.csgraph import connected_components -from unstructured_inference.logger import logger +# from unstructured_inference.logger import logger from unstructured_inference.math import safe_division -from unstructured_inference.models import tesseract + +# from unstructured_inference.models import tesseract # When extending the boundaries of a PDF object for the purpose of determining which other elements # should be considered in the same text region, we use a relative distance based on some fraction of @@ -263,7 +264,7 @@ def extract_text( return "" else: # return super().extract_text(objects, image, extract_tables, ocr_strategy) - return super().extract_text(objects, image, extract_table) + return super().extract_text(objects, image, extract_tables) # move to unst for individual_blocks mode @@ -272,7 +273,8 @@ def extract_text( # logger.debug("Running OCR on text block ...") # tesseract.load_agent(languages=languages) # padded_block = text_block.pad(12) -# cropped_image = image.crop((padded_block.x1, padded_block.y1, padded_block.x2, padded_block.y2)) +# cropped_image = image.crop( +# (padded_block.x1, padded_block.y1, padded_block.x2, padded_block.y2)) # entrie_page_ocr = os.getenv("ENTIRE_PAGE_OCR", "tesseract").lower() # if entrie_page_ocr == "paddle": # from unstructured_inference.models import paddle_ocr diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 549d03af..8e1be945 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -7,13 +7,14 @@ import numpy as np import pdf2image -import pytesseract + +# import pytesseract from pdfminer import psparser from pdfminer.high_level import extract_pages from PIL import Image, ImageSequence -from pytesseract import Output -from unstructured_inference.constants import OCRMode +# from pytesseract import Output +# from unstructured_inference.constants import OCRMode from unstructured_inference.inference.elements import ( EmbeddedTextRegion, ImageTextRegion, @@ -24,9 +25,10 @@ LayoutElement, LocationlessLayoutElement, merge_inferred_layout_with_extracted_layout, - # move to unst - # merge_inferred_layout_with_ocr_layout, ) + +# move to unst +# merge_inferred_layout_with_ocr_layout, from unstructured_inference.inference.ordering import order_layout from unstructured_inference.inference.pdf import get_images_from_pdf_element from unstructured_inference.logger import logger @@ -271,7 +273,8 @@ def get_elements_with_detection_model( # entrie_page_ocr = os.getenv("ENTIRE_PAGE_OCR", "tesseract").lower() # if entrie_page_ocr not in ["paddle", "tesseract"]: # raise ValueError( - # "Environment variable ENTIRE_PAGE_OCR must be set to 'tesseract' or 'paddle'.", + # "Environment variable ENTIRE_PAGE_OCR + # must be set to 'tesseract' or 'paddle'.", # ) # if entrie_page_ocr == "paddle": @@ -525,7 +528,7 @@ def process_data_with_model( # ocr_mode: str = OCRMode.FULL_PAGE.value, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, extract_tables: bool = False, - pdf_image_dpi: Optional[int] = None, + pdf_image_dpi: int = 200, **kwargs, ) -> DocumentLayout: """Processes pdf file in the form of a file handler (supporting a read method) into a @@ -558,7 +561,7 @@ def process_file_with_model( # ocr_mode: str = OCRMode.FULL_PAGE.value, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, extract_tables: bool = False, - pdf_image_dpi: Optional[int] = None, + pdf_image_dpi: int = 200, **kwargs, ) -> DocumentLayout: """Processes pdf file with name filename into a DocumentLayout by using a model identified by diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index b596249a..30d32380 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -1,20 +1,20 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Collection, List, Optional, cast +from typing import Collection, List, Optional import numpy as np from layoutparser.elements.layout import TextBlock from pandas import DataFrame from PIL import Image -from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD, SUBREGION_THRESHOLD_FOR_OCR +from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD from unstructured_inference.inference.elements import ( ImageTextRegion, Rectangle, TextRegion, grow_region_to_match_region, - partition_groups_from_regions, + # partition_groups_from_regions, region_bounding_boxes_are_almost_the_same, ) from unstructured_inference.models import tables @@ -264,7 +264,8 @@ def merge_inferred_layout_with_extracted_layout( # Note: # - The function relies on `is_almost_subregion_of()` method to determine if an OCR region # is a subregion of an existing layout element. -# - It also relies on `get_elements_from_ocr_regions()` to convert OCR regions to layout elements. +# - It also relies on `get_elements_from_ocr_regions()` to convert OCR +# regions to layout elements. # - The `SUBREGION_THRESHOLD_FOR_OCR` constant is used to specify the subregion matching # threshold. # """ diff --git a/unstructured_inference/models/paddle_ocr.py b/unstructured_inference/models/paddle_ocr.py index b4d6d38c..03d2d5cd 100644 --- a/unstructured_inference/models/paddle_ocr.py +++ b/unstructured_inference/models/paddle_ocr.py @@ -1,3 +1,4 @@ +"""This OCR module is used in table models only and will be removed after table OCR refactoring""" import functools import paddle From b64e546e5e8294ca2ca76ab98b3fca24b7641fbb Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Fri, 29 Sep 2023 12:13:28 -0400 Subject: [PATCH 11/26] disable tesseract --- .../models/test_tesseract.py | 32 +++++------ unstructured_inference/models/tesseract.py | 56 +++++++++---------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/test_unstructured_inference/models/test_tesseract.py b/test_unstructured_inference/models/test_tesseract.py index 475cba08..873f2fab 100644 --- a/test_unstructured_inference/models/test_tesseract.py +++ b/test_unstructured_inference/models/test_tesseract.py @@ -1,26 +1,26 @@ -from unittest.mock import patch +# from unittest.mock import patch -import pytest +# import pytest -from unstructured_inference.models import tesseract +# from unstructured_inference.models import tesseract -class MockTesseractAgent: - def __init__(self, languages): - pass +# class MockTesseractAgent: +# def __init__(self, languages): +# pass -def test_load_agent(monkeypatch): - monkeypatch.setattr(tesseract, "TesseractAgent", MockTesseractAgent) - monkeypatch.setattr(tesseract, "ocr_agents", {}) +# def test_load_agent(monkeypatch): +# monkeypatch.setattr(tesseract, "TesseractAgent", MockTesseractAgent) +# monkeypatch.setattr(tesseract, "ocr_agents", {}) - with patch.object(tesseract, "is_pytesseract_available", return_value=True): - tesseract.load_agent(languages="eng+swe") +# with patch.object(tesseract, "is_pytesseract_available", return_value=True): +# tesseract.load_agent(languages="eng+swe") - assert isinstance(tesseract.ocr_agents["eng+swe"], MockTesseractAgent) +# assert isinstance(tesseract.ocr_agents["eng+swe"], MockTesseractAgent) -def test_load_agent_raises_when_not_available(): - with patch.object(tesseract, "is_pytesseract_available", return_value=False): - with pytest.raises(ImportError): - tesseract.load_agent() +# def test_load_agent_raises_when_not_available(): +# with patch.object(tesseract, "is_pytesseract_available", return_value=False): +# with pytest.raises(ImportError): +# tesseract.load_agent() diff --git a/unstructured_inference/models/tesseract.py b/unstructured_inference/models/tesseract.py index 56bf8e5a..97f55001 100644 --- a/unstructured_inference/models/tesseract.py +++ b/unstructured_inference/models/tesseract.py @@ -1,38 +1,38 @@ -import os -from typing import Dict +# import os +# from typing import Dict -import pytesseract -from layoutparser.ocr.tesseract_agent import TesseractAgent, is_pytesseract_available +# import pytesseract +# from layoutparser.ocr.tesseract_agent import TesseractAgent, is_pytesseract_available -from unstructured_inference.logger import logger +# from unstructured_inference.logger import logger -ocr_agents: Dict[str, TesseractAgent] = {} +# ocr_agents: Dict[str, TesseractAgent] = {} -TesseractError = pytesseract.pytesseract.TesseractError +# TesseractError = pytesseract.pytesseract.TesseractError -# Force tesseract to be single threaded, -# otherwise we see major performance problems -if "OMP_THREAD_LIMIT" not in os.environ: - os.environ["OMP_THREAD_LIMIT"] = "1" +# # Force tesseract to be single threaded, +# # otherwise we see major performance problems +# if "OMP_THREAD_LIMIT" not in os.environ: +# os.environ["OMP_THREAD_LIMIT"] = "1" -def load_agent(languages: str = "eng"): - """Loads the Tesseract OCR agent as a global variable to ensure that we only load it once. +# def load_agent(languages: str = "eng"): +# """Loads the Tesseract OCR agent as a global variable to ensure that we only load it once. - Parameters - ---------- - languages - The languages to use for the Tesseract agent. To use a langauge, you'll first need - to isntall the appropriate Tesseract language pack. - """ - global ocr_agents +# Parameters +# ---------- +# languages +# The languages to use for the Tesseract agent. To use a langauge, you'll first need +# to isntall the appropriate Tesseract language pack. +# """ +# global ocr_agents - if not is_pytesseract_available(): - raise ImportError( - "Failed to load Tesseract. Ensure that Tesseract is installed. Example command: \n" - " >>> sudo apt install -y tesseract-ocr", - ) +# if not is_pytesseract_available(): +# raise ImportError( +# "Failed to load Tesseract. Ensure that Tesseract is installed. Example command: \n" +# " >>> sudo apt install -y tesseract-ocr", +# ) - if languages not in ocr_agents: - logger.info(f"Loading the Tesseract OCR agent for {languages} ...") - ocr_agents[languages] = TesseractAgent(languages=languages) +# if languages not in ocr_agents: +# logger.info(f"Loading the Tesseract OCR agent for {languages} ...") +# ocr_agents[languages] = TesseractAgent(languages=languages) From bc89ffa9a0956a55d209e9da0af0f0864eb56c3b Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Fri, 29 Sep 2023 12:49:20 -0400 Subject: [PATCH 12/26] move test fixture --- test_unstructured_inference/conftest.py | 40 ++++++++++++------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/test_unstructured_inference/conftest.py b/test_unstructured_inference/conftest.py index 761a0492..ef095c38 100644 --- a/test_unstructured_inference/conftest.py +++ b/test_unstructured_inference/conftest.py @@ -107,13 +107,13 @@ def mock_embedded_text_regions(): ] -@pytest.fixture() -def mock_ocr_regions(): - return [ - EmbeddedTextRegion(10, 10, 90, 90, "0"), - EmbeddedTextRegion(200, 200, 300, 300, "1"), - EmbeddedTextRegion(500, 320, 600, 350, "3"), - ] +# @pytest.fixture() +# def mock_ocr_regions(): +# return [ +# EmbeddedTextRegion(10, 10, 90, 90, "0"), +# EmbeddedTextRegion(200, 200, 300, 300, "1"), +# EmbeddedTextRegion(500, 320, 600, 350, "3"), +# ] # TODO(alan): Make a better test layout @@ -132,16 +132,16 @@ def mock_layout(mock_embedded_text_regions): ] -@pytest.fixture() -def mock_inferred_layout(mock_embedded_text_regions): - return [ - LayoutElement( - r.x1, - r.y1, - r.x2, - r.y2, - text=None, - type="Text", - ) - for r in mock_embedded_text_regions - ] +# @pytest.fixture() +# def mock_inferred_layout(mock_embedded_text_regions): +# return [ +# LayoutElement( +# r.x1, +# r.y1, +# r.x2, +# r.y2, +# text=None, +# type="Text", +# ) +# for r in mock_embedded_text_regions +# ] From a5fc90d8cd20a033f1116d242c1c2f30bc82db5c Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Fri, 29 Sep 2023 12:50:21 -0400 Subject: [PATCH 13/26] remove paddle install in docker --- Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index ebcf0da7..366cffc3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,7 +20,6 @@ RUN python3.8 -m pip install pip==${PIP_VERSION} && \ pip install --no-cache -r requirements/base.txt && \ pip install --no-cache -r requirements/test.txt && \ pip install --no-cache -r requirements/dev.txt && \ - pip install "unstructured.PaddleOCR" && \ dnf -y groupremove "Development Tools" && \ dnf clean all From abd95d1aec4df3940c3b1a4a34bfd1335af0a551 Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Fri, 29 Sep 2023 13:09:49 -0400 Subject: [PATCH 14/26] empty From 5726d2f2c30b3b76fd183785b52cac914f46a0a0 Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Fri, 29 Sep 2023 15:52:32 -0400 Subject: [PATCH 15/26] ocr param in new test --- test_unstructured_inference/models/test_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_unstructured_inference/models/test_model.py b/test_unstructured_inference/models/test_model.py index 4ae6c08a..8f158e26 100644 --- a/test_unstructured_inference/models/test_model.py +++ b/test_unstructured_inference/models/test_model.py @@ -87,8 +87,8 @@ def test_deduplicate_detected_elements(): doc = DocumentLayout.from_image_file( file, model, - ocr_strategy="never", - supplement_with_ocr_elements=False, + # ocr_strategy="never", + # supplement_with_ocr_elements=False, ) known_elements = [e for e in doc.pages[0].elements if e.type != "UncategorizedText"] # Compute intersection matrix From 1a968c4dec1ccde80c1604f35f4f3b7639916f43 Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Fri, 29 Sep 2023 16:32:16 -0400 Subject: [PATCH 16/26] changlog version --- CHANGELOG.md | 4 ++++ unstructured_inference/__version__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2277974d..17340799 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.6.7 + +* Remove all OCR related code expect the table OCR code + ## 0.6.6 * Stop passing ocr_languages parameter into paddle to avoid invalid paddle language code error, this will be fixed until diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 37b46218..ba99456d 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.6.6" # pragma: no cover +__version__ = "0.6.7" # pragma: no cover From 28e1bc769e8597164931d32efb532e1643dddc21 Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Tue, 3 Oct 2023 12:12:41 -0400 Subject: [PATCH 17/26] remove ocr constant --- unstructured_inference/constants.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/unstructured_inference/constants.py b/unstructured_inference/constants.py index 78c46379..572a1ff3 100644 --- a/unstructured_inference/constants.py +++ b/unstructured_inference/constants.py @@ -1,9 +1,9 @@ from enum import Enum -class OCRMode(Enum): - INDIVIDUAL_BLOCKS = "individual_blocks" - FULL_PAGE = "entire_page" +# class OCRMode(Enum): +# INDIVIDUAL_BLOCKS = "individual_blocks" +# FULL_PAGE = "entire_page" class AnnotationResult(Enum): @@ -15,11 +15,11 @@ class Source(Enum): YOLOX = "yolox" DETECTRON2_ONNX = "detectron2_onnx" DETECTRON2_LP = "detectron2_lp" - OCR_TESSERACT = "OCR-tesseract" - OCR_PADDLE = "OCR-paddle" + # OCR_TESSERACT = "OCR-tesseract" + # OCR_PADDLE = "OCR-paddle" PDFMINER = "pdfminer" MERGED = "merged" -SUBREGION_THRESHOLD_FOR_OCR = 0.5 +# SUBREGION_THRESHOLD_FOR_OCR = 0.5 FULL_PAGE_REGION_THRESHOLD = 0.99 From 3f43f06e3ef5a7e059f3e7cc27af24150ff3d087 Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Tue, 3 Oct 2023 18:28:45 -0400 Subject: [PATCH 18/26] remove all comment ocr --- test_unstructured_inference/conftest.py | 25 -- .../inference/test_layout.py | 326 +----------------- .../inference/test_layout_element.py | 98 ------ .../models/test_model.py | 2 - test_unstructured_inference/test_elements.py | 15 - unstructured_inference/constants.py | 8 - unstructured_inference/inference/elements.py | 94 ----- unstructured_inference/inference/layout.py | 128 +------ .../inference/layoutelement.py | 175 +--------- 9 files changed, 5 insertions(+), 866 deletions(-) diff --git a/test_unstructured_inference/conftest.py b/test_unstructured_inference/conftest.py index 051880af..097464fa 100644 --- a/test_unstructured_inference/conftest.py +++ b/test_unstructured_inference/conftest.py @@ -107,15 +107,6 @@ def mock_embedded_text_regions(): ] -# @pytest.fixture() -# def mock_ocr_regions(): -# return [ -# EmbeddedTextRegion(10, 10, 90, 90, text="0", source=None), -# EmbeddedTextRegion(200, 200, 300, 300, text="1", source=None), -# EmbeddedTextRegion(500, 320, 600, 350, text="3", source=None), -# ] - - # TODO(alan): Make a better test layout @pytest.fixture() def mock_layout(mock_embedded_text_regions): @@ -130,19 +121,3 @@ def mock_layout(mock_embedded_text_regions): ) for r in mock_embedded_text_regions ] - - -# @pytest.fixture() -# def mock_inferred_layout(mock_embedded_text_regions): -# return [ -# LayoutElement( -# r.x1, -# r.y1, -# r.x2, -# r.y2, -# text=None, -# source=None, -# type="Text", -# ) -# for r in mock_embedded_text_regions -# ] diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index 048f1690..d1ac32b1 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -2,8 +2,6 @@ import os.path import tempfile from functools import partial - -# from itertools import product from unittest.mock import mock_open, patch import numpy as np @@ -14,8 +12,6 @@ from unstructured_inference.constants import Source from unstructured_inference.inference import elements, layout, layoutelement from unstructured_inference.models import detectron2 - -# from unstructured_inference.models.base import get_model from unstructured_inference.models.unstructuredmodel import ( UnstructuredElementExtractionModel, UnstructuredObjectDetectionModel, @@ -89,50 +85,6 @@ def verify_image_array(): verify_image_array() -# def test_ocr(monkeypatch): -# mock_text = "The parrot flies high in the air!" - -# class MockOCRAgent: -# def detect(self, *args): -# return mock_text - -# monkeypatch.setattr(tesseract, "ocr_agents", {"eng": MockOCRAgent}) -# monkeypatch.setattr(tesseract, "is_pytesseract_available", lambda *args: True) - -# image = Image.fromarray(np.random.randint(12, 24, (40, 40)), mode="RGB") -# text_block = layout.TextRegion(1, 2, 3, 4, text=None) - -# assert elements.ocr(text_block, image=image) == mock_text - - -# def test_ocr_with_error(monkeypatch): -# class MockOCRAgent: -# def detect(self, *args): -# # We sometimes get this error on very small images -# raise tesseract.TesseractError(-8, "Estimating resolution as 1023") - -# monkeypatch.setattr(tesseract, "ocr_agents", {"eng": MockOCRAgent}) -# monkeypatch.setattr(tesseract, "is_pytesseract_available", lambda *args: True) - -# image = Image.fromarray(np.random.randint(12, 24, (40, 40)), mode="RGB") -# text_block = layout.TextRegion(1, 2, 3, 4, text=None) - -# assert elements.ocr(text_block, image=image) == "" - -# TODO(yuming): move source test -# def test_ocr_source(): -# file = "sample-docs/loremipsum-flat.pdf" -# model = get_model("yolox_tiny") -# doc = layout.DocumentLayout.from_file( -# file, -# model, -# ocr_mode=OCRMode.FULL_PAGE.value, -# supplement_with_ocr_elements=True, -# ocr_strategy="force", -# ) -# assert Source.OCR_TESSERACT in {e.source for e in doc.pages[0].elements} - - class MockLayoutModel: def __init__(self, layout): self.layout_return = layout @@ -162,26 +114,6 @@ def test_get_page_elements(monkeypatch, mock_final_layout): assert elements == page.elements -# def test_get_page_elements_with_tesseract_error(monkeypatch, mock_final_layout): -# def mock_image_to_data(*args, **kwargs): -# raise tesseract.TesseractError(-2, "Estimating resolution as 1023") - -# monkeypatch.setattr(layout.pytesseract, "image_to_data", mock_image_to_data) - -# image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB") -# page = layout.PageLayout( -# number=0, -# image=image, -# layout=mock_final_layout, -# detection_model=MockLayoutModel(mock_final_layout), -# ) - -# elements = page.get_elements_with_detection_model(inplace=False) - -# assert str(elements[0]) == "A Catchy Title" -# assert str(elements[1]).startswith("A very repetitive narrative.") - - class MockPool: def map(self, f, xs): return [f(x) for x in xs] @@ -193,102 +125,6 @@ def join(self): pass -# @pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI") -# def test_get_page_elements_with_paddle_ocr(monkeypatch): -# monkeypatch.setenv("ENTIRE_PAGE_OCR", "paddle") -# text_block = layout.TextRegion(2, 4, 6, 8, text=None) -# image_block = layout.ImageTextRegion(8, 14, 16, 18) -# doc_initial_layout = [text_block, image_block] -# text_layoutelement = layoutelement.LayoutElement( -# 2, -# 4, -# 6, -# 8, -# text=None, -# type="UncategorizedText", -# ) -# image_layoutelement = layoutelement.LayoutElement(8, 14, 16, 18, text=None, type="Image") -# doc_final_layout = [text_layoutelement, image_layoutelement] - -# monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True) -# monkeypatch.setattr(elements, "ocr", lambda *args, **kwargs: "An Even Catchier Title") - -# image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB") -# page = layout.PageLayout( -# number=0, -# image=image, -# layout=doc_initial_layout, -# detection_model=MockLayoutModel(doc_final_layout), -# # Note(yuming): there are differnt language codes for same language -# # between paddle and tesseract -# ocr_languages="en", -# ) -# page.get_elements_with_detection_model() - -# assert str(page) == "\n\nAn Even Catchier Title" - - -# def test_get_page_elements_with_tesseract_ocr(monkeypatch): -# monkeypatch.setenv("ENTIRE_PAGE_OCR", "tesseract") -# text_block = layout.TextRegion(2, 4, 6, 8, text=None) -# image_block = layout.ImageTextRegion(8, 14, 16, 18) -# doc_initial_layout = [text_block, image_block] -# text_layoutelement = layoutelement.LayoutElement( -# 2, -# 4, -# 6, -# 8, -# text=None, -# type="UncategorizedText", -# ) -# image_layoutelement = layoutelement.LayoutElement(8, 14, 16, 18, text=None, type="Image") -# doc_final_layout = [text_layoutelement, image_layoutelement] - -# monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True) -# monkeypatch.setattr(elements, "ocr", lambda *args, **kwargs: "An Even Catchier Title") - -# image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB") -# page = layout.PageLayout( -# number=0, -# image=image, -# layout=doc_initial_layout, -# detection_model=MockLayoutModel(doc_final_layout), -# ) -# page.get_elements_with_detection_model() - -# assert str(page) == "\n\nAn Even Catchier Title" - - -# def test_get_page_elements_with_ocr_invalid_entrie_page_ocr(monkeypatch): -# monkeypatch.setenv("ENTIRE_PAGE_OCR", "invalid_entire_page_ocr") -# text_block = layout.TextRegion(2, 4, 6, 8, text=None) -# image_block = layout.ImageTextRegion(8, 14, 16, 18) -# doc_initial_layout = [text_block, image_block] -# text_layoutelement = layoutelement.LayoutElement( -# 2, -# 4, -# 6, -# 8, -# text=None, -# type="UncategorizedText", -# ) -# image_layoutelement = layoutelement.LayoutElement(8, 14, 16, 18, text=None, type="Image") -# doc_final_layout = [text_layoutelement, image_layoutelement] - -# monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True) -# monkeypatch.setattr(elements, "ocr", lambda *args, **kwargs: "An Even Catchier Title") - -# image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB") -# page = layout.PageLayout( -# number=0, -# image=image, -# layout=doc_initial_layout, -# detection_model=MockLayoutModel(doc_final_layout), -# ) -# with pytest.raises(ValueError): -# page.get_elements_with_detection_model() - - def test_read_pdf(monkeypatch, mock_initial_layout, mock_final_layout, mock_image): with tempfile.TemporaryDirectory() as tmpdir: image_path1 = os.path.join(tmpdir, "mock1.jpg") @@ -375,10 +211,9 @@ def tolist(self): class MockEmbeddedTextRegion(layout.EmbeddedTextRegion): - def __init__(self, type=None, text=None, ocr_text=None): + def __init__(self, type=None, text=None): self.type = type self.text = text - self.ocr_text = ocr_text @property def points(self): @@ -392,21 +227,14 @@ def __init__( image=None, layout=None, model=None, - ocr_strategy="auto", - ocr_languages="eng", extract_tables=False, ): self.image = image self.layout = layout self.model = model - self.ocr_strategy = ocr_strategy - self.ocr_languages = ocr_languages self.extract_tables = extract_tables self.number = number - def ocr(self, text_block: MockEmbeddedTextRegion): - return text_block.ocr_text - @pytest.mark.parametrize( ("text", "expected"), @@ -444,26 +272,6 @@ def filter_by(self, *args, **kwargs): return MockLayout() -# @pytest.mark.parametrize( -# ("block_text", "layout_texts", "expected_text"), -# [ -# ("no ocr", ["pieced", "together", "group"], "no ocr"), -# (None, ["pieced", "together", "group"], "pieced together group"), -# ], -# ) -# def test_get_element_from_block(block_text, layout_texts, mock_image, expected_text): -# with patch("unstructured_inference.inference.elements.ocr", return_value="ocr"): -# block = layout.TextRegion(0, 0, 10, 10, text=block_text) -# captured_layout = [ -# layout.TextRegion(i + 1, i + 1, i + 2, i + 2, text=text) -# for i, text in enumerate(layout_texts) -# ] -# assert ( -# layout.get_element_from_block(block, mock_image, -# captured_layout).text == expected_text -# ) - - def test_get_elements_from_block_raises(): with pytest.raises(ValueError): block = layout.TextRegion(0, 0, 10, 10, text=None) @@ -577,11 +385,6 @@ def test_from_file_fixed_layout(fixed_layouts, called_method, not_called_method) getattr(layout.PageLayout, not_called_method).assert_not_called() -# def test_invalid_ocr_strategy_raises(mock_image): -# with pytest.raises(ValueError): -# layout.PageLayout(0, mock_image, MockLayout(), ocr_strategy="fake_strategy") - - @pytest.mark.parametrize( ("text", "expected"), [("a\ts\x0cd\nfas\fd\rf\b", "asdfasdf"), ("\"'\\", "\"'\\")], @@ -605,93 +408,6 @@ def test_remove_control_characters(text, expected): unpopulated_text_region = layout.EmbeddedTextRegion(50, 50, 60, 60, text=None) -# @pytest.mark.parametrize( -# ("region", "objects", "ocr_strategy", "expected"), -# [ -# (no_text_region, [nonoverlapping_rect], "auto", False), -# (no_text_region, [overlapping_rect], "auto", True), -# (no_text_region, [], "auto", False), -# (no_text_region, [populated_text_region, nonoverlapping_rect], "auto", False), -# (no_text_region, [populated_text_region, overlapping_rect], "auto", False), -# (no_text_region, [populated_text_region], "auto", False), -# (no_text_region, [unpopulated_text_region, nonoverlapping_rect], "auto", False), -# (no_text_region, [unpopulated_text_region, overlapping_rect], "auto", True), -# (no_text_region, [unpopulated_text_region], "auto", False), -# *list( -# product( -# [text_region], -# [ -# [], -# [populated_text_region], -# [unpopulated_text_region], -# [nonoverlapping_rect], -# [overlapping_rect], -# [populated_text_region, nonoverlapping_rect], -# [populated_text_region, overlapping_rect], -# [unpopulated_text_region, nonoverlapping_rect], -# [unpopulated_text_region, overlapping_rect], -# ], -# ["auto"], -# [False], -# ), -# ), -# *list( -# product( -# [cid_text_region], -# [ -# [], -# [populated_text_region], -# [unpopulated_text_region], -# [overlapping_rect], -# [populated_text_region, overlapping_rect], -# [unpopulated_text_region, overlapping_rect], -# ], -# ["auto"], -# [True], -# ), -# ), -# *list( -# product( -# [no_text_region, text_region, cid_text_region], -# [ -# [], -# [populated_text_region], -# [unpopulated_text_region], -# [nonoverlapping_rect], -# [overlapping_rect], -# [populated_text_region, nonoverlapping_rect], -# [populated_text_region, overlapping_rect], -# [unpopulated_text_region, nonoverlapping_rect], -# [unpopulated_text_region, overlapping_rect], -# ], -# ["force"], -# [True], -# ), -# ), -# *list( -# product( -# [no_text_region, text_region, cid_text_region], -# [ -# [], -# [populated_text_region], -# [unpopulated_text_region], -# [nonoverlapping_rect], -# [overlapping_rect], -# [populated_text_region, nonoverlapping_rect], -# [populated_text_region, overlapping_rect], -# [unpopulated_text_region, nonoverlapping_rect], -# [unpopulated_text_region, overlapping_rect], -# ], -# ["never"], -# [False], -# ), -# ), -# ], -# ) -# def test_ocr_image(region, objects, ocr_strategy, expected): -# assert elements.needs_ocr(region, objects, ocr_strategy) is expected - - @pytest.mark.parametrize("filename", ["loremipsum.pdf", "IRS-form-1987.pdf"]) def test_load_pdf(filename): layouts, images = layout.load_pdf(f"sample-docs/{filename}") @@ -728,7 +444,7 @@ def test_load_pdf_raises_with_path_only_no_output_folder(): @pytest.mark.skip("Temporarily removed multicolumn to fix ordering") -def test_load_pdf_with_multicolumn_layout_and_ocr(filename="sample-docs/design-thinking.pdf"): +def test_load_pdf_with_multicolumn_layout(filename="sample-docs/design-thinking.pdf"): layouts, images = layout.load_pdf(filename) doc = layout.process_file_with_model(filename=filename, model_name=None) test_snippets = ["Key to design thinking", "Design thinking also", "But in recent years"] @@ -787,35 +503,12 @@ def check_annotated_image(): check_annotated_image() -# def test_textregion_returns_empty_ocr_never(mock_image): -# tr = elements.TextRegion(0, 0, 24, 24) -# assert tr.extract_text(objects=None, image=mock_image, ocr_strategy="never") == "" - - @pytest.mark.parametrize(("text", "expected"), [("asdf", "asdf"), (None, "")]) def test_embedded_text_region(text, expected): etr = elements.EmbeddedTextRegion(0, 0, 24, 24, text=text) assert etr.extract_text(objects=None) == expected -# @pytest.mark.parametrize( -# ("text", "ocr_strategy", "expected"), -# [ -# (None, "never", ""), -# (None, "always", "asdf"), -# ("i have text", "never", "i have text"), -# ("i have text", "always", "i have text"), -# ], -# ) -# def test_image_text_region(text, ocr_strategy, expected, mock_image): -# itr = elements.ImageTextRegion(0, 0, 24, 24, text=text) -# with patch.object(elements, "ocr", return_value="asdf"): -# assert ( -# itr.extract_text(objects=None, image=moc -# k_image, ocr_strategy=ocr_strategy) == expected -# ) - - class MockDetectionModel(layout.UnstructuredObjectDetectionModel): def initialize(self, *args, **kwargs): pass @@ -974,9 +667,6 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m "asdf", detection_model=detection_model, element_extraction_model=element_extraction_model, - # ocr_strategy="auto", - # ocr_languages="eng", - # ocr_mode=OCRMode.FULL_PAGE.value, fixed_layouts=None, extract_tables=False, pdf_image_dpi=200, @@ -990,18 +680,6 @@ def test_exposed_pdf_image_dpi(pdf_image_dpi, expected, monkeypatch): assert mock_from_image.call_args[0][0].height == expected -# dpi check moved to unst -# def test_warning_if_chipper_and_low_dpi(caplog): -# with patch.object(layout.DocumentLayout, "from_file") as mock_from_file, patch.object( -# chipper.UnstructuredChipperModel, -# "initialize", -# ): -# layout.process_file_with_model("asdf", model_name="chipper", pdf_image_dpi=299) -# mock_from_file.assert_called_once() -# assert caplog.records[0].levelname == "WARNING" -# assert "DPI >= 300" in caplog.records[0].msg - - @pytest.mark.parametrize( ("filename", "img_num", "should_complete"), [("sample-docs/empty-document.pdf", 0, True), ("sample-docs/empty-document.pdf", 10, False)], diff --git a/test_unstructured_inference/inference/test_layout_element.py b/test_unstructured_inference/inference/test_layout_element.py index 5285f73a..c037b4ad 100644 --- a/test_unstructured_inference/inference/test_layout_element.py +++ b/test_unstructured_inference/inference/test_layout_element.py @@ -3,108 +3,10 @@ from layoutparser.elements.layout_elements import Rectangle as LPRectangle from unstructured_inference.constants import Source - -# from unstructured_inference.inference.elements import TextRegion from unstructured_inference.inference.layoutelement import ( LayoutElement, ) -# def test_aggregate_ocr_text_by_block(): -# expected = "A Unified Toolkit" -# ocr_layout = [ -# TextRegion(0, 0, 20, 20, source="OCR", text="A"), -# TextRegion(50, 50, 150, 150, source="OCR", text="Unified"), -# TextRegion(150, 150, 300, 250, source="OCR", text="Toolkit"), -# TextRegion(200, 250, 300, 350, source="OCR", text="Deep"), -# ] -# region = TextRegion(0, 0, 250, 350, text="") - -# text = aggregate_ocr_text_by_block(ocr_layout, region, 0.5) -# assert text == expected - - -# def test_merge_text_regions(mock_embedded_text_regions): -# expected = TextRegion( -# x1=437.83888888888885, -# y1=317.319341111111, -# x2=1256.334784222222, -# y2=406.9837855555556, -# text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image", -# ) - -# merged_text_region = merge_text_regions(mock_embedded_text_regions) -# assert merged_text_region == expected - - -# def test_get_elements_from_ocr_regions(mock_embedded_text_regions): -# expected = [ -# LayoutElement( -# x1=437.83888888888885, -# y1=317.319341111111, -# x2=1256.334784222222, -# y2=406.9837855555556, -# text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image", -# type="UncategorizedText", -# ), -# ] - -# elements = get_elements_from_ocr_regions(mock_embedded_text_regions) -# assert elements == expected - - -# def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions): -# ocr_elements = [ -# LayoutElement( -# r.x1, -# r.y1, -# r.x2, -# r.y2, -# text=r.text, -# source=None, -# type="UncategorizedText", -# ) -# for r in mock_ocr_regions -# ] - -# final_layout = supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions) - -# # Check if the final layout contains the original layout elements -# for element in mock_layout: -# assert element in final_layout - -# # Check if the final layout contains the OCR-derived elements -# assert any(ocr_element in final_layout for ocr_element in ocr_elements) - -# # Check if the OCR-derived elements that are subregions of layout elements are removed -# for element in mock_layout: -# for ocr_element in ocr_elements: -# if ocr_element.is_almost_subregion_of(element, SUBREGION_THRESHOLD_FOR_OCR): -# assert ocr_element not in final_layout - - -# def test_merge_inferred_layout_with_ocr_layout(mock_inferred_layout, mock_ocr_regions): -# ocr_elements = [ -# LayoutElement( -# r.x1, -# r.y1, -# r.x2, -# r.y2, -# text=r.text, -# source=None, -# type="UncategorizedText", -# ) -# for r in mock_ocr_regions -# ] - -# final_layout = merge_inferred_layout_with_ocr_layout(mock_inferred_layout, mock_ocr_regions) - -# # Check if the inferred layout's text attribute is updated with aggregated OCR text -# assert final_layout[0].text == mock_ocr_regions[2].text - -# # Check if the final layout contains both original elements and OCR-derived elements -# assert all(element in final_layout for element in mock_inferred_layout) -# assert any(element in final_layout for element in ocr_elements) - @pytest.mark.parametrize("is_table", [False, True]) def test_layout_element_extract_text( diff --git a/test_unstructured_inference/models/test_model.py b/test_unstructured_inference/models/test_model.py index 8f158e26..e05fbf5b 100644 --- a/test_unstructured_inference/models/test_model.py +++ b/test_unstructured_inference/models/test_model.py @@ -87,8 +87,6 @@ def test_deduplicate_detected_elements(): doc = DocumentLayout.from_image_file( file, model, - # ocr_strategy="never", - # supplement_with_ocr_elements=False, ) known_elements = [e for e in doc.pages[0].elements if e.type != "UncategorizedText"] # Compute intersection matrix diff --git a/test_unstructured_inference/test_elements.py b/test_unstructured_inference/test_elements.py index 702f19b4..1a68fa57 100644 --- a/test_unstructured_inference/test_elements.py +++ b/test_unstructured_inference/test_elements.py @@ -1,11 +1,9 @@ -# import logging import os from random import randint from unittest.mock import PropertyMock, patch import pytest -# from PIL import Image from unstructured_inference.inference import elements from unstructured_inference.inference.layoutelement import ( LocationlessLayoutElement, @@ -195,19 +193,6 @@ def test_intersection_over_min( ) -# @pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI") -# def test_ocr_paddle(monkeypatch, caplog): -# monkeypatch.setenv("ENTIRE_PAGE_OCR", "paddle") -# image = Image.new("RGB", (100, 100), (255, 255, 255)) -# text_block = elements.TextRegion(0, 0, 50, 50) -# # Note(yuming): paddle result is currently non-deterministic on ci -# # so don't check result like `assert result == ""` -# # use logger info to confirm we are using paddle instead -# with caplog.at_level(logging.INFO): -# _ = elements.ocr(text_block, image, languages="en") -# assert "paddle" in caplog.text - - def test_grow_region_to_match_region(): from unstructured_inference.inference.elements import Rectangle, grow_region_to_match_region diff --git a/unstructured_inference/constants.py b/unstructured_inference/constants.py index 572a1ff3..436e538e 100644 --- a/unstructured_inference/constants.py +++ b/unstructured_inference/constants.py @@ -1,11 +1,6 @@ from enum import Enum -# class OCRMode(Enum): -# INDIVIDUAL_BLOCKS = "individual_blocks" -# FULL_PAGE = "entire_page" - - class AnnotationResult(Enum): IMAGE = "image" PLOT = "plot" @@ -15,11 +10,8 @@ class Source(Enum): YOLOX = "yolox" DETECTRON2_ONNX = "detectron2_onnx" DETECTRON2_LP = "detectron2_lp" - # OCR_TESSERACT = "OCR-tesseract" - # OCR_PADDLE = "OCR-paddle" PDFMINER = "pdfminer" MERGED = "merged" -# SUBREGION_THRESHOLD_FOR_OCR = 0.5 FULL_PAGE_REGION_THRESHOLD = 0.99 diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py index b8d644bc..262431fe 100644 --- a/unstructured_inference/inference/elements.py +++ b/unstructured_inference/inference/elements.py @@ -1,6 +1,5 @@ from __future__ import annotations -# import os import re import unicodedata from copy import deepcopy @@ -13,12 +12,8 @@ from unstructured_inference.config import inference_config from unstructured_inference.constants import Source - -# from unstructured_inference.logger import logger from unstructured_inference.math import safe_division -# from unstructured_inference.models import tesseract - @dataclass class Rectangle: @@ -210,19 +205,14 @@ def extract_text( objects: Optional[Collection[TextRegion]], image: Optional[Image.Image] = None, extract_tables: bool = False, - # ocr_strategy: str = "auto", - # ocr_languages: str = "eng", ) -> str: """Extracts text contained in region.""" if self.text is not None: # If block text is already populated, we'll assume it's correct text = self.text elif objects is not None: - # text = aggregate_by_block(self, image, objects, ocr_strategy) text = aggregate_by_block(self, image, objects) elif image is not None: - # We don't have anything to go on but the image itself, so we use OCR - # text = ocr(self, image, languages=ocr_languages) if ocr_strategy != "never" else "" text = "" else: raise ValueError( @@ -238,8 +228,6 @@ def extract_text( objects: Optional[Collection[TextRegion]], image: Optional[Image.Image] = None, extract_tables: bool = False, - # ocr_strategy: str = "auto", - # ocr_languages: str = "eng", ) -> str: """Extracts text contained in region.""" if self.text is None: @@ -254,89 +242,18 @@ def extract_text( objects: Optional[Collection[TextRegion]], image: Optional[Image.Image] = None, extract_tables: bool = False, - # ocr_strategy: str = "auto", - # ocr_languages: str = "eng", ) -> str: """Extracts text contained in region.""" if self.text is None: - # if ocr_strategy == "never" or image is None: - # return "" - # else: - # return ocr(self, image, languages=ocr_languages) return "" else: - # return super().extract_text(objects, image, extract_tables, ocr_strategy) return super().extract_text(objects, image, extract_tables) -# move to unst for individual_blocks mode -# def ocr(text_block: TextRegion, image: Image.Image, languages: str = "eng") -> str: -# """Runs a cropped text block image through and OCR agent.""" -# logger.debug("Running OCR on text block ...") -# tesseract.load_agent(languages=languages) -# padded_block = text_block.pad(12) -# cropped_image = image.crop( -# (padded_block.x1, padded_block.y1, padded_block.x2, padded_block.y2)) -# entrie_page_ocr = os.getenv("ENTIRE_PAGE_OCR", "tesseract").lower() -# if entrie_page_ocr == "paddle": -# from unstructured_inference.models import paddle_ocr - -# paddle_result = paddle_ocr.load_agent().ocr(np.array(cropped_image), cls=True) -# recognized_text = "" -# for idx in range(len(paddle_result)): -# res = paddle_result[idx] -# for line in res: -# recognized_text += line[1][0] -# return recognized_text -# else: -# agent = tesseract.ocr_agents.get(languages) -# if agent is None: -# raise RuntimeError("OCR agent is not loaded for {languages}.") - -# try: -# return agent.detect(cropped_image) -# except tesseract.TesseractError: -# logger.warning("TesseractError: Skipping region", exc_info=True) -# return "" - -# move to unst for individual_blocks mode -# def needs_ocr( -# region: TextRegion, -# pdf_objects: Collection[TextRegion], -# ocr_strategy: str, -# ) -> bool: -# """Logic to determine whether ocr is needed to extract text from given region.""" -# if ocr_strategy == "force": -# return True -# elif ocr_strategy == "auto": -# image_objects = [obj for obj in pdf_objects if isinstance(obj, ImageTextRegion)] -# word_objects = [obj for obj in pdf_objects if isinstance(obj, EmbeddedTextRegion)] -# # If any image object overlaps with the region of interest, we have hope of getting some -# # text from OCR. Otherwise, there's nothing there to find, no need to waste our time with -# # OCR. -# image_intersects = any(region.intersects(img_obj) for img_obj in image_objects) -# if region.text is None: -# # If the region has no text check if any images overlap with the region that might -# # contain text. -# if any(obj.is_in(region) and obj.text is not None for obj in word_objects): -# # If there are word objects in the region, we defer to that rather than OCR -# return False -# else: -# return image_intersects -# else: -# # If the region has text, we should only have to OCR if too much of the text is -# # uninterpretable. -# return cid_ratio(region.text) > 0.5 -# else: -# return False - - def aggregate_by_block( text_region: TextRegion, image: Optional[Image.Image], pdf_objects: Collection[TextRegion], - # ocr_strategy: str = "auto", - # ocr_languages: str = "eng", ) -> str: """Extracts the text aggregated from the elements of the given layout that lie within the given block.""" @@ -345,17 +262,6 @@ def aggregate_by_block( text = remove_control_characters(text) return text - # if image is not None and needs_ocr(text_region, pdf_objects, ocr_strategy): - # text = ocr(text_region, image, languages=ocr_languages) - # else: - # filtered_blocks = [obj for obj in pdf_objects if obj.is_in(text_region, error_margin=5)] - # for little_block in filtered_blocks: - # if image is not None and needs_ocr(little_block, pdf_objects, ocr_strategy): - # little_block.text = ocr(little_block, image, languages=ocr_languages) - # text = " ".join([x.text for x in filtered_blocks if x.text]) - # text = remove_control_characters(text) - # return text - def cid_ratio(text: str) -> float: """Gets ratio of unknown 'cid' characters extracted from text to all characters.""" diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 39c5c950..bc21fafd 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -7,8 +7,6 @@ import numpy as np import pdf2image - -# import pytesseract from pdfminer import psparser from pdfminer.high_level import extract_pages from PIL import Image, ImageSequence @@ -25,9 +23,6 @@ LocationlessLayoutElement, merge_inferred_layout_with_extracted_layout, ) - -# move to unst -# merge_inferred_layout_with_ocr_layout, from unstructured_inference.inference.ordering import order_layout from unstructured_inference.inference.pdf import get_images_from_pdf_element from unstructured_inference.logger import logger @@ -49,12 +44,6 @@ import pdfplumber # noqa -# VALID_OCR_STRATEGIES = ( -# "auto", # Use OCR when it looks like other methods have failed -# "force", # Always use OCR -# "never", # Never use OCR -# ) - class DocumentLayout: """Class for handling documents that are saved as .pdf files. For .pdf files, a @@ -86,9 +75,6 @@ def from_file( detection_model: Optional[UnstructuredObjectDetectionModel] = None, element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, - # ocr_strategy: str = "auto", - # ocr_languages: str = "eng", - # ocr_mode: str = OCRMode.FULL_PAGE.value, extract_tables: bool = False, pdf_image_dpi: int = 200, **kwargs, @@ -126,9 +112,6 @@ def from_file( detection_model=detection_model, element_extraction_model=element_extraction_model, layout=layout, - # ocr_strategy=ocr_strategy, - # ocr_languages=ocr_languages, - # ocr_mode=ocr_mode, fixed_layout=fixed_layout, extract_tables=extract_tables, **kwargs, @@ -142,9 +125,6 @@ def from_image_file( filename: str, detection_model: Optional[UnstructuredObjectDetectionModel] = None, element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, - # ocr_strategy: str = "auto", - # ocr_languages: str = "eng", - # ocr_mode: str = OCRMode.FULL_PAGE.value, fixed_layout: Optional[List[TextRegion]] = None, extract_tables: bool = False, **kwargs, @@ -173,9 +153,6 @@ def from_image_file( detection_model=detection_model, element_extraction_model=element_extraction_model, layout=None, - # ocr_strategy=ocr_strategy, - # ocr_languages=ocr_languages, - # ocr_mode=ocr_mode, fixed_layout=fixed_layout, extract_tables=extract_tables, **kwargs, @@ -197,12 +174,8 @@ def __init__( document_filename: Optional[Union[str, PurePath]] = None, detection_model: Optional[UnstructuredObjectDetectionModel] = None, element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, - # ocr_strategy: str = "auto", - # ocr_languages: str = "eng", - # ocr_mode: str = OCRMode.FULL_PAGE.value, extract_tables: bool = False, analysis: bool = False, - # supplement_with_ocr_elements: bool = True, ): if detection_model is not None and element_extraction_model is not None: raise ValueError("Only one of detection_model and extraction_model should be passed.") @@ -218,16 +191,9 @@ def __init__( self.detection_model = detection_model self.element_extraction_model = element_extraction_model self.elements: Collection[Union[LayoutElement, LocationlessLayoutElement]] = [] - # if ocr_strategy not in VALID_OCR_STRATEGIES: - # raise ValueError(f"ocr_strategy must be one of {VALID_OCR_STRATEGIES}.") - # self.ocr_strategy = ocr_strategy - # self.ocr_languages = ocr_languages - # self.ocr_mode = ocr_mode self.extract_tables = extract_tables self.analysis = analysis self.inferred_layout: Optional[List[LayoutElement]] = None - # self.ocr_layout: Optional[List[TextRegion]] = None - # self.supplement_with_ocr_elements = supplement_with_ocr_elements def __str__(self) -> str: return "\n\n".join([str(element) for element in self.elements]) @@ -264,41 +230,6 @@ def get_elements_with_detection_model( # remote call in the future. inferred_layout: List[LayoutElement] = self.detection_model(self.image) - # move to unst - # if self.ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value: - # ocr_layout = None - # elif self.ocr_mode == OCRMode.FULL_PAGE.value: - # ocr_layout = None - # entrie_page_ocr = os.getenv("ENTIRE_PAGE_OCR", "tesseract").lower() - # if entrie_page_ocr not in ["paddle", "tesseract"]: - # raise ValueError( - # "Environment variable ENTIRE_PAGE_OCR - # must be set to 'tesseract' or 'paddle'.", - # ) - - # if entrie_page_ocr == "paddle": - # logger.info("Processing entrie page OCR with paddle...") - # from unstructured_inference.models import paddle_ocr - - # # TODO(yuming): paddle only support one language at once, - # # change ocr to tesseract if passed in multilanguages. - # ocr_data = paddle_ocr.load_agent(language=self.ocr_languages).ocr( - # np.array(self.image), - # cls=True, - # ) - # ocr_layout = parse_ocr_data_paddle(ocr_data) - # else: - # logger.info("Processing entrie page OCR with tesseract...") - # try: - # ocr_data = pytesseract.image_to_data( - # self.image, - # lang=self.ocr_languages, - # output_type=Output.DICT, - # ) - # ocr_layout = parse_ocr_data_tesseract(ocr_data) - # except pytesseract.pytesseract.TesseractError: - # logger.warning("TesseractError: Skipping page", exc_info=True) - if self.layout is not None: threshold_kwargs = {} # NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn @@ -312,26 +243,9 @@ def get_elements_with_detection_model( inferred_layout=inferred_layout, extracted_layout=self.layout, page_image_size=self.image.size, - # ocr_layout=ocr_layout, - # supplement_with_ocr_elements=self.supplement_with_ocr_elements, **threshold_kwargs, ) - # move to unst - # elif ocr_layout is not None: - # threshold_kwargs = {} - # # NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn - # # In other case the default values for the functions are used - # if ( - # isinstance(self.detection_model, UnstructuredDetectronONNXModel) - # and "R_50" not in self.detection_model.model_path - # ): - # threshold_kwargs = {"subregion_threshold": 0.3} - # merged_layout = merge_inferred_layout_with_ocr_layout( - # inferred_layout=inferred_layout, - # ocr_layout=ocr_layout, - # supplement_with_ocr_elements=self.supplement_with_ocr_elements, - # **threshold_kwargs, - # ) + else: merged_layout = inferred_layout @@ -339,7 +253,6 @@ def get_elements_with_detection_model( if self.analysis: self.inferred_layout = inferred_layout - # self.ocr_layout = ocr_layout if inplace: self.elements = elements @@ -349,15 +262,13 @@ def get_elements_with_detection_model( def get_elements_from_layout(self, layout: List[TextRegion]) -> List[LayoutElement]: """Uses the given Layout to separate the page text into elements, either extracting the - text from the discovered layout blocks or from the image using OCR.""" + text from the discovered layout blocks.""" layout = order_layout(layout) elements = [ get_element_from_block( block=e, image=self.image, pdf_objects=self.layout, - # ocr_strategy=self.ocr_strategy, - # ocr_languages=self.ocr_languages, extract_tables=self.extract_tables, ) for e in layout @@ -485,12 +396,8 @@ def from_image( detection_model: Optional[UnstructuredObjectDetectionModel] = None, element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, layout: Optional[List[TextRegion]] = None, - # ocr_strategy: str = "auto", - # ocr_languages: str = "eng", - # ocr_mode: str = OCRMode.FULL_PAGE.value, extract_tables: bool = False, fixed_layout: Optional[List[TextRegion]] = None, - # supplement_with_ocr_elements: bool = True, extract_images_in_pdf: bool = False, image_output_dir_path: Optional[str] = None, analysis: bool = False, @@ -503,12 +410,8 @@ def from_image( layout=layout, detection_model=detection_model, element_extraction_model=element_extraction_model, - # ocr_strategy=ocr_strategy, - # ocr_languages=ocr_languages, - # ocr_mode=ocr_mode, extract_tables=extract_tables, analysis=analysis, - # supplement_with_ocr_elements=supplement_with_ocr_elements, ) if page.element_extraction_model is not None: page.get_elements_using_image_extraction() @@ -539,9 +442,6 @@ def process_data_with_model( data: BinaryIO, model_name: Optional[str], is_image: bool = False, - # ocr_strategy: str = "auto", - # ocr_languages: str = "eng", - # ocr_mode: str = OCRMode.FULL_PAGE.value, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, extract_tables: bool = False, pdf_image_dpi: int = 200, @@ -556,9 +456,6 @@ def process_data_with_model( tmp_file.name, model_name, is_image=is_image, - # ocr_strategy=ocr_strategy, - # ocr_languages=ocr_languages, - # ocr_mode=ocr_mode, fixed_layouts=fixed_layouts, extract_tables=extract_tables, pdf_image_dpi=pdf_image_dpi, @@ -572,9 +469,6 @@ def process_file_with_model( filename: str, model_name: Optional[str], is_image: bool = False, - # ocr_strategy: str = "auto", - # ocr_languages: str = "eng", - # ocr_mode: str = OCRMode.FULL_PAGE.value, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, extract_tables: bool = False, pdf_image_dpi: int = 200, @@ -583,14 +477,6 @@ def process_file_with_model( """Processes pdf file with name filename into a DocumentLayout by using a model identified by model_name.""" - # if pdf_image_dpi is None: - # pdf_image_dpi = 300 if model_name == "chipper" else 200 - # if (pdf_image_dpi < 300) and (model_name == "chipper"): - # logger.warning( - # "The Chipper model performs better when images are rendered with DPI >= 300 " - # f"(currently {pdf_image_dpi}).", - # ) - model = get_model(model_name) if isinstance(model, UnstructuredObjectDetectionModel): detection_model = model @@ -605,9 +491,6 @@ def process_file_with_model( filename, detection_model=detection_model, element_extraction_model=element_extraction_model, - # ocr_strategy=ocr_strategy, - # ocr_languages=ocr_languages, - # ocr_mode=ocr_mode, extract_tables=extract_tables, **kwargs, ) @@ -616,9 +499,6 @@ def process_file_with_model( filename, detection_model=detection_model, element_extraction_model=element_extraction_model, - # ocr_strategy=ocr_strategy, - # ocr_languages=ocr_languages, - # ocr_mode=ocr_mode, fixed_layouts=fixed_layouts, extract_tables=extract_tables, pdf_image_dpi=pdf_image_dpi, @@ -632,8 +512,6 @@ def get_element_from_block( block: TextRegion, image: Optional[Image.Image] = None, pdf_objects: Optional[List[TextRegion]] = None, - # ocr_strategy: str = "auto", - # ocr_languages: str = "eng", extract_tables: bool = False, ) -> LayoutElement: """Creates a LayoutElement from a given layout or image by finding all the text that lies within @@ -643,8 +521,6 @@ def get_element_from_block( objects=pdf_objects, image=image, extract_tables=extract_tables, - # ocr_strategy=ocr_strategy, - # ocr_languages=ocr_languages, ) return element diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index b3a1ed31..b771509d 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -11,7 +11,6 @@ from unstructured_inference.config import inference_config from unstructured_inference.constants import ( FULL_PAGE_REGION_THRESHOLD, - # SUBREGION_THRESHOLD_FOR_OCR, Source, ) from unstructured_inference.inference.elements import ( @@ -19,7 +18,6 @@ Rectangle, TextRegion, grow_region_to_match_region, - # partition_groups_from_regions, region_bounding_boxes_are_almost_the_same, ) @@ -35,16 +33,12 @@ def extract_text( objects: Optional[Collection[TextRegion]], image: Optional[Image.Image] = None, extract_tables: bool = False, - # ocr_strategy: str = "auto", - # ocr_languages: str = "eng", ): """Extracts text contained in region""" text = super().extract_text( objects=objects, image=image, extract_tables=extract_tables, - # ocr_strategy=ocr_strategy, - # ocr_languages=ocr_languages, ) if extract_tables and self.type == "Table": self.text_as_html = interpret_table_block(self, image) @@ -97,8 +91,6 @@ def merge_inferred_layout_with_extracted_layout( inferred_layout: Collection[LayoutElement], extracted_layout: Collection[TextRegion], page_image_size: tuple, - # ocr_layout: Optional[List[TextRegion]] = None, - # supplement_with_ocr_elements: bool = True, same_region_threshold: float = inference_config.LAYOUT_SAME_REGION_THRESHOLD, subregion_threshold: float = inference_config.LAYOUT_SUBREGION_THRESHOLD, ) -> List[LayoutElement]: @@ -185,177 +177,12 @@ def merge_inferred_layout_with_extracted_layout( inferred_regions_to_add = [ region for region in inferred_layout if region not in inferred_regions_to_remove ] - # inferred_regions_to_add_without_text = [ - # region for region in inferred_regions_to_add if not region.text - # ] - # moved to unst - # if ocr_layout is not None: - # for inferred_region in inferred_regions_to_add_without_text: - # inferred_region.text = aggregate_ocr_text_by_block( - # ocr_layout, - # inferred_region, - # SUBREGION_THRESHOLD_FOR_OCR, - # ) - # out_layout = categorized_extracted_elements_to_add + inferred_regions_to_add - # final_layout = ( - # supplement_layout_with_ocr_elements(out_layout, ocr_layout) - # if supplement_with_ocr_elements - # else out_layout - # ) - # else: - # final_layout = categorized_extracted_elements_to_add + inferred_regions_to_add + final_layout = categorized_extracted_elements_to_add + inferred_regions_to_add return final_layout -# move to unst -# def merge_inferred_layout_with_ocr_layout( -# inferred_layout: List[LayoutElement], -# ocr_layout: List[TextRegion], -# supplement_with_ocr_elements: bool = True, -# ) -> List[LayoutElement]: -# """ -# Merge the inferred layout with the OCR-detected text regions. - -# This function iterates over each inferred layout element and aggregates the -# associated text from the OCR layout using the specified threshold. The inferred -# layout's text attribute is then updated with this aggregated text. -# """ - -# for inferred_region in inferred_layout: -# inferred_region.text = aggregate_ocr_text_by_block( -# ocr_layout, -# inferred_region, -# SUBREGION_THRESHOLD_FOR_OCR, -# ) - -# final_layout = ( -# supplement_layout_with_ocr_elements(inferred_layout, ocr_layout) -# if supplement_with_ocr_elements -# else inferred_layout -# ) - -# return final_layout - -# move to unst -# def aggregate_ocr_text_by_block( -# ocr_layout: List[TextRegion], -# region: TextRegion, -# subregion_threshold: float, -# ) -> Optional[str]: -# """Extracts the text aggregated from the regions of the ocr layout that lie within the given -# block.""" - -# extracted_texts = [] - -# for ocr_region in ocr_layout: -# ocr_region_is_subregion_of_given_region = ocr_region.is_almost_subregion_of( -# region, -# subregion_threshold=subregion_threshold, -# ) -# if ocr_region_is_subregion_of_given_region and ocr_region.text: -# extracted_texts.append(ocr_region.text) - -# return " ".join(extracted_texts) if extracted_texts else None - -# move to unst -# def supplement_layout_with_ocr_elements( -# layout: List[LayoutElement], -# ocr_layout: List[TextRegion], -# ) -> List[LayoutElement]: -# """ -# Supplement the existing layout with additional OCR-derived elements. - -# This function takes two lists: one list of pre-existing layout elements (`layout`) -# and another list of OCR-detected text regions (`ocr_layout`). It identifies OCR regions -# that are subregions of the elements in the existing layout and removes them from the -# OCR-derived list. Then, it appends the remaining OCR-derived regions to the existing layout. - -# Parameters: -# - layout (List[LayoutElement]): A list of existing layout elements, each of which is -# an instance of `LayoutElement`. -# - ocr_layout (List[TextRegion]): A list of OCR-derived text regions, each of which is -# an instance of `TextRegion`. - -# Returns: -# - List[LayoutElement]: The final combined layout consisting of both the original layout -# elements and the new OCR-derived elements. - -# Note: -# - The function relies on `is_almost_subregion_of()` method to determine if an OCR region -# is a subregion of an existing layout element. -# - It also relies on `get_elements_from_ocr_regions()` to convert OCR -# regions to layout elements. -# - The `SUBREGION_THRESHOLD_FOR_OCR` constant is used to specify the subregion matching -# threshold. -# """ - -# ocr_regions_to_remove = [] -# for ocr_region in ocr_layout: -# for el in layout: -# ocr_region_is_subregion_of_out_el = ocr_region.is_almost_subregion_of( -# cast(Rectangle, el), -# SUBREGION_THRESHOLD_FOR_OCR, -# ) -# if ocr_region_is_subregion_of_out_el: -# ocr_regions_to_remove.append(ocr_region) -# break - -# ocr_regions_to_add = [region for region in ocr_layout if region not in ocr_regions_to_remove] -# if ocr_regions_to_add: -# ocr_elements_to_add = get_elements_from_ocr_regions(ocr_regions_to_add) -# final_layout = layout + ocr_elements_to_add -# else: -# final_layout = layout - -# return final_layout - -# move to unst -# def merge_text_regions(regions: List[TextRegion]) -> TextRegion: -# """ -# Merge a list of TextRegion objects into a single TextRegion. - -# Parameters: -# - group (List[TextRegion]): A list of TextRegion objects to be merged. - -# Returns: -# - TextRegion: A single merged TextRegion object. -# """ - -# min_x1 = min([tr.x1 for tr in regions]) -# min_y1 = min([tr.y1 for tr in regions]) -# max_x2 = max([tr.x2 for tr in regions]) -# max_y2 = max([tr.y2 for tr in regions]) - -# merged_text = " ".join([tr.text for tr in regions if tr.text]) - -# return TextRegion(min_x1, min_y1, max_x2, max_y2, merged_text) - -# move to unst -# def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutElement]: -# """ -# Get layout elements from OCR regions -# """ - -# grouped_regions = cast( -# List[List[TextRegion]], -# partition_groups_from_regions(ocr_regions), -# ) -# merged_regions = [merge_text_regions(group) for group in grouped_regions] -# return [ -# LayoutElement( -# r.x1, -# r.y1, -# r.x2, -# r.y2, -# text=r.text, -# type="UncategorizedText", -# ) -# for r in merged_regions -# ] - - def separate(region_a: Union[LayoutElement, Rectangle], region_b: Union[LayoutElement, Rectangle]): """Reduce leftmost rectangle to don't overlap with the other""" From 4768a8e90b04277e3c3b72f18493f11392566442 Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Tue, 3 Oct 2023 20:03:24 -0400 Subject: [PATCH 19/26] add deduplicate_detected_elements back --- unstructured_inference/inference/layout.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index bc21fafd..7e6811c6 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -229,6 +229,9 @@ def get_elements_with_detection_model( # NOTE(mrobinson) - We'll want make this model inference step some kind of # remote call in the future. inferred_layout: List[LayoutElement] = self.detection_model(self.image) + inferred_layout = UnstructuredObjectDetectionModel.deduplicate_detected_elements( + inferred_layout, + ) if self.layout is not None: threshold_kwargs = {} From 5f0bbffb716d7a7b4d0285adfb673a75cd4ff69e Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 4 Oct 2023 11:10:49 -0700 Subject: [PATCH 20/26] refactor: remove ocr layout visualization --- examples/layout_analysis/visualization.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/layout_analysis/visualization.py b/examples/layout_analysis/visualization.py index fe4b497e..221d5301 100644 --- a/examples/layout_analysis/visualization.py +++ b/examples/layout_analysis/visualization.py @@ -14,7 +14,6 @@ def run(f_path, scope): "final": None, "extracted": {"layout": {"color": "green", "width": 2}}, "inferred": {"inferred_layout": {"color": "blue", "width": 2}}, - "ocr": {"ocr_layout": {"color": "yellow", "width": 2}}, } f_basename = os.path.splitext(os.path.basename(f_path))[0] @@ -47,8 +46,7 @@ def run(f_path, scope): write_image(img, output_f_path) print(f"page_num: {idx+1} - n_total_elements: {len(page.elements)} - n_extracted_elements: " - f"{len(page.layout)} - n_inferred_elements: {len(page.inferred_layout)} - " - f"n_ocr_elements: {len(page.ocr_layout)}") + f"{len(page.layout)} - n_inferred_elements: {len(page.inferred_layout)}") if __name__ == '__main__': From 2b7a2fc93c31f614f9c58c997afb6ec5bebfc94a Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Wed, 4 Oct 2023 16:16:52 -0400 Subject: [PATCH 21/26] remove teseract module since table won't use it --- .../models/test_tesseract.py | 26 ------------ unstructured_inference/constants.py | 3 ++ unstructured_inference/models/tables.py | 6 +-- unstructured_inference/models/tesseract.py | 42 ------------------- 4 files changed, 6 insertions(+), 71 deletions(-) delete mode 100644 test_unstructured_inference/models/test_tesseract.py delete mode 100644 unstructured_inference/models/tesseract.py diff --git a/test_unstructured_inference/models/test_tesseract.py b/test_unstructured_inference/models/test_tesseract.py deleted file mode 100644 index 475cba08..00000000 --- a/test_unstructured_inference/models/test_tesseract.py +++ /dev/null @@ -1,26 +0,0 @@ -from unittest.mock import patch - -import pytest - -from unstructured_inference.models import tesseract - - -class MockTesseractAgent: - def __init__(self, languages): - pass - - -def test_load_agent(monkeypatch): - monkeypatch.setattr(tesseract, "TesseractAgent", MockTesseractAgent) - monkeypatch.setattr(tesseract, "ocr_agents", {}) - - with patch.object(tesseract, "is_pytesseract_available", return_value=True): - tesseract.load_agent(languages="eng+swe") - - assert isinstance(tesseract.ocr_agents["eng+swe"], MockTesseractAgent) - - -def test_load_agent_raises_when_not_available(): - with patch.object(tesseract, "is_pytesseract_available", return_value=False): - with pytest.raises(ImportError): - tesseract.load_agent() diff --git a/unstructured_inference/constants.py b/unstructured_inference/constants.py index 436e538e..8fa622df 100644 --- a/unstructured_inference/constants.py +++ b/unstructured_inference/constants.py @@ -15,3 +15,6 @@ class Source(Enum): FULL_PAGE_REGION_THRESHOLD = 0.99 + +# this field is defined by pytesseract/unstructured.pytesseract +TESSERACT_TEXT_HEIGHT = "height" diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py index 4a68e3d2..6b29fe78 100644 --- a/unstructured_inference/models/tables.py +++ b/unstructured_inference/models/tables.py @@ -16,11 +16,11 @@ from transformers import DetrImageProcessor, TableTransformerForObjectDetection from unstructured_inference.config import inference_config -from unstructured_inference.logger import logger -from unstructured_inference.models.table_postprocess import Rect -from unstructured_inference.models.tesseract import ( +from unstructured_inference.constants import ( TESSERACT_TEXT_HEIGHT, ) +from unstructured_inference.logger import logger +from unstructured_inference.models.table_postprocess import Rect from unstructured_inference.models.unstructuredmodel import UnstructuredModel from unstructured_inference.utils import pad_image_with_background_color diff --git a/unstructured_inference/models/tesseract.py b/unstructured_inference/models/tesseract.py deleted file mode 100644 index e6f599cc..00000000 --- a/unstructured_inference/models/tesseract.py +++ /dev/null @@ -1,42 +0,0 @@ -import os -from typing import Dict - -import pytesseract -from layoutparser.ocr.tesseract_agent import TesseractAgent, is_pytesseract_available - -from unstructured_inference.logger import logger - -ocr_agents: Dict[str, TesseractAgent] = {} - -TesseractError = pytesseract.pytesseract.TesseractError - -# Force tesseract to be single threaded, -# otherwise we see major performance problems -if "OMP_THREAD_LIMIT" not in os.environ: - os.environ["OMP_THREAD_LIMIT"] = "1" - - -# this field is defined by pytesseract/unstructured.pytesseract -TESSERACT_TEXT_HEIGHT = "height" - - -def load_agent(languages: str = "eng"): - """Loads the Tesseract OCR agent as a global variable to ensure that we only load it once. - - Parameters - ---------- - languages - The languages to use for the Tesseract agent. To use a langauge, you'll first need - to isntall the appropriate Tesseract language pack. - """ - global ocr_agents - - if not is_pytesseract_available(): - raise ImportError( - "Failed to load Tesseract. Ensure that Tesseract is installed. Example command: \n" - " >>> sudo apt install -y tesseract-ocr", - ) - - if languages not in ocr_agents: - logger.info(f"Loading the Tesseract OCR agent for {languages} ...") - ocr_agents[languages] = TesseractAgent(languages=languages) From d319c9d08e8a51ed4dab11e7a61ca490445ab99d Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Wed, 4 Oct 2023 16:35:50 -0400 Subject: [PATCH 22/26] remove using image in extract_text --- unstructured_inference/inference/elements.py | 10 +++------- unstructured_inference/inference/layoutelement.py | 1 - 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py index 262431fe..69a7c831 100644 --- a/unstructured_inference/inference/elements.py +++ b/unstructured_inference/inference/elements.py @@ -211,13 +211,10 @@ def extract_text( # If block text is already populated, we'll assume it's correct text = self.text elif objects is not None: - text = aggregate_by_block(self, image, objects) - elif image is not None: - text = "" + text = aggregate_by_block(self, objects) else: raise ValueError( - "Got arguments image and layout as None, at least one must be populated to use for " - "text extraction.", + "Got layout as None, expected be populated to use for text extraction.", ) return text @@ -247,12 +244,11 @@ def extract_text( if self.text is None: return "" else: - return super().extract_text(objects, image, extract_tables) + return super().extract_text(objects, extract_tables) def aggregate_by_block( text_region: TextRegion, - image: Optional[Image.Image], pdf_objects: Collection[TextRegion], ) -> str: """Extracts the text aggregated from the elements of the given layout that lie within the given diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index b771509d..b1fbef3e 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -37,7 +37,6 @@ def extract_text( """Extracts text contained in region""" text = super().extract_text( objects=objects, - image=image, extract_tables=extract_tables, ) if extract_tables and self.type == "Table": From e6bb6a31a465cf90a0918aa8b557332726c7ed5e Mon Sep 17 00:00:00 2001 From: Yuming Long <63475068+yuming-long@users.noreply.github.com> Date: Wed, 4 Oct 2023 16:37:40 -0400 Subject: [PATCH 23/26] Update CHANGELOG.md Co-authored-by: cragwolfe --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 17340799..4fe3117f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.6.7 +## 0.7.0 * Remove all OCR related code expect the table OCR code From 4de3ff3eb6688bd38a7f48fa85cbdd2f645e2d04 Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Wed, 4 Oct 2023 17:07:50 -0400 Subject: [PATCH 24/26] version bump --- unstructured_inference/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index ba99456d..8909b1e7 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.6.7" # pragma: no cover +__version__ = "0.7.0" # pragma: no cover From 2f3c0db107f3c7132c1a128926612564413f2074 Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Wed, 4 Oct 2023 17:13:24 -0400 Subject: [PATCH 25/26] fix: remove value error in extract text --- unstructured_inference/inference/elements.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py index 69a7c831..8ca22415 100644 --- a/unstructured_inference/inference/elements.py +++ b/unstructured_inference/inference/elements.py @@ -213,9 +213,7 @@ def extract_text( elif objects is not None: text = aggregate_by_block(self, objects) else: - raise ValueError( - "Got layout as None, expected be populated to use for text extraction.", - ) + text = "" return text From 6a4677e8be21446ded8ab7dbbe6915e3c29734bd Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Wed, 4 Oct 2023 17:44:00 -0400 Subject: [PATCH 26/26] remove test since won't raise errir --- test_unstructured_inference/inference/test_layout.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index d1ac32b1..17d29bde 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -272,12 +272,6 @@ def filter_by(self, *args, **kwargs): return MockLayout() -def test_get_elements_from_block_raises(): - with pytest.raises(ValueError): - block = layout.TextRegion(0, 0, 10, 10, text=None) - layout.get_element_from_block(block, None, None) - - @pytest.mark.parametrize("filetype", ["png", "jpg", "tiff"]) def test_from_image_file(monkeypatch, mock_final_layout, filetype): def mock_get_elements(self, *args, **kwargs):