From 73f6c3989199061845dcfd7f5d8a183b52c035c9 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 17 Dec 2024 14:55:08 -0500 Subject: [PATCH] feat: allow disabling OCR in hi_res mode (fixes: #2467) --- .../partition/pdf_image/test_pdf.py | 16 ++++++ unstructured/partition/pdf.py | 50 +++++++++++-------- unstructured/partition/utils/constants.py | 1 + 3 files changed, 45 insertions(+), 22 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 9b1b8de6e1..0eeebe768a 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -602,6 +602,22 @@ def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode): assert "Layouts of scanned US newspapers from the 20th century" in table[0] +def test_partition_pdf_hi_res_ocr_mode_none(): + filename = example_doc_path("pdf/layout-parser-paper.pdf") + elements = pdf.partition_pdf( + filename=filename, + ocr_mode="none", + strategy=PartitionStrategy.HI_RES, + # FIXME: table structure still requires OCR for no good reason + infer_table_structure=False, + ) + fast_elements = pdf.partition_pdf( + filename=filename, + strategy=PartitionStrategy.FAST, + ) + assert elements != fast_elements + + def test_partition_pdf_with_copy_protection(): filename = example_doc_path("pdf/copy-protected.pdf") elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index f87812d40b..c3f41242a5 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -623,17 +623,20 @@ def _partition_pdf_or_image_local( hi_res_model_name=hi_res_model_name, ) - final_document_layout = process_file_with_ocr( - filename, - merged_document_layout, - extracted_layout=extracted_layout, - is_image=is_image, - infer_table_structure=infer_table_structure, - ocr_languages=ocr_languages, - ocr_mode=ocr_mode, - pdf_image_dpi=pdf_image_dpi, - ocr_layout_dumper=ocr_layout_dumper, - ) + if ocr_mode == OCRMode.NONE.value: + final_document_layout = merged_document_layout + else: + final_document_layout = process_file_with_ocr( + filename, + merged_document_layout, + extracted_layout=extracted_layout, + is_image=is_image, + infer_table_structure=infer_table_structure, + ocr_languages=ocr_languages, + ocr_mode=ocr_mode, + pdf_image_dpi=pdf_image_dpi, + ocr_layout_dumper=ocr_layout_dumper, + ) else: inferred_document_layout = process_data_with_model( file, @@ -678,17 +681,20 @@ def _partition_pdf_or_image_local( if hasattr(file, "seek"): file.seek(0) - final_document_layout = process_data_with_ocr( - file, - merged_document_layout, - extracted_layout=extracted_layout, - is_image=is_image, - infer_table_structure=infer_table_structure, - ocr_languages=ocr_languages, - ocr_mode=ocr_mode, - pdf_image_dpi=pdf_image_dpi, - ocr_layout_dumper=ocr_layout_dumper, - ) + if ocr_mode == OCRMode.NONE.value: + final_document_layout = merged_document_layout + else: + final_document_layout = process_data_with_ocr( + file, + merged_document_layout, + extracted_layout=extracted_layout, + is_image=is_image, + infer_table_structure=infer_table_structure, + ocr_languages=ocr_languages, + ocr_mode=ocr_mode, + pdf_image_dpi=pdf_image_dpi, + ocr_layout_dumper=ocr_layout_dumper, + ) final_document_layout = clean_pdfminer_inner_elements(final_document_layout) diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py index 4b4dadeaa1..225583c39e 100644 --- a/unstructured/partition/utils/constants.py +++ b/unstructured/partition/utils/constants.py @@ -12,6 +12,7 @@ class Source(Enum): class OCRMode(Enum): INDIVIDUAL_BLOCKS = "individual_blocks" FULL_PAGE = "entire_page" + NONE = "none" class PartitionStrategy: