feat: allow disabling OCR in hi_res mode (fixes: #2467)

Unstructured-IO · Dec 17, 2024 · 73f6c39 · 73f6c39
1 parent 9a9bf4c
commit 73f6c39
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 22 deletions.
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -602,6 +602,22 @@ def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode):
     assert "Layouts of scanned US newspapers from the 20th century" in table[0]
 
 
+def test_partition_pdf_hi_res_ocr_mode_none():
+    filename = example_doc_path("pdf/layout-parser-paper.pdf")
+    elements = pdf.partition_pdf(
+        filename=filename,
+        ocr_mode="none",
+        strategy=PartitionStrategy.HI_RES,
+        # FIXME: table structure still requires OCR for no good reason
+        infer_table_structure=False,
+    )
+    fast_elements = pdf.partition_pdf(
+        filename=filename,
+        strategy=PartitionStrategy.FAST,
+    )
+    assert elements != fast_elements
+
+
 def test_partition_pdf_with_copy_protection():
     filename = example_doc_path("pdf/copy-protected.pdf")
     elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)

diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -623,17 +623,20 @@ def _partition_pdf_or_image_local(
             hi_res_model_name=hi_res_model_name,
         )
 
-        final_document_layout = process_file_with_ocr(
-            filename,
-            merged_document_layout,
-            extracted_layout=extracted_layout,
-            is_image=is_image,
-            infer_table_structure=infer_table_structure,
-            ocr_languages=ocr_languages,
-            ocr_mode=ocr_mode,
-            pdf_image_dpi=pdf_image_dpi,
-            ocr_layout_dumper=ocr_layout_dumper,
-        )
+        if ocr_mode == OCRMode.NONE.value:
+            final_document_layout = merged_document_layout
+        else:
+            final_document_layout = process_file_with_ocr(
+                filename,
+                merged_document_layout,
+                extracted_layout=extracted_layout,
+                is_image=is_image,
+                infer_table_structure=infer_table_structure,
+                ocr_languages=ocr_languages,
+                ocr_mode=ocr_mode,
+                pdf_image_dpi=pdf_image_dpi,
+                ocr_layout_dumper=ocr_layout_dumper,
+            )
     else:
         inferred_document_layout = process_data_with_model(
             file,
@@ -678,17 +681,20 @@ def _partition_pdf_or_image_local(
 
         if hasattr(file, "seek"):
             file.seek(0)
-        final_document_layout = process_data_with_ocr(
-            file,
-            merged_document_layout,
-            extracted_layout=extracted_layout,
-            is_image=is_image,
-            infer_table_structure=infer_table_structure,
-            ocr_languages=ocr_languages,
-            ocr_mode=ocr_mode,
-            pdf_image_dpi=pdf_image_dpi,
-            ocr_layout_dumper=ocr_layout_dumper,
-        )
+        if ocr_mode == OCRMode.NONE.value:
+            final_document_layout = merged_document_layout
+        else:
+            final_document_layout = process_data_with_ocr(
+                file,
+                merged_document_layout,
+                extracted_layout=extracted_layout,
+                is_image=is_image,
+                infer_table_structure=infer_table_structure,
+                ocr_languages=ocr_languages,
+                ocr_mode=ocr_mode,
+                pdf_image_dpi=pdf_image_dpi,
+                ocr_layout_dumper=ocr_layout_dumper,
+            )
 
     final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
 

diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py
@@ -12,6 +12,7 @@ class Source(Enum):
 class OCRMode(Enum):
     INDIVIDUAL_BLOCKS = "individual_blocks"
     FULL_PAGE = "entire_page"
+    NONE = "none"
 
 
 class PartitionStrategy: