From 73f6c3989199061845dcfd7f5d8a183b52c035c9 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhdaines@logisphere.ca>
Date: Tue, 17 Dec 2024 14:55:08 -0500
Subject: [PATCH] feat: allow disabling OCR in hi_res mode (fixes: #2467)

---
 .../partition/pdf_image/test_pdf.py           | 16 ++++++
 unstructured/partition/pdf.py                 | 50 +++++++++++--------
 unstructured/partition/utils/constants.py     |  1 +
 3 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
index 9b1b8de6e1..0eeebe768a 100644
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -602,6 +602,22 @@ def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode):
     assert "Layouts of scanned US newspapers from the 20th century" in table[0]
 
 
+def test_partition_pdf_hi_res_ocr_mode_none():
+    filename = example_doc_path("pdf/layout-parser-paper.pdf")
+    elements = pdf.partition_pdf(
+        filename=filename,
+        ocr_mode="none",
+        strategy=PartitionStrategy.HI_RES,
+        # FIXME: table structure still requires OCR for no good reason
+        infer_table_structure=False,
+    )
+    fast_elements = pdf.partition_pdf(
+        filename=filename,
+        strategy=PartitionStrategy.FAST,
+    )
+    assert elements != fast_elements
+
+
 def test_partition_pdf_with_copy_protection():
     filename = example_doc_path("pdf/copy-protected.pdf")
     elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index f87812d40b..c3f41242a5 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -623,17 +623,20 @@ def _partition_pdf_or_image_local(
             hi_res_model_name=hi_res_model_name,
         )
 
-        final_document_layout = process_file_with_ocr(
-            filename,
-            merged_document_layout,
-            extracted_layout=extracted_layout,
-            is_image=is_image,
-            infer_table_structure=infer_table_structure,
-            ocr_languages=ocr_languages,
-            ocr_mode=ocr_mode,
-            pdf_image_dpi=pdf_image_dpi,
-            ocr_layout_dumper=ocr_layout_dumper,
-        )
+        if ocr_mode == OCRMode.NONE.value:
+            final_document_layout = merged_document_layout
+        else:
+            final_document_layout = process_file_with_ocr(
+                filename,
+                merged_document_layout,
+                extracted_layout=extracted_layout,
+                is_image=is_image,
+                infer_table_structure=infer_table_structure,
+                ocr_languages=ocr_languages,
+                ocr_mode=ocr_mode,
+                pdf_image_dpi=pdf_image_dpi,
+                ocr_layout_dumper=ocr_layout_dumper,
+            )
     else:
         inferred_document_layout = process_data_with_model(
             file,
@@ -678,17 +681,20 @@ def _partition_pdf_or_image_local(
 
         if hasattr(file, "seek"):
             file.seek(0)
-        final_document_layout = process_data_with_ocr(
-            file,
-            merged_document_layout,
-            extracted_layout=extracted_layout,
-            is_image=is_image,
-            infer_table_structure=infer_table_structure,
-            ocr_languages=ocr_languages,
-            ocr_mode=ocr_mode,
-            pdf_image_dpi=pdf_image_dpi,
-            ocr_layout_dumper=ocr_layout_dumper,
-        )
+        if ocr_mode == OCRMode.NONE.value:
+            final_document_layout = merged_document_layout
+        else:
+            final_document_layout = process_data_with_ocr(
+                file,
+                merged_document_layout,
+                extracted_layout=extracted_layout,
+                is_image=is_image,
+                infer_table_structure=infer_table_structure,
+                ocr_languages=ocr_languages,
+                ocr_mode=ocr_mode,
+                pdf_image_dpi=pdf_image_dpi,
+                ocr_layout_dumper=ocr_layout_dumper,
+            )
 
     final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
 
diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py
index 4b4dadeaa1..225583c39e 100644
--- a/unstructured/partition/utils/constants.py
+++ b/unstructured/partition/utils/constants.py
@@ -12,6 +12,7 @@ class Source(Enum):
 class OCRMode(Enum):
     INDIVIDUAL_BLOCKS = "individual_blocks"
     FULL_PAGE = "entire_page"
+    NONE = "none"
 
 
 class PartitionStrategy: