Skip to content

Commit

Permalink
feat: allow disabling OCR in hi_res mode (fixes: #2467)
Browse files Browse the repository at this point in the history
  • Loading branch information
David Huggins-Daines committed Dec 17, 2024
1 parent 9a9bf4c commit 73f6c39
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 22 deletions.
16 changes: 16 additions & 0 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,22 @@ def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode):
assert "Layouts of scanned US newspapers from the 20th century" in table[0]


def test_partition_pdf_hi_res_ocr_mode_none():
filename = example_doc_path("pdf/layout-parser-paper.pdf")
elements = pdf.partition_pdf(
filename=filename,
ocr_mode="none",
strategy=PartitionStrategy.HI_RES,
# FIXME: table structure still requires OCR for no good reason
infer_table_structure=False,
)
fast_elements = pdf.partition_pdf(
filename=filename,
strategy=PartitionStrategy.FAST,
)
assert elements != fast_elements


def test_partition_pdf_with_copy_protection():
filename = example_doc_path("pdf/copy-protected.pdf")
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)
Expand Down
50 changes: 28 additions & 22 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,17 +623,20 @@ def _partition_pdf_or_image_local(
hi_res_model_name=hi_res_model_name,
)

final_document_layout = process_file_with_ocr(
filename,
merged_document_layout,
extracted_layout=extracted_layout,
is_image=is_image,
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)
if ocr_mode == OCRMode.NONE.value:
final_document_layout = merged_document_layout
else:
final_document_layout = process_file_with_ocr(
filename,
merged_document_layout,
extracted_layout=extracted_layout,
is_image=is_image,
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)
else:
inferred_document_layout = process_data_with_model(
file,
Expand Down Expand Up @@ -678,17 +681,20 @@ def _partition_pdf_or_image_local(

if hasattr(file, "seek"):
file.seek(0)
final_document_layout = process_data_with_ocr(
file,
merged_document_layout,
extracted_layout=extracted_layout,
is_image=is_image,
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)
if ocr_mode == OCRMode.NONE.value:
final_document_layout = merged_document_layout
else:
final_document_layout = process_data_with_ocr(
file,
merged_document_layout,
extracted_layout=extracted_layout,
is_image=is_image,
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)

final_document_layout = clean_pdfminer_inner_elements(final_document_layout)

Expand Down
1 change: 1 addition & 0 deletions unstructured/partition/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class Source(Enum):
class OCRMode(Enum):
INDIVIDUAL_BLOCKS = "individual_blocks"
FULL_PAGE = "entire_page"
NONE = "none"


class PartitionStrategy:
Expand Down

0 comments on commit 73f6c39

Please sign in to comment.