Skip to content

Commit

Permalink
test: Add test to ensure languages trickle down to ocr (#1857)
Browse files Browse the repository at this point in the history
Closes
[#93](Unstructured-IO/unstructured-inference#93).

Adds a test to ensure language parameters are passed all the way from
`partition_pdf` down to the OCR calls.

#### Testing:

CI should pass.
  • Loading branch information
qued authored Oct 24, 2023
1 parent b530e0a commit 44cef80
Showing 1 changed file with 32 additions and 0 deletions.
32 changes: 32 additions & 0 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1057,3 +1057,35 @@ def test_partition_model_name_default_to_None():
)
except AttributeError:
pytest.fail("partition_pdf() raised AttributeError unexpectedly!")


@pytest.mark.parametrize(
("strategy", "ocr_func"),
[
(
"hi_res",
"unstructured_pytesseract.image_to_data",
),
(
"ocr_only",
"unstructured_pytesseract.run_and_get_multiple_output",
),
],
)
def test_ocr_language_passes_through(strategy, ocr_func):
# Create an exception that will be raised directly after OCR is called to stop execution
class CallException(Exception):
pass

mock_ocr_func = mock.Mock(side_effect=CallException("Function called!"))
# Patch the ocr function with the mock that will record the call and then terminate
with mock.patch(ocr_func, mock_ocr_func), pytest.raises(CallException):
pdf.partition_pdf(
"example-docs/layout-parser-paper-fast.pdf",
strategy=strategy,
ocr_languages="kor",
)
# Check that the language parameter was passed down as expected
kwargs = mock_ocr_func.call_args.kwargs
assert "lang" in kwargs
assert kwargs["lang"] == "kor"

0 comments on commit 44cef80

Please sign in to comment.