diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 4eb493cf41e935..254849df802738 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -466,7 +466,7 @@ class PyMuPDFParser(BaseBlobParser): parser = PyMuPDFParser( # password = None, mode = "single", - pages_delimitor = "\n\f", + pages_delimiter = "\n\f", # extract_images = True, # images_parser = TesseractBlobParser(), # extract_tables="markdown", diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index bd98d72db922d3..3c5f2ca9b6357b 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -495,7 +495,7 @@ def __init__( pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, extract_images: bool = False, images_parser: Optional[BaseImageBlobParser] = None, - images_inner_format:str="text", + images_inner_format: str = "text", extract_tables: Union[Literal["csv", "markdown", "html"], None] = None, headers: Optional[dict] = None, extract_tables_settings: Optional[dict[str, Any]] = None, diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py index ee0fe365885bb5..44cc8294643f1b 100644 --- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py +++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py @@ -143,13 +143,31 @@ def _analyze_image(self, img: "Image") -> str: @pytest.mark.parametrize( - "mode", - ["single", "page"], + "mode,image_parser", + [("single", EmptyImageBlobParser()), ("page", None)], ) @pytest.mark.parametrize( - "image_parser", - [EmptyImageBlobParser(), None], + "parser_factory,params", + [ + ("PyMuPDFParser", {}), + ], ) +@pytest.mark.requires("pillow") +def test_mode_and_extract_images_variations( + parser_factory: str, + params: dict, + mode: str, + image_parser: BaseImageBlobParser, +) -> None: + _test_matrix( + parser_factory, + params, + mode, + image_parser, + images_inner_format="text", + ) + + @pytest.mark.parametrize( "images_inner_format", ["text", "markdown-img", "html-img"], @@ -161,7 +179,24 @@ def _analyze_image(self, img: "Image") -> str: ], ) @pytest.mark.requires("pillow") -def test_mode_and_extract_images_variations( +def test_mode_and_image_formats_variations( + parser_factory: str, + params: dict, + images_inner_format: str, +) -> None: + mode = "single" + image_parser = EmptyImageBlobParser() + + _test_matrix( + parser_factory, + params, + mode, + image_parser, + images_inner_format, + ) + + +def _test_matrix( parser_factory: str, params: dict, mode: str, diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py index a681dce8c59c01..32fce9e4259ce7 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py @@ -226,7 +226,15 @@ def test_standard_parameters( assert len(docs) == 1 file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = loader_class(file_path, mode="page") + loader = loader_class(file_path, + mode="page", + page_delimiter="---", + images_parser=None, + images_inner_format="text", + password=None, + extract_tables=None, + extract_tables_settings=None, + ) docs = loader.load() assert len(docs) == 16 assert loader.web_path is None