Skip to content

Commit

Permalink
Optimise tests
Browse files Browse the repository at this point in the history
  • Loading branch information
pprados committed Jan 17, 2025
1 parent 23a73a9 commit 830e10d
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,7 @@ class PyMuPDFParser(BaseBlobParser):
parser = PyMuPDFParser(
# password = None,
mode = "single",
pages_delimitor = "\n\f",
pages_delimiter = "\n\f",
# extract_images = True,
# images_parser = TesseractBlobParser(),
# extract_tables="markdown",
Expand Down
2 changes: 1 addition & 1 deletion libs/community/langchain_community/document_loaders/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ def __init__(
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
extract_images: bool = False,
images_parser: Optional[BaseImageBlobParser] = None,
images_inner_format:str="text",
images_inner_format: str = "text",
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
headers: Optional[dict] = None,
extract_tables_settings: Optional[dict[str, Any]] = None,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,31 @@ def _analyze_image(self, img: "Image") -> str:


@pytest.mark.parametrize(
"mode",
["single", "page"],
"mode,image_parser",
[("single", EmptyImageBlobParser()), ("page", None)],
)
@pytest.mark.parametrize(
"image_parser",
[EmptyImageBlobParser(), None],
"parser_factory,params",
[
("PyMuPDFParser", {}),
],
)
@pytest.mark.requires("pillow")
def test_mode_and_extract_images_variations(
parser_factory: str,
params: dict,
mode: str,
image_parser: BaseImageBlobParser,
) -> None:
_test_matrix(
parser_factory,
params,
mode,
image_parser,
images_inner_format="text",
)


@pytest.mark.parametrize(
"images_inner_format",
["text", "markdown-img", "html-img"],
Expand All @@ -161,7 +179,24 @@ def _analyze_image(self, img: "Image") -> str:
],
)
@pytest.mark.requires("pillow")
def test_mode_and_extract_images_variations(
def test_mode_and_image_formats_variations(
parser_factory: str,
params: dict,
images_inner_format: str,
) -> None:
mode = "single"
image_parser = EmptyImageBlobParser()

_test_matrix(
parser_factory,
params,
mode,
image_parser,
images_inner_format,
)


def _test_matrix(
parser_factory: str,
params: dict,
mode: str,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,15 @@ def test_standard_parameters(
assert len(docs) == 1

file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = loader_class(file_path, mode="page")
loader = loader_class(file_path,
mode="page",
page_delimiter="---",
images_parser=None,
images_inner_format="text",
password=None,
extract_tables=None,
extract_tables_settings=None,
)
docs = loader.load()
assert len(docs) == 16
assert loader.web_path is None
Expand Down

0 comments on commit 830e10d

Please sign in to comment.