From 59f1e182db4aec9985a94b8e05117bc54a3dc0ea Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Tue, 26 Nov 2024 10:48:55 +0100 Subject: [PATCH 1/5] feat: Add variable to specify inputs as optional to ConditionalRouter (#8568) * Add optional_variables in ConditionalRouter * Add reno note * Add more unit test with various complex scenarios * Add more unit tests * Add pylint disable=too-many-positional-arguments * PR feedback from @sjrl --- .../components/routers/conditional_router.py | 70 ++++++++- ...-optional-parameters-f02c598d7c751868.yaml | 4 + .../routers/test_conditional_router.py | 145 ++++++++++++++++++ 3 files changed, 217 insertions(+), 2 deletions(-) create mode 100644 releasenotes/notes/conditional-router-optional-parameters-f02c598d7c751868.yaml diff --git a/haystack/components/routers/conditional_router.py b/haystack/components/routers/conditional_router.py index 27312f43a1..5fbe399248 100644 --- a/haystack/components/routers/conditional_router.py +++ b/haystack/components/routers/conditional_router.py @@ -107,12 +107,13 @@ class ConditionalRouter: ``` """ - def __init__( + def __init__( # pylint: disable=too-many-positional-arguments self, routes: List[Dict], custom_filters: Optional[Dict[str, Callable]] = None, unsafe: bool = False, validate_output_type: bool = False, + optional_variables: Optional[List[str]] = None, ): """ Initializes the `ConditionalRouter` with a list of routes detailing the conditions for routing. @@ -136,11 +137,54 @@ def __init__( :param validate_output_type: Enable validation of routes' output. If a route output doesn't match the declared type a ValueError is raised running. + :param optional_variables: + A list of variable names that are optional in your route conditions and outputs. + If these variables are not provided at runtime, they will be set to `None`. + This allows you to write routes that can handle missing inputs gracefully without raising errors. + + Example usage with a default fallback route in a Pipeline: + ```python + from haystack import Pipeline + from haystack.components.routers import ConditionalRouter + + routes = [ + { + "condition": '{{ path == "rag" }}', + "output": "{{ question }}", + "output_name": "rag_route", + "output_type": str + }, + { + "condition": "{{ True }}", # fallback route + "output": "{{ question }}", + "output_name": "default_route", + "output_type": str + } + ] + + router = ConditionalRouter(routes, optional_variables=["path"]) + pipe = Pipeline() + pipe.add_component("router", router) + + # When 'path' is provided in the pipeline: + result = pipe.run(data={"router": {"question": "What?", "path": "rag"}}) + assert result["router"] == {"rag_route": "What?"} + + # When 'path' is not provided, fallback route is taken: + result = pipe.run(data={"router": {"question": "What?"}}) + assert result["router"] == {"default_route": "What?"} + ``` + + This pattern is particularly useful when: + - You want to provide default/fallback behavior when certain inputs are missing + - Some variables are only needed for specific routing conditions + - You're building flexible pipelines where not all inputs are guaranteed to be present """ self.routes: List[dict] = routes self.custom_filters = custom_filters or {} self._unsafe = unsafe self._validate_output_type = validate_output_type + self.optional_variables = optional_variables or [] # Create a Jinja environment to inspect variables in the condition templates if self._unsafe: @@ -166,7 +210,28 @@ def __init__( # extract outputs output_types.update({route["output_name"]: route["output_type"]}) - component.set_input_types(self, **{var: Any for var in input_types}) + # remove optional variables from mandatory input types + mandatory_input_types = input_types - set(self.optional_variables) + + # warn about unused optional variables + unused_optional_vars = set(self.optional_variables) - input_types if self.optional_variables else None + if unused_optional_vars: + msg = ( + f"The following optional variables are specified but not used in any route: {unused_optional_vars}. " + "Check if there's a typo in variable names." + ) + # intentionally using both warn and logger + warn(msg, UserWarning) + logger.warning(msg) + + # add mandatory input types + component.set_input_types(self, **{var: Any for var in mandatory_input_types}) + + # now add optional input types + for optional_var_name in self.optional_variables: + component.set_input_type(self, name=optional_var_name, type=Any, default=None) + + # set output types component.set_output_types(self, **output_types) def to_dict(self) -> Dict[str, Any]: @@ -186,6 +251,7 @@ def to_dict(self) -> Dict[str, Any]: custom_filters=se_filters, unsafe=self._unsafe, validate_output_type=self._validate_output_type, + optional_variables=self.optional_variables, ) @classmethod diff --git a/releasenotes/notes/conditional-router-optional-parameters-f02c598d7c751868.yaml b/releasenotes/notes/conditional-router-optional-parameters-f02c598d7c751868.yaml new file mode 100644 index 0000000000..99056a24da --- /dev/null +++ b/releasenotes/notes/conditional-router-optional-parameters-f02c598d7c751868.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Introduces optional parameters in the ConditionalRouter component, enabling default/fallback routing behavior when certain inputs are not provided at runtime. This enhancement allows for more flexible pipeline configurations with graceful handling of missing parameters. diff --git a/test/components/routers/test_conditional_router.py b/test/components/routers/test_conditional_router.py index 8ea3f86d92..e0f3552319 100644 --- a/test/components/routers/test_conditional_router.py +++ b/test/components/routers/test_conditional_router.py @@ -7,6 +7,7 @@ import pytest +from haystack import Pipeline from haystack.components.routers import ConditionalRouter from haystack.components.routers.conditional_router import NoRouteSelectedException from haystack.dataclasses import ChatMessage @@ -397,3 +398,147 @@ def test_validate_output_type_with_unsafe(self): streams = ["1", "2", "3", "4"] with pytest.raises(ValueError, match="Route 'streams' type doesn't match expected type"): router.run(streams=streams, message=message) + + def test_router_with_optional_parameters(self): + """ + Test that the router works with optional parameters, particularly testing the default/fallback route + when an expected parameter is not provided. + """ + routes = [ + {"condition": '{{path == "rag"}}', "output": "{{question}}", "output_name": "normal", "output_type": str}, + { + "condition": '{{path == "followup_short"}}', + "output": "{{question}}", + "output_name": "followup_short", + "output_type": str, + }, + { + "condition": '{{path == "followup_elaborate"}}', + "output": "{{question}}", + "output_name": "followup_elaborate", + "output_type": str, + }, + {"condition": "{{ True }}", "output": "{{ question }}", "output_name": "fallback", "output_type": str}, + ] + + router = ConditionalRouter(routes, optional_variables=["path"]) + + # Test direct component usage + result = router.run(question="What?") + assert result == {"fallback": "What?"}, "Default route should be taken when 'path' is not provided" + + # Test with path parameter + result = router.run(question="What?", path="rag") + assert result == {"normal": "What?"}, "Specific route should be taken when 'path' is provided" + + pipe = Pipeline() + pipe.add_component("router", router) + + # Test pipeline without path parameter + result = pipe.run(data={"router": {"question": "What?"}}) + assert result["router"] == { + "fallback": "What?" + }, "Default route should work in pipeline when 'path' is not provided" + + # Test pipeline with path parameter + result = pipe.run(data={"router": {"question": "What?", "path": "followup_short"}}) + assert result["router"] == {"followup_short": "What?"}, "Specific route should work in pipeline" + + def test_router_with_multiple_optional_parameters(self): + """ + Test ConditionalRouter with a mix of mandatory and optional parameters, + exploring various combinations of provided/missing optional variables. + """ + routes = [ + { + "condition": '{{mode == "chat" and language == "en" and source == "doc"}}', + "output": "{{question}}", + "output_name": "en_doc_chat", + "output_type": str, + }, + { + "condition": '{{mode == "qa" and source == "web"}}', + "output": "{{question}}", + "output_name": "web_qa", + "output_type": str, + }, + { + "condition": '{{mode == "qa" and source == "doc"}}', + "output": "{{question}}", + "output_name": "doc_qa", + "output_type": str, + }, + { + "condition": '{{mode == "chat" and language == "en"}}', + "output": "{{question}}", + "output_name": "en_chat", + "output_type": str, + }, + { + "condition": '{{mode == "chat"}}', # fallback for chat without language + "output": "{{question}}", + "output_name": "default_chat", + "output_type": str, + }, + { + "condition": "{{ True }}", # global fallback + "output": "{{question}}", + "output_name": "fallback", + "output_type": str, + }, + ] + + # There are four variables in the routes: + # - mandatory: mode, question (always must be provided) or we'll route to fallback + # - optional: source, language + router = ConditionalRouter(routes, optional_variables=["source", "language"]) + + # Test with mandatory parameter only + result = router.run(question="What?", mode="chat") + assert result == {"default_chat": "What?"}, "Should use chat fallback when language not provided" + + # Test with all parameters provided + result = router.run(question="What?", mode="chat", language="en", source="doc") + assert result == {"en_doc_chat": "What?"}, "Should use specific route when all params provided" + + # Test with different mandatory value and one optional + result = router.run(question="What?", mode="qa", source="web") + assert result == {"web_qa": "What?"}, "Should route qa with source correctly" + + # Test with mandatory the routes to fallback + result = router.run(question="What?", mode="qa") + assert result == {"fallback": "What?"}, "Should use global fallback for qa without source" + + # Test in pipeline + pipe = Pipeline() + pipe.add_component("router", router) + + # Test pipeline with mandatory only + result = pipe.run(data={"router": {"question": "What?", "mode": "chat"}}) + assert result["router"] == {"default_chat": "What?"}, "Pipeline should handle missing optionals" + + # Test pipeline with mandatory and one optional + result = pipe.run(data={"router": {"question": "What?", "mode": "qa", "source": "doc"}}) + assert result["router"] == {"doc_qa": "What?"}, "Pipeline should handle all parameters" + + # Test pipeline with mandatory and both optionals + result = pipe.run(data={"router": {"question": "What?", "mode": "chat", "language": "en", "source": "doc"}}) + assert result["router"] == {"en_doc_chat": "What?"}, "Pipeline should handle all parameters" + + def test_warns_on_unused_optional_variables(self): + """ + Test that a warning is raised when optional_variables contains variables + that are not used in any route conditions or outputs. + """ + routes = [ + {"condition": '{{mode == "chat"}}', "output": "{{question}}", "output_name": "chat", "output_type": str}, + {"condition": "{{ True }}", "output": "{{question}}", "output_name": "fallback", "output_type": str}, + ] + + # Initialize with unused optional variables and capture warning + with pytest.warns(UserWarning, match="optional variables"): + router = ConditionalRouter(routes=routes, optional_variables=["unused_var1", "unused_var2"]) + + # Verify router still works normally + result = router.run(question="What?", mode="chat") + assert result == {"chat": "What?"} From f0c3692cf2a86c69de8738d53af925500e8a5126 Mon Sep 17 00:00:00 2001 From: Michele Pangrazzi Date: Tue, 26 Nov 2024 11:44:50 +0100 Subject: [PATCH 2/5] Remove `is_greedy` deprecated argument from `@component` decorator (#8580) * Remove 'is_greedy' deprecated argument from @component decorator * Remove unused import --- haystack/core/component/component.py | 16 +++------------- ...rom-component-decorator-9af6940bc60795d0.yaml | 4 ++++ 2 files changed, 7 insertions(+), 13 deletions(-) create mode 100644 releasenotes/notes/remove-deprecated-argument-from-component-decorator-9af6940bc60795d0.yaml diff --git a/haystack/core/component/component.py b/haystack/core/component/component.py index e1dbf2c5f0..567faa4871 100644 --- a/haystack/core/component/component.py +++ b/haystack/core/component/component.py @@ -71,7 +71,6 @@ import inspect import sys -import warnings from collections.abc import Callable from contextlib import contextmanager from contextvars import ContextVar @@ -487,21 +486,12 @@ class available here, we temporarily store the output types as an attribute of return output_types_decorator - def _component(self, cls, is_greedy: Optional[bool] = None): + def _component(self, cls: Any): """ Decorator validating the structure of the component and registering it in the components registry. """ logger.debug("Registering {component} as a component", component=cls) - if is_greedy is not None: - msg = ( - "The 'is_greedy' argument is deprecated and will be removed in version '2.7.0'. " - "Change the 'Variadic' input of your Component to 'GreedyVariadic' instead." - ) - warnings.warn(msg, DeprecationWarning) - else: - is_greedy = False - # Check for required methods and fail as soon as possible if not hasattr(cls, "run"): raise ComponentError(f"{cls.__name__} must have a 'run()' method. See the docs for more information.") @@ -543,11 +533,11 @@ def copy_class_namespace(namespace): return cls - def __call__(self, cls: Optional[type] = None, is_greedy: Optional[bool] = None): + def __call__(self, cls: Optional[type] = None): # We must wrap the call to the decorator in a function for it to work # correctly with or without parens def wrap(cls): - return self._component(cls, is_greedy=is_greedy) + return self._component(cls) if cls: # Decorator is called without parens diff --git a/releasenotes/notes/remove-deprecated-argument-from-component-decorator-9af6940bc60795d0.yaml b/releasenotes/notes/remove-deprecated-argument-from-component-decorator-9af6940bc60795d0.yaml new file mode 100644 index 0000000000..d1b21febb2 --- /dev/null +++ b/releasenotes/notes/remove-deprecated-argument-from-component-decorator-9af6940bc60795d0.yaml @@ -0,0 +1,4 @@ +--- +upgrade: + - | + Remove 'is_greedy' deprecated argument from `@component` decorator. Change the 'Variadic' input of your Component to 'GreedyVariadic' instead. From 3d95e067221521860270e6f774caf202ca3436f6 Mon Sep 17 00:00:00 2001 From: Haystack Bot <73523382+HaystackBot@users.noreply.github.com> Date: Tue, 26 Nov 2024 12:42:12 +0100 Subject: [PATCH 3/5] chore: Update unstable version to 2.9.0-rc0 (#8582) Co-authored-by: github-actions[bot] --- VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION.txt b/VERSION.txt index 32a386165c..da852fef72 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -2.8.0-rc0 +2.9.0-rc0 From 2440a5ee17248ef556324da69a9d15b62b023580 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Tue, 26 Nov 2024 14:47:04 +0100 Subject: [PATCH 4/5] chore:`PyPDFToDocument` - deprecate `converter` init parameter (#8569) * deprecat converter in pypdf * fix linting of MetaFieldGroupingRanker * linting --- haystack/components/converters/pypdf.py | 17 ++++++++++++++--- ...te-converter-parameter-d0cc04def6c3a293.yaml | 6 ++++++ 2 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 releasenotes/notes/pypdf-deprecate-converter-parameter-d0cc04def6c3a293.yaml diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py index 72fbcdc161..a17abd34d9 100644 --- a/haystack/components/converters/pypdf.py +++ b/haystack/components/converters/pypdf.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import io +import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Protocol, Union @@ -22,6 +23,9 @@ class PyPDFConverter(Protocol): """ A protocol that defines a converter which takes a PdfReader object and converts it into a Document object. + + This is deprecated and will be removed in Haystack 2.9.0. + For in-depth customization of the conversion process, consider implementing a custom component. """ def convert(self, reader: "PdfReader") -> Document: # noqa: D102 @@ -40,8 +44,7 @@ class PyPDFToDocument: """ Converts PDF files to documents your pipeline can query. - This component uses converters compatible with the PyPDF library. - If no converter is provided, uses a default text extraction converter. + This component uses the PyPDF library. You can attach metadata to the resulting documents. ### Usage example @@ -62,10 +65,18 @@ def __init__(self, converter: Optional[PyPDFConverter] = None): Create an PyPDFToDocument component. :param converter: - An instance of a PyPDFConverter compatible class. + An instance of a PyPDFConverter compatible class. This is deprecated and will be removed in Haystack 2.9.0. + For in-depth customization of the conversion process, consider implementing a custom component. """ pypdf_import.check() + if converter is not None: + msg = ( + "The `converter` parameter is deprecated and will be removed in Haystack 2.9.0. " + "For in-depth customization of the conversion process, consider implementing a custom component." + ) + warnings.warn(msg, DeprecationWarning) + self.converter = converter def to_dict(self): diff --git a/releasenotes/notes/pypdf-deprecate-converter-parameter-d0cc04def6c3a293.yaml b/releasenotes/notes/pypdf-deprecate-converter-parameter-d0cc04def6c3a293.yaml new file mode 100644 index 0000000000..6f59d3a1c4 --- /dev/null +++ b/releasenotes/notes/pypdf-deprecate-converter-parameter-d0cc04def6c3a293.yaml @@ -0,0 +1,6 @@ +--- +deprecations: + - | + The `converter` parameter in the `PyPDFToDocument` component is deprecated and will be removed in Haystack 2.9.0. + For in-depth customization of the conversion process, consider implementing a custom component. + Additional high-level customization options will be added in the future. From fb42c035c554d0477a9bcbf4ff890545da9c3874 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Tue, 26 Nov 2024 16:37:59 +0100 Subject: [PATCH 5/5] feat: `PyPDFToDocument` - add new customization parameters (#8574) * deprecat converter in pypdf * fix linting of MetaFieldGroupingRanker * linting * pypdftodocument: add customization params * fix mypy * incorporate feedback --- haystack/components/converters/pypdf.py | 101 +++++++++++- ...customization-params-3da578deff7f83a5.yaml | 5 + .../converters/test_pypdf_to_document.py | 149 +++++++++++++++--- 3 files changed, 227 insertions(+), 28 deletions(-) create mode 100644 releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py index a17abd34d9..555cf1df58 100644 --- a/haystack/components/converters/pypdf.py +++ b/haystack/components/converters/pypdf.py @@ -4,6 +4,7 @@ import io import warnings +from enum import Enum from pathlib import Path from typing import Any, Dict, List, Optional, Protocol, Union @@ -39,6 +40,33 @@ def from_dict(cls, data): # noqa: D102 ... +class PyPDFExtractionMode(Enum): + """ + The mode to use for extracting text from a PDF. + """ + + PLAIN = "plain" + LAYOUT = "layout" + + def __str__(self) -> str: + """ + Convert a PyPDFExtractionMode enum to a string. + """ + return self.value + + @staticmethod + def from_str(string: str) -> "PyPDFExtractionMode": + """ + Convert a string to a PyPDFExtractionMode enum. + """ + enum_map = {e.value: e for e in PyPDFExtractionMode} + mode = enum_map.get(string) + if mode is None: + msg = f"Unknown extraction mode '{string}'. Supported modes are: {list(enum_map.keys())}" + raise ValueError(msg) + return mode + + @component class PyPDFToDocument: """ @@ -60,13 +88,49 @@ class PyPDFToDocument: ``` """ - def __init__(self, converter: Optional[PyPDFConverter] = None): + def __init__( + self, + converter: Optional[PyPDFConverter] = None, + *, + extraction_mode: Union[str, PyPDFExtractionMode] = PyPDFExtractionMode.PLAIN, + plain_mode_orientations: tuple = (0, 90, 180, 270), + plain_mode_space_width: float = 200.0, + layout_mode_space_vertically: bool = True, + layout_mode_scale_weight: float = 1.25, + layout_mode_strip_rotated: bool = True, + layout_mode_font_height_weight: float = 1.0, + ): """ Create an PyPDFToDocument component. :param converter: An instance of a PyPDFConverter compatible class. This is deprecated and will be removed in Haystack 2.9.0. For in-depth customization of the conversion process, consider implementing a custom component. + + All the following parameters are applied only if `converter` is None. + + :param extraction_mode: + The mode to use for extracting text from a PDF. + Layout mode is an experimental mode that adheres to the rendered layout of the PDF. + :param plain_mode_orientations: + Tuple of orientations to look for when extracting text from a PDF in plain mode. + Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`. + :param plain_mode_space_width: + Forces default space width if not extracted from font. + Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`. + :param layout_mode_space_vertically: + Whether to include blank lines inferred from y distance + font height. + Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`. + :param layout_mode_scale_weight: + Multiplier for string length when calculating weighted average character width. + Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`. + :param layout_mode_strip_rotated: + Layout mode does not support rotated text. Set to `False` to include rotated text anyway. + If rotated text is discovered, layout will be degraded and a warning will be logged. + Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`. + :param layout_mode_font_height_weight: + Multiplier for font height when calculating blank line height. + Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`. """ pypdf_import.check() @@ -79,6 +143,16 @@ def __init__(self, converter: Optional[PyPDFConverter] = None): self.converter = converter + if isinstance(extraction_mode, str): + extraction_mode = PyPDFExtractionMode.from_str(extraction_mode) + self.extraction_mode = extraction_mode + self.plain_mode_orientations = plain_mode_orientations + self.plain_mode_space_width = plain_mode_space_width + self.layout_mode_space_vertically = layout_mode_space_vertically + self.layout_mode_scale_weight = layout_mode_scale_weight + self.layout_mode_strip_rotated = layout_mode_strip_rotated + self.layout_mode_font_height_weight = layout_mode_font_height_weight + def to_dict(self): """ Serializes the component to a dictionary. @@ -87,7 +161,15 @@ def to_dict(self): Dictionary with serialized data. """ return default_to_dict( - self, converter=(serialize_class_instance(self.converter) if self.converter is not None else None) + self, + converter=(serialize_class_instance(self.converter) if self.converter else None), + extraction_mode=str(self.extraction_mode), + plain_mode_orientations=self.plain_mode_orientations, + plain_mode_space_width=self.plain_mode_space_width, + layout_mode_space_vertically=self.layout_mode_space_vertically, + layout_mode_scale_weight=self.layout_mode_scale_weight, + layout_mode_strip_rotated=self.layout_mode_strip_rotated, + layout_mode_font_height_weight=self.layout_mode_font_height_weight, ) @classmethod @@ -108,7 +190,20 @@ def from_dict(cls, data): return default_from_dict(cls, data) def _default_convert(self, reader: "PdfReader") -> Document: - text = "\f".join(page.extract_text() for page in reader.pages) + texts = [] + for page in reader.pages: + texts.append( + page.extract_text( + orientations=self.plain_mode_orientations, + extraction_mode=self.extraction_mode.value, + space_width=self.plain_mode_space_width, + layout_mode_space_vertically=self.layout_mode_space_vertically, + layout_mode_scale_weight=self.layout_mode_scale_weight, + layout_mode_strip_rotated=self.layout_mode_strip_rotated, + layout_mode_font_height_weight=self.layout_mode_font_height_weight, + ) + ) + text = "\f".join(texts) return Document(content=text) @component.output_types(documents=List[Document]) diff --git a/releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml b/releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml new file mode 100644 index 0000000000..4e3f5176ac --- /dev/null +++ b/releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml @@ -0,0 +1,5 @@ +--- +enhancements: + - | + Added new initialization parameters to the `PyPDFToDocument` component to customize the text extraction process + from PDF files. diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py index 234b545eac..d9aee0020f 100644 --- a/test/components/converters/test_pypdf_to_document.py +++ b/test/components/converters/test_pypdf_to_document.py @@ -2,17 +2,17 @@ # # SPDX-License-Identifier: Apache-2.0 import logging -from unittest.mock import patch +from unittest.mock import patch, Mock import pytest from haystack import Document, default_from_dict, default_to_dict -from haystack.components.converters.pypdf import PyPDFToDocument +from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode from haystack.dataclasses import ByteStream @pytest.fixture -def pypdf_converter(): +def pypdf_component(): return PyPDFToDocument() @@ -30,38 +30,104 @@ def from_dict(cls, data): class TestPyPDFToDocument: - def test_init(self, pypdf_converter): - assert pypdf_converter.converter is None + def test_init(self, pypdf_component): + assert pypdf_component.converter is None + assert pypdf_component.extraction_mode == PyPDFExtractionMode.PLAIN + assert pypdf_component.plain_mode_orientations == (0, 90, 180, 270) + assert pypdf_component.plain_mode_space_width == 200.0 + assert pypdf_component.layout_mode_space_vertically is True + assert pypdf_component.layout_mode_scale_weight == 1.25 + assert pypdf_component.layout_mode_strip_rotated is True + assert pypdf_component.layout_mode_font_height_weight == 1.0 - def test_init_params(self): - pypdf_converter = PyPDFToDocument(converter=CustomConverter()) - assert isinstance(pypdf_converter.converter, CustomConverter) + def test_init_converter(self): + pypdf_component = PyPDFToDocument(converter=CustomConverter()) + assert isinstance(pypdf_component.converter, CustomConverter) - def test_to_dict(self, pypdf_converter): - data = pypdf_converter.to_dict() + def test_init_custom_params(self): + pypdf_component = PyPDFToDocument( + extraction_mode="layout", + plain_mode_orientations=(0, 90), + plain_mode_space_width=150.0, + layout_mode_space_vertically=False, + layout_mode_scale_weight=2.0, + layout_mode_strip_rotated=False, + layout_mode_font_height_weight=0.5, + ) + + assert pypdf_component.extraction_mode == PyPDFExtractionMode.LAYOUT + assert pypdf_component.plain_mode_orientations == (0, 90) + assert pypdf_component.plain_mode_space_width == 150.0 + assert pypdf_component.layout_mode_space_vertically is False + assert pypdf_component.layout_mode_scale_weight == 2.0 + assert pypdf_component.layout_mode_strip_rotated is False + assert pypdf_component.layout_mode_font_height_weight == 0.5 + + def test_init_invalid_extraction_mode(self): + with pytest.raises(ValueError): + PyPDFToDocument(extraction_mode="invalid") + + def test_to_dict(self, pypdf_component): + data = pypdf_component.to_dict() assert data == { "type": "haystack.components.converters.pypdf.PyPDFToDocument", - "init_parameters": {"converter": None}, + "init_parameters": { + "converter": None, + "extraction_mode": "plain", + "plain_mode_orientations": (0, 90, 180, 270), + "plain_mode_space_width": 200.0, + "layout_mode_space_vertically": True, + "layout_mode_scale_weight": 1.25, + "layout_mode_strip_rotated": True, + "layout_mode_font_height_weight": 1.0, + }, } def test_to_dict_custom_converter(self): - pypdf_converter = PyPDFToDocument(converter=CustomConverter()) - data = pypdf_converter.to_dict() + pypdf_component = PyPDFToDocument(converter=CustomConverter()) + data = pypdf_component.to_dict() assert data == { "type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": { "converter": { "data": {"key": "value", "more": False}, "type": "converters.test_pypdf_to_document.CustomConverter", - } + }, + "extraction_mode": "plain", + "plain_mode_orientations": (0, 90, 180, 270), + "plain_mode_space_width": 200.0, + "layout_mode_space_vertically": True, + "layout_mode_scale_weight": 1.25, + "layout_mode_strip_rotated": True, + "layout_mode_font_height_weight": 1.0, }, } def test_from_dict(self): - data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {"converter": None}} + data = { + "type": "haystack.components.converters.pypdf.PyPDFToDocument", + "init_parameters": { + "converter": None, + "extraction_mode": "plain", + "plain_mode_orientations": (0, 90, 180, 270), + "plain_mode_space_width": 200.0, + "layout_mode_space_vertically": True, + "layout_mode_scale_weight": 1.25, + "layout_mode_strip_rotated": True, + "layout_mode_font_height_weight": 1.0, + }, + } + instance = PyPDFToDocument.from_dict(data) assert isinstance(instance, PyPDFToDocument) assert instance.converter is None + assert instance.extraction_mode == PyPDFExtractionMode.PLAIN + assert instance.plain_mode_orientations == (0, 90, 180, 270) + assert instance.plain_mode_space_width == 200.0 + assert instance.layout_mode_space_vertically is True + assert instance.layout_mode_scale_weight == 1.25 + assert instance.layout_mode_strip_rotated is True + assert instance.layout_mode_font_height_weight == 1.0 def test_from_dict_defaults(self): data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}} @@ -83,30 +149,63 @@ def test_from_dict_custom_converter(self): assert isinstance(instance, PyPDFToDocument) assert isinstance(instance.converter, CustomConverter) + def test_default_convert(self): + mock_page1 = Mock() + mock_page2 = Mock() + mock_page1.extract_text.return_value = "Page 1 content" + mock_page2.extract_text.return_value = "Page 2 content" + mock_reader = Mock() + mock_reader.pages = [mock_page1, mock_page2] + + converter = PyPDFToDocument( + extraction_mode="layout", + plain_mode_orientations=(0, 90), + plain_mode_space_width=150.0, + layout_mode_space_vertically=False, + layout_mode_scale_weight=2.0, + layout_mode_strip_rotated=False, + layout_mode_font_height_weight=1.5, + ) + + doc = converter._default_convert(mock_reader) + assert doc.content == "Page 1 content\fPage 2 content" + + expected_params = { + "extraction_mode": "layout", + "orientations": (0, 90), + "space_width": 150.0, + "layout_mode_space_vertically": False, + "layout_mode_scale_weight": 2.0, + "layout_mode_strip_rotated": False, + "layout_mode_font_height_weight": 1.5, + } + for mock_page in mock_reader.pages: + mock_page.extract_text.assert_called_once_with(**expected_params) + @pytest.mark.integration - def test_run(self, test_files_path, pypdf_converter): + def test_run(self, test_files_path, pypdf_component): """ Test if the component runs correctly. """ paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"] - output = pypdf_converter.run(sources=paths) + output = pypdf_component.run(sources=paths) docs = output["documents"] assert len(docs) == 1 assert "History" in docs[0].content @pytest.mark.integration - def test_page_breaks_added(self, test_files_path, pypdf_converter): + def test_page_breaks_added(self, test_files_path, pypdf_component): paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"] - output = pypdf_converter.run(sources=paths) + output = pypdf_component.run(sources=paths) docs = output["documents"] assert len(docs) == 1 assert docs[0].content.count("\f") == 3 - def test_run_with_meta(self, test_files_path, pypdf_converter): + def test_run_with_meta(self, test_files_path, pypdf_component): bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"}) with patch("haystack.components.converters.pypdf.PdfReader"): - output = pypdf_converter.run( + output = pypdf_component.run( sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"} ) @@ -115,17 +214,17 @@ def test_run_with_meta(self, test_files_path, pypdf_converter): assert output["documents"][0].meta["language"] == "it" assert output["documents"][1].meta["language"] == "it" - def test_run_error_handling(self, test_files_path, pypdf_converter, caplog): + def test_run_error_handling(self, test_files_path, pypdf_component, caplog): """ Test if the component correctly handles errors. """ paths = ["non_existing_file.pdf"] with caplog.at_level(logging.WARNING): - pypdf_converter.run(sources=paths) + pypdf_component.run(sources=paths) assert "Could not read non_existing_file.pdf" in caplog.text @pytest.mark.integration - def test_mixed_sources_run(self, test_files_path, pypdf_converter): + def test_mixed_sources_run(self, test_files_path, pypdf_component): """ Test if the component runs correctly when mixed sources are provided. """ @@ -133,7 +232,7 @@ def test_mixed_sources_run(self, test_files_path, pypdf_converter): with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f: paths.append(ByteStream(f.read())) - output = pypdf_converter.run(sources=paths) + output = pypdf_component.run(sources=paths) docs = output["documents"] assert len(docs) == 2 assert "History and standardization" in docs[0].content