From 59f1e182db4aec9985a94b8e05117bc54a3dc0ea Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Tue, 26 Nov 2024 10:48:55 +0100
Subject: [PATCH 1/5] feat: Add variable to specify inputs as optional to
 ConditionalRouter (#8568)

* Add optional_variables in ConditionalRouter

* Add reno note

* Add more unit test with various complex scenarios

* Add more unit tests

* Add pylint disable=too-many-positional-arguments

* PR feedback from @sjrl
---
 .../components/routers/conditional_router.py  |  70 ++++++++-
 ...-optional-parameters-f02c598d7c751868.yaml |   4 +
 .../routers/test_conditional_router.py        | 145 ++++++++++++++++++
 3 files changed, 217 insertions(+), 2 deletions(-)
 create mode 100644 releasenotes/notes/conditional-router-optional-parameters-f02c598d7c751868.yaml

diff --git a/haystack/components/routers/conditional_router.py b/haystack/components/routers/conditional_router.py
index 27312f43a1..5fbe399248 100644
--- a/haystack/components/routers/conditional_router.py
+++ b/haystack/components/routers/conditional_router.py
@@ -107,12 +107,13 @@ class ConditionalRouter:
     ```
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         routes: List[Dict],
         custom_filters: Optional[Dict[str, Callable]] = None,
         unsafe: bool = False,
         validate_output_type: bool = False,
+        optional_variables: Optional[List[str]] = None,
     ):
         """
         Initializes the `ConditionalRouter` with a list of routes detailing the conditions for routing.
@@ -136,11 +137,54 @@ def __init__(
         :param validate_output_type:
             Enable validation of routes' output.
             If a route output doesn't match the declared type a ValueError is raised running.
+        :param optional_variables:
+            A list of variable names that are optional in your route conditions and outputs.
+            If these variables are not provided at runtime, they will be set to `None`.
+            This allows you to write routes that can handle missing inputs gracefully without raising errors.
+
+            Example usage with a default fallback route in a Pipeline:
+            ```python
+            from haystack import Pipeline
+            from haystack.components.routers import ConditionalRouter
+
+            routes = [
+                {
+                    "condition": '{{ path == "rag" }}',
+                    "output": "{{ question }}",
+                    "output_name": "rag_route",
+                    "output_type": str
+                },
+                {
+                    "condition": "{{ True }}",  # fallback route
+                    "output": "{{ question }}",
+                    "output_name": "default_route",
+                    "output_type": str
+                }
+            ]
+
+            router = ConditionalRouter(routes, optional_variables=["path"])
+            pipe = Pipeline()
+            pipe.add_component("router", router)
+
+            # When 'path' is provided in the pipeline:
+            result = pipe.run(data={"router": {"question": "What?", "path": "rag"}})
+            assert result["router"] == {"rag_route": "What?"}
+
+            # When 'path' is not provided, fallback route is taken:
+            result = pipe.run(data={"router": {"question": "What?"}})
+            assert result["router"] == {"default_route": "What?"}
+            ```
+
+            This pattern is particularly useful when:
+            - You want to provide default/fallback behavior when certain inputs are missing
+            - Some variables are only needed for specific routing conditions
+            - You're building flexible pipelines where not all inputs are guaranteed to be present
         """
         self.routes: List[dict] = routes
         self.custom_filters = custom_filters or {}
         self._unsafe = unsafe
         self._validate_output_type = validate_output_type
+        self.optional_variables = optional_variables or []
 
         # Create a Jinja environment to inspect variables in the condition templates
         if self._unsafe:
@@ -166,7 +210,28 @@ def __init__(
             # extract outputs
             output_types.update({route["output_name"]: route["output_type"]})
 
-        component.set_input_types(self, **{var: Any for var in input_types})
+        # remove optional variables from mandatory input types
+        mandatory_input_types = input_types - set(self.optional_variables)
+
+        # warn about unused optional variables
+        unused_optional_vars = set(self.optional_variables) - input_types if self.optional_variables else None
+        if unused_optional_vars:
+            msg = (
+                f"The following optional variables are specified but not used in any route: {unused_optional_vars}. "
+                "Check if there's a typo in variable names."
+            )
+            # intentionally using both warn and logger
+            warn(msg, UserWarning)
+            logger.warning(msg)
+
+        # add mandatory input types
+        component.set_input_types(self, **{var: Any for var in mandatory_input_types})
+
+        # now add optional input types
+        for optional_var_name in self.optional_variables:
+            component.set_input_type(self, name=optional_var_name, type=Any, default=None)
+
+        # set output types
         component.set_output_types(self, **output_types)
 
     def to_dict(self) -> Dict[str, Any]:
@@ -186,6 +251,7 @@ def to_dict(self) -> Dict[str, Any]:
             custom_filters=se_filters,
             unsafe=self._unsafe,
             validate_output_type=self._validate_output_type,
+            optional_variables=self.optional_variables,
         )
 
     @classmethod
diff --git a/releasenotes/notes/conditional-router-optional-parameters-f02c598d7c751868.yaml b/releasenotes/notes/conditional-router-optional-parameters-f02c598d7c751868.yaml
new file mode 100644
index 0000000000..99056a24da
--- /dev/null
+++ b/releasenotes/notes/conditional-router-optional-parameters-f02c598d7c751868.yaml
@@ -0,0 +1,4 @@
+---
+enhancements:
+  - |
+    Introduces optional parameters in the ConditionalRouter component, enabling default/fallback routing behavior when certain inputs are not provided at runtime. This enhancement allows for more flexible pipeline configurations with graceful handling of missing parameters.
diff --git a/test/components/routers/test_conditional_router.py b/test/components/routers/test_conditional_router.py
index 8ea3f86d92..e0f3552319 100644
--- a/test/components/routers/test_conditional_router.py
+++ b/test/components/routers/test_conditional_router.py
@@ -7,6 +7,7 @@
 
 import pytest
 
+from haystack import Pipeline
 from haystack.components.routers import ConditionalRouter
 from haystack.components.routers.conditional_router import NoRouteSelectedException
 from haystack.dataclasses import ChatMessage
@@ -397,3 +398,147 @@ def test_validate_output_type_with_unsafe(self):
         streams = ["1", "2", "3", "4"]
         with pytest.raises(ValueError, match="Route 'streams' type doesn't match expected type"):
             router.run(streams=streams, message=message)
+
+    def test_router_with_optional_parameters(self):
+        """
+        Test that the router works with optional parameters, particularly testing the default/fallback route
+        when an expected parameter is not provided.
+        """
+        routes = [
+            {"condition": '{{path == "rag"}}', "output": "{{question}}", "output_name": "normal", "output_type": str},
+            {
+                "condition": '{{path == "followup_short"}}',
+                "output": "{{question}}",
+                "output_name": "followup_short",
+                "output_type": str,
+            },
+            {
+                "condition": '{{path == "followup_elaborate"}}',
+                "output": "{{question}}",
+                "output_name": "followup_elaborate",
+                "output_type": str,
+            },
+            {"condition": "{{ True }}", "output": "{{ question }}", "output_name": "fallback", "output_type": str},
+        ]
+
+        router = ConditionalRouter(routes, optional_variables=["path"])
+
+        # Test direct component usage
+        result = router.run(question="What?")
+        assert result == {"fallback": "What?"}, "Default route should be taken when 'path' is not provided"
+
+        # Test with path parameter
+        result = router.run(question="What?", path="rag")
+        assert result == {"normal": "What?"}, "Specific route should be taken when 'path' is provided"
+
+        pipe = Pipeline()
+        pipe.add_component("router", router)
+
+        # Test pipeline without path parameter
+        result = pipe.run(data={"router": {"question": "What?"}})
+        assert result["router"] == {
+            "fallback": "What?"
+        }, "Default route should work in pipeline when 'path' is not provided"
+
+        # Test pipeline with path parameter
+        result = pipe.run(data={"router": {"question": "What?", "path": "followup_short"}})
+        assert result["router"] == {"followup_short": "What?"}, "Specific route should work in pipeline"
+
+    def test_router_with_multiple_optional_parameters(self):
+        """
+        Test ConditionalRouter with a mix of mandatory and optional parameters,
+        exploring various combinations of provided/missing optional variables.
+        """
+        routes = [
+            {
+                "condition": '{{mode == "chat" and language == "en" and source == "doc"}}',
+                "output": "{{question}}",
+                "output_name": "en_doc_chat",
+                "output_type": str,
+            },
+            {
+                "condition": '{{mode == "qa" and source == "web"}}',
+                "output": "{{question}}",
+                "output_name": "web_qa",
+                "output_type": str,
+            },
+            {
+                "condition": '{{mode == "qa" and source == "doc"}}',
+                "output": "{{question}}",
+                "output_name": "doc_qa",
+                "output_type": str,
+            },
+            {
+                "condition": '{{mode == "chat" and language == "en"}}',
+                "output": "{{question}}",
+                "output_name": "en_chat",
+                "output_type": str,
+            },
+            {
+                "condition": '{{mode == "chat"}}',  # fallback for chat without language
+                "output": "{{question}}",
+                "output_name": "default_chat",
+                "output_type": str,
+            },
+            {
+                "condition": "{{ True }}",  # global fallback
+                "output": "{{question}}",
+                "output_name": "fallback",
+                "output_type": str,
+            },
+        ]
+
+        # There are four variables in the routes:
+        # - mandatory: mode, question (always must be provided) or we'll route to fallback
+        # - optional: source, language
+        router = ConditionalRouter(routes, optional_variables=["source", "language"])
+
+        # Test with mandatory parameter only
+        result = router.run(question="What?", mode="chat")
+        assert result == {"default_chat": "What?"}, "Should use chat fallback when language not provided"
+
+        # Test with all parameters provided
+        result = router.run(question="What?", mode="chat", language="en", source="doc")
+        assert result == {"en_doc_chat": "What?"}, "Should use specific route when all params provided"
+
+        # Test with different mandatory value and one optional
+        result = router.run(question="What?", mode="qa", source="web")
+        assert result == {"web_qa": "What?"}, "Should route qa with source correctly"
+
+        # Test with mandatory the routes to fallback
+        result = router.run(question="What?", mode="qa")
+        assert result == {"fallback": "What?"}, "Should use global fallback for qa without source"
+
+        # Test in pipeline
+        pipe = Pipeline()
+        pipe.add_component("router", router)
+
+        # Test pipeline with mandatory only
+        result = pipe.run(data={"router": {"question": "What?", "mode": "chat"}})
+        assert result["router"] == {"default_chat": "What?"}, "Pipeline should handle missing optionals"
+
+        # Test pipeline with mandatory and one optional
+        result = pipe.run(data={"router": {"question": "What?", "mode": "qa", "source": "doc"}})
+        assert result["router"] == {"doc_qa": "What?"}, "Pipeline should handle all parameters"
+
+        # Test pipeline with mandatory and both optionals
+        result = pipe.run(data={"router": {"question": "What?", "mode": "chat", "language": "en", "source": "doc"}})
+        assert result["router"] == {"en_doc_chat": "What?"}, "Pipeline should handle all parameters"
+
+    def test_warns_on_unused_optional_variables(self):
+        """
+        Test that a warning is raised when optional_variables contains variables
+        that are not used in any route conditions or outputs.
+        """
+        routes = [
+            {"condition": '{{mode == "chat"}}', "output": "{{question}}", "output_name": "chat", "output_type": str},
+            {"condition": "{{ True }}", "output": "{{question}}", "output_name": "fallback", "output_type": str},
+        ]
+
+        # Initialize with unused optional variables and capture warning
+        with pytest.warns(UserWarning, match="optional variables"):
+            router = ConditionalRouter(routes=routes, optional_variables=["unused_var1", "unused_var2"])
+
+        # Verify router still works normally
+        result = router.run(question="What?", mode="chat")
+        assert result == {"chat": "What?"}

From f0c3692cf2a86c69de8738d53af925500e8a5126 Mon Sep 17 00:00:00 2001
From: Michele Pangrazzi <xmikex83@gmail.com>
Date: Tue, 26 Nov 2024 11:44:50 +0100
Subject: [PATCH 2/5] Remove `is_greedy` deprecated argument from `@component`
 decorator (#8580)

* Remove 'is_greedy' deprecated argument from @component decorator

* Remove unused import
---
 haystack/core/component/component.py             | 16 +++-------------
 ...rom-component-decorator-9af6940bc60795d0.yaml |  4 ++++
 2 files changed, 7 insertions(+), 13 deletions(-)
 create mode 100644 releasenotes/notes/remove-deprecated-argument-from-component-decorator-9af6940bc60795d0.yaml

diff --git a/haystack/core/component/component.py b/haystack/core/component/component.py
index e1dbf2c5f0..567faa4871 100644
--- a/haystack/core/component/component.py
+++ b/haystack/core/component/component.py
@@ -71,7 +71,6 @@
 
 import inspect
 import sys
-import warnings
 from collections.abc import Callable
 from contextlib import contextmanager
 from contextvars import ContextVar
@@ -487,21 +486,12 @@ class available here, we temporarily store the output types as an attribute of
 
         return output_types_decorator
 
-    def _component(self, cls, is_greedy: Optional[bool] = None):
+    def _component(self, cls: Any):
         """
         Decorator validating the structure of the component and registering it in the components registry.
         """
         logger.debug("Registering {component} as a component", component=cls)
 
-        if is_greedy is not None:
-            msg = (
-                "The 'is_greedy' argument is deprecated and will be removed in version '2.7.0'. "
-                "Change the 'Variadic' input of your Component to 'GreedyVariadic' instead."
-            )
-            warnings.warn(msg, DeprecationWarning)
-        else:
-            is_greedy = False
-
         # Check for required methods and fail as soon as possible
         if not hasattr(cls, "run"):
             raise ComponentError(f"{cls.__name__} must have a 'run()' method. See the docs for more information.")
@@ -543,11 +533,11 @@ def copy_class_namespace(namespace):
 
         return cls
 
-    def __call__(self, cls: Optional[type] = None, is_greedy: Optional[bool] = None):
+    def __call__(self, cls: Optional[type] = None):
         # We must wrap the call to the decorator in a function for it to work
         # correctly with or without parens
         def wrap(cls):
-            return self._component(cls, is_greedy=is_greedy)
+            return self._component(cls)
 
         if cls:
             # Decorator is called without parens
diff --git a/releasenotes/notes/remove-deprecated-argument-from-component-decorator-9af6940bc60795d0.yaml b/releasenotes/notes/remove-deprecated-argument-from-component-decorator-9af6940bc60795d0.yaml
new file mode 100644
index 0000000000..d1b21febb2
--- /dev/null
+++ b/releasenotes/notes/remove-deprecated-argument-from-component-decorator-9af6940bc60795d0.yaml
@@ -0,0 +1,4 @@
+---
+upgrade:
+  - |
+    Remove 'is_greedy' deprecated argument from `@component` decorator. Change the 'Variadic' input of your Component to 'GreedyVariadic' instead.

From 3d95e067221521860270e6f774caf202ca3436f6 Mon Sep 17 00:00:00 2001
From: Haystack Bot <73523382+HaystackBot@users.noreply.github.com>
Date: Tue, 26 Nov 2024 12:42:12 +0100
Subject: [PATCH 3/5] chore: Update unstable version to 2.9.0-rc0 (#8582)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 VERSION.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/VERSION.txt b/VERSION.txt
index 32a386165c..da852fef72 100644
--- a/VERSION.txt
+++ b/VERSION.txt
@@ -1 +1 @@
-2.8.0-rc0
+2.9.0-rc0

From 2440a5ee17248ef556324da69a9d15b62b023580 Mon Sep 17 00:00:00 2001
From: Stefano Fiorucci <stefanofiorucci@gmail.com>
Date: Tue, 26 Nov 2024 14:47:04 +0100
Subject: [PATCH 4/5] chore:`PyPDFToDocument` - deprecate `converter` init
 parameter (#8569)

* deprecat converter in pypdf

* fix linting of MetaFieldGroupingRanker

* linting
---
 haystack/components/converters/pypdf.py         | 17 ++++++++++++++---
 ...te-converter-parameter-d0cc04def6c3a293.yaml |  6 ++++++
 2 files changed, 20 insertions(+), 3 deletions(-)
 create mode 100644 releasenotes/notes/pypdf-deprecate-converter-parameter-d0cc04def6c3a293.yaml

diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py
index 72fbcdc161..a17abd34d9 100644
--- a/haystack/components/converters/pypdf.py
+++ b/haystack/components/converters/pypdf.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import io
+import warnings
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Protocol, Union
 
@@ -22,6 +23,9 @@
 class PyPDFConverter(Protocol):
     """
     A protocol that defines a converter which takes a PdfReader object and converts it into a Document object.
+
+    This is deprecated and will be removed in Haystack 2.9.0.
+    For in-depth customization of the conversion process, consider implementing a custom component.
     """
 
     def convert(self, reader: "PdfReader") -> Document:  # noqa: D102
@@ -40,8 +44,7 @@ class PyPDFToDocument:
     """
     Converts PDF files to documents your pipeline can query.
 
-    This component uses converters compatible with the PyPDF library.
-    If no converter is provided, uses a default text extraction converter.
+    This component uses the PyPDF library.
     You can attach metadata to the resulting documents.
 
     ### Usage example
@@ -62,10 +65,18 @@ def __init__(self, converter: Optional[PyPDFConverter] = None):
         Create an PyPDFToDocument component.
 
         :param converter:
-            An instance of a PyPDFConverter compatible class.
+            An instance of a PyPDFConverter compatible class. This is deprecated and will be removed in Haystack 2.9.0.
+            For in-depth customization of the conversion process, consider implementing a custom component.
         """
         pypdf_import.check()
 
+        if converter is not None:
+            msg = (
+                "The `converter` parameter is deprecated and will be removed in Haystack 2.9.0. "
+                "For in-depth customization of the conversion process, consider implementing a custom component."
+            )
+            warnings.warn(msg, DeprecationWarning)
+
         self.converter = converter
 
     def to_dict(self):
diff --git a/releasenotes/notes/pypdf-deprecate-converter-parameter-d0cc04def6c3a293.yaml b/releasenotes/notes/pypdf-deprecate-converter-parameter-d0cc04def6c3a293.yaml
new file mode 100644
index 0000000000..6f59d3a1c4
--- /dev/null
+++ b/releasenotes/notes/pypdf-deprecate-converter-parameter-d0cc04def6c3a293.yaml
@@ -0,0 +1,6 @@
+---
+deprecations:
+  - |
+    The `converter` parameter in the `PyPDFToDocument` component is deprecated and will be removed in Haystack 2.9.0.
+    For in-depth customization of the conversion process, consider implementing a custom component.
+    Additional high-level customization options will be added in the future.

From fb42c035c554d0477a9bcbf4ff890545da9c3874 Mon Sep 17 00:00:00 2001
From: Stefano Fiorucci <stefanofiorucci@gmail.com>
Date: Tue, 26 Nov 2024 16:37:59 +0100
Subject: [PATCH 5/5] feat: `PyPDFToDocument` - add new customization
 parameters (#8574)

* deprecat converter in pypdf

* fix linting of MetaFieldGroupingRanker

* linting

* pypdftodocument: add customization params

* fix mypy

* incorporate feedback
---
 haystack/components/converters/pypdf.py       | 101 +++++++++++-
 ...customization-params-3da578deff7f83a5.yaml |   5 +
 .../converters/test_pypdf_to_document.py      | 149 +++++++++++++++---
 3 files changed, 227 insertions(+), 28 deletions(-)
 create mode 100644 releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml

diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py
index a17abd34d9..555cf1df58 100644
--- a/haystack/components/converters/pypdf.py
+++ b/haystack/components/converters/pypdf.py
@@ -4,6 +4,7 @@
 
 import io
 import warnings
+from enum import Enum
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Protocol, Union
 
@@ -39,6 +40,33 @@ def from_dict(cls, data):  # noqa: D102
         ...
 
 
+class PyPDFExtractionMode(Enum):
+    """
+    The mode to use for extracting text from a PDF.
+    """
+
+    PLAIN = "plain"
+    LAYOUT = "layout"
+
+    def __str__(self) -> str:
+        """
+        Convert a PyPDFExtractionMode enum to a string.
+        """
+        return self.value
+
+    @staticmethod
+    def from_str(string: str) -> "PyPDFExtractionMode":
+        """
+        Convert a string to a PyPDFExtractionMode enum.
+        """
+        enum_map = {e.value: e for e in PyPDFExtractionMode}
+        mode = enum_map.get(string)
+        if mode is None:
+            msg = f"Unknown extraction mode '{string}'. Supported modes are: {list(enum_map.keys())}"
+            raise ValueError(msg)
+        return mode
+
+
 @component
 class PyPDFToDocument:
     """
@@ -60,13 +88,49 @@ class PyPDFToDocument:
     ```
     """
 
-    def __init__(self, converter: Optional[PyPDFConverter] = None):
+    def __init__(
+        self,
+        converter: Optional[PyPDFConverter] = None,
+        *,
+        extraction_mode: Union[str, PyPDFExtractionMode] = PyPDFExtractionMode.PLAIN,
+        plain_mode_orientations: tuple = (0, 90, 180, 270),
+        plain_mode_space_width: float = 200.0,
+        layout_mode_space_vertically: bool = True,
+        layout_mode_scale_weight: float = 1.25,
+        layout_mode_strip_rotated: bool = True,
+        layout_mode_font_height_weight: float = 1.0,
+    ):
         """
         Create an PyPDFToDocument component.
 
         :param converter:
             An instance of a PyPDFConverter compatible class. This is deprecated and will be removed in Haystack 2.9.0.
             For in-depth customization of the conversion process, consider implementing a custom component.
+
+        All the following parameters are applied only if `converter` is None.
+
+        :param extraction_mode:
+            The mode to use for extracting text from a PDF.
+            Layout mode is an experimental mode that adheres to the rendered layout of the PDF.
+        :param plain_mode_orientations:
+            Tuple of orientations to look for when extracting text from a PDF in plain mode.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
+        :param plain_mode_space_width:
+            Forces default space width if not extracted from font.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
+        :param layout_mode_space_vertically:
+            Whether to include blank lines inferred from y distance + font height.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
+        :param layout_mode_scale_weight:
+            Multiplier for string length when calculating weighted average character width.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
+        :param layout_mode_strip_rotated:
+            Layout mode does not support rotated text. Set to `False` to include rotated text anyway.
+            If rotated text is discovered, layout will be degraded and a warning will be logged.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
+        :param layout_mode_font_height_weight:
+            Multiplier for font height when calculating blank line height.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
         """
         pypdf_import.check()
 
@@ -79,6 +143,16 @@ def __init__(self, converter: Optional[PyPDFConverter] = None):
 
         self.converter = converter
 
+        if isinstance(extraction_mode, str):
+            extraction_mode = PyPDFExtractionMode.from_str(extraction_mode)
+        self.extraction_mode = extraction_mode
+        self.plain_mode_orientations = plain_mode_orientations
+        self.plain_mode_space_width = plain_mode_space_width
+        self.layout_mode_space_vertically = layout_mode_space_vertically
+        self.layout_mode_scale_weight = layout_mode_scale_weight
+        self.layout_mode_strip_rotated = layout_mode_strip_rotated
+        self.layout_mode_font_height_weight = layout_mode_font_height_weight
+
     def to_dict(self):
         """
         Serializes the component to a dictionary.
@@ -87,7 +161,15 @@ def to_dict(self):
             Dictionary with serialized data.
         """
         return default_to_dict(
-            self, converter=(serialize_class_instance(self.converter) if self.converter is not None else None)
+            self,
+            converter=(serialize_class_instance(self.converter) if self.converter else None),
+            extraction_mode=str(self.extraction_mode),
+            plain_mode_orientations=self.plain_mode_orientations,
+            plain_mode_space_width=self.plain_mode_space_width,
+            layout_mode_space_vertically=self.layout_mode_space_vertically,
+            layout_mode_scale_weight=self.layout_mode_scale_weight,
+            layout_mode_strip_rotated=self.layout_mode_strip_rotated,
+            layout_mode_font_height_weight=self.layout_mode_font_height_weight,
         )
 
     @classmethod
@@ -108,7 +190,20 @@ def from_dict(cls, data):
         return default_from_dict(cls, data)
 
     def _default_convert(self, reader: "PdfReader") -> Document:
-        text = "\f".join(page.extract_text() for page in reader.pages)
+        texts = []
+        for page in reader.pages:
+            texts.append(
+                page.extract_text(
+                    orientations=self.plain_mode_orientations,
+                    extraction_mode=self.extraction_mode.value,
+                    space_width=self.plain_mode_space_width,
+                    layout_mode_space_vertically=self.layout_mode_space_vertically,
+                    layout_mode_scale_weight=self.layout_mode_scale_weight,
+                    layout_mode_strip_rotated=self.layout_mode_strip_rotated,
+                    layout_mode_font_height_weight=self.layout_mode_font_height_weight,
+                )
+            )
+        text = "\f".join(texts)
         return Document(content=text)
 
     @component.output_types(documents=List[Document])
diff --git a/releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml b/releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml
new file mode 100644
index 0000000000..4e3f5176ac
--- /dev/null
+++ b/releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml
@@ -0,0 +1,5 @@
+---
+enhancements:
+  - |
+    Added new initialization parameters to the `PyPDFToDocument` component to customize the text extraction process
+    from PDF files.
diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py
index 234b545eac..d9aee0020f 100644
--- a/test/components/converters/test_pypdf_to_document.py
+++ b/test/components/converters/test_pypdf_to_document.py
@@ -2,17 +2,17 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 import logging
-from unittest.mock import patch
+from unittest.mock import patch, Mock
 
 import pytest
 
 from haystack import Document, default_from_dict, default_to_dict
-from haystack.components.converters.pypdf import PyPDFToDocument
+from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode
 from haystack.dataclasses import ByteStream
 
 
 @pytest.fixture
-def pypdf_converter():
+def pypdf_component():
     return PyPDFToDocument()
 
 
@@ -30,38 +30,104 @@ def from_dict(cls, data):
 
 
 class TestPyPDFToDocument:
-    def test_init(self, pypdf_converter):
-        assert pypdf_converter.converter is None
+    def test_init(self, pypdf_component):
+        assert pypdf_component.converter is None
+        assert pypdf_component.extraction_mode == PyPDFExtractionMode.PLAIN
+        assert pypdf_component.plain_mode_orientations == (0, 90, 180, 270)
+        assert pypdf_component.plain_mode_space_width == 200.0
+        assert pypdf_component.layout_mode_space_vertically is True
+        assert pypdf_component.layout_mode_scale_weight == 1.25
+        assert pypdf_component.layout_mode_strip_rotated is True
+        assert pypdf_component.layout_mode_font_height_weight == 1.0
 
-    def test_init_params(self):
-        pypdf_converter = PyPDFToDocument(converter=CustomConverter())
-        assert isinstance(pypdf_converter.converter, CustomConverter)
+    def test_init_converter(self):
+        pypdf_component = PyPDFToDocument(converter=CustomConverter())
+        assert isinstance(pypdf_component.converter, CustomConverter)
 
-    def test_to_dict(self, pypdf_converter):
-        data = pypdf_converter.to_dict()
+    def test_init_custom_params(self):
+        pypdf_component = PyPDFToDocument(
+            extraction_mode="layout",
+            plain_mode_orientations=(0, 90),
+            plain_mode_space_width=150.0,
+            layout_mode_space_vertically=False,
+            layout_mode_scale_weight=2.0,
+            layout_mode_strip_rotated=False,
+            layout_mode_font_height_weight=0.5,
+        )
+
+        assert pypdf_component.extraction_mode == PyPDFExtractionMode.LAYOUT
+        assert pypdf_component.plain_mode_orientations == (0, 90)
+        assert pypdf_component.plain_mode_space_width == 150.0
+        assert pypdf_component.layout_mode_space_vertically is False
+        assert pypdf_component.layout_mode_scale_weight == 2.0
+        assert pypdf_component.layout_mode_strip_rotated is False
+        assert pypdf_component.layout_mode_font_height_weight == 0.5
+
+    def test_init_invalid_extraction_mode(self):
+        with pytest.raises(ValueError):
+            PyPDFToDocument(extraction_mode="invalid")
+
+    def test_to_dict(self, pypdf_component):
+        data = pypdf_component.to_dict()
         assert data == {
             "type": "haystack.components.converters.pypdf.PyPDFToDocument",
-            "init_parameters": {"converter": None},
+            "init_parameters": {
+                "converter": None,
+                "extraction_mode": "plain",
+                "plain_mode_orientations": (0, 90, 180, 270),
+                "plain_mode_space_width": 200.0,
+                "layout_mode_space_vertically": True,
+                "layout_mode_scale_weight": 1.25,
+                "layout_mode_strip_rotated": True,
+                "layout_mode_font_height_weight": 1.0,
+            },
         }
 
     def test_to_dict_custom_converter(self):
-        pypdf_converter = PyPDFToDocument(converter=CustomConverter())
-        data = pypdf_converter.to_dict()
+        pypdf_component = PyPDFToDocument(converter=CustomConverter())
+        data = pypdf_component.to_dict()
         assert data == {
             "type": "haystack.components.converters.pypdf.PyPDFToDocument",
             "init_parameters": {
                 "converter": {
                     "data": {"key": "value", "more": False},
                     "type": "converters.test_pypdf_to_document.CustomConverter",
-                }
+                },
+                "extraction_mode": "plain",
+                "plain_mode_orientations": (0, 90, 180, 270),
+                "plain_mode_space_width": 200.0,
+                "layout_mode_space_vertically": True,
+                "layout_mode_scale_weight": 1.25,
+                "layout_mode_strip_rotated": True,
+                "layout_mode_font_height_weight": 1.0,
             },
         }
 
     def test_from_dict(self):
-        data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {"converter": None}}
+        data = {
+            "type": "haystack.components.converters.pypdf.PyPDFToDocument",
+            "init_parameters": {
+                "converter": None,
+                "extraction_mode": "plain",
+                "plain_mode_orientations": (0, 90, 180, 270),
+                "plain_mode_space_width": 200.0,
+                "layout_mode_space_vertically": True,
+                "layout_mode_scale_weight": 1.25,
+                "layout_mode_strip_rotated": True,
+                "layout_mode_font_height_weight": 1.0,
+            },
+        }
+
         instance = PyPDFToDocument.from_dict(data)
         assert isinstance(instance, PyPDFToDocument)
         assert instance.converter is None
+        assert instance.extraction_mode == PyPDFExtractionMode.PLAIN
+        assert instance.plain_mode_orientations == (0, 90, 180, 270)
+        assert instance.plain_mode_space_width == 200.0
+        assert instance.layout_mode_space_vertically is True
+        assert instance.layout_mode_scale_weight == 1.25
+        assert instance.layout_mode_strip_rotated is True
+        assert instance.layout_mode_font_height_weight == 1.0
 
     def test_from_dict_defaults(self):
         data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}}
@@ -83,30 +149,63 @@ def test_from_dict_custom_converter(self):
         assert isinstance(instance, PyPDFToDocument)
         assert isinstance(instance.converter, CustomConverter)
 
+    def test_default_convert(self):
+        mock_page1 = Mock()
+        mock_page2 = Mock()
+        mock_page1.extract_text.return_value = "Page 1 content"
+        mock_page2.extract_text.return_value = "Page 2 content"
+        mock_reader = Mock()
+        mock_reader.pages = [mock_page1, mock_page2]
+
+        converter = PyPDFToDocument(
+            extraction_mode="layout",
+            plain_mode_orientations=(0, 90),
+            plain_mode_space_width=150.0,
+            layout_mode_space_vertically=False,
+            layout_mode_scale_weight=2.0,
+            layout_mode_strip_rotated=False,
+            layout_mode_font_height_weight=1.5,
+        )
+
+        doc = converter._default_convert(mock_reader)
+        assert doc.content == "Page 1 content\fPage 2 content"
+
+        expected_params = {
+            "extraction_mode": "layout",
+            "orientations": (0, 90),
+            "space_width": 150.0,
+            "layout_mode_space_vertically": False,
+            "layout_mode_scale_weight": 2.0,
+            "layout_mode_strip_rotated": False,
+            "layout_mode_font_height_weight": 1.5,
+        }
+        for mock_page in mock_reader.pages:
+            mock_page.extract_text.assert_called_once_with(**expected_params)
+
     @pytest.mark.integration
-    def test_run(self, test_files_path, pypdf_converter):
+    def test_run(self, test_files_path, pypdf_component):
         """
         Test if the component runs correctly.
         """
         paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
-        output = pypdf_converter.run(sources=paths)
+        output = pypdf_component.run(sources=paths)
         docs = output["documents"]
         assert len(docs) == 1
         assert "History" in docs[0].content
 
     @pytest.mark.integration
-    def test_page_breaks_added(self, test_files_path, pypdf_converter):
+    def test_page_breaks_added(self, test_files_path, pypdf_component):
         paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
-        output = pypdf_converter.run(sources=paths)
+        output = pypdf_component.run(sources=paths)
         docs = output["documents"]
         assert len(docs) == 1
         assert docs[0].content.count("\f") == 3
 
-    def test_run_with_meta(self, test_files_path, pypdf_converter):
+    def test_run_with_meta(self, test_files_path, pypdf_component):
         bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
 
         with patch("haystack.components.converters.pypdf.PdfReader"):
-            output = pypdf_converter.run(
+            output = pypdf_component.run(
                 sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
             )
 
@@ -115,17 +214,17 @@ def test_run_with_meta(self, test_files_path, pypdf_converter):
         assert output["documents"][0].meta["language"] == "it"
         assert output["documents"][1].meta["language"] == "it"
 
-    def test_run_error_handling(self, test_files_path, pypdf_converter, caplog):
+    def test_run_error_handling(self, test_files_path, pypdf_component, caplog):
         """
         Test if the component correctly handles errors.
         """
         paths = ["non_existing_file.pdf"]
         with caplog.at_level(logging.WARNING):
-            pypdf_converter.run(sources=paths)
+            pypdf_component.run(sources=paths)
             assert "Could not read non_existing_file.pdf" in caplog.text
 
     @pytest.mark.integration
-    def test_mixed_sources_run(self, test_files_path, pypdf_converter):
+    def test_mixed_sources_run(self, test_files_path, pypdf_component):
         """
         Test if the component runs correctly when mixed sources are provided.
         """
@@ -133,7 +232,7 @@ def test_mixed_sources_run(self, test_files_path, pypdf_converter):
         with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f:
             paths.append(ByteStream(f.read()))
 
-        output = pypdf_converter.run(sources=paths)
+        output = pypdf_component.run(sources=paths)
         docs = output["documents"]
         assert len(docs) == 2
         assert "History and standardization" in docs[0].content