Merge branch 'main' into fix-sentence-window

deepset-ai · Nov 27, 2024 · b2662d7 · b2662d7
2 parents 0e13644 + fb42c03
commit b2662d7
Show file tree

Hide file tree

Showing 10 changed files with 472 additions and 47 deletions.
diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-2.8.0-rc0
+2.9.0-rc0
diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py
@@ -3,6 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import io
+import warnings
+from enum import Enum
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Protocol, Union
 
@@ -22,6 +24,9 @@
 class PyPDFConverter(Protocol):
     """
     A protocol that defines a converter which takes a PdfReader object and converts it into a Document object.
+
+    This is deprecated and will be removed in Haystack 2.9.0.
+    For in-depth customization of the conversion process, consider implementing a custom component.
     """
 
     def convert(self, reader: "PdfReader") -> Document:  # noqa: D102
@@ -35,13 +40,39 @@ def from_dict(cls, data):  # noqa: D102
         ...
 
 
+class PyPDFExtractionMode(Enum):
+    """
+    The mode to use for extracting text from a PDF.
+    """
+
+    PLAIN = "plain"
+    LAYOUT = "layout"
+
+    def __str__(self) -> str:
+        """
+        Convert a PyPDFExtractionMode enum to a string.
+        """
+        return self.value
+
+    @staticmethod
+    def from_str(string: str) -> "PyPDFExtractionMode":
+        """
+        Convert a string to a PyPDFExtractionMode enum.
+        """
+        enum_map = {e.value: e for e in PyPDFExtractionMode}
+        mode = enum_map.get(string)
+        if mode is None:
+            msg = f"Unknown extraction mode '{string}'. Supported modes are: {list(enum_map.keys())}"
+            raise ValueError(msg)
+        return mode
+
+
 @component
 class PyPDFToDocument:
     """
     Converts PDF files to documents your pipeline can query.
 
-    This component uses converters compatible with the PyPDF library.
-    If no converter is provided, uses a default text extraction converter.
+    This component uses the PyPDF library.
     You can attach metadata to the resulting documents.
 
     ### Usage example
@@ -57,17 +88,71 @@ class PyPDFToDocument:
     ```
     """
 
-    def __init__(self, converter: Optional[PyPDFConverter] = None):
+    def __init__(
+        self,
+        converter: Optional[PyPDFConverter] = None,
+        *,
+        extraction_mode: Union[str, PyPDFExtractionMode] = PyPDFExtractionMode.PLAIN,
+        plain_mode_orientations: tuple = (0, 90, 180, 270),
+        plain_mode_space_width: float = 200.0,
+        layout_mode_space_vertically: bool = True,
+        layout_mode_scale_weight: float = 1.25,
+        layout_mode_strip_rotated: bool = True,
+        layout_mode_font_height_weight: float = 1.0,
+    ):
         """
         Create an PyPDFToDocument component.
 
         :param converter:
-            An instance of a PyPDFConverter compatible class.
+            An instance of a PyPDFConverter compatible class. This is deprecated and will be removed in Haystack 2.9.0.
+            For in-depth customization of the conversion process, consider implementing a custom component.
+
+        All the following parameters are applied only if `converter` is None.
+
+        :param extraction_mode:
+            The mode to use for extracting text from a PDF.
+            Layout mode is an experimental mode that adheres to the rendered layout of the PDF.
+        :param plain_mode_orientations:
+            Tuple of orientations to look for when extracting text from a PDF in plain mode.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
+        :param plain_mode_space_width:
+            Forces default space width if not extracted from font.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
+        :param layout_mode_space_vertically:
+            Whether to include blank lines inferred from y distance + font height.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
+        :param layout_mode_scale_weight:
+            Multiplier for string length when calculating weighted average character width.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
+        :param layout_mode_strip_rotated:
+            Layout mode does not support rotated text. Set to `False` to include rotated text anyway.
+            If rotated text is discovered, layout will be degraded and a warning will be logged.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
+        :param layout_mode_font_height_weight:
+            Multiplier for font height when calculating blank line height.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
         """
         pypdf_import.check()
 
+        if converter is not None:
+            msg = (
+                "The `converter` parameter is deprecated and will be removed in Haystack 2.9.0. "
+                "For in-depth customization of the conversion process, consider implementing a custom component."
+            )
+            warnings.warn(msg, DeprecationWarning)
+
         self.converter = converter
 
+        if isinstance(extraction_mode, str):
+            extraction_mode = PyPDFExtractionMode.from_str(extraction_mode)
+        self.extraction_mode = extraction_mode
+        self.plain_mode_orientations = plain_mode_orientations
+        self.plain_mode_space_width = plain_mode_space_width
+        self.layout_mode_space_vertically = layout_mode_space_vertically
+        self.layout_mode_scale_weight = layout_mode_scale_weight
+        self.layout_mode_strip_rotated = layout_mode_strip_rotated
+        self.layout_mode_font_height_weight = layout_mode_font_height_weight
+
     def to_dict(self):
         """
         Serializes the component to a dictionary.
@@ -76,7 +161,15 @@ def to_dict(self):
             Dictionary with serialized data.
         """
         return default_to_dict(
-            self, converter=(serialize_class_instance(self.converter) if self.converter is not None else None)
+            self,
+            converter=(serialize_class_instance(self.converter) if self.converter else None),
+            extraction_mode=str(self.extraction_mode),
+            plain_mode_orientations=self.plain_mode_orientations,
+            plain_mode_space_width=self.plain_mode_space_width,
+            layout_mode_space_vertically=self.layout_mode_space_vertically,
+            layout_mode_scale_weight=self.layout_mode_scale_weight,
+            layout_mode_strip_rotated=self.layout_mode_strip_rotated,
+            layout_mode_font_height_weight=self.layout_mode_font_height_weight,
         )
 
     @classmethod
@@ -97,7 +190,20 @@ def from_dict(cls, data):
         return default_from_dict(cls, data)
 
     def _default_convert(self, reader: "PdfReader") -> Document:
-        text = "\f".join(page.extract_text() for page in reader.pages)
+        texts = []
+        for page in reader.pages:
+            texts.append(
+                page.extract_text(
+                    orientations=self.plain_mode_orientations,
+                    extraction_mode=self.extraction_mode.value,
+                    space_width=self.plain_mode_space_width,
+                    layout_mode_space_vertically=self.layout_mode_space_vertically,
+                    layout_mode_scale_weight=self.layout_mode_scale_weight,
+                    layout_mode_strip_rotated=self.layout_mode_strip_rotated,
+                    layout_mode_font_height_weight=self.layout_mode_font_height_weight,
+                )
+            )
+        text = "\f".join(texts)
         return Document(content=text)
 
     @component.output_types(documents=List[Document])

diff --git a/haystack/components/routers/conditional_router.py b/haystack/components/routers/conditional_router.py
@@ -107,12 +107,13 @@ class ConditionalRouter:
     ```
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         routes: List[Dict],
         custom_filters: Optional[Dict[str, Callable]] = None,
         unsafe: bool = False,
         validate_output_type: bool = False,
+        optional_variables: Optional[List[str]] = None,
     ):
         """
         Initializes the `ConditionalRouter` with a list of routes detailing the conditions for routing.
@@ -136,11 +137,54 @@ def __init__(
         :param validate_output_type:
             Enable validation of routes' output.
             If a route output doesn't match the declared type a ValueError is raised running.
+        :param optional_variables:
+            A list of variable names that are optional in your route conditions and outputs.
+            If these variables are not provided at runtime, they will be set to `None`.
+            This allows you to write routes that can handle missing inputs gracefully without raising errors.
+
+            Example usage with a default fallback route in a Pipeline:
+            ```python
+            from haystack import Pipeline
+            from haystack.components.routers import ConditionalRouter
+
+            routes = [
+                {
+                    "condition": '{{ path == "rag" }}',
+                    "output": "{{ question }}",
+                    "output_name": "rag_route",
+                    "output_type": str
+                },
+                {
+                    "condition": "{{ True }}",  # fallback route
+                    "output": "{{ question }}",
+                    "output_name": "default_route",
+                    "output_type": str
+                }
+            ]
+
+            router = ConditionalRouter(routes, optional_variables=["path"])
+            pipe = Pipeline()
+            pipe.add_component("router", router)
+
+            # When 'path' is provided in the pipeline:
+            result = pipe.run(data={"router": {"question": "What?", "path": "rag"}})
+            assert result["router"] == {"rag_route": "What?"}
+
+            # When 'path' is not provided, fallback route is taken:
+            result = pipe.run(data={"router": {"question": "What?"}})
+            assert result["router"] == {"default_route": "What?"}
+            ```
+
+            This pattern is particularly useful when:
+            - You want to provide default/fallback behavior when certain inputs are missing
+            - Some variables are only needed for specific routing conditions
+            - You're building flexible pipelines where not all inputs are guaranteed to be present
         """
         self.routes: List[dict] = routes
         self.custom_filters = custom_filters or {}
         self._unsafe = unsafe
         self._validate_output_type = validate_output_type
+        self.optional_variables = optional_variables or []
 
         # Create a Jinja environment to inspect variables in the condition templates
         if self._unsafe:
@@ -166,7 +210,28 @@ def __init__(
             # extract outputs
             output_types.update({route["output_name"]: route["output_type"]})
 
-        component.set_input_types(self, **{var: Any for var in input_types})
+        # remove optional variables from mandatory input types
+        mandatory_input_types = input_types - set(self.optional_variables)
+
+        # warn about unused optional variables
+        unused_optional_vars = set(self.optional_variables) - input_types if self.optional_variables else None
+        if unused_optional_vars:
+            msg = (
+                f"The following optional variables are specified but not used in any route: {unused_optional_vars}. "
+                "Check if there's a typo in variable names."
+            )
+            # intentionally using both warn and logger
+            warn(msg, UserWarning)
+            logger.warning(msg)
+
+        # add mandatory input types
+        component.set_input_types(self, **{var: Any for var in mandatory_input_types})
+
+        # now add optional input types
+        for optional_var_name in self.optional_variables:
+            component.set_input_type(self, name=optional_var_name, type=Any, default=None)
+
+        # set output types
         component.set_output_types(self, **output_types)
 
     def to_dict(self) -> Dict[str, Any]:
@@ -186,6 +251,7 @@ def to_dict(self) -> Dict[str, Any]:
             custom_filters=se_filters,
             unsafe=self._unsafe,
             validate_output_type=self._validate_output_type,
+            optional_variables=self.optional_variables,
         )
 
     @classmethod

diff --git a/haystack/core/component/component.py b/haystack/core/component/component.py
@@ -71,7 +71,6 @@
 
 import inspect
 import sys
-import warnings
 from collections.abc import Callable
 from contextlib import contextmanager
 from contextvars import ContextVar
@@ -487,21 +486,12 @@ class available here, we temporarily store the output types as an attribute of
 
         return output_types_decorator
 
-    def _component(self, cls, is_greedy: Optional[bool] = None):
+    def _component(self, cls: Any):
         """
         Decorator validating the structure of the component and registering it in the components registry.
         """
         logger.debug("Registering {component} as a component", component=cls)
 
-        if is_greedy is not None:
-            msg = (
-                "The 'is_greedy' argument is deprecated and will be removed in version '2.7.0'. "
-                "Change the 'Variadic' input of your Component to 'GreedyVariadic' instead."
-            )
-            warnings.warn(msg, DeprecationWarning)
-        else:
-            is_greedy = False
-
         # Check for required methods and fail as soon as possible
         if not hasattr(cls, "run"):
             raise ComponentError(f"{cls.__name__} must have a 'run()' method. See the docs for more information.")
@@ -543,11 +533,11 @@ def copy_class_namespace(namespace):
 
         return cls
 
-    def __call__(self, cls: Optional[type] = None, is_greedy: Optional[bool] = None):
+    def __call__(self, cls: Optional[type] = None):
         # We must wrap the call to the decorator in a function for it to work
         # correctly with or without parens
         def wrap(cls):
-            return self._component(cls, is_greedy=is_greedy)
+            return self._component(cls)
 
         if cls:
             # Decorator is called without parens

diff --git a/releasenotes/notes/conditional-router-optional-parameters-f02c598d7c751868.yaml b/releasenotes/notes/conditional-router-optional-parameters-f02c598d7c751868.yaml
@@ -0,0 +1,4 @@
+---
+enhancements:
+  - |
+    Introduces optional parameters in the ConditionalRouter component, enabling default/fallback routing behavior when certain inputs are not provided at runtime. This enhancement allows for more flexible pipeline configurations with graceful handling of missing parameters.
diff --git a/releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml b/releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml
@@ -0,0 +1,5 @@
+---
+enhancements:
+  - |
+    Added new initialization parameters to the `PyPDFToDocument` component to customize the text extraction process
+    from PDF files.
diff --git a/releasenotes/notes/pypdf-deprecate-converter-parameter-d0cc04def6c3a293.yaml b/releasenotes/notes/pypdf-deprecate-converter-parameter-d0cc04def6c3a293.yaml
@@ -0,0 +1,6 @@
+---
+deprecations:
+  - |
+    The `converter` parameter in the `PyPDFToDocument` component is deprecated and will be removed in Haystack 2.9.0.
+    For in-depth customization of the conversion process, consider implementing a custom component.
+    Additional high-level customization options will be added in the future.
diff --git a/releasenotes/notes/remove-deprecated-argument-from-component-decorator-9af6940bc60795d0.yaml b/releasenotes/notes/remove-deprecated-argument-from-component-decorator-9af6940bc60795d0.yaml
@@ -0,0 +1,4 @@
+---
+upgrade:
+  - |
+    Remove 'is_greedy' deprecated argument from `@component` decorator. Change the 'Variadic' input of your Component to 'GreedyVariadic' instead.