Skip to content

Commit

Permalink
Merge branch 'main' into fix-sentence-window
Browse files Browse the repository at this point in the history
  • Loading branch information
davidsbatista authored Nov 27, 2024
2 parents 0e13644 + fb42c03 commit b2662d7
Show file tree
Hide file tree
Showing 10 changed files with 472 additions and 47 deletions.
2 changes: 1 addition & 1 deletion VERSION.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.8.0-rc0
2.9.0-rc0
118 changes: 112 additions & 6 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# SPDX-License-Identifier: Apache-2.0

import io
import warnings
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Protocol, Union

Expand All @@ -22,6 +24,9 @@
class PyPDFConverter(Protocol):
"""
A protocol that defines a converter which takes a PdfReader object and converts it into a Document object.
This is deprecated and will be removed in Haystack 2.9.0.
For in-depth customization of the conversion process, consider implementing a custom component.
"""

def convert(self, reader: "PdfReader") -> Document: # noqa: D102
Expand All @@ -35,13 +40,39 @@ def from_dict(cls, data): # noqa: D102
...


class PyPDFExtractionMode(Enum):
"""
The mode to use for extracting text from a PDF.
"""

PLAIN = "plain"
LAYOUT = "layout"

def __str__(self) -> str:
"""
Convert a PyPDFExtractionMode enum to a string.
"""
return self.value

@staticmethod
def from_str(string: str) -> "PyPDFExtractionMode":
"""
Convert a string to a PyPDFExtractionMode enum.
"""
enum_map = {e.value: e for e in PyPDFExtractionMode}
mode = enum_map.get(string)
if mode is None:
msg = f"Unknown extraction mode '{string}'. Supported modes are: {list(enum_map.keys())}"
raise ValueError(msg)
return mode


@component
class PyPDFToDocument:
"""
Converts PDF files to documents your pipeline can query.
This component uses converters compatible with the PyPDF library.
If no converter is provided, uses a default text extraction converter.
This component uses the PyPDF library.
You can attach metadata to the resulting documents.
### Usage example
Expand All @@ -57,17 +88,71 @@ class PyPDFToDocument:
```
"""

def __init__(self, converter: Optional[PyPDFConverter] = None):
def __init__(
self,
converter: Optional[PyPDFConverter] = None,
*,
extraction_mode: Union[str, PyPDFExtractionMode] = PyPDFExtractionMode.PLAIN,
plain_mode_orientations: tuple = (0, 90, 180, 270),
plain_mode_space_width: float = 200.0,
layout_mode_space_vertically: bool = True,
layout_mode_scale_weight: float = 1.25,
layout_mode_strip_rotated: bool = True,
layout_mode_font_height_weight: float = 1.0,
):
"""
Create an PyPDFToDocument component.
:param converter:
An instance of a PyPDFConverter compatible class.
An instance of a PyPDFConverter compatible class. This is deprecated and will be removed in Haystack 2.9.0.
For in-depth customization of the conversion process, consider implementing a custom component.
All the following parameters are applied only if `converter` is None.
:param extraction_mode:
The mode to use for extracting text from a PDF.
Layout mode is an experimental mode that adheres to the rendered layout of the PDF.
:param plain_mode_orientations:
Tuple of orientations to look for when extracting text from a PDF in plain mode.
Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
:param plain_mode_space_width:
Forces default space width if not extracted from font.
Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
:param layout_mode_space_vertically:
Whether to include blank lines inferred from y distance + font height.
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
:param layout_mode_scale_weight:
Multiplier for string length when calculating weighted average character width.
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
:param layout_mode_strip_rotated:
Layout mode does not support rotated text. Set to `False` to include rotated text anyway.
If rotated text is discovered, layout will be degraded and a warning will be logged.
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
:param layout_mode_font_height_weight:
Multiplier for font height when calculating blank line height.
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
"""
pypdf_import.check()

if converter is not None:
msg = (
"The `converter` parameter is deprecated and will be removed in Haystack 2.9.0. "
"For in-depth customization of the conversion process, consider implementing a custom component."
)
warnings.warn(msg, DeprecationWarning)

self.converter = converter

if isinstance(extraction_mode, str):
extraction_mode = PyPDFExtractionMode.from_str(extraction_mode)
self.extraction_mode = extraction_mode
self.plain_mode_orientations = plain_mode_orientations
self.plain_mode_space_width = plain_mode_space_width
self.layout_mode_space_vertically = layout_mode_space_vertically
self.layout_mode_scale_weight = layout_mode_scale_weight
self.layout_mode_strip_rotated = layout_mode_strip_rotated
self.layout_mode_font_height_weight = layout_mode_font_height_weight

def to_dict(self):
"""
Serializes the component to a dictionary.
Expand All @@ -76,7 +161,15 @@ def to_dict(self):
Dictionary with serialized data.
"""
return default_to_dict(
self, converter=(serialize_class_instance(self.converter) if self.converter is not None else None)
self,
converter=(serialize_class_instance(self.converter) if self.converter else None),
extraction_mode=str(self.extraction_mode),
plain_mode_orientations=self.plain_mode_orientations,
plain_mode_space_width=self.plain_mode_space_width,
layout_mode_space_vertically=self.layout_mode_space_vertically,
layout_mode_scale_weight=self.layout_mode_scale_weight,
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
)

@classmethod
Expand All @@ -97,7 +190,20 @@ def from_dict(cls, data):
return default_from_dict(cls, data)

def _default_convert(self, reader: "PdfReader") -> Document:
text = "\f".join(page.extract_text() for page in reader.pages)
texts = []
for page in reader.pages:
texts.append(
page.extract_text(
orientations=self.plain_mode_orientations,
extraction_mode=self.extraction_mode.value,
space_width=self.plain_mode_space_width,
layout_mode_space_vertically=self.layout_mode_space_vertically,
layout_mode_scale_weight=self.layout_mode_scale_weight,
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
)
)
text = "\f".join(texts)
return Document(content=text)

@component.output_types(documents=List[Document])
Expand Down
70 changes: 68 additions & 2 deletions haystack/components/routers/conditional_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,13 @@ class ConditionalRouter:
```
"""

def __init__(
def __init__( # pylint: disable=too-many-positional-arguments
self,
routes: List[Dict],
custom_filters: Optional[Dict[str, Callable]] = None,
unsafe: bool = False,
validate_output_type: bool = False,
optional_variables: Optional[List[str]] = None,
):
"""
Initializes the `ConditionalRouter` with a list of routes detailing the conditions for routing.
Expand All @@ -136,11 +137,54 @@ def __init__(
:param validate_output_type:
Enable validation of routes' output.
If a route output doesn't match the declared type a ValueError is raised running.
:param optional_variables:
A list of variable names that are optional in your route conditions and outputs.
If these variables are not provided at runtime, they will be set to `None`.
This allows you to write routes that can handle missing inputs gracefully without raising errors.
Example usage with a default fallback route in a Pipeline:
```python
from haystack import Pipeline
from haystack.components.routers import ConditionalRouter
routes = [
{
"condition": '{{ path == "rag" }}',
"output": "{{ question }}",
"output_name": "rag_route",
"output_type": str
},
{
"condition": "{{ True }}", # fallback route
"output": "{{ question }}",
"output_name": "default_route",
"output_type": str
}
]
router = ConditionalRouter(routes, optional_variables=["path"])
pipe = Pipeline()
pipe.add_component("router", router)
# When 'path' is provided in the pipeline:
result = pipe.run(data={"router": {"question": "What?", "path": "rag"}})
assert result["router"] == {"rag_route": "What?"}
# When 'path' is not provided, fallback route is taken:
result = pipe.run(data={"router": {"question": "What?"}})
assert result["router"] == {"default_route": "What?"}
```
This pattern is particularly useful when:
- You want to provide default/fallback behavior when certain inputs are missing
- Some variables are only needed for specific routing conditions
- You're building flexible pipelines where not all inputs are guaranteed to be present
"""
self.routes: List[dict] = routes
self.custom_filters = custom_filters or {}
self._unsafe = unsafe
self._validate_output_type = validate_output_type
self.optional_variables = optional_variables or []

# Create a Jinja environment to inspect variables in the condition templates
if self._unsafe:
Expand All @@ -166,7 +210,28 @@ def __init__(
# extract outputs
output_types.update({route["output_name"]: route["output_type"]})

component.set_input_types(self, **{var: Any for var in input_types})
# remove optional variables from mandatory input types
mandatory_input_types = input_types - set(self.optional_variables)

# warn about unused optional variables
unused_optional_vars = set(self.optional_variables) - input_types if self.optional_variables else None
if unused_optional_vars:
msg = (
f"The following optional variables are specified but not used in any route: {unused_optional_vars}. "
"Check if there's a typo in variable names."
)
# intentionally using both warn and logger
warn(msg, UserWarning)
logger.warning(msg)

# add mandatory input types
component.set_input_types(self, **{var: Any for var in mandatory_input_types})

# now add optional input types
for optional_var_name in self.optional_variables:
component.set_input_type(self, name=optional_var_name, type=Any, default=None)

# set output types
component.set_output_types(self, **output_types)

def to_dict(self) -> Dict[str, Any]:
Expand All @@ -186,6 +251,7 @@ def to_dict(self) -> Dict[str, Any]:
custom_filters=se_filters,
unsafe=self._unsafe,
validate_output_type=self._validate_output_type,
optional_variables=self.optional_variables,
)

@classmethod
Expand Down
16 changes: 3 additions & 13 deletions haystack/core/component/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@

import inspect
import sys
import warnings
from collections.abc import Callable
from contextlib import contextmanager
from contextvars import ContextVar
Expand Down Expand Up @@ -487,21 +486,12 @@ class available here, we temporarily store the output types as an attribute of

return output_types_decorator

def _component(self, cls, is_greedy: Optional[bool] = None):
def _component(self, cls: Any):
"""
Decorator validating the structure of the component and registering it in the components registry.
"""
logger.debug("Registering {component} as a component", component=cls)

if is_greedy is not None:
msg = (
"The 'is_greedy' argument is deprecated and will be removed in version '2.7.0'. "
"Change the 'Variadic' input of your Component to 'GreedyVariadic' instead."
)
warnings.warn(msg, DeprecationWarning)
else:
is_greedy = False

# Check for required methods and fail as soon as possible
if not hasattr(cls, "run"):
raise ComponentError(f"{cls.__name__} must have a 'run()' method. See the docs for more information.")
Expand Down Expand Up @@ -543,11 +533,11 @@ def copy_class_namespace(namespace):

return cls

def __call__(self, cls: Optional[type] = None, is_greedy: Optional[bool] = None):
def __call__(self, cls: Optional[type] = None):
# We must wrap the call to the decorator in a function for it to work
# correctly with or without parens
def wrap(cls):
return self._component(cls, is_greedy=is_greedy)
return self._component(cls)

if cls:
# Decorator is called without parens
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
enhancements:
- |
Introduces optional parameters in the ConditionalRouter component, enabling default/fallback routing behavior when certain inputs are not provided at runtime. This enhancement allows for more flexible pipeline configurations with graceful handling of missing parameters.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
enhancements:
- |
Added new initialization parameters to the `PyPDFToDocument` component to customize the text extraction process
from PDF files.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
deprecations:
- |
The `converter` parameter in the `PyPDFToDocument` component is deprecated and will be removed in Haystack 2.9.0.
For in-depth customization of the conversion process, consider implementing a custom component.
Additional high-level customization options will be added in the future.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
upgrade:
- |
Remove 'is_greedy' deprecated argument from `@component` decorator. Change the 'Variadic' input of your Component to 'GreedyVariadic' instead.
Loading

0 comments on commit b2662d7

Please sign in to comment.