diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py index edc028e439..0815198ed2 100644 --- a/haystack/components/converters/azure.py +++ b/haystack/components/converters/azure.py @@ -21,17 +21,17 @@ class AzureOCRDocumentConverter: Supported file formats are: PDF, JPEG, PNG, BMP, TIFF, DOCX, XLSX, PPTX, and HTML. In order to be able to use this component, you need an active Azure account - and a Document Intelligence or Cognitive Services resource. Please follow the steps described in the + and a Document Intelligence or Cognitive Services resource. Follow the steps described in the [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api) to set up your resource. Usage example: ```python - from haystack.components.converters.azure import AzureOCRDocumentConverter + from haystack.components.converters import AzureOCRDocumentConverter from haystack.utils import Secret converter = AzureOCRDocumentConverter(endpoint="", api_key=Secret.from_token("")) - results = converter.run(sources=["image-based-document.pdf"], meta={"date_added": datetime.now().isoformat()}) + results = converter.run(sources=["path/to/document_with_images.pdf"], meta={"date_added": datetime.now().isoformat()}) documents = results["documents"] print(documents[0].content) # 'This is a text from the PDF file.' @@ -44,9 +44,12 @@ def __init__( """ Create an AzureOCRDocumentConverter component. - :param endpoint: The endpoint of your Azure resource. - :param api_key: The key of your Azure resource. - :param model_id: The model ID of the model you want to use. Please refer to [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/choose-model-feature) + :param endpoint: + The endpoint of your Azure resource. + :param api_key: + The key of your Azure resource. + :param model_id: + The model ID of the model you want to use. Please refer to [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/choose-model-feature) for a list of available models. Default: `"prebuilt-read"`. """ azure_import.check() @@ -59,20 +62,21 @@ def __init__( @component.output_types(documents=List[Document], raw_azure_response=List[Dict]) def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None): """ - Convert files to Documents using Azure's Document Intelligence service. - - This component creates two outputs: `documents` and `raw_azure_response`. The `documents` output contains - a list of Documents that were created from the files. The `raw_azure_response` output contains a list of - the raw responses from Azure's Document Intelligence service. - - :param sources: List of file paths or ByteStream objects. - :param meta: Optional metadata to attach to the Documents. - This value can be either a list of dictionaries or a single dictionary. - If it's a single dictionary, its content is added to the metadata of all produced Documents. - If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. - Defaults to `None`. - :return: A dictionary containing a list of Document objects under the 'documents' key - and the raw Azure response under the 'raw_azure_response' key. + Convert a list of files to Documents using Azure's Document Intelligence service. + + :param sources: + List of file paths or ByteStream objects. + :param meta: + Optional metadata to attach to the Documents. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced Documents. + If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. + If `sources` contains ByteStream objects, their `meta` will be added to the output Documents. + + :returns: + A dictionary with the following keys: + - `documents`: List of created Documents + - `raw_azure_response`: List of raw Azure responses used to create the Documents """ documents = [] azure_output = [] @@ -104,14 +108,22 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D def to_dict(self) -> Dict[str, Any]: """ - Serialize this component to a dictionary. + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. """ return default_to_dict(self, api_key=self.api_key.to_dict(), endpoint=self.endpoint, model_id=self.model_id) @classmethod def from_dict(cls, data: Dict[str, Any]) -> "AzureOCRDocumentConverter": """ - Deserialize this component from a dictionary. + Deserializes the component from a dictionary. + + :param data: + The dictionary to deserialize from. + :returns: + The deserialized component. """ deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) return default_from_dict(cls, data) diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py index fa9135b51d..e362ea1154 100644 --- a/haystack/components/converters/html.py +++ b/haystack/components/converters/html.py @@ -17,15 +17,14 @@ class HTMLToDocument: Usage example: ```python - from haystack.components.converters.html import HTMLToDocument + from haystack.components.converters import HTMLToDocument converter = HTMLToDocument() - results = converter.run(sources=["sample.html"]) + results = converter.run(sources=["path/to/sample.html"]) documents = results["documents"] print(documents[0].content) # 'This is a text from the HTML file.' ``` - """ def __init__( @@ -43,17 +42,32 @@ def __init__( """ Create an HTMLToDocument component. - :param extractor_type: The type of boilerpy3 extractor to use. Defaults to `DefaultExtractor`. - For more information on the different types of extractors, - see [boilerpy3 documentation](https://github.com/jmriebold/BoilerPy3?tab=readme-ov-file#extractors). + :param + extractor_type: Name of the extractor class to use. Defaults to `DefaultExtractor`. + For more information on the different types of extractors, + see [boilerpy3 documentation](https://github.com/jmriebold/BoilerPy3?tab=readme-ov-file#extractors). """ self.extractor_type = extractor_type def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ return default_to_dict(self, extractor_type=self.extractor_type) @classmethod def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument": + """ + Deserializes the component from a dictionary. + + :param data: + The dictionary to deserialize from. + :returns: + The deserialized component. + """ return default_from_dict(cls, data) @component.output_types(documents=List[Document]) @@ -65,13 +79,18 @@ def run( """ Converts a list of HTML files to Documents. - :param sources: List of HTML file paths or ByteStream objects. - :param meta: Optional metadata to attach to the Documents. - This value can be either a list of dictionaries or a single dictionary. - If it's a single dictionary, its content is added to the metadata of all produced Documents. - If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. - Defaults to `None`. - :return: A dictionary containing a list of Document objects under the 'documents' key. + :param sources: + List of HTML file paths or ByteStream objects. + :param meta: + Optional metadata to attach to the Documents. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced Documents. + If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. + If `sources` contains ByteStream objects, their `meta` will be added to the output Documents. + + :returns: + A dictionary with the following keys: + - `documents`: Created Documents """ documents = [] diff --git a/haystack/components/converters/markdown.py b/haystack/components/converters/markdown.py index 12a44dec84..feb8c88e05 100644 --- a/haystack/components/converters/markdown.py +++ b/haystack/components/converters/markdown.py @@ -23,10 +23,10 @@ class MarkdownToDocument: Usage example: ```python - from haystack.components.converters.markdown import MarkdownToDocument + from haystack.components.converters import MarkdownToDocument converter = MarkdownToDocument() - results = converter.run(sources=["sample.md"], meta={"date_added": datetime.now().isoformat()}) + results = converter.run(sources=["path/to/sample.md"], meta={"date_added": datetime.now().isoformat()}) documents = results["documents"] print(documents[0].content) # 'This is a text from the markdown file.' @@ -35,8 +35,12 @@ class MarkdownToDocument: def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True): """ - :param table_to_single_line: Convert contents of the table into a single line. Defaults to False. - :param progress_bar: Show a progress bar for the conversion. Defaults to True. + Create a MarkdownToDocument component. + + :param table_to_single_line: + If True converts table contents into a single line. + :param progress_bar: + If True shows a progress bar when running. """ markdown_conversion_imports.check() @@ -50,15 +54,20 @@ def run( meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, ): """ - Reads text from a markdown file and executes optional preprocessing steps. - - :param sources: A list of markdown data sources (file paths or binary objects) - :param meta: Optional metadata to attach to the Documents. - This value can be either a list of dictionaries or a single dictionary. - If it's a single dictionary, its content is added to the metadata of all produced Documents. - If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. - Defaults to `None`. - :return: A dictionary containing a list of Document objects under the 'documents' key. + Converts a list of Markdown files to Documents. + + :param sources: + List of file paths or ByteStream objects. + :param meta: + Optional metadata to attach to the Documents. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced Documents. + If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. + If `sources` contains ByteStream objects, their `meta` will be added to the output Documents. + + :returns: + A dictionary with the following keys: + - `documents`: List of created Documents """ parser = MarkdownIt(renderer_cls=RendererPlain) if self.table_to_single_line: diff --git a/haystack/components/converters/openapi_functions.py b/haystack/components/converters/openapi_functions.py index 8d4e31dc92..b25cd9c922 100644 --- a/haystack/components/converters/openapi_functions.py +++ b/haystack/components/converters/openapi_functions.py @@ -20,30 +20,33 @@ @component class OpenAPIServiceToFunctions: """ - OpenAPIServiceToFunctions is responsible for converting an OpenAPI service specification into a format suitable - for OpenAI function calling, based on the provided OpenAPI specification. Given an OpenAPI specification, - OpenAPIServiceToFunctions processes it, and extracts function definitions that can be invoked via OpenAI's - function calling mechanism. The format of the extracted functions is compatible with OpenAI's function calling - JSON format. - - Minimal requirements for OpenAPI specification: - - OpenAPI version 3.0.0 or higher - - Each path must have: - - a unique operationId - - a description - - a requestBody or parameters or both - - a schema for the requestBody and/or parameters - - - See https://github.com/OAI/OpenAPI-Specification for more details on OpenAPI specification. - See https://platform.openai.com/docs/guides/function-calling for more details on OpenAI function calling. + Converts OpenAPI service definitions to a format suitable for OpenAI function calling. + + The definition must respect OpenAPI specification 3.0.0 or higher. + It can be specified in JSON or YAML format. + Each function must have: + - unique operationId + - description + - requestBody and/or parameters + - schema for the requestBody and/or parameters + For more details on OpenAPI specification see the [official documentation](https://github.com/OAI/OpenAPI-Specification). + For more details on OpenAI function calling see the [official documentation](https://platform.openai.com/docs/guides/function-calling). + + Usage example: + ```python + from haystack.components.converters import OpenAPIServiceToFunctions + + converter = OpenAPIServiceToFunctions() + result = converter.run(sources=["path/to/openapi_definition.yaml"]) + assert result["documents"] + ``` """ MIN_REQUIRED_OPENAPI_SPEC_VERSION = 3 def __init__(self): """ - Initializes the OpenAPIServiceToFunctions instance + Create a OpenAPIServiceToFunctions component. """ openapi_imports.check() @@ -52,19 +55,21 @@ def run( self, sources: List[Union[str, Path, ByteStream]], system_messages: Optional[List[str]] = None ) -> Dict[str, Any]: """ - Processes OpenAPI specification URLs or files to extract functions that can be invoked via OpenAI function - calling mechanism. Each source is paired with an optional system message. The system message can be potentially - used in LLM response generation. - - :param sources: A list of OpenAPI specification sources, which can be URLs, file paths, or ByteStream objects. - :type sources: List[Union[str, Path, ByteStream]] - :param system_messages: A list of optional system messages corresponding to each source. - :type system_messages: Optional[List[str]] - :return: A dictionary with a key 'documents' containing a list of Document objects. Each Document object - encapsulates a function definition and relevant metadata. - :rtype: Dict[str, Any] - :raises RuntimeError: If the OpenAPI specification cannot be downloaded or processed. - :raises ValueError: If the source type is not recognized or no functions are found in the OpenAPI specification. + Converts OpenAPI definitions in OpenAI function calling format. + + :param sources: + File paths, URLs or ByteStream objects of OpenAPI definitions. + :param system_messages: + Optional system messages for each source. + + :returns: + A dictionary with the following keys: + - documents: Documents containing a function definition and relevant metadata + + :raises RuntimeError: + If the OpenAPI definitions cannot be downloaded or processed. + :raises ValueError: + If the source type is not recognized or no functions are found in the OpenAPI definitions. """ documents: List[Document] = [] system_messages = system_messages or [""] * len(sources) diff --git a/haystack/components/converters/output_adapter.py b/haystack/components/converters/output_adapter.py index 21d9ef4288..50e3406473 100644 --- a/haystack/components/converters/output_adapter.py +++ b/haystack/components/converters/output_adapter.py @@ -16,61 +16,38 @@ class OutputAdaptationException(Exception): @component class OutputAdapter: """ - OutputAdapter in Haystack 2.x pipelines is designed to adapt the output of one component - to be compatible with the input of another component using Jinja2 template expressions. - - The component configuration requires specifying the adaptation rules. Each rule comprises: - - 'template': A Jinja2 template string that defines how to adapt the input data. - - 'output_type': The type of the output data (e.g., str, List[int]). - - 'custom_filters': A dictionary of custom Jinja2 filters to be used in the template. - - Example configuration: + Adapts output of a Component using Jinja templates. + Usage example: ```python + from haystack import Document from haystack.components.converters import OutputAdapter - adapter = OutputAdapter(template="{{ documents[0].content }}", output_type=str) - input_data = {"documents": [{"content": "Test content"}]} - expected_output = {"output": "Test content"} - - assert adapter.run(**input_data) == expected_output - ``` - - In the pipeline setup, the adapter is placed between components that require output/input adaptation. - The name under which the adapted value is published is `output`. Use this name to connect the OutputAdapter - to downstream components in the pipeline. - - Example pipeline setup: - - ```python - from haystack import Pipeline, component - from haystack.components.converters import OutputAdapter + adapter = OutputAdapter(template="{{ documents[0].content }}", output_type=str) + documents = [Document(content="Test content"] + result = adapter.run(documents=documents) - @component - class DocumentProducer: - @component.output_types(documents=dict) - def run(self): - return {"documents": [{"content": '{"framework": "Haystack"}'}]} - - pipe = Pipeline() - pipe.add_component( - name="output_adapter", - instance=OutputAdapter(template="{{ documents[0].content | json_loads}}", output_type=str), - ) - pipe.add_component(name="document_producer", instance=DocumentProducer()) - pipe.connect("document_producer", "output_adapter") - result = pipe.run(data={}) - assert result["output_adapter"]["output"] == {"framework": "Haystack"} + assert result["output"] == "Test content" ``` """ def __init__(self, template: str, output_type: TypeAlias, custom_filters: Optional[Dict[str, Callable]] = None): """ - Initializes the OutputAdapter with a set of adaptation rules. - :param template: A Jinja2 template string that defines how to adapt the output data to the input of the - downstream component. - :param output_type: The type of the output data (e.g., str, List[int]). - :param custom_filters: A dictionary of custom Jinja2 filters to be used in the template. + Create an OutputAdapter component. + + :param template: + A Jinja template that defines how to adapt the input data. + The variables in the template define the input of this instance. + e.g. + With this template: + ``` + {{ documents[0].content }} + ``` + The Component input will be `documents`. + :param output_type: + The type of output this instance will return. + :param custom_filters: + A dictionary of custom Jinja filters used in the template. """ self.custom_filters = {**(custom_filters or {})} input_types: Set[str] = set() @@ -98,12 +75,15 @@ def __init__(self, template: str, output_type: TypeAlias, custom_filters: Option def run(self, **kwargs): """ - Executes the output adaptation logic by applying the specified Jinja template expressions - to adapt the incoming data to a format suitable for downstream components. + Renders the Jinja template with the provided inputs. + + :param kwargs: + Must contain all variables used in the `template` string. + :returns: + A dictionary with the following keys: + - `output`: Rendered Jinja template. - :param kwargs: A dictionary containing the pipeline variables, which are inputs to the adaptation templates. - :return: A dictionary containing the adapted outputs, based on the adaptation rules. - :raises OutputAdaptationException: If there's an error during the adaptation process. + :raises OutputAdaptationException: If template rendering fails. """ # check if kwargs are empty if not kwargs: @@ -124,6 +104,12 @@ def run(self, **kwargs): return adapted_outputs def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ se_filters = {name: serialize_callable(filter_func) for name, filter_func in self.custom_filters.items()} return default_to_dict( self, template=self.template, output_type=serialize_type(self.output_type), custom_filters=se_filters @@ -131,6 +117,14 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "OutputAdapter": + """ + Deserializes the component from a dictionary. + + :param data: + The dictionary to deserialize from. + :returns: + The deserialized component. + """ init_params = data.get("init_parameters", {}) init_params["output_type"] = deserialize_type(init_params["output_type"]) for name, filter_func in init_params.get("custom_filters", {}).items(): diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py index fb325970f5..f6ed4faf3f 100644 --- a/haystack/components/converters/pypdf.py +++ b/haystack/components/converters/pypdf.py @@ -42,9 +42,10 @@ def convert(self, reader: "PdfReader") -> Document: @component class PyPDFToDocument: """ - Converts PDF files to Document objects. - It uses a converter that follows the PyPDFConverter protocol to perform the conversion. - A default text extraction converter is used if no custom converter is provided. + Converts PDF files to Documents. + + Uses `pypdf` compatible converters to convert PDF files to Documents. + A default text extraction converter is used if one is not provided. Usage example: ```python @@ -60,9 +61,10 @@ class PyPDFToDocument: def __init__(self, converter_name: str = "default"): """ - Initializes the PyPDFToDocument component with an optional custom converter. - :param converter_name: A converter name that is registered in the CONVERTERS_REGISTRY. - Defaults to 'default'. + Create an PyPDFToDocument component. + + :param converter_name: + Name of the registered converter to use. """ pypdf_import.check() @@ -77,6 +79,12 @@ def __init__(self, converter_name: str = "default"): self._converter: PyPDFConverter = converter def to_dict(self): + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ # do not serialize the _converter instance return default_to_dict(self, converter_name=self.converter_name) @@ -87,15 +95,20 @@ def run( meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, ): """ - Converts a list of PDF sources into Document objects using the configured converter. - - :param sources: A list of PDF data sources, which can be file paths or ByteStream objects. - :param meta: Optional metadata to attach to the Documents. - This value can be either a list of dictionaries or a single dictionary. - If it's a single dictionary, its content is added to the metadata of all produced Documents. - If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. - Defaults to `None`. - :return: A dictionary containing a list of Document objects under the 'documents' key. + Converts PDF files to Documents. + + :param sources: + List of HTML file paths or ByteStream objects. + :param meta: + Optional metadata to attach to the Documents. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced Documents. + If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. + If `sources` contains ByteStream objects, their `meta` will be added to the output Documents. + + :returns: + A dictionary with the following keys: + - `documents`: Created Documents """ documents = [] meta_list = normalize_metadata(meta, sources_count=len(sources)) diff --git a/haystack/components/converters/tika.py b/haystack/components/converters/tika.py index 909945b57b..1afcd9cfc8 100644 --- a/haystack/components/converters/tika.py +++ b/haystack/components/converters/tika.py @@ -16,13 +16,12 @@ @component class TikaDocumentConverter: """ - A component for converting files of different types (pdf, docx, html, etc.) to Documents. + Converts files of different types to Documents. + This component uses [Apache Tika](https://tika.apache.org/) for parsing the files and, therefore, requires a running Tika server. - - The easiest way to run Tika is to use Docker: `docker run -d -p 127.0.0.1:9998:9998 apache/tika:latest`. - For more options on running Tika on Docker, - see the [documentation](https://github.com/apache/tika-docker/blob/main/README.md#usage). + For more options on running Tika, + see the [official documentation](https://github.com/apache/tika-docker/blob/main/README.md#usage). Usage example: ```python @@ -43,7 +42,8 @@ def __init__(self, tika_url: str = "http://localhost:9998/tika"): """ Create a TikaDocumentConverter component. - :param tika_url: URL of the Tika server. Default: `"http://localhost:9998/tika"` + :param tika_url: + Tika server URL. """ tika_import.check() self.tika_url = tika_url @@ -55,15 +55,20 @@ def run( meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, ): """ - Convert files to Documents. + Converts files to Documents. + + :param sources: + List of HTML file paths or ByteStream objects. + :param meta: + Optional metadata to attach to the Documents. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced Documents. + If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. + If `sources` contains ByteStream objects, their `meta` will be added to the output Documents. - :param sources: List of file paths or ByteStream objects. - :param meta: Optional metadata to attach to the Documents. - This value can be either a list of dictionaries or a single dictionary. - If it's a single dictionary, its content is added to the metadata of all produced Documents. - If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. - Defaults to `None`. - :return: A dictionary containing a list of Document objects under the 'documents' key. + :returns: + A dictionary with the following keys: + - `documents`: Created Documents """ documents = [] meta_list = normalize_metadata(meta=meta, sources_count=len(sources)) diff --git a/haystack/components/converters/txt.py b/haystack/components/converters/txt.py index bb986fab28..00d39b3762 100644 --- a/haystack/components/converters/txt.py +++ b/haystack/components/converters/txt.py @@ -11,7 +11,7 @@ @component class TextFileToDocument: """ - A component for converting a text file to a Document. + Converts text files to Documents. Usage example: ```python @@ -29,9 +29,10 @@ def __init__(self, encoding: str = "utf-8"): """ Create a TextFileToDocument component. - :param encoding: The default encoding of the text files. Default: `"utf-8"`. - Note that if the encoding is specified in the metadata of a ByteStream, - it will override this default. + :param encoding: + The encoding of the text files. + Note that if the encoding is specified in the metadata of a source ByteStream, + it will override this value. """ self.encoding = encoding @@ -42,17 +43,20 @@ def run( meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, ): """ - Convert text files to Documents. + Converts text files to Documents. - :param sources: A list of paths to text files or ByteStream objects. - Note that if an encoding is specified in the metadata of a ByteStream, - it will override the component's default. - :param meta: Optional metadata to attach to the Documents. - This value can be either a list of dictionaries or a single dictionary. - If it's a single dictionary, its content is added to the metadata of all produced Documents. - If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. - Defaults to `None`. - :return: A dictionary containing a list of Document objects under the 'documents' key. + :param sources: + List of HTML file paths or ByteStream objects. + :param meta: + Optional metadata to attach to the Documents. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced Documents. + If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. + If `sources` contains ByteStream objects, their `meta` will be added to the output Documents. + + :returns: + A dictionary with the following keys: + - `documents`: Created Documents """ documents = []