deepset-ai · silvanocerza · Feb 29, 2024 · Feb 28, 2024
@@ -21,17 +21,17 @@ class AzureOCRDocumentConverter:
     Supported file formats are: PDF, JPEG, PNG, BMP, TIFF, DOCX, XLSX, PPTX, and HTML.
 
     In order to be able to use this component, you need an active Azure account
-    and a Document Intelligence or Cognitive Services resource. Please follow the steps described in the
+    and a Document Intelligence or Cognitive Services resource. Follow the steps described in the
     [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api)
     to set up your resource.
 
     Usage example:
     ```python
-    from haystack.components.converters.azure import AzureOCRDocumentConverter
+    from haystack.components.converters import AzureOCRDocumentConverter
     from haystack.utils import Secret
 
     converter = AzureOCRDocumentConverter(endpoint="<url>", api_key=Secret.from_token("<your-api-key>"))
-    results = converter.run(sources=["image-based-document.pdf"], meta={"date_added": datetime.now().isoformat()})
+    results = converter.run(sources=["path/to/document_with_images.pdf"], meta={"date_added": datetime.now().isoformat()})
     documents = results["documents"]
     print(documents[0].content)
     # 'This is a text from the PDF file.'
@@ -44,9 +44,12 @@ def __init__(
         """
         Create an AzureOCRDocumentConverter component.
 
-        :param endpoint: The endpoint of your Azure resource.
-        :param api_key: The key of your Azure resource.
-        :param model_id: The model ID of the model you want to use. Please refer to [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/choose-model-feature)
+        :param endpoint:
+            The endpoint of your Azure resource.
+        :param api_key:
+            The key of your Azure resource.
+        :param model_id:
+            The model ID of the model you want to use. Please refer to [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/choose-model-feature)
             for a list of available models. Default: `"prebuilt-read"`.
         """
         azure_import.check()
@@ -59,20 +62,21 @@ def __init__(
     @component.output_types(documents=List[Document], raw_azure_response=List[Dict])
     def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
         """
-        Convert files to Documents using Azure's Document Intelligence service.
-
-        This component creates two outputs: `documents` and `raw_azure_response`. The `documents` output contains
-        a list of Documents that were created from the files. The `raw_azure_response` output contains a list of
-        the raw responses from Azure's Document Intelligence service.
-
-        :param sources: List of file paths or ByteStream objects.
-        :param meta: Optional metadata to attach to the Documents.
-          This value can be either a list of dictionaries or a single dictionary.
-          If it's a single dictionary, its content is added to the metadata of all produced Documents.
-          If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
-          Defaults to `None`.
-        :return: A dictionary containing a list of Document objects under the 'documents' key
-          and the raw Azure response under the 'raw_azure_response' key.
+        Convert a list of files to Documents using Azure's Document Intelligence service.
+
+        :param sources:
+            List of file paths or ByteStream objects.
+        :param meta:
+            Optional metadata to attach to the Documents.
+            This value can be either a list of dictionaries or a single dictionary.
+            If it's a single dictionary, its content is added to the metadata of all produced Documents.
+            If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
+            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
+
+        :returns:
+            A dictionary with the following keys:
+            - `documents`: List of created Documents
+            - `raw_azure_response`: List of raw Azure responses used to create the Documents
         """
         documents = []
         azure_output = []
@@ -104,14 +108,22 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Serialize this component to a dictionary.
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
         """
         return default_to_dict(self, api_key=self.api_key.to_dict(), endpoint=self.endpoint, model_id=self.model_id)
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "AzureOCRDocumentConverter":
         """
-        Deserialize this component from a dictionary.
+        Deserializes the component from a dictionary.
+
+        :param data:
+            The dictionary to deserialize from.
+        :returns:
+            The deserialized component.
         """
         deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
         return default_from_dict(cls, data)

@@ -17,15 +17,14 @@ class HTMLToDocument:
 
     Usage example:
     ```python
-    from haystack.components.converters.html import HTMLToDocument
+    from haystack.components.converters import HTMLToDocument
 
     converter = HTMLToDocument()
-    results = converter.run(sources=["sample.html"])
+    results = converter.run(sources=["path/to/sample.html"])
     documents = results["documents"]
     print(documents[0].content)
     # 'This is a text from the HTML file.'
     ```
-
     """
 
     def __init__(
@@ -43,17 +42,32 @@ def __init__(
         """
         Create an HTMLToDocument component.
 
-        :param extractor_type: The type of boilerpy3 extractor to use. Defaults to `DefaultExtractor`.
-          For more information on the different types of extractors,
-          see [boilerpy3 documentation](https://github.com/jmriebold/BoilerPy3?tab=readme-ov-file#extractors).
+        :param
+            extractor_type: Name of the extractor class to use. Defaults to `DefaultExtractor`.
+            For more information on the different types of extractors,
+            see [boilerpy3 documentation](https://github.com/jmriebold/BoilerPy3?tab=readme-ov-file#extractors).
         """
         self.extractor_type = extractor_type
 
     def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
         return default_to_dict(self, extractor_type=self.extractor_type)
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data:
+            The dictionary to deserialize from.
+        :returns:
+            The deserialized component.
+        """
         return default_from_dict(cls, data)
 
     @component.output_types(documents=List[Document])
@@ -65,13 +79,18 @@ def run(
         """
         Converts a list of HTML files to Documents.
 
-        :param sources: List of HTML file paths or ByteStream objects.
-        :param meta: Optional metadata to attach to the Documents.
-          This value can be either a list of dictionaries or a single dictionary.
-          If it's a single dictionary, its content is added to the metadata of all produced Documents.
-          If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
-          Defaults to `None`.
-        :return: A dictionary containing a list of Document objects under the 'documents' key.
+        :param sources:
+            List of HTML file paths or ByteStream objects.
+        :param meta:
+            Optional metadata to attach to the Documents.
+            This value can be either a list of dictionaries or a single dictionary.
+            If it's a single dictionary, its content is added to the metadata of all produced Documents.
+            If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
+            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
+
+        :returns:
+            A dictionary with the following keys:
+            - `documents`: Created Documents
         """
 
         documents = []

@@ -23,10 +23,10 @@ class MarkdownToDocument:
 
     Usage example:
     ```python
-    from haystack.components.converters.markdown import MarkdownToDocument
+    from haystack.components.converters import MarkdownToDocument
 
     converter = MarkdownToDocument()
-    results = converter.run(sources=["sample.md"], meta={"date_added": datetime.now().isoformat()})
+    results = converter.run(sources=["path/to/sample.md"], meta={"date_added": datetime.now().isoformat()})
     documents = results["documents"]
     print(documents[0].content)
     # 'This is a text from the markdown file.'
@@ -35,8 +35,12 @@ class MarkdownToDocument:
 
     def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True):
         """
-        :param table_to_single_line: Convert contents of the table into a single line. Defaults to False.
-        :param progress_bar: Show a progress bar for the conversion. Defaults to True.
+        Create a MarkdownToDocument component.
+
+        :param table_to_single_line:
+            If True converts table contents into a single line.
+        :param progress_bar:
+            If True shows a progress bar when running.
         """
         markdown_conversion_imports.check()
 
@@ -50,15 +54,20 @@ def run(
         meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
     ):
         """
-        Reads text from a markdown file and executes optional preprocessing steps.
-
-        :param sources: A list of markdown data sources (file paths or binary objects)
-        :param meta: Optional metadata to attach to the Documents.
-          This value can be either a list of dictionaries or a single dictionary.
-          If it's a single dictionary, its content is added to the metadata of all produced Documents.
-          If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
-          Defaults to `None`.
-        :return: A dictionary containing a list of Document objects under the 'documents' key.
+        Converts a list of Markdown files to Documents.
+
+        :param sources:
+            List of file paths or ByteStream objects.
+        :param meta:
+            Optional metadata to attach to the Documents.
+            This value can be either a list of dictionaries or a single dictionary.
+            If it's a single dictionary, its content is added to the metadata of all produced Documents.
+            If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
+            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
+
+        :returns:
+            A dictionary with the following keys:
+            - `documents`: List of created Documents
         """
         parser = MarkdownIt(renderer_cls=RendererPlain)
         if self.table_to_single_line:

@@ -20,30 +20,33 @@
 @component
 class OpenAPIServiceToFunctions:
     """
-    OpenAPIServiceToFunctions is responsible for converting an OpenAPI service specification into a format suitable
-    for OpenAI function calling, based on the provided OpenAPI specification. Given an OpenAPI specification,
-    OpenAPIServiceToFunctions processes it, and extracts function definitions that can be invoked via OpenAI's
-    function calling mechanism. The format of the extracted functions is compatible with OpenAI's function calling
-    JSON format.
-
-    Minimal requirements for OpenAPI specification:
-    - OpenAPI version 3.0.0 or higher
-    - Each path must have:
-        - a unique operationId
-        - a description
-        - a requestBody or parameters or both
-        - a schema for the requestBody and/or parameters
-
-
-    See https://github.com/OAI/OpenAPI-Specification for more details on OpenAPI specification.
-    See https://platform.openai.com/docs/guides/function-calling for more details on OpenAI function calling.
+    Converts OpenAPI service definitions to a format suitable for OpenAI function calling.
+
+    The definition must respect OpenAPI specification 3.0.0 or higher.
+    It can be specified in JSON or YAML format.
+    Each function must have:
+        - unique operationId
+        - description
+        - requestBody and/or parameters
+        - schema for the requestBody and/or parameters
+    For more details on OpenAPI specification see the [official documentation](https://github.com/OAI/OpenAPI-Specification).
+    For more details on OpenAI function calling see the [official documentation](https://platform.openai.com/docs/guides/function-calling).
+
+    Usage example:
+    ```python
+    from haystack.components.converters import OpenAPIServiceToFunctions
+
+    converter = OpenAPIServiceToFunctions()
+    result = converter.run(sources=["path/to/openapi_definition.yaml"])
+    assert result["documents"]
+    ```
     """
 
     MIN_REQUIRED_OPENAPI_SPEC_VERSION = 3
 
     def __init__(self):
         """
-        Initializes the OpenAPIServiceToFunctions instance
+        Create a OpenAPIServiceToFunctions component.
         """
         openapi_imports.check()
 
@@ -52,19 +55,21 @@ def run(
         self, sources: List[Union[str, Path, ByteStream]], system_messages: Optional[List[str]] = None
     ) -> Dict[str, Any]:
         """
-        Processes OpenAPI specification URLs or files to extract functions that can be invoked via OpenAI function
-        calling mechanism. Each source is paired with an optional system message. The system message can be potentially
-        used in LLM response generation.
-
-        :param sources: A list of OpenAPI specification sources, which can be URLs, file paths, or ByteStream objects.
-        :type sources: List[Union[str, Path, ByteStream]]
-        :param system_messages: A list of optional system messages corresponding to each source.
-        :type system_messages: Optional[List[str]]
-        :return: A dictionary with a key 'documents' containing a list of Document objects. Each Document object
-                 encapsulates a function definition and relevant metadata.
-        :rtype: Dict[str, Any]
-        :raises RuntimeError: If the OpenAPI specification cannot be downloaded or processed.
-        :raises ValueError: If the source type is not recognized or no functions are found in the OpenAPI specification.
+        Converts OpenAPI definitions in OpenAI function calling format.
+
+        :param sources:
+            File paths, URLs or ByteStream objects of OpenAPI definitions.
+        :param system_messages:
+            Optional system messages for each source.
+
+        :returns:
+            A dictionary with the following keys:
+            - documents: Documents containing a function definition and relevant metadata
+
+        :raises RuntimeError:
+            If the OpenAPI definitions cannot be downloaded or processed.
+        :raises ValueError:
+            If the source type is not recognized or no functions are found in the OpenAPI definitions.
         """
         documents: List[Document] = []
         system_messages = system_messages or [""] * len(sources)