Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: Update converters docstrings #7250

Merged
merged 1 commit into from
Feb 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 34 additions & 22 deletions haystack/components/converters/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,17 @@ class AzureOCRDocumentConverter:
Supported file formats are: PDF, JPEG, PNG, BMP, TIFF, DOCX, XLSX, PPTX, and HTML.

In order to be able to use this component, you need an active Azure account
and a Document Intelligence or Cognitive Services resource. Please follow the steps described in the
and a Document Intelligence or Cognitive Services resource. Follow the steps described in the
[Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api)
to set up your resource.

Usage example:
```python
from haystack.components.converters.azure import AzureOCRDocumentConverter
from haystack.components.converters import AzureOCRDocumentConverter
from haystack.utils import Secret

converter = AzureOCRDocumentConverter(endpoint="<url>", api_key=Secret.from_token("<your-api-key>"))
results = converter.run(sources=["image-based-document.pdf"], meta={"date_added": datetime.now().isoformat()})
results = converter.run(sources=["path/to/document_with_images.pdf"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the PDF file.'
Expand All @@ -44,9 +44,12 @@ def __init__(
"""
Create an AzureOCRDocumentConverter component.

:param endpoint: The endpoint of your Azure resource.
:param api_key: The key of your Azure resource.
:param model_id: The model ID of the model you want to use. Please refer to [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/choose-model-feature)
:param endpoint:
The endpoint of your Azure resource.
:param api_key:
The key of your Azure resource.
:param model_id:
The model ID of the model you want to use. Please refer to [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/choose-model-feature)
for a list of available models. Default: `"prebuilt-read"`.
"""
azure_import.check()
Expand All @@ -59,20 +62,21 @@ def __init__(
@component.output_types(documents=List[Document], raw_azure_response=List[Dict])
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
"""
Convert files to Documents using Azure's Document Intelligence service.

This component creates two outputs: `documents` and `raw_azure_response`. The `documents` output contains
a list of Documents that were created from the files. The `raw_azure_response` output contains a list of
the raw responses from Azure's Document Intelligence service.

:param sources: List of file paths or ByteStream objects.
:param meta: Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key
and the raw Azure response under the 'raw_azure_response' key.
Convert a list of files to Documents using Azure's Document Intelligence service.

:param sources:
List of file paths or ByteStream objects.
:param meta:
Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.

:returns:
A dictionary with the following keys:
- `documents`: List of created Documents
- `raw_azure_response`: List of raw Azure responses used to create the Documents
"""
documents = []
azure_output = []
Expand Down Expand Up @@ -104,14 +108,22 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
Serializes the component to a dictionary.

:returns:
Dictionary with serialized data.
"""
return default_to_dict(self, api_key=self.api_key.to_dict(), endpoint=self.endpoint, model_id=self.model_id)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "AzureOCRDocumentConverter":
"""
Deserialize this component from a dictionary.
Deserializes the component from a dictionary.

:param data:
The dictionary to deserialize from.
:returns:
The deserialized component.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
return default_from_dict(cls, data)
Expand Down
45 changes: 32 additions & 13 deletions haystack/components/converters/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,14 @@ class HTMLToDocument:

Usage example:
```python
from haystack.components.converters.html import HTMLToDocument
from haystack.components.converters import HTMLToDocument

converter = HTMLToDocument()
results = converter.run(sources=["sample.html"])
results = converter.run(sources=["path/to/sample.html"])
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the HTML file.'
```

"""

def __init__(
Expand All @@ -43,17 +42,32 @@ def __init__(
"""
Create an HTMLToDocument component.

:param extractor_type: The type of boilerpy3 extractor to use. Defaults to `DefaultExtractor`.
For more information on the different types of extractors,
see [boilerpy3 documentation](https://github.com/jmriebold/BoilerPy3?tab=readme-ov-file#extractors).
:param
extractor_type: Name of the extractor class to use. Defaults to `DefaultExtractor`.
For more information on the different types of extractors,
see [boilerpy3 documentation](https://github.com/jmriebold/BoilerPy3?tab=readme-ov-file#extractors).
"""
self.extractor_type = extractor_type

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.

:returns:
Dictionary with serialized data.
"""
return default_to_dict(self, extractor_type=self.extractor_type)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument":
"""
Deserializes the component from a dictionary.

:param data:
The dictionary to deserialize from.
:returns:
The deserialized component.
"""
return default_from_dict(cls, data)

@component.output_types(documents=List[Document])
Expand All @@ -65,13 +79,18 @@ def run(
"""
Converts a list of HTML files to Documents.

:param sources: List of HTML file paths or ByteStream objects.
:param meta: Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
:param sources:
List of HTML file paths or ByteStream objects.
:param meta:
Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.

:returns:
A dictionary with the following keys:
- `documents`: Created Documents
"""

documents = []
Expand Down
35 changes: 22 additions & 13 deletions haystack/components/converters/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ class MarkdownToDocument:

Usage example:
```python
from haystack.components.converters.markdown import MarkdownToDocument
from haystack.components.converters import MarkdownToDocument

converter = MarkdownToDocument()
results = converter.run(sources=["sample.md"], meta={"date_added": datetime.now().isoformat()})
results = converter.run(sources=["path/to/sample.md"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the markdown file.'
Expand All @@ -35,8 +35,12 @@ class MarkdownToDocument:

def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True):
"""
:param table_to_single_line: Convert contents of the table into a single line. Defaults to False.
:param progress_bar: Show a progress bar for the conversion. Defaults to True.
Create a MarkdownToDocument component.

:param table_to_single_line:
If True converts table contents into a single line.
:param progress_bar:
If True shows a progress bar when running.
"""
markdown_conversion_imports.check()

Expand All @@ -50,15 +54,20 @@ def run(
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Reads text from a markdown file and executes optional preprocessing steps.

:param sources: A list of markdown data sources (file paths or binary objects)
:param meta: Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
Converts a list of Markdown files to Documents.

:param sources:
List of file paths or ByteStream objects.
:param meta:
Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.

:returns:
A dictionary with the following keys:
- `documents`: List of created Documents
"""
parser = MarkdownIt(renderer_cls=RendererPlain)
if self.table_to_single_line:
Expand Down
67 changes: 36 additions & 31 deletions haystack/components/converters/openapi_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,30 +20,33 @@
@component
class OpenAPIServiceToFunctions:
"""
OpenAPIServiceToFunctions is responsible for converting an OpenAPI service specification into a format suitable
for OpenAI function calling, based on the provided OpenAPI specification. Given an OpenAPI specification,
OpenAPIServiceToFunctions processes it, and extracts function definitions that can be invoked via OpenAI's
function calling mechanism. The format of the extracted functions is compatible with OpenAI's function calling
JSON format.

Minimal requirements for OpenAPI specification:
- OpenAPI version 3.0.0 or higher
- Each path must have:
- a unique operationId
- a description
- a requestBody or parameters or both
- a schema for the requestBody and/or parameters


See https://github.com/OAI/OpenAPI-Specification for more details on OpenAPI specification.
See https://platform.openai.com/docs/guides/function-calling for more details on OpenAI function calling.
Converts OpenAPI service definitions to a format suitable for OpenAI function calling.

The definition must respect OpenAPI specification 3.0.0 or higher.
It can be specified in JSON or YAML format.
Each function must have:
- unique operationId
- description
- requestBody and/or parameters
- schema for the requestBody and/or parameters
For more details on OpenAPI specification see the [official documentation](https://github.com/OAI/OpenAPI-Specification).
For more details on OpenAI function calling see the [official documentation](https://platform.openai.com/docs/guides/function-calling).

Usage example:
```python
from haystack.components.converters import OpenAPIServiceToFunctions

converter = OpenAPIServiceToFunctions()
result = converter.run(sources=["path/to/openapi_definition.yaml"])
assert result["documents"]
```
"""

MIN_REQUIRED_OPENAPI_SPEC_VERSION = 3

def __init__(self):
"""
Initializes the OpenAPIServiceToFunctions instance
Create a OpenAPIServiceToFunctions component.
"""
openapi_imports.check()

Expand All @@ -52,19 +55,21 @@ def run(
self, sources: List[Union[str, Path, ByteStream]], system_messages: Optional[List[str]] = None
) -> Dict[str, Any]:
"""
Processes OpenAPI specification URLs or files to extract functions that can be invoked via OpenAI function
calling mechanism. Each source is paired with an optional system message. The system message can be potentially
used in LLM response generation.

:param sources: A list of OpenAPI specification sources, which can be URLs, file paths, or ByteStream objects.
:type sources: List[Union[str, Path, ByteStream]]
:param system_messages: A list of optional system messages corresponding to each source.
:type system_messages: Optional[List[str]]
:return: A dictionary with a key 'documents' containing a list of Document objects. Each Document object
encapsulates a function definition and relevant metadata.
:rtype: Dict[str, Any]
:raises RuntimeError: If the OpenAPI specification cannot be downloaded or processed.
:raises ValueError: If the source type is not recognized or no functions are found in the OpenAPI specification.
Converts OpenAPI definitions in OpenAI function calling format.

:param sources:
File paths, URLs or ByteStream objects of OpenAPI definitions.
:param system_messages:
Optional system messages for each source.

:returns:
A dictionary with the following keys:
- documents: Documents containing a function definition and relevant metadata

:raises RuntimeError:
If the OpenAPI definitions cannot be downloaded or processed.
:raises ValueError:
If the source type is not recognized or no functions are found in the OpenAPI definitions.
"""
documents: List[Document] = []
system_messages = system_messages or [""] * len(sources)
Expand Down
Loading