From 21906d0558778aa1e869aded4ab0c998405fc4f6 Mon Sep 17 00:00:00 2001 From: Amna Mubashar Date: Fri, 22 Nov 2024 13:55:08 +0100 Subject: [PATCH] feat: Add `store_full_path` to converters (1/3) (#8566) * Add store_full_path param to 3 converters --- haystack/components/converters/csv.py | 21 +++++++++- haystack/components/converters/docx.py | 24 +++++++++-- haystack/components/converters/html.py | 25 +++++++++-- ...-param-to-converters-60007b232e279a33.yaml | 8 ++++ .../converters/test_csv_to_document.py | 19 +++++++++ .../converters/test_docx_file_to_document.py | 42 ++++++++++++++++--- .../converters/test_html_to_document.py | 22 ++++++++++ 7 files changed, 149 insertions(+), 12 deletions(-) create mode 100644 releasenotes/notes/add-store-full-path-param-to-converters-60007b232e279a33.yaml diff --git a/haystack/components/converters/csv.py b/haystack/components/converters/csv.py index 721d8cf625..2b36d4cc67 100644 --- a/haystack/components/converters/csv.py +++ b/haystack/components/converters/csv.py @@ -3,6 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 import io +import os +import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -34,7 +36,7 @@ class CSVToDocument: ``` """ - def __init__(self, encoding: str = "utf-8"): + def __init__(self, encoding: str = "utf-8", store_full_path: bool = True): """ Creates a CSVToDocument component. @@ -42,8 +44,12 @@ def __init__(self, encoding: str = "utf-8"): The encoding of the csv files to convert. If the encoding is specified in the metadata of a source ByteStream, it overrides this value. + :param store_full_path: + If True, the full path of the file is stored in the metadata of the document. + If False, only the file name is stored. """ self.encoding = encoding + self.store_full_path = store_full_path @component.output_types(documents=List[Document]) def run( @@ -87,6 +93,19 @@ def run( continue merged_metadata = {**bytestream.meta, **metadata} + + warnings.warn( + "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " + "In the 2.9.0 release, the default value for `store_full_path` will change to False, " + "storing only file names to improve privacy.", + DeprecationWarning, + ) + + if not self.store_full_path and "file_path" in bytestream.meta: + file_path = bytestream.meta.get("file_path") + if file_path: # Ensure the value is not None for pylint + merged_metadata["file_path"] = os.path.basename(file_path) + document = Document(content=data, meta=merged_metadata) documents.append(document) diff --git a/haystack/components/converters/docx.py b/haystack/components/converters/docx.py index dc0a51f485..604b1a5173 100644 --- a/haystack/components/converters/docx.py +++ b/haystack/components/converters/docx.py @@ -4,6 +4,8 @@ import csv import io +import os +import warnings from dataclasses import dataclass from enum import Enum from io import StringIO @@ -107,15 +109,19 @@ class DOCXToDocument: ``` """ - def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV): + def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = True): """ Create a DOCXToDocument component. :param table_format: The format for table output. Can be either DOCXTableFormat.MARKDOWN, DOCXTableFormat.CSV, "markdown", or "csv". Defaults to DOCXTableFormat.CSV. + :param store_full_path: + If True, the full path of the file is stored in the metadata of the document. + If False, only the file name is stored. """ docx_import.check() self.table_format = DOCXTableFormat.from_str(table_format) if isinstance(table_format, str) else table_format + self.store_full_path = store_full_path def to_dict(self) -> Dict[str, Any]: """ @@ -124,7 +130,7 @@ def to_dict(self) -> Dict[str, Any]: :returns: Dictionary with serialized data. """ - return default_to_dict(self, table_format=str(self.table_format)) + return default_to_dict(self, table_format=str(self.table_format), store_full_path=self.store_full_path) @classmethod def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument": @@ -158,7 +164,6 @@ def run( If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. If `sources` contains ByteStream objects, their `meta` will be added to the output Documents. - :returns: A dictionary with the following keys: - `documents`: Created Documents @@ -184,8 +189,21 @@ def run( ) continue + warnings.warn( + "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " + "In the 2.9.0 release, the default value for `store_full_path` will change to False, " + "storing only file names to improve privacy.", + DeprecationWarning, + ) + docx_metadata = self._get_docx_metadata(document=docx_document) merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata} + + if not self.store_full_path and "file_path" in bytestream.meta: + file_path = bytestream.meta.get("file_path") + if file_path: # Ensure the value is not None for pylint + merged_metadata["file_path"] = os.path.basename(file_path) + document = Document(content=text, meta=merged_metadata) documents.append(document) diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py index b090ec1754..4c226ab99f 100644 --- a/haystack/components/converters/html.py +++ b/haystack/components/converters/html.py @@ -2,6 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 +import os +import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -33,17 +35,21 @@ class HTMLToDocument: ``` """ - def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None): + def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None, store_full_path: bool = True): """ Create an HTMLToDocument component. :param extraction_kwargs: A dictionary containing keyword arguments to customize the extraction process. These are passed to the underlying Trafilatura `extract` function. For the full list of available arguments, see the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/corefunctions.html#extract). + :param store_full_path: + If True, the full path of the file is stored in the metadata of the document. + If False, only the file name is stored. """ trafilatura_import.check() self.extraction_kwargs = extraction_kwargs or {} + self.store_full_path = store_full_path def to_dict(self) -> Dict[str, Any]: """ @@ -52,7 +58,7 @@ def to_dict(self) -> Dict[str, Any]: :returns: Dictionary with serialized data. """ - return default_to_dict(self, extraction_kwargs=self.extraction_kwargs) + return default_to_dict(self, extraction_kwargs=self.extraction_kwargs, store_full_path=self.store_full_path) @classmethod def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument": @@ -115,7 +121,20 @@ def run( ) continue - document = Document(content=text, meta={**bytestream.meta, **metadata}) + merged_metadata = {**bytestream.meta, **metadata} + + warnings.warn( + "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " + "In the 2.9.0 release, the default value for `store_full_path` will change to False, " + "storing only file names to improve privacy.", + DeprecationWarning, + ) + if not self.store_full_path and "file_path" in bytestream.meta: + file_path = bytestream.meta.get("file_path") + if file_path: # Ensure the value is not None for pylint + merged_metadata["file_path"] = os.path.basename(file_path) + + document = Document(content=text, meta=merged_metadata) documents.append(document) return {"documents": documents} diff --git a/releasenotes/notes/add-store-full-path-param-to-converters-60007b232e279a33.yaml b/releasenotes/notes/add-store-full-path-param-to-converters-60007b232e279a33.yaml new file mode 100644 index 0000000000..93d8dbc0d4 --- /dev/null +++ b/releasenotes/notes/add-store-full-path-param-to-converters-60007b232e279a33.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Added a new `store_full_path` parameter to the `__init__` method of `CSVToDocument`, `DOCXToDocument`, and `HTMLToDocument`. The default value is `True`, which stores full file path in the metadata of the output documents. When set to `False`, only the file name is stored. + +deprecations: + - | + The default value of the `store_full_path` parameter will be changed to `False` in Haysatck 2.9.0 to enhance privacy. diff --git a/test/components/converters/test_csv_to_document.py b/test/components/converters/test_csv_to_document.py index c8f0bac0f1..3271c09c17 100644 --- a/test/components/converters/test_csv_to_document.py +++ b/test/components/converters/test_csv_to_document.py @@ -39,6 +39,25 @@ def test_run(self, test_files_path): assert docs[1].meta["file_path"] == str(files[1]) assert docs[2].meta["file_path"] == str(files[2]) + def test_run_with_store_full_path_false(self, test_files_path): + """ + Test if the component runs correctly with store_full_path=False + """ + bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv") + bytestream.meta["file_path"] = str(test_files_path / "csv" / "sample_1.csv") + bytestream.meta["key"] = "value" + files = [bytestream, test_files_path / "csv" / "sample_2.csv", test_files_path / "csv" / "sample_3.csv"] + converter = CSVToDocument(store_full_path=False) + output = converter.run(sources=files) + docs = output["documents"] + assert len(docs) == 3 + assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content + assert isinstance(docs[0].content, str) + assert docs[0].meta["file_path"] == "sample_1.csv" + assert docs[0].meta["key"] == "value" + assert docs[1].meta["file_path"] == "sample_2.csv" + assert docs[2].meta["file_path"] == "sample_3.csv" + def test_run_error_handling(self, test_files_path, caplog): """ Test if the component correctly handles errors. diff --git a/test/components/converters/test_docx_file_to_document.py b/test/components/converters/test_docx_file_to_document.py index 64dfaa79f4..c0ce370f3e 100644 --- a/test/components/converters/test_docx_file_to_document.py +++ b/test/components/converters/test_docx_file_to_document.py @@ -32,7 +32,7 @@ def test_to_dict(self): data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"table_format": "csv"}, + "init_parameters": {"store_full_path": True, "table_format": "csv"}, } def test_to_dict_custom_parameters(self): @@ -40,28 +40,28 @@ def test_to_dict_custom_parameters(self): data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"table_format": "markdown"}, + "init_parameters": {"store_full_path": True, "table_format": "markdown"}, } converter = DOCXToDocument(table_format="csv") data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"table_format": "csv"}, + "init_parameters": {"store_full_path": True, "table_format": "csv"}, } converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN) data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"table_format": "markdown"}, + "init_parameters": {"store_full_path": True, "table_format": "markdown"}, } converter = DOCXToDocument(table_format=DOCXTableFormat.CSV) data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"table_format": "csv"}, + "init_parameters": {"store_full_path": True, "table_format": "csv"}, } def test_from_dict(self): @@ -179,6 +179,38 @@ def test_run_with_table(self, test_files_path): "Now we are in Page 2" in part for part in content_parts[table_index + 1 :] ), "Text after table not found" + def test_run_with_store_full_path_false(self, test_files_path): + """ + Test if the component runs correctly with store_full_path=False + """ + docx_converter = DOCXToDocument(store_full_path=False) + paths = [test_files_path / "docx" / "sample_docx_1.docx"] + output = docx_converter.run(sources=paths) + docs = output["documents"] + assert len(docs) == 1 + assert "History" in docs[0].content + assert docs[0].meta.keys() == {"file_path", "docx"} + assert docs[0].meta == { + "file_path": "sample_docx_1.docx", + "docx": DOCXMetadata( + author="Microsoft Office User", + category="", + comments="", + content_status="", + created="2024-06-09T21:17:00+00:00", + identifier="", + keywords="", + language="", + last_modified_by="Carlos Fernández Lorán", + last_printed=None, + modified="2024-06-09T21:27:00+00:00", + revision=2, + subject="", + title="", + version="", + ), + } + @pytest.mark.parametrize("table_format", ["markdown", "csv"]) def test_table_between_two_paragraphs(self, test_files_path, table_format): docx_converter = DOCXToDocument(table_format=table_format) diff --git a/test/components/converters/test_html_to_document.py b/test/components/converters/test_html_to_document.py index 85c0ffd710..df76c8e892 100644 --- a/test/components/converters/test_html_to_document.py +++ b/test/components/converters/test_html_to_document.py @@ -38,6 +38,28 @@ def test_run_doc_metadata(self, test_files_path): assert "Haystack" in docs[0].content assert docs[0].meta["file_name"] == "what_is_haystack.html" + def test_run_with_store_full_path(self, test_files_path): + """ + Test if the component runs correctly when metadata is supplied by the user. + """ + converter = HTMLToDocument() + sources = [test_files_path / "html" / "what_is_haystack.html"] + + results = converter.run(sources=sources) # store_full_path is True by default + docs = results["documents"] + + assert len(docs) == 1 + assert "Haystack" in docs[0].content + assert docs[0].meta["file_path"] == str(sources[0]) + + converter_2 = HTMLToDocument(store_full_path=False) + results = converter_2.run(sources=sources) + docs = results["documents"] + + assert len(docs) == 1 + assert "Haystack" in docs[0].content + assert docs[0].meta["file_path"] == "what_is_haystack.html" + def test_incorrect_meta(self, test_files_path): """ Test if the component raises an error when incorrect metadata is supplied by the user.