From e98a6fea04f65fa1a3c5baed49623356c3939fef Mon Sep 17 00:00:00 2001 From: Sriniketh J <81156510+srini047@users.noreply.github.com> Date: Fri, 6 Sep 2024 14:29:12 +0530 Subject: [PATCH] Convertor: CSVToDocument (#8328) * carry forwarded initial commit * fix: doc strings * fix: update docstrings * fix: docstring update * fix: csv encoding in actions * fix: line endings through hooks * fix: converter docs addition --- docs/pydoc/config/converters_api.yml | 3 +- haystack/components/converters/__init__.py | 2 + haystack/components/converters/csv.py | 93 +++++++++++++++++++ .../add-csv-converter-5c0d52f180d498f5.yaml | 4 + .../converters/test_csv_to_document.py | 86 +++++++++++++++++ test/test_files/csv/sample_1.csv | 4 + test/test_files/csv/sample_2.csv | 4 + test/test_files/csv/sample_3.csv | 4 + 8 files changed, 199 insertions(+), 1 deletion(-) create mode 100644 haystack/components/converters/csv.py create mode 100644 releasenotes/notes/add-csv-converter-5c0d52f180d498f5.yaml create mode 100644 test/components/converters/test_csv_to_document.py create mode 100644 test/test_files/csv/sample_1.csv create mode 100644 test/test_files/csv/sample_2.csv create mode 100644 test/test_files/csv/sample_3.csv diff --git a/docs/pydoc/config/converters_api.yml b/docs/pydoc/config/converters_api.yml index e8ec88c7a2..945ae37430 100644 --- a/docs/pydoc/config/converters_api.yml +++ b/docs/pydoc/config/converters_api.yml @@ -13,7 +13,8 @@ loaders: "txt", "output_adapter", "openapi_functions", - "docx" + "docx", + "csv" ] ignore_when_discovered: ["__init__"] processors: diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py index bde66e1589..681ab85c35 100644 --- a/haystack/components/converters/__init__.py +++ b/haystack/components/converters/__init__.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from haystack.components.converters.azure import AzureOCRDocumentConverter +from haystack.components.converters.csv import CSVToDocument from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument from haystack.components.converters.html import HTMLToDocument from haystack.components.converters.markdown import MarkdownToDocument @@ -27,4 +28,5 @@ "DOCXToDocument", "DOCXMetadata", "PPTXToDocument", + "CSVToDocument", ] diff --git a/haystack/components/converters/csv.py b/haystack/components/converters/csv.py new file mode 100644 index 0000000000..721d8cf625 --- /dev/null +++ b/haystack/components/converters/csv.py @@ -0,0 +1,93 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import io +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +from haystack import Document, component, logging +from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata +from haystack.dataclasses import ByteStream + +logger = logging.getLogger(__name__) + + +@component +class CSVToDocument: + """ + Converts CSV files to Documents. + + By default, it uses UTF-8 encoding when converting files but + you can also set a custom encoding. + It can attach metadata to the resulting documents. + + ### Usage example + + ```python + from haystack.components.converters.csv import CSVToDocument + converter = CSVToDocument() + results = converter.run(sources=["sample.csv"], meta={"date_added": datetime.now().isoformat()}) + documents = results["documents"] + print(documents[0].content) + # 'col1,col2\now1,row1\nrow2row2\n' + ``` + """ + + def __init__(self, encoding: str = "utf-8"): + """ + Creates a CSVToDocument component. + + :param encoding: + The encoding of the csv files to convert. + If the encoding is specified in the metadata of a source ByteStream, + it overrides this value. + """ + self.encoding = encoding + + @component.output_types(documents=List[Document]) + def run( + self, + sources: List[Union[str, Path, ByteStream]], + meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, + ): + """ + Converts a CSV file to a Document. + + :param sources: + List of file paths or ByteStream objects. + :param meta: + Optional metadata to attach to the documents. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced documents. + If it's a list, the length of the list must match the number of sources, because the two lists will + be zipped. + If `sources` contains ByteStream objects, their `meta` will be added to the output documents. + :returns: + A dictionary with the following keys: + - `documents`: Created documents + """ + documents = [] + + meta_list = normalize_metadata(meta, sources_count=len(sources)) + + for source, metadata in zip(sources, meta_list): + try: + bytestream = get_bytestream_from_source(source) + except Exception as e: + logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e) + continue + try: + encoding = bytestream.meta.get("encoding", self.encoding) + data = io.BytesIO(bytestream.data).getvalue().decode(encoding=encoding) + except Exception as e: + logger.warning( + "Could not convert file {source}. Skipping it. Error message: {error}", source=source, error=e + ) + continue + + merged_metadata = {**bytestream.meta, **metadata} + document = Document(content=data, meta=merged_metadata) + documents.append(document) + + return {"documents": documents} diff --git a/releasenotes/notes/add-csv-converter-5c0d52f180d498f5.yaml b/releasenotes/notes/add-csv-converter-5c0d52f180d498f5.yaml new file mode 100644 index 0000000000..e5a01c9457 --- /dev/null +++ b/releasenotes/notes/add-csv-converter-5c0d52f180d498f5.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Add a CSV to Document converter component. Loads the file as bytes object. Adds the loaded string as a new document that can be used for further processing by the Document Splitter. diff --git a/test/components/converters/test_csv_to_document.py b/test/components/converters/test_csv_to_document.py new file mode 100644 index 0000000000..c8f0bac0f1 --- /dev/null +++ b/test/components/converters/test_csv_to_document.py @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +import logging +from unittest.mock import patch +import pandas as pd +from pathlib import Path + +import pytest + +from haystack.dataclasses import ByteStream +from haystack.components.converters.csv import CSVToDocument + + +@pytest.fixture +def csv_converter(): + return CSVToDocument() + + +class TestCSVToDocument: + def test_init(self, csv_converter): + assert isinstance(csv_converter, CSVToDocument) + + def test_run(self, test_files_path): + """ + Test if the component runs correctly. + """ + bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv") + bytestream.meta["file_path"] = str(test_files_path / "csv" / "sample_1.csv") + bytestream.meta["key"] = "value" + files = [bytestream, test_files_path / "csv" / "sample_2.csv", test_files_path / "csv" / "sample_3.csv"] + converter = CSVToDocument() + output = converter.run(sources=files) + docs = output["documents"] + assert len(docs) == 3 + assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content + assert isinstance(docs[0].content, str) + assert docs[0].meta == bytestream.meta + assert docs[1].meta["file_path"] == str(files[1]) + assert docs[2].meta["file_path"] == str(files[2]) + + def test_run_error_handling(self, test_files_path, caplog): + """ + Test if the component correctly handles errors. + """ + paths = [ + test_files_path / "csv" / "sample_2.csv", + "non_existing_file.csv", + test_files_path / "csv" / "sample_3.csv", + ] + converter = CSVToDocument() + with caplog.at_level(logging.WARNING): + output = converter.run(sources=paths) + assert "non_existing_file.csv" in caplog.text + docs = output["documents"] + assert len(docs) == 2 + assert docs[0].meta["file_path"] == str(paths[0]) + + def test_encoding_override(self, test_files_path, caplog): + """ + Test if the encoding metadata field is used properly + """ + bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv") + bytestream.meta["key"] = "value" + + converter = CSVToDocument(encoding="utf-16-le") + output = converter.run(sources=[bytestream]) + with caplog.at_level(logging.ERROR): + output = converter.run(sources=[bytestream]) + assert "codec can't decode" in caplog.text + + converter = CSVToDocument(encoding="utf-8") + output = converter.run(sources=[bytestream]) + assert "Name,Age\r\n" in output["documents"][0].content + + def test_run_with_meta(self): + bytestream = ByteStream( + data=b"Name,Age,City\r\nAlice,30,New York\r\nBob,25,Los Angeles\r\nCharlie,35,Chicago\r\n", + meta={"name": "test_name", "language": "en"}, + ) + converter = CSVToDocument() + output = converter.run(sources=[bytestream], meta=[{"language": "it"}]) + document = output["documents"][0] + + # check that the metadata from the bytestream is merged with that from the meta parameter + assert document.meta == {"name": "test_name", "language": "it"} diff --git a/test/test_files/csv/sample_1.csv b/test/test_files/csv/sample_1.csv new file mode 100644 index 0000000000..d3fc3116b1 --- /dev/null +++ b/test/test_files/csv/sample_1.csv @@ -0,0 +1,4 @@ +Name,Age +John Doe,27 +Jane Smith,37 +Mike Johnson,47 diff --git a/test/test_files/csv/sample_2.csv b/test/test_files/csv/sample_2.csv new file mode 100644 index 0000000000..3e23848a71 --- /dev/null +++ b/test/test_files/csv/sample_2.csv @@ -0,0 +1,4 @@ +Name,City +John Doe,New York +Jane Smith,Los Angeles +Mike Johnson,Chicago diff --git a/test/test_files/csv/sample_3.csv b/test/test_files/csv/sample_3.csv new file mode 100644 index 0000000000..fcc805b0a3 --- /dev/null +++ b/test/test_files/csv/sample_3.csv @@ -0,0 +1,4 @@ +Name,Email +John Doe,johndoe@example.com +Jane Smith,janesmith@example.com +Mike Johnson,mikejohnson@example.com