Convertor: CSVToDocument (#8328)

* carry forwarded initial commit * fix: doc strings * fix: update docstrings * fix: docstring update * fix: csv encoding in actions * fix: line endings through hooks * fix: converter docs addition
deepset-ai · Sep 6, 2024 · e98a6fe · e98a6fe
1 parent a292f0a
commit e98a6fe
Show file tree

Hide file tree

Showing 8 changed files with 199 additions and 1 deletion.
diff --git a/docs/pydoc/config/converters_api.yml b/docs/pydoc/config/converters_api.yml
@@ -13,7 +13,8 @@ loaders:
         "txt",
         "output_adapter",
         "openapi_functions",
-        "docx"
+        "docx",
+        "csv"
       ]
     ignore_when_discovered: ["__init__"]
 processors:

diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from haystack.components.converters.azure import AzureOCRDocumentConverter
+from haystack.components.converters.csv import CSVToDocument
 from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument
 from haystack.components.converters.html import HTMLToDocument
 from haystack.components.converters.markdown import MarkdownToDocument
@@ -27,4 +28,5 @@
     "DOCXToDocument",
     "DOCXMetadata",
     "PPTXToDocument",
+    "CSVToDocument",
 ]
diff --git a/haystack/components/converters/csv.py b/haystack/components/converters/csv.py
@@ -0,0 +1,93 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import io
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+from haystack import Document, component, logging
+from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
+from haystack.dataclasses import ByteStream
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class CSVToDocument:
+    """
+    Converts CSV files to Documents.
+
+    By default, it uses UTF-8 encoding when converting files but
+    you can also set a custom encoding.
+    It can attach metadata to the resulting documents.
+
+    ### Usage example
+
+    ```python
+    from haystack.components.converters.csv import CSVToDocument
+    converter = CSVToDocument()
+    results = converter.run(sources=["sample.csv"], meta={"date_added": datetime.now().isoformat()})
+    documents = results["documents"]
+    print(documents[0].content)
+    # 'col1,col2\now1,row1\nrow2row2\n'
+    ```
+    """
+
+    def __init__(self, encoding: str = "utf-8"):
+        """
+        Creates a CSVToDocument component.
+
+        :param encoding:
+            The encoding of the csv files to convert.
+            If the encoding is specified in the metadata of a source ByteStream,
+            it overrides this value.
+        """
+        self.encoding = encoding
+
+    @component.output_types(documents=List[Document])
+    def run(
+        self,
+        sources: List[Union[str, Path, ByteStream]],
+        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+    ):
+        """
+        Converts a CSV file to a Document.
+
+        :param sources:
+            List of file paths or ByteStream objects.
+        :param meta:
+            Optional metadata to attach to the documents.
+            This value can be either a list of dictionaries or a single dictionary.
+            If it's a single dictionary, its content is added to the metadata of all produced documents.
+            If it's a list, the length of the list must match the number of sources, because the two lists will
+            be zipped.
+            If `sources` contains ByteStream objects, their `meta` will be added to the output documents.
+        :returns:
+            A dictionary with the following keys:
+            - `documents`: Created documents
+        """
+        documents = []
+
+        meta_list = normalize_metadata(meta, sources_count=len(sources))
+
+        for source, metadata in zip(sources, meta_list):
+            try:
+                bytestream = get_bytestream_from_source(source)
+            except Exception as e:
+                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
+                continue
+            try:
+                encoding = bytestream.meta.get("encoding", self.encoding)
+                data = io.BytesIO(bytestream.data).getvalue().decode(encoding=encoding)
+            except Exception as e:
+                logger.warning(
+                    "Could not convert file {source}. Skipping it. Error message: {error}", source=source, error=e
+                )
+                continue
+
+            merged_metadata = {**bytestream.meta, **metadata}
+            document = Document(content=data, meta=merged_metadata)
+            documents.append(document)
+
+        return {"documents": documents}
diff --git a/releasenotes/notes/add-csv-converter-5c0d52f180d498f5.yaml b/releasenotes/notes/add-csv-converter-5c0d52f180d498f5.yaml
@@ -0,0 +1,4 @@
+---
+features:
+  - |
+    Add a CSV to Document converter component. Loads the file as bytes object. Adds the loaded string as a new document that can be used for further processing by the Document Splitter.
diff --git a/test/components/converters/test_csv_to_document.py b/test/components/converters/test_csv_to_document.py
@@ -0,0 +1,86 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
+#
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from unittest.mock import patch
+import pandas as pd
+from pathlib import Path
+
+import pytest
+
+from haystack.dataclasses import ByteStream
+from haystack.components.converters.csv import CSVToDocument
+
+
+@pytest.fixture
+def csv_converter():
+    return CSVToDocument()
+
+
+class TestCSVToDocument:
+    def test_init(self, csv_converter):
+        assert isinstance(csv_converter, CSVToDocument)
+
+    def test_run(self, test_files_path):
+        """
+        Test if the component runs correctly.
+        """
+        bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv")
+        bytestream.meta["file_path"] = str(test_files_path / "csv" / "sample_1.csv")
+        bytestream.meta["key"] = "value"
+        files = [bytestream, test_files_path / "csv" / "sample_2.csv", test_files_path / "csv" / "sample_3.csv"]
+        converter = CSVToDocument()
+        output = converter.run(sources=files)
+        docs = output["documents"]
+        assert len(docs) == 3
+        assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content
+        assert isinstance(docs[0].content, str)
+        assert docs[0].meta == bytestream.meta
+        assert docs[1].meta["file_path"] == str(files[1])
+        assert docs[2].meta["file_path"] == str(files[2])
+
+    def test_run_error_handling(self, test_files_path, caplog):
+        """
+        Test if the component correctly handles errors.
+        """
+        paths = [
+            test_files_path / "csv" / "sample_2.csv",
+            "non_existing_file.csv",
+            test_files_path / "csv" / "sample_3.csv",
+        ]
+        converter = CSVToDocument()
+        with caplog.at_level(logging.WARNING):
+            output = converter.run(sources=paths)
+            assert "non_existing_file.csv" in caplog.text
+        docs = output["documents"]
+        assert len(docs) == 2
+        assert docs[0].meta["file_path"] == str(paths[0])
+
+    def test_encoding_override(self, test_files_path, caplog):
+        """
+        Test if the encoding metadata field is used properly
+        """
+        bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv")
+        bytestream.meta["key"] = "value"
+
+        converter = CSVToDocument(encoding="utf-16-le")
+        output = converter.run(sources=[bytestream])
+        with caplog.at_level(logging.ERROR):
+            output = converter.run(sources=[bytestream])
+            assert "codec can't decode" in caplog.text
+
+        converter = CSVToDocument(encoding="utf-8")
+        output = converter.run(sources=[bytestream])
+        assert "Name,Age\r\n" in output["documents"][0].content
+
+    def test_run_with_meta(self):
+        bytestream = ByteStream(
+            data=b"Name,Age,City\r\nAlice,30,New York\r\nBob,25,Los Angeles\r\nCharlie,35,Chicago\r\n",
+            meta={"name": "test_name", "language": "en"},
+        )
+        converter = CSVToDocument()
+        output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
+        document = output["documents"][0]
+
+        # check that the metadata from the bytestream is merged with that from the meta parameter
+        assert document.meta == {"name": "test_name", "language": "it"}
diff --git a/test/test_files/csv/sample_1.csv b/test/test_files/csv/sample_1.csv
@@ -0,0 +1,4 @@
+Name,Age
+John Doe,27
+Jane Smith,37
+Mike Johnson,47
diff --git a/test/test_files/csv/sample_2.csv b/test/test_files/csv/sample_2.csv
@@ -0,0 +1,4 @@
+Name,City
+John Doe,New York
+Jane Smith,Los Angeles
+Mike Johnson,Chicago
diff --git a/test/test_files/csv/sample_3.csv b/test/test_files/csv/sample_3.csv
@@ -0,0 +1,4 @@
+Name,Email
+John Doe,[email protected]
+Jane Smith,[email protected]
+Mike Johnson,[email protected]