Skip to content

Commit

Permalink
Convertor: CSVToDocument (#8328)
Browse files Browse the repository at this point in the history
* carry forwarded initial commit

* fix: doc strings

* fix: update docstrings

* fix: docstring update

* fix: csv encoding in actions

* fix: line endings through hooks

* fix: converter docs addition
  • Loading branch information
srini047 authored Sep 6, 2024
1 parent a292f0a commit e98a6fe
Show file tree
Hide file tree
Showing 8 changed files with 199 additions and 1 deletion.
3 changes: 2 additions & 1 deletion docs/pydoc/config/converters_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ loaders:
"txt",
"output_adapter",
"openapi_functions",
"docx"
"docx",
"csv"
]
ignore_when_discovered: ["__init__"]
processors:
Expand Down
2 changes: 2 additions & 0 deletions haystack/components/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

from haystack.components.converters.azure import AzureOCRDocumentConverter
from haystack.components.converters.csv import CSVToDocument
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument
from haystack.components.converters.html import HTMLToDocument
from haystack.components.converters.markdown import MarkdownToDocument
Expand All @@ -27,4 +28,5 @@
"DOCXToDocument",
"DOCXMetadata",
"PPTXToDocument",
"CSVToDocument",
]
93 changes: 93 additions & 0 deletions haystack/components/converters/csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

import io
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

from haystack import Document, component, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream

logger = logging.getLogger(__name__)


@component
class CSVToDocument:
"""
Converts CSV files to Documents.
By default, it uses UTF-8 encoding when converting files but
you can also set a custom encoding.
It can attach metadata to the resulting documents.
### Usage example
```python
from haystack.components.converters.csv import CSVToDocument
converter = CSVToDocument()
results = converter.run(sources=["sample.csv"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# 'col1,col2\now1,row1\nrow2row2\n'
```
"""

def __init__(self, encoding: str = "utf-8"):
"""
Creates a CSVToDocument component.
:param encoding:
The encoding of the csv files to convert.
If the encoding is specified in the metadata of a source ByteStream,
it overrides this value.
"""
self.encoding = encoding

@component.output_types(documents=List[Document])
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Converts a CSV file to a Document.
:param sources:
List of file paths or ByteStream objects.
:param meta:
Optional metadata to attach to the documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced documents.
If it's a list, the length of the list must match the number of sources, because the two lists will
be zipped.
If `sources` contains ByteStream objects, their `meta` will be added to the output documents.
:returns:
A dictionary with the following keys:
- `documents`: Created documents
"""
documents = []

meta_list = normalize_metadata(meta, sources_count=len(sources))

for source, metadata in zip(sources, meta_list):
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
continue
try:
encoding = bytestream.meta.get("encoding", self.encoding)
data = io.BytesIO(bytestream.data).getvalue().decode(encoding=encoding)
except Exception as e:
logger.warning(
"Could not convert file {source}. Skipping it. Error message: {error}", source=source, error=e
)
continue

merged_metadata = {**bytestream.meta, **metadata}
document = Document(content=data, meta=merged_metadata)
documents.append(document)

return {"documents": documents}
4 changes: 4 additions & 0 deletions releasenotes/notes/add-csv-converter-5c0d52f180d498f5.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Add a CSV to Document converter component. Loads the file as bytes object. Adds the loaded string as a new document that can be used for further processing by the Document Splitter.
86 changes: 86 additions & 0 deletions test/components/converters/test_csv_to_document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
import logging
from unittest.mock import patch
import pandas as pd
from pathlib import Path

import pytest

from haystack.dataclasses import ByteStream
from haystack.components.converters.csv import CSVToDocument


@pytest.fixture
def csv_converter():
return CSVToDocument()


class TestCSVToDocument:
def test_init(self, csv_converter):
assert isinstance(csv_converter, CSVToDocument)

def test_run(self, test_files_path):
"""
Test if the component runs correctly.
"""
bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv")
bytestream.meta["file_path"] = str(test_files_path / "csv" / "sample_1.csv")
bytestream.meta["key"] = "value"
files = [bytestream, test_files_path / "csv" / "sample_2.csv", test_files_path / "csv" / "sample_3.csv"]
converter = CSVToDocument()
output = converter.run(sources=files)
docs = output["documents"]
assert len(docs) == 3
assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content
assert isinstance(docs[0].content, str)
assert docs[0].meta == bytestream.meta
assert docs[1].meta["file_path"] == str(files[1])
assert docs[2].meta["file_path"] == str(files[2])

def test_run_error_handling(self, test_files_path, caplog):
"""
Test if the component correctly handles errors.
"""
paths = [
test_files_path / "csv" / "sample_2.csv",
"non_existing_file.csv",
test_files_path / "csv" / "sample_3.csv",
]
converter = CSVToDocument()
with caplog.at_level(logging.WARNING):
output = converter.run(sources=paths)
assert "non_existing_file.csv" in caplog.text
docs = output["documents"]
assert len(docs) == 2
assert docs[0].meta["file_path"] == str(paths[0])

def test_encoding_override(self, test_files_path, caplog):
"""
Test if the encoding metadata field is used properly
"""
bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv")
bytestream.meta["key"] = "value"

converter = CSVToDocument(encoding="utf-16-le")
output = converter.run(sources=[bytestream])
with caplog.at_level(logging.ERROR):
output = converter.run(sources=[bytestream])
assert "codec can't decode" in caplog.text

converter = CSVToDocument(encoding="utf-8")
output = converter.run(sources=[bytestream])
assert "Name,Age\r\n" in output["documents"][0].content

def test_run_with_meta(self):
bytestream = ByteStream(
data=b"Name,Age,City\r\nAlice,30,New York\r\nBob,25,Los Angeles\r\nCharlie,35,Chicago\r\n",
meta={"name": "test_name", "language": "en"},
)
converter = CSVToDocument()
output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
document = output["documents"][0]

# check that the metadata from the bytestream is merged with that from the meta parameter
assert document.meta == {"name": "test_name", "language": "it"}
4 changes: 4 additions & 0 deletions test/test_files/csv/sample_1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Name,Age
John Doe,27
Jane Smith,37
Mike Johnson,47
4 changes: 4 additions & 0 deletions test/test_files/csv/sample_2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Name,City
John Doe,New York
Jane Smith,Los Angeles
Mike Johnson,Chicago
4 changes: 4 additions & 0 deletions test/test_files/csv/sample_3.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Name,Email
John Doe,[email protected]
Jane Smith,[email protected]
Mike Johnson,[email protected]

0 comments on commit e98a6fe

Please sign in to comment.