-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* carry forwarded initial commit * fix: doc strings * fix: update docstrings * fix: docstring update * fix: csv encoding in actions * fix: line endings through hooks * fix: converter docs addition
- Loading branch information
Showing
8 changed files
with
199 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]> | ||
# | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import io | ||
from pathlib import Path | ||
from typing import Any, Dict, List, Optional, Union | ||
|
||
from haystack import Document, component, logging | ||
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata | ||
from haystack.dataclasses import ByteStream | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
@component | ||
class CSVToDocument: | ||
""" | ||
Converts CSV files to Documents. | ||
By default, it uses UTF-8 encoding when converting files but | ||
you can also set a custom encoding. | ||
It can attach metadata to the resulting documents. | ||
### Usage example | ||
```python | ||
from haystack.components.converters.csv import CSVToDocument | ||
converter = CSVToDocument() | ||
results = converter.run(sources=["sample.csv"], meta={"date_added": datetime.now().isoformat()}) | ||
documents = results["documents"] | ||
print(documents[0].content) | ||
# 'col1,col2\now1,row1\nrow2row2\n' | ||
``` | ||
""" | ||
|
||
def __init__(self, encoding: str = "utf-8"): | ||
""" | ||
Creates a CSVToDocument component. | ||
:param encoding: | ||
The encoding of the csv files to convert. | ||
If the encoding is specified in the metadata of a source ByteStream, | ||
it overrides this value. | ||
""" | ||
self.encoding = encoding | ||
|
||
@component.output_types(documents=List[Document]) | ||
def run( | ||
self, | ||
sources: List[Union[str, Path, ByteStream]], | ||
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, | ||
): | ||
""" | ||
Converts a CSV file to a Document. | ||
:param sources: | ||
List of file paths or ByteStream objects. | ||
:param meta: | ||
Optional metadata to attach to the documents. | ||
This value can be either a list of dictionaries or a single dictionary. | ||
If it's a single dictionary, its content is added to the metadata of all produced documents. | ||
If it's a list, the length of the list must match the number of sources, because the two lists will | ||
be zipped. | ||
If `sources` contains ByteStream objects, their `meta` will be added to the output documents. | ||
:returns: | ||
A dictionary with the following keys: | ||
- `documents`: Created documents | ||
""" | ||
documents = [] | ||
|
||
meta_list = normalize_metadata(meta, sources_count=len(sources)) | ||
|
||
for source, metadata in zip(sources, meta_list): | ||
try: | ||
bytestream = get_bytestream_from_source(source) | ||
except Exception as e: | ||
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e) | ||
continue | ||
try: | ||
encoding = bytestream.meta.get("encoding", self.encoding) | ||
data = io.BytesIO(bytestream.data).getvalue().decode(encoding=encoding) | ||
except Exception as e: | ||
logger.warning( | ||
"Could not convert file {source}. Skipping it. Error message: {error}", source=source, error=e | ||
) | ||
continue | ||
|
||
merged_metadata = {**bytestream.meta, **metadata} | ||
document = Document(content=data, meta=merged_metadata) | ||
documents.append(document) | ||
|
||
return {"documents": documents} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
--- | ||
features: | ||
- | | ||
Add a CSV to Document converter component. Loads the file as bytes object. Adds the loaded string as a new document that can be used for further processing by the Document Splitter. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]> | ||
# | ||
# SPDX-License-Identifier: Apache-2.0 | ||
import logging | ||
from unittest.mock import patch | ||
import pandas as pd | ||
from pathlib import Path | ||
|
||
import pytest | ||
|
||
from haystack.dataclasses import ByteStream | ||
from haystack.components.converters.csv import CSVToDocument | ||
|
||
|
||
@pytest.fixture | ||
def csv_converter(): | ||
return CSVToDocument() | ||
|
||
|
||
class TestCSVToDocument: | ||
def test_init(self, csv_converter): | ||
assert isinstance(csv_converter, CSVToDocument) | ||
|
||
def test_run(self, test_files_path): | ||
""" | ||
Test if the component runs correctly. | ||
""" | ||
bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv") | ||
bytestream.meta["file_path"] = str(test_files_path / "csv" / "sample_1.csv") | ||
bytestream.meta["key"] = "value" | ||
files = [bytestream, test_files_path / "csv" / "sample_2.csv", test_files_path / "csv" / "sample_3.csv"] | ||
converter = CSVToDocument() | ||
output = converter.run(sources=files) | ||
docs = output["documents"] | ||
assert len(docs) == 3 | ||
assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content | ||
assert isinstance(docs[0].content, str) | ||
assert docs[0].meta == bytestream.meta | ||
assert docs[1].meta["file_path"] == str(files[1]) | ||
assert docs[2].meta["file_path"] == str(files[2]) | ||
|
||
def test_run_error_handling(self, test_files_path, caplog): | ||
""" | ||
Test if the component correctly handles errors. | ||
""" | ||
paths = [ | ||
test_files_path / "csv" / "sample_2.csv", | ||
"non_existing_file.csv", | ||
test_files_path / "csv" / "sample_3.csv", | ||
] | ||
converter = CSVToDocument() | ||
with caplog.at_level(logging.WARNING): | ||
output = converter.run(sources=paths) | ||
assert "non_existing_file.csv" in caplog.text | ||
docs = output["documents"] | ||
assert len(docs) == 2 | ||
assert docs[0].meta["file_path"] == str(paths[0]) | ||
|
||
def test_encoding_override(self, test_files_path, caplog): | ||
""" | ||
Test if the encoding metadata field is used properly | ||
""" | ||
bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv") | ||
bytestream.meta["key"] = "value" | ||
|
||
converter = CSVToDocument(encoding="utf-16-le") | ||
output = converter.run(sources=[bytestream]) | ||
with caplog.at_level(logging.ERROR): | ||
output = converter.run(sources=[bytestream]) | ||
assert "codec can't decode" in caplog.text | ||
|
||
converter = CSVToDocument(encoding="utf-8") | ||
output = converter.run(sources=[bytestream]) | ||
assert "Name,Age\r\n" in output["documents"][0].content | ||
|
||
def test_run_with_meta(self): | ||
bytestream = ByteStream( | ||
data=b"Name,Age,City\r\nAlice,30,New York\r\nBob,25,Los Angeles\r\nCharlie,35,Chicago\r\n", | ||
meta={"name": "test_name", "language": "en"}, | ||
) | ||
converter = CSVToDocument() | ||
output = converter.run(sources=[bytestream], meta=[{"language": "it"}]) | ||
document = output["documents"][0] | ||
|
||
# check that the metadata from the bytestream is merged with that from the meta parameter | ||
assert document.meta == {"name": "test_name", "language": "it"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
Name,Age | ||
John Doe,27 | ||
Jane Smith,37 | ||
Mike Johnson,47 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
Name,City | ||
John Doe,New York | ||
Jane Smith,Los Angeles | ||
Mike Johnson,Chicago |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
Name,Email | ||
John Doe,[email protected] | ||
Jane Smith,[email protected] | ||
Mike Johnson,[email protected] |