-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add
MarkdownToTextDocument
(v2) (#6159)
* Add MarkdownToTextDocument * Add release notes * Update GitHub workflows * Update GitHub workflows * Refactor code with minimal dependencies * Update docstrings * Apply suggestions from code review Co-authored-by: Daria Fokina <[email protected]> * Update document with content and meta for backward compatibility * Refactor Document Class for Backward Compatibility Co-authored-by: Stefano Fiorucci <[email protected]> * Update tests * Improve test assertions --------- Co-authored-by: Daria Fokina <[email protected]> Co-authored-by: Stefano Fiorucci <[email protected]>
- Loading branch information
1 parent
5e12230
commit 6bf0b9d
Showing
7 changed files
with
266 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import logging | ||
from pathlib import Path | ||
from typing import Any, Dict, List, Optional, Union | ||
|
||
from tqdm import tqdm | ||
|
||
from haystack.preview import Document, component | ||
from haystack.preview.dataclasses import ByteStream | ||
from haystack.preview.lazy_imports import LazyImport | ||
|
||
with LazyImport("Run 'pip install markdown-it-py mdit_plain'") as markdown_conversion_imports: | ||
from markdown_it import MarkdownIt | ||
from mdit_plain.renderer import RendererPlain | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
@component | ||
class MarkdownToDocument: | ||
""" | ||
Converts a Markdown file into a text Document. | ||
Usage example: | ||
```python | ||
from haystack.preview.components.file_converters.markdown import MarkdownToDocument | ||
converter = MarkdownToDocument() | ||
results = converter.run(sources=["sample.md"]) | ||
documents = results["documents"] | ||
print(documents[0].content) | ||
# 'This is a text from the markdown file.' | ||
``` | ||
""" | ||
|
||
def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True): | ||
""" | ||
:param table_to_single_line: Convert contents of the table into a single line. Defaults to False. | ||
:param progress_bar: Show a progress bar for the conversion. Defaults to True. | ||
""" | ||
markdown_conversion_imports.check() | ||
|
||
self.table_to_single_line = table_to_single_line | ||
self.progress_bar = progress_bar | ||
|
||
@component.output_types(documents=List[Document]) | ||
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None): | ||
""" | ||
Reads text from a markdown file and executes optional preprocessing steps. | ||
:param sources: A list of markdown data sources (file paths or binary objects) | ||
:param meta: Optional list of metadata to attach to the Documents. | ||
The length of the list must match the number of paths. Defaults to `None`. | ||
""" | ||
parser = MarkdownIt(renderer_cls=RendererPlain) | ||
if self.table_to_single_line: | ||
parser.enable("table") | ||
|
||
documents = [] | ||
if meta is None: | ||
meta = [{}] * len(sources) | ||
|
||
for source, metadata in tqdm( | ||
zip(sources, meta), | ||
total=len(sources), | ||
desc="Converting markdown files to Documents", | ||
disable=not self.progress_bar, | ||
): | ||
try: | ||
file_content = self._extract_content(source) | ||
except Exception as e: | ||
logger.warning("Could not read %s. Skipping it. Error: %s", source, e) | ||
continue | ||
try: | ||
text = parser.render(file_content) | ||
except Exception as conversion_e: # Consider specifying the expected exception type(s) here | ||
logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e) | ||
continue | ||
|
||
document = Document(content=text, meta=metadata) | ||
documents.append(document) | ||
|
||
return {"documents": documents} | ||
|
||
def _extract_content(self, source: Union[str, Path, ByteStream]) -> str: | ||
""" | ||
Extracts content from the given data source. | ||
:param source: The data source to extract content from. | ||
:return: The extracted content. | ||
""" | ||
if isinstance(source, (str, Path)): | ||
with open(source) as text_file: | ||
return text_file.read() | ||
if isinstance(source, ByteStream): | ||
return source.data.decode("utf-8") | ||
|
||
raise ValueError(f"Unsupported source type: {type(source)}") |
4 changes: 4 additions & 0 deletions
4
releasenotes/notes/add-MarkdownToTextDocument-f97ec6c5fb35527d.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
--- | ||
preview: | ||
- | | ||
Add MarkdownToTextDocument, a file converter that converts Markdown files into a text Documents. |
93 changes: 93 additions & 0 deletions
93
test/preview/components/file_converters/test_markdown_to_document.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
import logging | ||
|
||
import pytest | ||
|
||
from haystack.preview.components.file_converters.markdown import MarkdownToDocument | ||
from haystack.preview.dataclasses import ByteStream | ||
|
||
|
||
class TestMarkdownToDocument: | ||
@pytest.mark.unit | ||
def test_init_params_default(self): | ||
converter = MarkdownToDocument() | ||
assert converter.table_to_single_line is False | ||
assert converter.progress_bar is True | ||
|
||
@pytest.mark.unit | ||
def test_init_params_custom(self): | ||
converter = MarkdownToDocument(table_to_single_line=True, progress_bar=False) | ||
assert converter.table_to_single_line is True | ||
assert converter.progress_bar is False | ||
|
||
@pytest.mark.integration | ||
def test_run(self, preview_samples_path): | ||
converter = MarkdownToDocument() | ||
sources = [preview_samples_path / "markdown" / "sample.md"] | ||
results = converter.run(sources=sources) | ||
docs = results["documents"] | ||
|
||
assert len(docs) == 1 | ||
for doc in docs: | ||
assert "What to build with Haystack" in doc.content | ||
assert "# git clone https://github.com/deepset-ai/haystack.git" in doc.content | ||
|
||
@pytest.mark.integration | ||
def test_run_metadata(self, preview_samples_path): | ||
converter = MarkdownToDocument() | ||
sources = [preview_samples_path / "markdown" / "sample.md"] | ||
metadata = [{"file_name": "sample.md"}] | ||
results = converter.run(sources=sources, meta=metadata) | ||
docs = results["documents"] | ||
|
||
assert len(docs) == 1 | ||
for doc in docs: | ||
assert "What to build with Haystack" in doc.content | ||
assert "# git clone https://github.com/deepset-ai/haystack.git" in doc.content | ||
assert doc.meta == {"file_name": "sample.md"} | ||
|
||
@pytest.mark.integration | ||
def test_run_wrong_file_type(self, preview_samples_path, caplog): | ||
""" | ||
Test if the component runs correctly when an input file is not of the expected type. | ||
""" | ||
sources = [preview_samples_path / "audio" / "answer.wav"] | ||
converter = MarkdownToDocument() | ||
with caplog.at_level(logging.WARNING): | ||
output = converter.run(sources=sources) | ||
assert "codec can't decode byte" in caplog.text | ||
|
||
docs = output["documents"] | ||
assert not docs | ||
|
||
@pytest.mark.integration | ||
def test_run_error_handling(self, caplog): | ||
""" | ||
Test if the component correctly handles errors. | ||
""" | ||
sources = ["non_existing_file.md"] | ||
converter = MarkdownToDocument() | ||
with caplog.at_level(logging.WARNING): | ||
result = converter.run(sources=sources) | ||
assert "Could not read non_existing_file.md" in caplog.text | ||
assert not result["documents"] | ||
|
||
@pytest.mark.unit | ||
def test_mixed_sources_run(self, preview_samples_path): | ||
""" | ||
Test if the component runs correctly if the input is a mix of strings, paths and ByteStreams. | ||
""" | ||
sources = [ | ||
preview_samples_path / "markdown" / "sample.md", | ||
str((preview_samples_path / "markdown" / "sample.md").absolute()), | ||
] | ||
with open(preview_samples_path / "markdown" / "sample.md", "rb") as f: | ||
byte_stream = f.read() | ||
sources.append(ByteStream(byte_stream)) | ||
|
||
converter = MarkdownToDocument() | ||
output = converter.run(sources=sources) | ||
docs = output["documents"] | ||
assert len(docs) == 3 | ||
for doc in docs: | ||
assert "What to build with Haystack" in doc.content | ||
assert "# git clone https://github.com/deepset-ai/haystack.git" in doc.content |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
--- | ||
type: intro | ||
date: 1.1.2023 | ||
--- | ||
```bash | ||
pip install farm-haystack | ||
``` | ||
## What to build with Haystack | ||
|
||
- **Ask questions in natural language** and find granular answers in your own documents. | ||
- Perform **semantic search** and retrieve documents according to meaning not keywords | ||
- Use **off-the-shelf models** or **fine-tune** them to your own domain. | ||
- Use **user feedback** to evaluate, benchmark and continuously improve your live models. | ||
- Leverage existing **knowledge bases** and better handle the long tail of queries that **chatbots** receive. | ||
- **Automate processes** by automatically applying a list of questions to new documents and using the extracted answers. | ||
|
||
![Logo](https://raw.githubusercontent.com/deepset-ai/haystack/main/docs/img/logo.png) | ||
|
||
|
||
## Core Features | ||
|
||
- **Latest models**: Utilize all latest transformer based models (e.g. BERT, RoBERTa, MiniLM) for extractive QA, generative QA and document retrieval. | ||
- **Modular**: Multiple choices to fit your tech stack and use case. Pick your favorite database, file converter or modeling framework. | ||
- **Open**: 100% compatible with HuggingFace's model hub. Tight interfaces to other frameworks (e.g. Transformers, FARM, sentence-transformers) | ||
- **Scalable**: Scale to millions of docs via retrievers, production-ready backends like Elasticsearch / FAISS and a fastAPI REST API | ||
- **End-to-End**: All tooling in one place: file conversion, cleaning, splitting, training, eval, inference, labeling ... | ||
- **Developer friendly**: Easy to debug, extend and modify. | ||
- **Customizable**: Fine-tune models to your own domain or implement your custom DocumentStore. | ||
- **Continuous Learning**: Collect new training data via user feedback in production & improve your models continuously | ||
|
||
| | | | ||
|-|-| | ||
| :ledger: [Docs](https://haystack.deepset.ai/overview/intro) | Usage, Guides, API documentation ...| | ||
| :beginner: [Quick Demo](https://github.com/deepset-ai/haystack/#quick-demo) | Quickly see what Haystack offers | | ||
| :floppy_disk: [Installation](https://github.com/deepset-ai/haystack/#installation) | How to install Haystack | | ||
| :art: [Key Components](https://github.com/deepset-ai/haystack/#key-components) | Overview of core concepts | | ||
| :mortar_board: [Tutorials](https://github.com/deepset-ai/haystack/#tutorials) | Jupyter/Colab Notebooks & Scripts | | ||
| :eyes: [How to use Haystack](https://github.com/deepset-ai/haystack/#how-to-use-haystack) | Basic explanation of concepts, options and usage | | ||
| :heart: [Contributing](https://github.com/deepset-ai/haystack/#heart-contributing) | We welcome all contributions! | | ||
| :bar_chart: [Benchmarks](https://haystack.deepset.ai/benchmarks/v0.9.0) | Speed & Accuracy of Retriever, Readers and DocumentStores | | ||
| :telescope: [Roadmap](https://haystack.deepset.ai/overview/roadmap) | Public roadmap of Haystack | | ||
| :pray: [Slack](https://haystack.deepset.ai/community/join) | Join our community on Slack | | ||
| :bird: [Twitter](https://twitter.com/deepset_ai) | Follow us on Twitter for news and updates | | ||
| :newspaper: [Blog](https://medium.com/deepset-ai) | Read our articles on Medium | | ||
|
||
|
||
## Quick Demo | ||
|
||
The quickest way to see what Haystack offers is to start a [Docker Compose](https://docs.docker.com/compose/) demo application: | ||
|
||
**1. Update/install Docker and Docker Compose, then launch Docker** | ||
|
||
``` | ||
# apt-get update && apt-get install docker && apt-get install docker-compose | ||
# service docker start | ||
``` | ||
|
||
**2. Clone Haystack repository** | ||
|
||
``` | ||
# git clone https://github.com/deepset-ai/haystack.git | ||
``` | ||
|
||
### 2nd level headline for testing purposes | ||
#### 3rd level headline for testing purposes |