Skip to content

Commit

Permalink
Add MarkdownToTextDocument
Browse files Browse the repository at this point in the history
  • Loading branch information
awinml committed Oct 23, 2023
1 parent 7e6c6be commit 8cd4f7c
Show file tree
Hide file tree
Showing 4 changed files with 240 additions and 0 deletions.
2 changes: 2 additions & 0 deletions haystack/preview/components/file_converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
from haystack.preview.components.file_converters.azure import AzureOCRDocumentConverter
from haystack.preview.components.file_converters.pypdf import PyPDFToDocument
from haystack.preview.components.file_converters.html import HTMLToDocument
from haystack.preview.components.file_converters.markdown import MarkdownToTextDocument

__all__ = [
"TextFileToDocument",
"TikaDocumentConverter",
"AzureOCRDocumentConverter",
"PyPDFToDocument",
"HTMLToDocument",
"MarkdownToTextDocument",
]
119 changes: 119 additions & 0 deletions haystack/preview/components/file_converters/markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import logging
import re
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union, Any

from tqdm import tqdm

from haystack.preview import Document, component
from haystack.preview.lazy_imports import LazyImport

with LazyImport("Run 'pip install beautifulsoup4 markdown python-frontmatter'") as markdown_conversion_imports:
import frontmatter
from bs4 import BeautifulSoup, NavigableString
from markdown import markdown


logger = logging.getLogger(__name__)


@component
class MarkdownToTextDocument:
"""
Converts a Markdown file into a text Document.
"""

def __init__(
self,
remove_code_snippets: bool = True,
extract_headlines: bool = False,
add_frontmatter_to_meta: bool = False,
progress_bar: bool = True,
):
"""
:param remove_code_snippets: Whether to remove snippets from the markdown file. Defaults to True.
:param extract_headlines: Whether to extract headings from the markdown file. Defaults to False.
:param add_frontmatter_to_meta: Whether to add the contents of the frontmatter to `meta`. Defaults to False.
:param progress_bar: Show a progress bar for the conversion.
"""
markdown_conversion_imports.check()

self.remove_code_snippets = remove_code_snippets
self.extract_headlines = extract_headlines
self.add_frontmatter_to_meta = add_frontmatter_to_meta
self.progress_bar = progress_bar

@component.output_types(documents=List[Document])
def run(self, paths: List[Union[str, Path]], metadata: Optional[List[Union[Dict, Any]]] = None):
"""
Reads text from a markdown file and executes optional preprocessing steps.
:param file_path: path of the file to convert
:param metadata: Optional list of metadata to attach to the Documents.
The length of the list must match the number of paths. Defaults to `None`.
"""

if metadata is None:
metadata = [None] * len(paths)

documents = []

for file_path, meta in tqdm(
zip(paths, metadata),
total=len(paths),
desc="Converting markdown files to Documents",
disable=not self.progress_bar,
):
with open(file_path, errors="ignore") as f:
file_metadata, markdown_text = frontmatter.parse(f.read())

# md -> html -> text since BeautifulSoup can extract text cleanly
html = markdown(markdown_text, extensions=["fenced_code"])

# remove code snippets
if self.remove_code_snippets:
html = re.sub(r"<pre>(.*?)</pre>", " ", html, flags=re.DOTALL)
html = re.sub(r"<code>(.*?)</code>", " ", html, flags=re.DOTALL)
soup = BeautifulSoup(html, "html.parser")

if self.add_frontmatter_to_meta:
if meta is None:
meta = file_metadata
else:
meta.update(file_metadata)

if self.extract_headlines:
text, headlines = self._extract_text_and_headlines(soup)
if meta is None:
meta = {}
meta["headlines"] = headlines
else:
text = soup.get_text()

if meta is None:
document = Document(text=text)
else:
document = Document(text=text, metadata=meta)
documents.append(document)

return {"documents": documents}

@staticmethod
def _extract_text_and_headlines(soup: "BeautifulSoup") -> Tuple[str, List[Dict]]:
"""
Extracts text and headings from a soup object.
"""
headline_tags = {"h1", "h2", "h3", "h4", "h5", "h6"}
headlines = []
text = ""
for desc in soup.descendants:
if desc.name in headline_tags:
current_headline = desc.get_text()
current_start_idx = len(text)
current_level = int(desc.name[-1]) - 1
headlines.append({"headline": current_headline, "start_idx": current_start_idx, "level": current_level})

if isinstance(desc, NavigableString):
text += desc.get_text()

return text, headlines
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import pytest

from haystack.preview.components.file_converters.markdown import MarkdownToTextDocument


class TestMarkdownToTextDocument:
@pytest.mark.unit
def test_markdown_converter(self, preview_samples_path):
converter = MarkdownToTextDocument()
results = converter.run(
paths=[preview_samples_path / "markdown" / "sample.md", preview_samples_path / "markdown" / "sample.md"]
)
assert results["documents"][0].text.startswith("\nWhat to build with Haystack")
assert "# git clone https://github.com/deepset-ai/haystack.git" not in results["documents"][0].text

@pytest.mark.unit
def test_markdown_converter_headline_extraction(self, preview_samples_path):
expected_headlines = [
("What to build with Haystack", 1),
("Core Features", 1),
("Quick Demo", 1),
("2nd level headline for testing purposes", 2),
("3rd level headline for testing purposes", 3),
]

converter = MarkdownToTextDocument(extract_headlines=True, remove_code_snippets=False)
results = converter.run(paths=[preview_samples_path / "markdown" / "sample.md"])

# Check if correct number of headlines are extracted
assert len(results["documents"][0].metadata["headlines"]) == 5
for extracted_headline, (expected_headline, expected_level) in zip(
results["documents"][0].metadata["headlines"], expected_headlines
):
# Check if correct headline and level is extracted
assert extracted_headline["headline"] == expected_headline
assert extracted_headline["level"] == expected_level

# Check if correct start_idx is extracted
start_idx = extracted_headline["start_idx"]
hl_len = len(extracted_headline["headline"])
assert extracted_headline["headline"] == results["documents"][0].text[start_idx : start_idx + hl_len]

@pytest.mark.unit
def test_markdown_converter_frontmatter_to_meta(self, preview_samples_path):
converter = MarkdownToTextDocument(add_frontmatter_to_meta=True)
results = converter.run(paths=[preview_samples_path / "markdown" / "sample.md"])
assert results["documents"][0].metadata["type"] == "intro"
assert results["documents"][0].metadata["date"] == "1.1.2023"

@pytest.mark.unit
def test_markdown_converter_remove_code_snippets(self, preview_samples_path):
converter = MarkdownToTextDocument(remove_code_snippets=False)
results = converter.run(paths=[preview_samples_path / "markdown" / "sample.md"])
assert results["documents"][0].text.startswith("pip install farm-haystack")
65 changes: 65 additions & 0 deletions test/preview/test_files/markdown/sample.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
---
type: intro
date: 1.1.2023
---
```bash
pip install farm-haystack
```
## What to build with Haystack

- **Ask questions in natural language** and find granular answers in your own documents.
- Perform **semantic search** and retrieve documents according to meaning not keywords
- Use **off-the-shelf models** or **fine-tune** them to your own domain.
- Use **user feedback** to evaluate, benchmark and continuously improve your live models.
- Leverage existing **knowledge bases** and better handle the long tail of queries that **chatbots** receive.
- **Automate processes** by automatically applying a list of questions to new documents and using the extracted answers.

![Logo](https://raw.githubusercontent.com/deepset-ai/haystack/main/docs/img/logo.png)


## Core Features

- **Latest models**: Utilize all latest transformer based models (e.g. BERT, RoBERTa, MiniLM) for extractive QA, generative QA and document retrieval.
- **Modular**: Multiple choices to fit your tech stack and use case. Pick your favorite database, file converter or modeling framework.
- **Open**: 100% compatible with HuggingFace's model hub. Tight interfaces to other frameworks (e.g. Transformers, FARM, sentence-transformers)
- **Scalable**: Scale to millions of docs via retrievers, production-ready backends like Elasticsearch / FAISS and a fastAPI REST API
- **End-to-End**: All tooling in one place: file conversion, cleaning, splitting, training, eval, inference, labeling ...
- **Developer friendly**: Easy to debug, extend and modify.
- **Customizable**: Fine-tune models to your own domain or implement your custom DocumentStore.
- **Continuous Learning**: Collect new training data via user feedback in production & improve your models continuously

| | |
|-|-|
| :ledger: [Docs](https://haystack.deepset.ai/overview/intro) | Usage, Guides, API documentation ...|
| :beginner: [Quick Demo](https://github.com/deepset-ai/haystack/#quick-demo) | Quickly see what Haystack offers |
| :floppy_disk: [Installation](https://github.com/deepset-ai/haystack/#installation) | How to install Haystack |
| :art: [Key Components](https://github.com/deepset-ai/haystack/#key-components) | Overview of core concepts |
| :mortar_board: [Tutorials](https://github.com/deepset-ai/haystack/#tutorials) | Jupyter/Colab Notebooks & Scripts |
| :eyes: [How to use Haystack](https://github.com/deepset-ai/haystack/#how-to-use-haystack) | Basic explanation of concepts, options and usage |
| :heart: [Contributing](https://github.com/deepset-ai/haystack/#heart-contributing) | We welcome all contributions! |
| :bar_chart: [Benchmarks](https://haystack.deepset.ai/benchmarks/v0.9.0) | Speed & Accuracy of Retriever, Readers and DocumentStores |
| :telescope: [Roadmap](https://haystack.deepset.ai/overview/roadmap) | Public roadmap of Haystack |
| :pray: [Slack](https://haystack.deepset.ai/community/join) | Join our community on Slack |
| :bird: [Twitter](https://twitter.com/deepset_ai) | Follow us on Twitter for news and updates |
| :newspaper: [Blog](https://medium.com/deepset-ai) | Read our articles on Medium |


## Quick Demo

The quickest way to see what Haystack offers is to start a [Docker Compose](https://docs.docker.com/compose/) demo application:

**1. Update/install Docker and Docker Compose, then launch Docker**

```
# apt-get update && apt-get install docker && apt-get install docker-compose
# service docker start
```

**2. Clone Haystack repository**

```
# git clone https://github.com/deepset-ai/haystack.git
```

### 2nd level headline for testing purposes
#### 3rd level headline for testing purposes

0 comments on commit 8cd4f7c

Please sign in to comment.