Add MarkdownToTextDocument

deepset-ai · Oct 23, 2023 · 8cd4f7c · 8cd4f7c
1 parent 7e6c6be
commit 8cd4f7c
Show file tree

Hide file tree

Showing 4 changed files with 240 additions and 0 deletions.
diff --git a/haystack/preview/components/file_converters/__init__.py b/haystack/preview/components/file_converters/__init__.py
@@ -3,11 +3,13 @@
 from haystack.preview.components.file_converters.azure import AzureOCRDocumentConverter
 from haystack.preview.components.file_converters.pypdf import PyPDFToDocument
 from haystack.preview.components.file_converters.html import HTMLToDocument
+from haystack.preview.components.file_converters.markdown import MarkdownToTextDocument
 
 __all__ = [
     "TextFileToDocument",
     "TikaDocumentConverter",
     "AzureOCRDocumentConverter",
     "PyPDFToDocument",
     "HTMLToDocument",
+    "MarkdownToTextDocument",
 ]
diff --git a/haystack/preview/components/file_converters/markdown.py b/haystack/preview/components/file_converters/markdown.py
@@ -0,0 +1,119 @@
+import logging
+import re
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union, Any
+
+from tqdm import tqdm
+
+from haystack.preview import Document, component
+from haystack.preview.lazy_imports import LazyImport
+
+with LazyImport("Run 'pip install beautifulsoup4 markdown python-frontmatter'") as markdown_conversion_imports:
+    import frontmatter
+    from bs4 import BeautifulSoup, NavigableString
+    from markdown import markdown
+
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class MarkdownToTextDocument:
+    """
+    Converts a Markdown file into a text Document.
+    """
+
+    def __init__(
+        self,
+        remove_code_snippets: bool = True,
+        extract_headlines: bool = False,
+        add_frontmatter_to_meta: bool = False,
+        progress_bar: bool = True,
+    ):
+        """
+        :param remove_code_snippets: Whether to remove snippets from the markdown file. Defaults to True.
+        :param extract_headlines: Whether to extract headings from the markdown file. Defaults to False.
+        :param add_frontmatter_to_meta: Whether to add the contents of the frontmatter to `meta`. Defaults to False.
+        :param progress_bar: Show a progress bar for the conversion.
+        """
+        markdown_conversion_imports.check()
+
+        self.remove_code_snippets = remove_code_snippets
+        self.extract_headlines = extract_headlines
+        self.add_frontmatter_to_meta = add_frontmatter_to_meta
+        self.progress_bar = progress_bar
+
+    @component.output_types(documents=List[Document])
+    def run(self, paths: List[Union[str, Path]], metadata: Optional[List[Union[Dict, Any]]] = None):
+        """
+        Reads text from a markdown file and executes optional preprocessing steps.
+
+        :param file_path: path of the file to convert
+        :param metadata: Optional list of metadata to attach to the Documents.
+        The length of the list must match the number of paths. Defaults to `None`.
+        """
+
+        if metadata is None:
+            metadata = [None] * len(paths)
+
+        documents = []
+
+        for file_path, meta in tqdm(
+            zip(paths, metadata),
+            total=len(paths),
+            desc="Converting markdown files to Documents",
+            disable=not self.progress_bar,
+        ):
+            with open(file_path, errors="ignore") as f:
+                file_metadata, markdown_text = frontmatter.parse(f.read())
+
+            # md -> html -> text since BeautifulSoup can extract text cleanly
+            html = markdown(markdown_text, extensions=["fenced_code"])
+
+            # remove code snippets
+            if self.remove_code_snippets:
+                html = re.sub(r"<pre>(.*?)</pre>", " ", html, flags=re.DOTALL)
+                html = re.sub(r"<code>(.*?)</code>", " ", html, flags=re.DOTALL)
+            soup = BeautifulSoup(html, "html.parser")
+
+            if self.add_frontmatter_to_meta:
+                if meta is None:
+                    meta = file_metadata
+                else:
+                    meta.update(file_metadata)
+
+            if self.extract_headlines:
+                text, headlines = self._extract_text_and_headlines(soup)
+                if meta is None:
+                    meta = {}
+                meta["headlines"] = headlines
+            else:
+                text = soup.get_text()
+
+            if meta is None:
+                document = Document(text=text)
+            else:
+                document = Document(text=text, metadata=meta)
+            documents.append(document)
+
+        return {"documents": documents}
+
+    @staticmethod
+    def _extract_text_and_headlines(soup: "BeautifulSoup") -> Tuple[str, List[Dict]]:
+        """
+        Extracts text and headings from a soup object.
+        """
+        headline_tags = {"h1", "h2", "h3", "h4", "h5", "h6"}
+        headlines = []
+        text = ""
+        for desc in soup.descendants:
+            if desc.name in headline_tags:
+                current_headline = desc.get_text()
+                current_start_idx = len(text)
+                current_level = int(desc.name[-1]) - 1
+                headlines.append({"headline": current_headline, "start_idx": current_start_idx, "level": current_level})
+
+            if isinstance(desc, NavigableString):
+                text += desc.get_text()
+
+        return text, headlines
diff --git a/test/preview/components/file_converters/test_markdown_to_document.py b/test/preview/components/file_converters/test_markdown_to_document.py
@@ -0,0 +1,54 @@
+import pytest
+
+from haystack.preview.components.file_converters.markdown import MarkdownToTextDocument
+
+
+class TestMarkdownToTextDocument:
+    @pytest.mark.unit
+    def test_markdown_converter(self, preview_samples_path):
+        converter = MarkdownToTextDocument()
+        results = converter.run(
+            paths=[preview_samples_path / "markdown" / "sample.md", preview_samples_path / "markdown" / "sample.md"]
+        )
+        assert results["documents"][0].text.startswith("\nWhat to build with Haystack")
+        assert "# git clone https://github.com/deepset-ai/haystack.git" not in results["documents"][0].text
+
+    @pytest.mark.unit
+    def test_markdown_converter_headline_extraction(self, preview_samples_path):
+        expected_headlines = [
+            ("What to build with Haystack", 1),
+            ("Core Features", 1),
+            ("Quick Demo", 1),
+            ("2nd level headline for testing purposes", 2),
+            ("3rd level headline for testing purposes", 3),
+        ]
+
+        converter = MarkdownToTextDocument(extract_headlines=True, remove_code_snippets=False)
+        results = converter.run(paths=[preview_samples_path / "markdown" / "sample.md"])
+
+        # Check if correct number of headlines are extracted
+        assert len(results["documents"][0].metadata["headlines"]) == 5
+        for extracted_headline, (expected_headline, expected_level) in zip(
+            results["documents"][0].metadata["headlines"], expected_headlines
+        ):
+            # Check if correct headline and level is extracted
+            assert extracted_headline["headline"] == expected_headline
+            assert extracted_headline["level"] == expected_level
+
+            # Check if correct start_idx is extracted
+            start_idx = extracted_headline["start_idx"]
+            hl_len = len(extracted_headline["headline"])
+            assert extracted_headline["headline"] == results["documents"][0].text[start_idx : start_idx + hl_len]
+
+    @pytest.mark.unit
+    def test_markdown_converter_frontmatter_to_meta(self, preview_samples_path):
+        converter = MarkdownToTextDocument(add_frontmatter_to_meta=True)
+        results = converter.run(paths=[preview_samples_path / "markdown" / "sample.md"])
+        assert results["documents"][0].metadata["type"] == "intro"
+        assert results["documents"][0].metadata["date"] == "1.1.2023"
+
+    @pytest.mark.unit
+    def test_markdown_converter_remove_code_snippets(self, preview_samples_path):
+        converter = MarkdownToTextDocument(remove_code_snippets=False)
+        results = converter.run(paths=[preview_samples_path / "markdown" / "sample.md"])
+        assert results["documents"][0].text.startswith("pip install farm-haystack")
diff --git a/test/preview/test_files/markdown/sample.md b/test/preview/test_files/markdown/sample.md
@@ -0,0 +1,65 @@
+---
+type: intro
+date: 1.1.2023
+---
+```bash
+pip install farm-haystack
+```
+## What to build with Haystack
+
+- **Ask questions in natural language** and find granular answers in your own documents.
+- Perform **semantic search** and retrieve documents according to meaning not keywords
+- Use **off-the-shelf models** or **fine-tune** them to your own domain.
+- Use **user feedback** to evaluate, benchmark and continuously improve your live models.
+- Leverage existing **knowledge bases** and better handle the long tail of queries that **chatbots** receive.
+- **Automate processes** by automatically applying a list of questions to new documents and using the extracted answers.
+
+![Logo](https://raw.githubusercontent.com/deepset-ai/haystack/main/docs/img/logo.png)
+
+
+## Core Features
+
+-   **Latest models**: Utilize all latest transformer based models (e.g. BERT, RoBERTa, MiniLM) for extractive QA, generative QA and document retrieval.
+-   **Modular**: Multiple choices to fit your tech stack and use case. Pick your favorite database, file converter or modeling framework.
+-   **Open**: 100% compatible with HuggingFace's model hub. Tight interfaces to other frameworks (e.g. Transformers, FARM, sentence-transformers)
+-   **Scalable**: Scale to millions of docs via retrievers, production-ready backends like Elasticsearch / FAISS and a fastAPI REST API
+-   **End-to-End**: All tooling in one place: file conversion, cleaning, splitting, training, eval, inference, labeling ...
+-   **Developer friendly**: Easy to debug, extend and modify.
+-   **Customizable**: Fine-tune models to your own domain or implement your custom DocumentStore.
+-   **Continuous Learning**: Collect new training data via user feedback in production & improve your models continuously
+
+|  |  |
+|-|-|
+| :ledger: [Docs](https://haystack.deepset.ai/overview/intro) | Usage, Guides, API documentation ...|
+| :beginner: [Quick Demo](https://github.com/deepset-ai/haystack/#quick-demo) | Quickly see what Haystack offers |
+| :floppy_disk: [Installation](https://github.com/deepset-ai/haystack/#installation) | How to install Haystack |
+| :art: [Key Components](https://github.com/deepset-ai/haystack/#key-components) | Overview of core concepts |
+| :mortar_board: [Tutorials](https://github.com/deepset-ai/haystack/#tutorials) | Jupyter/Colab Notebooks & Scripts |
+| :eyes: [How to use Haystack](https://github.com/deepset-ai/haystack/#how-to-use-haystack) | Basic explanation of concepts, options and usage |
+| :heart: [Contributing](https://github.com/deepset-ai/haystack/#heart-contributing) | We welcome all contributions! |
+| :bar_chart: [Benchmarks](https://haystack.deepset.ai/benchmarks/v0.9.0) | Speed & Accuracy of Retriever, Readers and DocumentStores |
+| :telescope: [Roadmap](https://haystack.deepset.ai/overview/roadmap) | Public roadmap of Haystack |
+| :pray: [Slack](https://haystack.deepset.ai/community/join) | Join our community on Slack |
+| :bird: [Twitter](https://twitter.com/deepset_ai) | Follow us on Twitter for news and updates |
+| :newspaper: [Blog](https://medium.com/deepset-ai) | Read our articles on Medium |
+
+
+## Quick Demo
+
+The quickest way to see what Haystack offers is to start a [Docker Compose](https://docs.docker.com/compose/) demo application:
+
+**1. Update/install Docker and Docker Compose, then launch Docker**
+
+```
+    # apt-get update && apt-get install docker && apt-get install docker-compose
+    # service docker start
+```
+
+**2. Clone Haystack repository**
+
+```
+    # git clone https://github.com/deepset-ai/haystack.git
+```
+
+### 2nd level headline for testing purposes
+#### 3rd level headline for testing purposes