Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(api): add markdown support #106

Merged
merged 2 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Tous les changements notables de l'application sont documentés dans ce fichier.

## [Alpha] - 2024-12-09

- 🎉 Ajout de la possibilité d'uploader des fichiers markdown
- 🎉 Accès au endpoint `/search` par le biais de l'endpoint `/chat/completions` avec le paramètre `search=true`
- 🎉 Ajout d'un endpoint GET `/metrics` pour récupérer les métriques de l'application via Prometheus
- 🔄 Refactoring des classes de clients
Expand Down
3 changes: 3 additions & 0 deletions app/endpoints/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ async def upload_file(file: UploadFile = File(...), request: FilesRequest = Body
For JSON, file structure like a list of documents: [{"text": "hello world", "title": "my document", "metadata": {"autor": "me"}}, ...]} or [{"text": "hello world", "title": "my document"}, ...]}
Each document must have a "text" and "title" keys and "metadata" key (optional) with dict type value.
- html: Hypertext Markup Language file.
- markdown: Markdown Language file.

Max file size is 20MB.
"""

file_size = len(file.file.read())
Expand Down
14 changes: 6 additions & 8 deletions app/helpers/_fileuploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,20 @@

from app.clients import SearchClient
from app.helpers.data.chunkers import *
from app.helpers.data.parsers import HTMLParser, JSONParser, PDFParser
from app.helpers.data.parsers import HTMLParser, JSONParser, PDFParser, MarkdownParser
from app.schemas.chunks import Chunk
from app.schemas.data import ParserOutput
from app.schemas.security import User
from app.utils.exceptions import InvalidJSONFormatException, NoChunksToUpsertException, ParsingFileFailedException, UnsupportedFileTypeException
from app.utils.variables import (
CHUNKERS,
DEFAULT_CHUNKER,
HTML_TYPE,
JSON_TYPE,
PDF_TYPE,
)
from app.utils.variables import CHUNKERS, DEFAULT_CHUNKER, HTML_TYPE, JSON_TYPE, PDF_TYPE, MARKDOWN_TYPE


class FileUploader:
TYPE_DICT = {
"json": JSON_TYPE,
"html": HTML_TYPE,
"pdf": PDF_TYPE,
"md": MARKDOWN_TYPE,
}

def __init__(self, collection_id: str, search_client: SearchClient, user: User):
Expand All @@ -48,6 +43,9 @@ def parse(self, file: UploadFile) -> List[ParserOutput]:
elif file_type == HTML_TYPE:
parser = HTMLParser(collection_id=self.collection_id)

elif file_type == MARKDOWN_TYPE:
parser = MarkdownParser(collection_id=self.collection_id)

try:
output = parser.parse(file=file)
except Exception as e:
Expand Down
3 changes: 2 additions & 1 deletion app/helpers/data/parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from ._htmlparser import HTMLParser
from ._jsonparser import JSONParser
from ._pdfparser import PDFParser
from ._mdparser import MarkdownParser
from ._baseparser import BaseParser


__all__ = ["HTMLParser", "JSONParser", "PDFParser", "BaseParser"]
__all__ = ["HTMLParser", "JSONParser", "PDFParser", "MarkdownParser", "BaseParser"]
73 changes: 73 additions & 0 deletions app/helpers/data/parsers/_mdparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import re
import time
from typing import List, Optional, Tuple
import uuid

from bs4 import BeautifulSoup
from fastapi import UploadFile

from app.schemas.data import ParserOutput, ParserOutputMetadata

from . import HTMLParser
from ._baseparser import BaseParser


class MarkdownParser(BaseParser):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)

def parse(self, file: UploadFile) -> List[ParserOutput]:
"""
Parse a Markdown file and converts it into a list of chunk objects.

Args:
file (UploadFile): Markdown file to parse.

Returns:
List[ParserOutput]: List of parsed outputs.
"""

markdown_text = file.file.read().decode(encoding="utf-8")

markdown_tups: List[Tuple[Optional[str], str]] = []
lines = markdown_text.split("\n")

title = None
current_header = None
current_lines = []
in_code_block = False

for line in lines:
if line.startswith("```"):
# This is the end of a code block if we are already in it, and vice versa.
in_code_block = not in_code_block

header_match = re.match(pattern=r"^#+\s", string=line)
if not in_code_block and header_match:
# Upon first header, skip if current text chunk is empty
if current_header is not None or len(current_lines) > 0:
markdown_tups.append((current_header, "\n".join(current_lines)))
if not title:
title = line
current_header = line
current_lines.clear()
else:
current_lines.append(line)

# Append final text chunk
if current_lines:
markdown_tups.append((current_header, "\n".join(current_lines)))

extracted_text = [f"${title}:\n${content}".format({title, content}) for (title, content) in markdown_tups]

content = self.clean("\n".join(extracted_text).strip())

name = file.filename.strip()

metadata = ParserOutputMetadata(
collection_id=self.collection_id, document_id=str(uuid.uuid4()), document_name=name, document_created_at=round(time.time()), title=title
)

output = [ParserOutput(content=content, metadata=metadata)]

return output
38 changes: 38 additions & 0 deletions app/tests/assets/markdown.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Dauphinois de potimarron - recette végétarienne cuisine-libre.org

## Description

Préparation : 20 min Cuisson : 1 h [Four](https://www.cuisine-libre.org/four)

![Végétarien](https://www.cuisine-libre.org/local/cache-vignettes/L40xH40/moton18-9d595.png?1644794211 "Végétarien")

Sans viande Sans œuf

Dauphinois de potimarron Rated 5.00 out of 5 based on 2 ratings.

![](https://www.cuisine-libre.org/local/cache-gd2/6d/60311ebc0c8cb1dfbbe3e5cf92e9fd.jpg?1675005547)

![Appétissante photo DR](https://www.cuisine-libre.org/local/cache-gd2/c6/f3d3dd24ed5a690a2e6ad481f8a95c.jpg?1675005547)

## Ingrédients pour 4

- potimarron de 1 kg (ou plus)
- crème fraiche liquide (fleurette)
- ail
- beurre
- sel, poivre

## Préparation

Préchauffer le four à 180/200°C.

Couper le potimarron en « taillons » de quelques millimètres d’épaisseur.
Frotter d’une gousse d’ail épluchée un plat à four en terre. Y répartir les « taillons » en couches, saler et poivrer entre chaque couche. Verser la crème, qui doit juste couvrir le potimarron (jusqu’à un litre en fonction de la taille du plat). Parsemer de quelques noisettes de beurre, pour le gratiné final.

Cuire une heure environ, forcer à 220°C les dix dernières minutes.

## <:info_post_scriptum:>

Un plat qui se réchauffe plus facilement que le vrai [gratin dauphinois](https://www.cuisine-libre.org/gratin-dauphinois) (aux pommes de terre). Cette recette m’a été proposée par ma productrice de légumes préférée : c’est la recette du gratin dauphinois appliquée à la courge. Elle convient au potimarron et aux variétés de citrouilles fermes.

- [![](https://www.cuisine-libre.org/local/cache-gd2/13/f50d96b2f12916e2df6b65f1bd381c.jpg?1644794690)Potimarron](https://www.cuisine-libre.org/potimarron)
18 changes: 18 additions & 0 deletions app/tests/test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,24 @@ def test_upload_html_file_chunker_parameters(self, args, session_user, setup):
response = session_user.post(f"{args["base_url"]}/files", data=data, files=files)
assert response.status_code == 201, f"error: upload file ({response.status_code} - {response.text})"

def test_upload_mardown_file(self, args, session_user, setup):
PRIVATE_COLLECTION_ID, _ = setup

file_path = "app/tests/assets/markdown.md"
files = {"file": (os.path.basename(file_path), open(file_path, "rb"), "text/mardown")}
data = {"request": '{"collection": "%s"}' % PRIVATE_COLLECTION_ID}
response = session_user.post(f"{args["base_url"]}/files", data=data, files=files)
assert response.status_code == 201, f"error: upload file ({response.status_code} - {response.text})"

def test_upload_mardown_file_chunker_parameters(self, args, session_user, setup):
PRIVATE_COLLECTION_ID, _ = setup

file_path = "app/tests/assets/markdown.md"
files = {"file": (os.path.basename(file_path), open(file_path, "rb"), "text/markdown")}
data = {"request": '{"collection": "%s", "chunker": {"args": {"chunk_size": 1000}}}' % PRIVATE_COLLECTION_ID}
response = session_user.post(f"{args["base_url"]}/files", data=data, files=files)
assert response.status_code == 201, f"error: upload file ({response.status_code} - {response.text})"

def test_upload_json_file(self, args, session_user, setup):
PRIVATE_COLLECTION_ID, _ = setup

Expand Down
1 change: 1 addition & 0 deletions app/utils/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
JSON_TYPE = "application/json"
TXT_TYPE = "text/plain"
HTML_TYPE = "text/html"
MARKDOWN_TYPE = "text/markdown"
# @TODO : add DOCX_TYPE (application/vnd.openxmlformats-officedocument.wordprocessingml.document)

# Clients
Expand Down
2 changes: 1 addition & 1 deletion docs/search.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Ainsi, la conception de l'API rend impossible de faire une recherche de similari

## Importer un fichier

Une fois la collection créée, vous pouvez importer des fichiers dans l'API avec l'endpoint `POST /v1/files`. Plusieurs types de fichiers sont acceptés par l'API dont JSON, PDF ou encore HTML. Le endpoint va réaliser les étapes suivantes :
Une fois la collection créée, vous pouvez importer des fichiers dans l'API avec l'endpoint `POST /v1/files`. Plusieurs types de fichiers sont acceptés par l'API dont JSON, PDF, Markdown ou encore HTML. Le endpoint va réaliser les étapes suivantes :

1. Détecter le type du fichier s'il n'est pas spécifié par l'utilisateur.
2. Créer un ID unique (*document_id*).
Expand Down
2 changes: 1 addition & 1 deletion ui/pages/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@
key="upload_file_selectbox",
)
collection_id = collection.split(" - ")[0] if collection else None
file_to_upload = st.file_uploader("File", type=["pdf", "html", "json"])
file_to_upload = st.file_uploader("File", type=["pdf", "html", "json", "md"])
submit_upload = st.button("Upload", disabled=not collection_id or not file_to_upload)
if file_to_upload and submit_upload and collection_id:
with st.spinner("Téléchargement et traitement du document en cours..."):
Expand Down