From 585eb23a517ecdce61dfc6a00ab24e685ea59fe4 Mon Sep 17 00:00:00 2001 From: Florian Bastin Date: Tue, 25 Jul 2023 20:01:22 +0200 Subject: [PATCH 1/3] feat(confluence): add markdown format option --- .../langchain/document_loaders/confluence.py | 48 ++++++++++++++----- libs/langchain/pyproject.toml | 1 + 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/confluence.py b/libs/langchain/langchain/document_loaders/confluence.py index 96190713329c5..5a0c452428d64 100644 --- a/libs/langchain/langchain/document_loaders/confluence.py +++ b/libs/langchain/langchain/document_loaders/confluence.py @@ -205,6 +205,7 @@ def load( limit: Optional[int] = 50, max_pages: Optional[int] = 1000, ocr_languages: Optional[str] = None, + keep_markdown_format: Optional[bool] = False, ) -> List[Document]: """ :param space_key: Space key retrieved from a confluence URL, defaults to None @@ -234,6 +235,8 @@ def load( language, you'll first need to install the appropriate Tesseract language pack. :type ocr_languages: str, optional + :param keep_markdown_format: Whether to keep the markdown format, default to False + :type keep_markdown_format: bool, optional :raises ValueError: _description_ :raises ImportError: _description_ :return: _description_ @@ -263,6 +266,7 @@ def load( include_comments, content_format, ocr_languages, + keep_markdown_format, ) if label: @@ -294,6 +298,7 @@ def load( include_comments, content_format, ocr_languages, + keep_markdown_format, ) if page_ids: @@ -319,6 +324,7 @@ def load( include_comments, content_format, ocr_languages, + keep_markdown_format, ) docs.append(doc) @@ -397,6 +403,7 @@ def process_pages( include_comments: bool, content_format: ContentFormat, ocr_languages: Optional[str] = None, + keep_markdown_format: Optional[bool] = False, ) -> List[Document]: """Process a list of pages into a list of documents.""" docs = [] @@ -409,6 +416,7 @@ def process_pages( include_comments, content_format, ocr_languages, + keep_markdown_format, ) docs.append(doc) @@ -421,24 +429,42 @@ def process_page( include_comments: bool, content_format: ContentFormat, ocr_languages: Optional[str] = None, + keep_markdown_format: Optional[bool] = False, ) -> Document: - try: - from bs4 import BeautifulSoup # type: ignore - except ImportError: - raise ImportError( - "`beautifulsoup4` package not found, please run " - "`pip install beautifulsoup4`" - ) + if keep_markdown_format: + try: + from markdownify import markdownify + except ImportError: + raise ImportError( + "`markdownify` package not found, please run " + "`pip install markdownify`" + ) + else: + try: + from bs4 import BeautifulSoup # type: ignore + except ImportError: + raise ImportError( + "`beautifulsoup4` package not found, please run " + "`pip install beautifulsoup4`" + ) if include_attachments: attachment_texts = self.process_attachment(page["id"], ocr_languages) else: attachment_texts = [] - content = content_format.get_content(page) - text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join( - attachment_texts - ) + if keep_markdown_format: + # Use markdownify to keep the page Markdown style + text = markdownify( + page["body"]["storage"]["value"], heading_style="ATX" + ) + "".join(attachment_texts) + + else: + content = content_format.get_content(page) + text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join( + attachment_texts + ) + if include_comments: comments = self.confluence.get_page_comments( page["id"], expand="body.view.value", depth="all" diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index 7495df1a729b0..a1bbe73e33c57 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -356,6 +356,7 @@ extended_testing = [ "rank_bm25", "geopandas", "jinja2", + "markdownify" ] [[tool.poetry.source]] From 8af6d1ebe72d9a09e8a5e0bbef95fd563d4e2683 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Wed, 26 Jul 2023 14:22:03 -0700 Subject: [PATCH 2/3] cr --- libs/langchain/langchain/document_loaders/confluence.py | 4 ++-- libs/langchain/pyproject.toml | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/confluence.py b/libs/langchain/langchain/document_loaders/confluence.py index 5a0c452428d64..9488bfe5d2557 100644 --- a/libs/langchain/langchain/document_loaders/confluence.py +++ b/libs/langchain/langchain/document_loaders/confluence.py @@ -205,7 +205,7 @@ def load( limit: Optional[int] = 50, max_pages: Optional[int] = 1000, ocr_languages: Optional[str] = None, - keep_markdown_format: Optional[bool] = False, + keep_markdown_format: bool = False, ) -> List[Document]: """ :param space_key: Space key retrieved from a confluence URL, defaults to None @@ -236,7 +236,7 @@ def load( Tesseract language pack. :type ocr_languages: str, optional :param keep_markdown_format: Whether to keep the markdown format, default to False - :type keep_markdown_format: bool, optional + :type keep_markdown_format: bool :raises ValueError: _description_ :raises ImportError: _description_ :return: _description_ diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index a1bbe73e33c57..7495df1a729b0 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -356,7 +356,6 @@ extended_testing = [ "rank_bm25", "geopandas", "jinja2", - "markdownify" ] [[tool.poetry.source]] From d9e820a01c28325a1fdd20f8e55ee2079419e2e4 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Wed, 26 Jul 2023 14:33:12 -0700 Subject: [PATCH 3/3] fmt --- libs/langchain/langchain/document_loaders/confluence.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libs/langchain/langchain/document_loaders/confluence.py b/libs/langchain/langchain/document_loaders/confluence.py index 9488bfe5d2557..8615faa7d8fbb 100644 --- a/libs/langchain/langchain/document_loaders/confluence.py +++ b/libs/langchain/langchain/document_loaders/confluence.py @@ -235,7 +235,8 @@ def load( language, you'll first need to install the appropriate Tesseract language pack. :type ocr_languages: str, optional - :param keep_markdown_format: Whether to keep the markdown format, default to False + :param keep_markdown_format: Whether to keep the markdown format, defaults to + False :type keep_markdown_format: bool :raises ValueError: _description_ :raises ImportError: _description_