From 585eb23a517ecdce61dfc6a00ab24e685ea59fe4 Mon Sep 17 00:00:00 2001
From: Florian Bastin <florian.bastin@octo.com>
Date: Tue, 25 Jul 2023 20:01:22 +0200
Subject: [PATCH 1/3] feat(confluence): add markdown format option

---
 .../langchain/document_loaders/confluence.py  | 48 ++++++++++++++-----
 libs/langchain/pyproject.toml                 |  1 +
 2 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/libs/langchain/langchain/document_loaders/confluence.py b/libs/langchain/langchain/document_loaders/confluence.py
index 96190713329c5..5a0c452428d64 100644
--- a/libs/langchain/langchain/document_loaders/confluence.py
+++ b/libs/langchain/langchain/document_loaders/confluence.py
@@ -205,6 +205,7 @@ def load(
         limit: Optional[int] = 50,
         max_pages: Optional[int] = 1000,
         ocr_languages: Optional[str] = None,
+        keep_markdown_format: Optional[bool] = False,
     ) -> List[Document]:
         """
         :param space_key: Space key retrieved from a confluence URL, defaults to None
@@ -234,6 +235,8 @@ def load(
                               language, you'll first need to install the appropriate
                               Tesseract language pack.
         :type ocr_languages: str, optional
+        :param keep_markdown_format: Whether to keep the markdown format, default to False
+        :type keep_markdown_format: bool, optional
         :raises ValueError: _description_
         :raises ImportError: _description_
         :return: _description_
@@ -263,6 +266,7 @@ def load(
                 include_comments,
                 content_format,
                 ocr_languages,
+                keep_markdown_format,
             )
 
         if label:
@@ -294,6 +298,7 @@ def load(
                 include_comments,
                 content_format,
                 ocr_languages,
+                keep_markdown_format,
             )
 
         if page_ids:
@@ -319,6 +324,7 @@ def load(
                     include_comments,
                     content_format,
                     ocr_languages,
+                    keep_markdown_format,
                 )
                 docs.append(doc)
 
@@ -397,6 +403,7 @@ def process_pages(
         include_comments: bool,
         content_format: ContentFormat,
         ocr_languages: Optional[str] = None,
+        keep_markdown_format: Optional[bool] = False,
     ) -> List[Document]:
         """Process a list of pages into a list of documents."""
         docs = []
@@ -409,6 +416,7 @@ def process_pages(
                 include_comments,
                 content_format,
                 ocr_languages,
+                keep_markdown_format,
             )
             docs.append(doc)
 
@@ -421,24 +429,42 @@ def process_page(
         include_comments: bool,
         content_format: ContentFormat,
         ocr_languages: Optional[str] = None,
+        keep_markdown_format: Optional[bool] = False,
     ) -> Document:
-        try:
-            from bs4 import BeautifulSoup  # type: ignore
-        except ImportError:
-            raise ImportError(
-                "`beautifulsoup4` package not found, please run "
-                "`pip install beautifulsoup4`"
-            )
+        if keep_markdown_format:
+            try:
+                from markdownify import markdownify
+            except ImportError:
+                raise ImportError(
+                    "`markdownify` package not found, please run "
+                    "`pip install markdownify`"
+                )
+        else:
+            try:
+                from bs4 import BeautifulSoup  # type: ignore
+            except ImportError:
+                raise ImportError(
+                    "`beautifulsoup4` package not found, please run "
+                    "`pip install beautifulsoup4`"
+                )
 
         if include_attachments:
             attachment_texts = self.process_attachment(page["id"], ocr_languages)
         else:
             attachment_texts = []
 
-        content = content_format.get_content(page)
-        text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join(
-            attachment_texts
-        )
+        if keep_markdown_format:
+            # Use markdownify to keep the page Markdown style
+            text = markdownify(
+                page["body"]["storage"]["value"], heading_style="ATX"
+            ) + "".join(attachment_texts)
+
+        else:
+            content = content_format.get_content(page)
+            text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join(
+                attachment_texts
+            )
+
         if include_comments:
             comments = self.confluence.get_page_comments(
                 page["id"], expand="body.view.value", depth="all"
diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml
index 7495df1a729b0..a1bbe73e33c57 100644
--- a/libs/langchain/pyproject.toml
+++ b/libs/langchain/pyproject.toml
@@ -356,6 +356,7 @@ extended_testing = [
  "rank_bm25",
  "geopandas",
  "jinja2",
+ "markdownify"
 ]
 
 [[tool.poetry.source]]

From 8af6d1ebe72d9a09e8a5e0bbef95fd563d4e2683 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Wed, 26 Jul 2023 14:22:03 -0700
Subject: [PATCH 2/3] cr

---
 libs/langchain/langchain/document_loaders/confluence.py | 4 ++--
 libs/langchain/pyproject.toml                           | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/libs/langchain/langchain/document_loaders/confluence.py b/libs/langchain/langchain/document_loaders/confluence.py
index 5a0c452428d64..9488bfe5d2557 100644
--- a/libs/langchain/langchain/document_loaders/confluence.py
+++ b/libs/langchain/langchain/document_loaders/confluence.py
@@ -205,7 +205,7 @@ def load(
         limit: Optional[int] = 50,
         max_pages: Optional[int] = 1000,
         ocr_languages: Optional[str] = None,
-        keep_markdown_format: Optional[bool] = False,
+        keep_markdown_format: bool = False,
     ) -> List[Document]:
         """
         :param space_key: Space key retrieved from a confluence URL, defaults to None
@@ -236,7 +236,7 @@ def load(
                               Tesseract language pack.
         :type ocr_languages: str, optional
         :param keep_markdown_format: Whether to keep the markdown format, default to False
-        :type keep_markdown_format: bool, optional
+        :type keep_markdown_format: bool
         :raises ValueError: _description_
         :raises ImportError: _description_
         :return: _description_
diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml
index a1bbe73e33c57..7495df1a729b0 100644
--- a/libs/langchain/pyproject.toml
+++ b/libs/langchain/pyproject.toml
@@ -356,7 +356,6 @@ extended_testing = [
  "rank_bm25",
  "geopandas",
  "jinja2",
- "markdownify"
 ]
 
 [[tool.poetry.source]]

From d9e820a01c28325a1fdd20f8e55ee2079419e2e4 Mon Sep 17 00:00:00 2001
From: Bagatur <baskaryan@gmail.com>
Date: Wed, 26 Jul 2023 14:33:12 -0700
Subject: [PATCH 3/3] fmt

---
 libs/langchain/langchain/document_loaders/confluence.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libs/langchain/langchain/document_loaders/confluence.py b/libs/langchain/langchain/document_loaders/confluence.py
index 9488bfe5d2557..8615faa7d8fbb 100644
--- a/libs/langchain/langchain/document_loaders/confluence.py
+++ b/libs/langchain/langchain/document_loaders/confluence.py
@@ -235,7 +235,8 @@ def load(
                               language, you'll first need to install the appropriate
                               Tesseract language pack.
         :type ocr_languages: str, optional
-        :param keep_markdown_format: Whether to keep the markdown format, default to False
+        :param keep_markdown_format: Whether to keep the markdown format, defaults to
+            False
         :type keep_markdown_format: bool
         :raises ValueError: _description_
         :raises ImportError: _description_