Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(confluence): add markdown format option #8246

Merged
merged 3 commits into from
Jul 26, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 38 additions & 11 deletions libs/langchain/langchain/document_loaders/confluence.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ def load(
limit: Optional[int] = 50,
max_pages: Optional[int] = 1000,
ocr_languages: Optional[str] = None,
keep_markdown_format: bool = False,
) -> List[Document]:
"""
:param space_key: Space key retrieved from a confluence URL, defaults to None
Expand Down Expand Up @@ -234,6 +235,9 @@ def load(
language, you'll first need to install the appropriate
Tesseract language pack.
:type ocr_languages: str, optional
:param keep_markdown_format: Whether to keep the markdown format, defaults to
False
:type keep_markdown_format: bool
:raises ValueError: _description_
:raises ImportError: _description_
:return: _description_
Expand Down Expand Up @@ -263,6 +267,7 @@ def load(
include_comments,
content_format,
ocr_languages,
keep_markdown_format,
)

if label:
Expand Down Expand Up @@ -294,6 +299,7 @@ def load(
include_comments,
content_format,
ocr_languages,
keep_markdown_format,
)

if page_ids:
Expand All @@ -319,6 +325,7 @@ def load(
include_comments,
content_format,
ocr_languages,
keep_markdown_format,
)
docs.append(doc)

Expand Down Expand Up @@ -397,6 +404,7 @@ def process_pages(
include_comments: bool,
content_format: ContentFormat,
ocr_languages: Optional[str] = None,
keep_markdown_format: Optional[bool] = False,
) -> List[Document]:
"""Process a list of pages into a list of documents."""
docs = []
Expand All @@ -409,6 +417,7 @@ def process_pages(
include_comments,
content_format,
ocr_languages,
keep_markdown_format,
)
docs.append(doc)

Expand All @@ -421,24 +430,42 @@ def process_page(
include_comments: bool,
content_format: ContentFormat,
ocr_languages: Optional[str] = None,
keep_markdown_format: Optional[bool] = False,
) -> Document:
try:
from bs4 import BeautifulSoup # type: ignore
except ImportError:
raise ImportError(
"`beautifulsoup4` package not found, please run "
"`pip install beautifulsoup4`"
)
if keep_markdown_format:
try:
from markdownify import markdownify
except ImportError:
raise ImportError(
"`markdownify` package not found, please run "
"`pip install markdownify`"
)
else:
try:
from bs4 import BeautifulSoup # type: ignore
except ImportError:
raise ImportError(
"`beautifulsoup4` package not found, please run "
"`pip install beautifulsoup4`"
)

if include_attachments:
attachment_texts = self.process_attachment(page["id"], ocr_languages)
else:
attachment_texts = []

content = content_format.get_content(page)
text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join(
attachment_texts
)
if keep_markdown_format:
# Use markdownify to keep the page Markdown style
text = markdownify(
page["body"]["storage"]["value"], heading_style="ATX"
) + "".join(attachment_texts)

else:
content = content_format.get_content(page)
text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join(
attachment_texts
)

if include_comments:
comments = self.confluence.get_page_comments(
page["id"], expand="body.view.value", depth="all"
Expand Down