Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More explicit Confluence Connector #2289

Merged
merged 10 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/pr-python-connector-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ on:
env:
# Confluence
CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
"""migration confluence to be explicit

Revision ID: a3795dce87be
Revises: 1f60f60c3401
Create Date: 2024-09-01 13:52:12.006740

"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
from sqlalchemy.sql import table, column

revision = "a3795dce87be"
down_revision = "1f60f60c3401"
branch_labels = None
depends_on = None


def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]:
from urllib.parse import urlparse

def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]:
parsed_url = urlparse(wiki_url)
wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split('/spaces')[0]}"
path_parts = parsed_url.path.split("/")
space = path_parts[3]
page_id = path_parts[5] if len(path_parts) > 5 else ""
return wiki_base, space, page_id

def _extract_confluence_keys_from_datacenter_url(
wiki_url: str,
) -> tuple[str, str, str]:
DISPLAY = "/display/"
PAGE = "/pages/"
parsed_url = urlparse(wiki_url)
wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split(DISPLAY)[0]}"
space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
page_id = ""
if (content := parsed_url.path.split(PAGE)) and len(content) > 1:
page_id = content[1]
return wiki_base, space, page_id

is_confluence_cloud = (
".atlassian.net/wiki/spaces/" in wiki_url
or ".jira.com/wiki/spaces/" in wiki_url
)

if is_confluence_cloud:
wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url(wiki_url)
else:
wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url(
wiki_url
)

return wiki_base, space, page_id, is_confluence_cloud


def reconstruct_confluence_url(
wiki_base: str, space: str, page_id: str, is_cloud: bool
) -> str:
if is_cloud:
url = f"{wiki_base}/spaces/{space}"
if page_id:
url += f"/pages/{page_id}"
else:
url = f"{wiki_base}/display/{space}"
if page_id:
url += f"/pages/{page_id}"
return url


def upgrade() -> None:
connector = table(
"connector",
column("id", sa.Integer),
column("source", sa.String()),
column("input_type", sa.String()),
column("connector_specific_config", postgresql.JSONB),
)

# Fetch all Confluence connectors
connection = op.get_bind()
confluence_connectors = connection.execute(
sa.select(connector).where(
sa.and_(
connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL"
)
)
).fetchall()

for row in confluence_connectors:
config = row.connector_specific_config
wiki_page_url = config["wiki_page_url"]
wiki_base, space, page_id, is_cloud = extract_confluence_keys_from_url(
wiki_page_url
)

new_config = {
"wiki_base": wiki_base,
"space": space,
"page_id": page_id,
"is_cloud": is_cloud,
}

for key, value in config.items():
if key not in ["wiki_page_url"]:
new_config[key] = value

op.execute(
connector.update()
.where(connector.c.id == row.id)
.values(connector_specific_config=new_config)
)


def downgrade() -> None:
connector = table(
"connector",
column("id", sa.Integer),
column("source", sa.String()),
column("input_type", sa.String()),
column("connector_specific_config", postgresql.JSONB),
)

confluence_connectors = (
op.get_bind()
.execute(
sa.select(connector).where(
connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL"
)
)
.fetchall()
)

for row in confluence_connectors:
config = row.connector_specific_config
if all(key in config for key in ["wiki_base", "space", "is_cloud"]):
wiki_page_url = reconstruct_confluence_url(
config["wiki_base"],
config["space"],
config.get("page_id", ""),
config["is_cloud"],
)

new_config = {"wiki_page_url": wiki_page_url}
new_config.update(
{
k: v
for k, v in config.items()
if k not in ["wiki_base", "space", "page_id", "is_cloud"]
}
)

op.execute(
connector.update()
.where(connector.c.id == row.id)
.values(connector_specific_config=new_config)
)
102 changes: 18 additions & 84 deletions backend/danswer/connectors/confluence/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from functools import lru_cache
from typing import Any
from typing import cast
from urllib.parse import urlparse

import bs4
from atlassian import Confluence # type:ignore
Expand Down Expand Up @@ -53,79 +52,6 @@
)


def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]:
"""Sample
URL w/ page: https://danswer.atlassian.net/wiki/spaces/1234abcd/pages/5678efgh/overview
URL w/o page: https://danswer.atlassian.net/wiki/spaces/ASAM/overview

wiki_base is https://danswer.atlassian.net/wiki
space is 1234abcd
page_id is 5678efgh
"""
parsed_url = urlparse(wiki_url)
wiki_base = (
parsed_url.scheme
+ "://"
+ parsed_url.netloc
+ parsed_url.path.split("/spaces")[0]
)

path_parts = parsed_url.path.split("/")
space = path_parts[3]

page_id = path_parts[5] if len(path_parts) > 5 else ""
return wiki_base, space, page_id


def _extract_confluence_keys_from_datacenter_url(wiki_url: str) -> tuple[str, str, str]:
"""Sample
URL w/ page https://danswer.ai/confluence/display/1234abcd/pages/5678efgh/overview
URL w/o page https://danswer.ai/confluence/display/1234abcd/overview
wiki_base is https://danswer.ai/confluence
space is 1234abcd
page_id is 5678efgh
"""
# /display/ is always right before the space and at the end of the base print()
DISPLAY = "/display/"
PAGE = "/pages/"

parsed_url = urlparse(wiki_url)
wiki_base = (
parsed_url.scheme
+ "://"
+ parsed_url.netloc
+ parsed_url.path.split(DISPLAY)[0]
)
space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
page_id = ""
if (content := parsed_url.path.split(PAGE)) and len(content) > 1:
page_id = content[1]
return wiki_base, space, page_id


def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]:
is_confluence_cloud = (
".atlassian.net/wiki/spaces/" in wiki_url
or ".jira.com/wiki/spaces/" in wiki_url
)

try:
if is_confluence_cloud:
wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url(
wiki_url
)
else:
wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url(
wiki_url
)
except Exception as e:
error_msg = f"Not a valid Confluence Wiki Link, unable to extract wiki base, space, and page id. Exception: {e}"
logger.error(error_msg)
raise ValueError(error_msg)

return wiki_base, space, page_id, is_confluence_cloud


@lru_cache()
def _get_user(user_id: str, confluence_client: Confluence) -> str:
"""Get Confluence Display Name based on the account-id or userkey value
Expand Down Expand Up @@ -372,7 +298,10 @@ def _fetch_single_depth_child_pages(
class ConfluenceConnector(LoadConnector, PollConnector):
def __init__(
self,
wiki_page_url: str,
wiki_base: str,
space: str,
is_cloud: bool,
page_id: str = "",
index_recursively: bool = True,
batch_size: int = INDEX_BATCH_SIZE,
continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
Expand All @@ -386,15 +315,15 @@ def __init__(
self.labels_to_skip = set(labels_to_skip)
self.recursive_indexer: RecursiveIndexer | None = None
self.index_recursively = index_recursively
(
self.wiki_base,
self.space,
self.page_id,
self.is_cloud,
) = extract_confluence_keys_from_url(wiki_page_url)

self.space_level_scan = False
# Remove trailing slash from wiki_base if present
self.wiki_base = wiki_base.rstrip("/")
self.space = space
self.page_id = page_id

self.is_cloud = is_cloud

self.space_level_scan = False
self.confluence_client: Confluence | None = None

if self.page_id is None or self.page_id == "":
Expand All @@ -414,7 +343,6 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None
username=username if self.is_cloud else None,
password=access_token if self.is_cloud else None,
token=access_token if not self.is_cloud else None,
cloud=self.is_cloud,
)
return None

Expand Down Expand Up @@ -866,7 +794,13 @@ def poll_source(


if __name__ == "__main__":
connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
connector = ConfluenceConnector(
wiki_base=os.environ["CONFLUENCE_TEST_SPACE_URL"],
space=os.environ["CONFLUENCE_TEST_SPACE"],
is_cloud=os.environ.get("CONFLUENCE_IS_CLOUD", "true").lower() == "true",
page_id=os.environ.get("CONFLUENCE_TEST_PAGE_ID", ""),
index_recursively=True,
)
connector.load_credentials(
{
"confluence_username": os.environ["CONFLUENCE_USER_NAME"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@

@pytest.fixture
def confluence_connector() -> ConfluenceConnector:
connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
connector = ConfluenceConnector(
wiki_base=os.environ["CONFLUENCE_TEST_SPACE_URL"],
space=os.environ["CONFLUENCE_TEST_SPACE"],
is_cloud=os.environ.get("CONFLUENCE_IS_CLOUD", "true").lower() == "true",
page_id=os.environ.get("CONFLUENCE_TEST_PAGE_ID", ""),
)

connector.load_credentials(
{
"confluence_username": os.environ["CONFLUENCE_USER_NAME"],
Expand Down
14 changes: 10 additions & 4 deletions web/src/components/admin/connectors/ConnectorTitle.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,16 @@ export const ConnectorTitle = ({
);
} else if (connector.source === "confluence") {
const typedConnector = connector as Connector<ConfluenceConfig>;
additionalMetadata.set(
"Wiki URL",
typedConnector.connector_specific_config.wiki_page_url
);
const wikiUrl = typedConnector.connector_specific_config.is_cloud
? `${typedConnector.connector_specific_config.wiki_base}/wiki/spaces/${typedConnector.connector_specific_config.space}`
: `${typedConnector.connector_specific_config.wiki_base}/spaces/${typedConnector.connector_specific_config.space}`;
additionalMetadata.set("Wiki URL", wikiUrl);
if (typedConnector.connector_specific_config.page_id) {
additionalMetadata.set(
"Page ID",
typedConnector.connector_specific_config.page_id
);
}
} else if (connector.source === "jira") {
const typedConnector = connector as Connector<JiraConfig>;
additionalMetadata.set(
Expand Down
Loading
Loading