diff --git a/.github/workflows/pr-python-connector-tests.yml b/.github/workflows/pr-python-connector-tests.yml index f79d0609822..00b92c9b003 100644 --- a/.github/workflows/pr-python-connector-tests.yml +++ b/.github/workflows/pr-python-connector-tests.yml @@ -10,6 +10,9 @@ on: env: # Confluence CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }} + CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }} + CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }} + CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }} CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }} CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }} diff --git a/backend/alembic/versions/a3795dce87be_migration_confluence_to_be_explicit.py b/backend/alembic/versions/a3795dce87be_migration_confluence_to_be_explicit.py new file mode 100644 index 00000000000..ad23892a428 --- /dev/null +++ b/backend/alembic/versions/a3795dce87be_migration_confluence_to_be_explicit.py @@ -0,0 +1,158 @@ +"""migration confluence to be explicit + +Revision ID: a3795dce87be +Revises: 1f60f60c3401 +Create Date: 2024-09-01 13:52:12.006740 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy.sql import table, column + +revision = "a3795dce87be" +down_revision = "1f60f60c3401" +branch_labels = None +depends_on = None + + +def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]: + from urllib.parse import urlparse + + def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]: + parsed_url = urlparse(wiki_url) + wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split('/spaces')[0]}" + path_parts = parsed_url.path.split("/") + space = path_parts[3] + page_id = path_parts[5] if len(path_parts) > 5 else "" + return wiki_base, space, page_id + + def _extract_confluence_keys_from_datacenter_url( + wiki_url: str, + ) -> tuple[str, str, str]: + DISPLAY = "/display/" + PAGE = "/pages/" + parsed_url = urlparse(wiki_url) + wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split(DISPLAY)[0]}" + space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0] + page_id = "" + if (content := parsed_url.path.split(PAGE)) and len(content) > 1: + page_id = content[1] + return wiki_base, space, page_id + + is_confluence_cloud = ( + ".atlassian.net/wiki/spaces/" in wiki_url + or ".jira.com/wiki/spaces/" in wiki_url + ) + + if is_confluence_cloud: + wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url(wiki_url) + else: + wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url( + wiki_url + ) + + return wiki_base, space, page_id, is_confluence_cloud + + +def reconstruct_confluence_url( + wiki_base: str, space: str, page_id: str, is_cloud: bool +) -> str: + if is_cloud: + url = f"{wiki_base}/spaces/{space}" + if page_id: + url += f"/pages/{page_id}" + else: + url = f"{wiki_base}/display/{space}" + if page_id: + url += f"/pages/{page_id}" + return url + + +def upgrade() -> None: + connector = table( + "connector", + column("id", sa.Integer), + column("source", sa.String()), + column("input_type", sa.String()), + column("connector_specific_config", postgresql.JSONB), + ) + + # Fetch all Confluence connectors + connection = op.get_bind() + confluence_connectors = connection.execute( + sa.select(connector).where( + sa.and_( + connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL" + ) + ) + ).fetchall() + + for row in confluence_connectors: + config = row.connector_specific_config + wiki_page_url = config["wiki_page_url"] + wiki_base, space, page_id, is_cloud = extract_confluence_keys_from_url( + wiki_page_url + ) + + new_config = { + "wiki_base": wiki_base, + "space": space, + "page_id": page_id, + "is_cloud": is_cloud, + } + + for key, value in config.items(): + if key not in ["wiki_page_url"]: + new_config[key] = value + + op.execute( + connector.update() + .where(connector.c.id == row.id) + .values(connector_specific_config=new_config) + ) + + +def downgrade() -> None: + connector = table( + "connector", + column("id", sa.Integer), + column("source", sa.String()), + column("input_type", sa.String()), + column("connector_specific_config", postgresql.JSONB), + ) + + confluence_connectors = ( + op.get_bind() + .execute( + sa.select(connector).where( + connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL" + ) + ) + .fetchall() + ) + + for row in confluence_connectors: + config = row.connector_specific_config + if all(key in config for key in ["wiki_base", "space", "is_cloud"]): + wiki_page_url = reconstruct_confluence_url( + config["wiki_base"], + config["space"], + config.get("page_id", ""), + config["is_cloud"], + ) + + new_config = {"wiki_page_url": wiki_page_url} + new_config.update( + { + k: v + for k, v in config.items() + if k not in ["wiki_base", "space", "page_id", "is_cloud"] + } + ) + + op.execute( + connector.update() + .where(connector.c.id == row.id) + .values(connector_specific_config=new_config) + ) diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py index b8dc967a3d9..78efce4ab98 100644 --- a/backend/danswer/connectors/confluence/connector.py +++ b/backend/danswer/connectors/confluence/connector.py @@ -7,7 +7,6 @@ from functools import lru_cache from typing import Any from typing import cast -from urllib.parse import urlparse import bs4 from atlassian import Confluence # type:ignore @@ -53,79 +52,6 @@ ) -def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]: - """Sample - URL w/ page: https://danswer.atlassian.net/wiki/spaces/1234abcd/pages/5678efgh/overview - URL w/o page: https://danswer.atlassian.net/wiki/spaces/ASAM/overview - - wiki_base is https://danswer.atlassian.net/wiki - space is 1234abcd - page_id is 5678efgh - """ - parsed_url = urlparse(wiki_url) - wiki_base = ( - parsed_url.scheme - + "://" - + parsed_url.netloc - + parsed_url.path.split("/spaces")[0] - ) - - path_parts = parsed_url.path.split("/") - space = path_parts[3] - - page_id = path_parts[5] if len(path_parts) > 5 else "" - return wiki_base, space, page_id - - -def _extract_confluence_keys_from_datacenter_url(wiki_url: str) -> tuple[str, str, str]: - """Sample - URL w/ page https://danswer.ai/confluence/display/1234abcd/pages/5678efgh/overview - URL w/o page https://danswer.ai/confluence/display/1234abcd/overview - wiki_base is https://danswer.ai/confluence - space is 1234abcd - page_id is 5678efgh - """ - # /display/ is always right before the space and at the end of the base print() - DISPLAY = "/display/" - PAGE = "/pages/" - - parsed_url = urlparse(wiki_url) - wiki_base = ( - parsed_url.scheme - + "://" - + parsed_url.netloc - + parsed_url.path.split(DISPLAY)[0] - ) - space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0] - page_id = "" - if (content := parsed_url.path.split(PAGE)) and len(content) > 1: - page_id = content[1] - return wiki_base, space, page_id - - -def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]: - is_confluence_cloud = ( - ".atlassian.net/wiki/spaces/" in wiki_url - or ".jira.com/wiki/spaces/" in wiki_url - ) - - try: - if is_confluence_cloud: - wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url( - wiki_url - ) - else: - wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url( - wiki_url - ) - except Exception as e: - error_msg = f"Not a valid Confluence Wiki Link, unable to extract wiki base, space, and page id. Exception: {e}" - logger.error(error_msg) - raise ValueError(error_msg) - - return wiki_base, space, page_id, is_confluence_cloud - - @lru_cache() def _get_user(user_id: str, confluence_client: Confluence) -> str: """Get Confluence Display Name based on the account-id or userkey value @@ -372,7 +298,10 @@ def _fetch_single_depth_child_pages( class ConfluenceConnector(LoadConnector, PollConnector): def __init__( self, - wiki_page_url: str, + wiki_base: str, + space: str, + is_cloud: bool, + page_id: str = "", index_recursively: bool = True, batch_size: int = INDEX_BATCH_SIZE, continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE, @@ -386,15 +315,15 @@ def __init__( self.labels_to_skip = set(labels_to_skip) self.recursive_indexer: RecursiveIndexer | None = None self.index_recursively = index_recursively - ( - self.wiki_base, - self.space, - self.page_id, - self.is_cloud, - ) = extract_confluence_keys_from_url(wiki_page_url) - self.space_level_scan = False + # Remove trailing slash from wiki_base if present + self.wiki_base = wiki_base.rstrip("/") + self.space = space + self.page_id = page_id + self.is_cloud = is_cloud + + self.space_level_scan = False self.confluence_client: Confluence | None = None if self.page_id is None or self.page_id == "": @@ -414,7 +343,6 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None username=username if self.is_cloud else None, password=access_token if self.is_cloud else None, token=access_token if not self.is_cloud else None, - cloud=self.is_cloud, ) return None @@ -866,7 +794,13 @@ def poll_source( if __name__ == "__main__": - connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"]) + connector = ConfluenceConnector( + wiki_base=os.environ["CONFLUENCE_TEST_SPACE_URL"], + space=os.environ["CONFLUENCE_TEST_SPACE"], + is_cloud=os.environ.get("CONFLUENCE_IS_CLOUD", "true").lower() == "true", + page_id=os.environ.get("CONFLUENCE_TEST_PAGE_ID", ""), + index_recursively=True, + ) connector.load_credentials( { "confluence_username": os.environ["CONFLUENCE_USER_NAME"], diff --git a/backend/tests/daily/connectors/confluence/test_confluence_basic.py b/backend/tests/daily/connectors/confluence/test_confluence_basic.py index 7f05242c50b..4eb25207814 100644 --- a/backend/tests/daily/connectors/confluence/test_confluence_basic.py +++ b/backend/tests/daily/connectors/confluence/test_confluence_basic.py @@ -8,7 +8,13 @@ @pytest.fixture def confluence_connector() -> ConfluenceConnector: - connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"]) + connector = ConfluenceConnector( + wiki_base=os.environ["CONFLUENCE_TEST_SPACE_URL"], + space=os.environ["CONFLUENCE_TEST_SPACE"], + is_cloud=os.environ.get("CONFLUENCE_IS_CLOUD", "true").lower() == "true", + page_id=os.environ.get("CONFLUENCE_TEST_PAGE_ID", ""), + ) + connector.load_credentials( { "confluence_username": os.environ["CONFLUENCE_USER_NAME"], diff --git a/web/src/components/admin/connectors/ConnectorTitle.tsx b/web/src/components/admin/connectors/ConnectorTitle.tsx index 3fd62bd261b..269c72e905f 100644 --- a/web/src/components/admin/connectors/ConnectorTitle.tsx +++ b/web/src/components/admin/connectors/ConnectorTitle.tsx @@ -48,10 +48,16 @@ export const ConnectorTitle = ({ ); } else if (connector.source === "confluence") { const typedConnector = connector as Connector; - additionalMetadata.set( - "Wiki URL", - typedConnector.connector_specific_config.wiki_page_url - ); + const wikiUrl = typedConnector.connector_specific_config.is_cloud + ? `${typedConnector.connector_specific_config.wiki_base}/wiki/spaces/${typedConnector.connector_specific_config.space}` + : `${typedConnector.connector_specific_config.wiki_base}/spaces/${typedConnector.connector_specific_config.space}`; + additionalMetadata.set("Wiki URL", wikiUrl); + if (typedConnector.connector_specific_config.page_id) { + additionalMetadata.set( + "Page ID", + typedConnector.connector_specific_config.page_id + ); + } } else if (connector.source === "jira") { const typedConnector = connector as Connector; additionalMetadata.set( diff --git a/web/src/lib/connectors/connectors.ts b/web/src/lib/connectors/connectors.ts index c66dbd2453b..90fa0b732a6 100644 --- a/web/src/lib/connectors/connectors.ts +++ b/web/src/lib/connectors/connectors.ts @@ -219,19 +219,37 @@ export const connectorConfigs: Record = { }, confluence: { description: "Configure Confluence connector", - subtext: `Specify any link to a Confluence page below and click "Index" to Index. If the provided link is for an entire space, we will index the entire space. However, if you want to index a specific page, you can do so by entering the page's URL. - -For example, entering https://danswer.atlassian.net/wiki/spaces/Engineering/overview and clicking the Index button will index the whole Engineering Confluence space, but entering https://danswer.atlassian.net/wiki/spaces/Engineering/pages/164331/example+page will index that page (and optionally the page's children). + subtext: `Specify the base URL of your Confluence instance, the space name, and optionally a specific page ID to index. If no page ID is provided, the entire space will be indexed. -Selecting the "Index Recursively" checkbox will index the single page's children in addition to itself.`, +For example, entering "https://pablosfsanchez.atlassian.net/wiki" as the Wiki Base URL, "KB" as the Space, and "164331" as the Page ID will index the specific page at https://pablosfsanchez.atlassian.net/wiki/spaces/KB/pages/164331/Page. If you leave the Page ID empty, it will index the entire KB space. + +Selecting the "Index Recursively" checkbox will index the specified page and all of its children.`, values: [ { type: "text", - query: "Enter the wiki page URL:", - label: "Wiki Page URL", - name: "wiki_page_url", + query: "Enter the wiki base URL:", + label: "Wiki Base URL", + name: "wiki_base", optional: false, - description: "Enter any link to a Confluence space or Page", + description: + "The base URL of your Confluence instance (e.g., https://your-domain.atlassian.net/wiki)", + }, + { + type: "text", + query: "Enter the space:", + label: "Space", + name: "space", + optional: false, + description: "The Confluence space name to index (e.g. `KB`)", + }, + { + type: "text", + query: "Enter the page ID (optional):", + label: "Page ID", + name: "page_id", + optional: true, + description: + "Specific page ID to index - leave empty to index the entire space (e.g. `131368`)", }, { type: "checkbox", @@ -241,6 +259,16 @@ Selecting the "Index Recursively" checkbox will index the single page's children name: "index_recursively", optional: false, }, + { + type: "checkbox", + query: "Is this a Confluence Cloud instance?", + label: "Is Cloud", + name: "is_cloud", + optional: false, + default: true, + description: + "Check if this is a Confluence Cloud instance, uncheck for Confluence Server/Data Center", + }, ], }, jira: { @@ -817,7 +845,10 @@ export interface GmailConfig {} export interface BookstackConfig {} export interface ConfluenceConfig { - wiki_page_url: string; + wiki_base: string; + space: string; + page_id?: string; + is_cloud?: boolean; index_recursively?: boolean; }