Skip to content

Commit

Permalink
Merge branch 'danswer-ai:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
colachg authored Mar 24, 2024
2 parents 772ac28 + b8f767a commit 093fe96
Show file tree
Hide file tree
Showing 46 changed files with 1,644 additions and 365 deletions.
28 changes: 28 additions & 0 deletions backend/alembic/versions/4738e4b3bae1_pg_file_store.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""PG File Store
Revision ID: 4738e4b3bae1
Revises: e91df4e935ef
Create Date: 2024-03-20 18:53:32.461518
"""
from alembic import op
import sqlalchemy as sa

# revision identifiers, used by Alembic.
revision = "4738e4b3bae1"
down_revision = "e91df4e935ef"
branch_labels = None
depends_on = None


def upgrade() -> None:
op.create_table(
"file_store",
sa.Column("file_name", sa.String(), nullable=False),
sa.Column("lobj_oid", sa.Integer(), nullable=False),
sa.PrimaryKeyConstraint("file_name"),
)


def downgrade() -> None:
op.drop_table("file_store")
71 changes: 71 additions & 0 deletions backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""Remove Remaining Enums
Revision ID: 776b3bbe9092
Revises: 4738e4b3bae1
Create Date: 2024-03-22 21:34:27.629444
"""
from alembic import op
import sqlalchemy as sa

from danswer.db.models import IndexModelStatus
from danswer.search.models import RecencyBiasSetting
from danswer.search.models import SearchType

# revision identifiers, used by Alembic.
revision = "776b3bbe9092"
down_revision = "4738e4b3bae1"
branch_labels = None
depends_on = None


def upgrade() -> None:
op.alter_column(
"persona",
"search_type",
type_=sa.String,
existing_type=sa.Enum(SearchType, native_enum=False),
existing_nullable=False,
)
op.alter_column(
"persona",
"recency_bias",
type_=sa.String,
existing_type=sa.Enum(RecencyBiasSetting, native_enum=False),
existing_nullable=False,
)

# Because the indexmodelstatus enum does not have a mapping to a string type
# we need this workaround instead of directly changing the type
op.add_column("embedding_model", sa.Column("temp_status", sa.String))
op.execute("UPDATE embedding_model SET temp_status = status::text")
op.drop_column("embedding_model", "status")
op.alter_column("embedding_model", "temp_status", new_column_name="status")

op.execute("DROP TYPE IF EXISTS searchtype")
op.execute("DROP TYPE IF EXISTS recencybiassetting")
op.execute("DROP TYPE IF EXISTS indexmodelstatus")


def downgrade() -> None:
op.alter_column(
"persona",
"search_type",
type_=sa.Enum(SearchType, native_enum=False),
existing_type=sa.String(length=50),
existing_nullable=False,
)
op.alter_column(
"persona",
"recency_bias",
type_=sa.Enum(RecencyBiasSetting, native_enum=False),
existing_type=sa.String(length=50),
existing_nullable=False,
)
op.alter_column(
"embedding_model",
"status",
type_=sa.Enum(IndexModelStatus, native_enum=False),
existing_type=sa.String(length=50),
existing_nullable=False,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Remove DocumentSource from Tag
Revision ID: 91fd3b470d1a
Revises: 173cae5bba26
Create Date: 2024-03-21 12:05:23.956734
"""
from alembic import op
import sqlalchemy as sa
from danswer.configs.constants import DocumentSource

# revision identifiers, used by Alembic.
revision = "91fd3b470d1a"
down_revision = "173cae5bba26"
branch_labels = None
depends_on = None


def upgrade() -> None:
op.alter_column(
"tag",
"source",
type_=sa.String(length=50),
existing_type=sa.Enum(DocumentSource, native_enum=False),
existing_nullable=False,
)


def downgrade() -> None:
op.alter_column(
"tag",
"source",
type_=sa.Enum(DocumentSource, native_enum=False),
existing_type=sa.String(length=50),
existing_nullable=False,
)
118 changes: 118 additions & 0 deletions backend/alembic/versions/e91df4e935ef_private_personas_documentsets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""Private Personas DocumentSets
Revision ID: e91df4e935ef
Revises: 91fd3b470d1a
Create Date: 2024-03-17 11:47:24.675881
"""
import fastapi_users_db_sqlalchemy
from alembic import op
import sqlalchemy as sa

# revision identifiers, used by Alembic.
revision = "e91df4e935ef"
down_revision = "91fd3b470d1a"
branch_labels = None
depends_on = None


def upgrade() -> None:
op.create_table(
"document_set__user",
sa.Column("document_set_id", sa.Integer(), nullable=False),
sa.Column(
"user_id",
fastapi_users_db_sqlalchemy.generics.GUID(),
nullable=False,
),
sa.ForeignKeyConstraint(
["document_set_id"],
["document_set.id"],
),
sa.ForeignKeyConstraint(
["user_id"],
["user.id"],
),
sa.PrimaryKeyConstraint("document_set_id", "user_id"),
)
op.create_table(
"persona__user",
sa.Column("persona_id", sa.Integer(), nullable=False),
sa.Column(
"user_id",
fastapi_users_db_sqlalchemy.generics.GUID(),
nullable=False,
),
sa.ForeignKeyConstraint(
["persona_id"],
["persona.id"],
),
sa.ForeignKeyConstraint(
["user_id"],
["user.id"],
),
sa.PrimaryKeyConstraint("persona_id", "user_id"),
)
op.create_table(
"document_set__user_group",
sa.Column("document_set_id", sa.Integer(), nullable=False),
sa.Column(
"user_group_id",
sa.Integer(),
nullable=False,
),
sa.ForeignKeyConstraint(
["document_set_id"],
["document_set.id"],
),
sa.ForeignKeyConstraint(
["user_group_id"],
["user_group.id"],
),
sa.PrimaryKeyConstraint("document_set_id", "user_group_id"),
)
op.create_table(
"persona__user_group",
sa.Column("persona_id", sa.Integer(), nullable=False),
sa.Column(
"user_group_id",
sa.Integer(),
nullable=False,
),
sa.ForeignKeyConstraint(
["persona_id"],
["persona.id"],
),
sa.ForeignKeyConstraint(
["user_group_id"],
["user_group.id"],
),
sa.PrimaryKeyConstraint("persona_id", "user_group_id"),
)

op.add_column(
"document_set",
sa.Column("is_public", sa.Boolean(), nullable=True),
)
# fill in is_public for existing rows
op.execute("UPDATE document_set SET is_public = true WHERE is_public IS NULL")
op.alter_column("document_set", "is_public", nullable=False)

op.add_column(
"persona",
sa.Column("is_public", sa.Boolean(), nullable=True),
)
# fill in is_public for existing rows
op.execute("UPDATE persona SET is_public = true WHERE is_public IS NULL")
op.alter_column("persona", "is_public", nullable=False)


def downgrade() -> None:
op.drop_column("persona", "is_public")

op.drop_column("document_set", "is_public")

op.drop_table("persona__user")
op.drop_table("document_set__user")
op.drop_table("persona__user_group")
op.drop_table("document_set__user_group")
33 changes: 23 additions & 10 deletions backend/danswer/auth/users.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,13 +279,32 @@ async def logout(
# take care of that in `double_check_user` ourself. This is needed, since
# we want the /me endpoint to still return a user even if they are not
# yet verified, so that the frontend knows they exist
optional_valid_user = fastapi_users.current_user(active=True, optional=True)
optional_fastapi_current_user = fastapi_users.current_user(active=True, optional=True)


async def double_check_user(
async def optional_user_(
request: Request,
user: User | None,
db_session: Session,
) -> User | None:
"""NOTE: `request` and `db_session` are not used here, but are included
for the EE version of this function."""
return user


async def optional_user(
request: Request,
user: User | None = Depends(optional_fastapi_current_user),
db_session: Session = Depends(get_session),
) -> User | None:
versioned_fetch_user = fetch_versioned_implementation(
"danswer.auth.users", "optional_user_"
)
return await versioned_fetch_user(request, user, db_session)


async def double_check_user(
user: User | None,
optional: bool = DISABLE_AUTH,
) -> User | None:
if optional:
Expand All @@ -307,15 +326,9 @@ async def double_check_user(


async def current_user(
request: Request,
user: User | None = Depends(optional_valid_user),
db_session: Session = Depends(get_session),
user: User | None = Depends(optional_user),
) -> User | None:
double_check_user = fetch_versioned_implementation(
"danswer.auth.users", "double_check_user"
)
user = await double_check_user(request, user, db_session)
return user
return await double_check_user(user)


async def current_admin_user(user: User | None = Depends(current_user)) -> User | None:
Expand Down
6 changes: 1 addition & 5 deletions backend/danswer/background/celery/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def check_for_document_sets_sync_task() -> None:
with Session(get_sqlalchemy_engine()) as db_session:
# check if any document sets are not synced
document_set_info = fetch_document_sets(
db_session=db_session, include_outdated=True
user_id=None, db_session=db_session, include_outdated=True
)
for document_set, _ in document_set_info:
if not document_set.is_up_to_date:
Expand Down Expand Up @@ -226,8 +226,4 @@ def clean_old_temp_files_task(
"task": "check_for_document_sets_sync_task",
"schedule": timedelta(seconds=5),
},
"clean-old-temp-files": {
"task": "clean_old_temp_files_task",
"schedule": timedelta(minutes=30),
},
}
1 change: 1 addition & 0 deletions backend/danswer/chat/load_yamls.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def load_personas_from_yaml(
document_sets=doc_sets,
default_persona=True,
shared=True,
is_public=True,
db_session=db_session,
)

Expand Down
2 changes: 1 addition & 1 deletion backend/danswer/connectors/confluence/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def _extract_confluence_keys_from_datacenter_url(wiki_url: str) -> tuple[str, st


def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, bool]:
is_confluence_cloud = ".atlassian.net/wiki/spaces/" in wiki_url
is_confluence_cloud = ".atlassian.net/wiki/spaces/" in wiki_url or ".jira.com/wiki/spaces/" in wiki_url

try:
if is_confluence_cloud:
Expand Down
22 changes: 11 additions & 11 deletions backend/danswer/connectors/cross_connector_utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
import os
import re
import zipfile
from collections.abc import Generator
from pathlib import Path
from collections.abc import Iterator
from typing import Any
from typing import IO

Expand Down Expand Up @@ -78,11 +77,11 @@ def is_macos_resource_fork_file(file_name: str) -> bool:
# to the zip file. This file should contain a list of objects with the following format:
# [{ "filename": "file1.txt", "link": "https://example.com/file1.txt" }]
def load_files_from_zip(
zip_location: str | Path,
zip_file_io: IO,
ignore_macos_resource_fork_files: bool = True,
ignore_dirs: bool = True,
) -> Generator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]], None, None]:
with zipfile.ZipFile(zip_location, "r") as zip_file:
) -> Iterator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]]]:
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
zip_metadata = {}
try:
metadata_file_info = zip_file.getinfo(".danswer_metadata.json")
Expand All @@ -109,18 +108,19 @@ def load_files_from_zip(
yield file_info, file, zip_metadata.get(file_info.filename, {})


def detect_encoding(file_path: str | Path) -> str:
with open(file_path, "rb") as file:
raw_data = file.read(50000) # Read a portion of the file to guess encoding
return chardet.detect(raw_data)["encoding"] or "utf-8"
def detect_encoding(file: IO[bytes]) -> str:
raw_data = file.read(50000)
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
file.seek(0)
return encoding


def read_file(
file_reader: IO[Any], encoding: str = "utf-8", errors: str = "replace"
file: IO, encoding: str = "utf-8", errors: str = "replace"
) -> tuple[str, dict]:
metadata = {}
file_content_raw = ""
for ind, line in enumerate(file_reader):
for ind, line in enumerate(file):
try:
line = line.decode(encoding) if isinstance(line, bytes) else line
except UnicodeDecodeError:
Expand Down
Loading

0 comments on commit 093fe96

Please sign in to comment.