Skip to content

Commit

Permalink
feat(helpers): vectorstore oriented object (#18)
Browse files Browse the repository at this point in the history
Co-authored-by: leoguillaume <[email protected]>
  • Loading branch information
2 people authored and anthonyjacquelin committed Sep 13, 2024
1 parent 1cb3d6d commit cf6a263
Show file tree
Hide file tree
Showing 23 changed files with 684 additions and 932 deletions.
69 changes: 69 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Contributions

Pour contribuer au projet, merci de suivre les instructions suivantes.

# Commit

Merci de respecter la convention suivante pour vos commits :

```
[doc|feat|fix](*) commit object (in english)
# example
feat(collections): collection name retriever
```

&ast;*Le thème est optionnel et doit correspondre à un thématique de la code base (deploy, collections, models, ...).

# Packages

1. Installez [libmagic](https://man7.org/linux/man-pages/man3/libmagic.3.html)

2. Dans un environnement virtuel Python, installez les packages Python présents dans le fichier *[pyproject.toml](./app/pyproject.toml)*

```bash
pip install app/.
```

# Tests

Merci avant chaque pull request, de vérifier le bon déploiement de votre API à l'aide en exécutant des tests unitaires.

1. Après avoir créer un fichier *config.yml*, lancez l'API en local

```bash
uvicorn app.main:app --port 8080 --log-level debug --reload
```

2. Executez les tests unitaires

```bash
PYTHONPATH=. pytest -v --exitfirst app/tests --base-url http://localhost:8080/v1 --api-key API_KEY
```

# Linter

Le linter du projet est [Ruff](https://beta.ruff.rs/docs/configuration/). Les règles de formatages spécifiques au projet sont dans le fichier *[pyproject.toml](./app/pyproject.toml)*.

## Configurer Ruff sur VSCode

1. Installez l'extension *Ruff* (charliermarsh.ruff) dans VSCode
2. Configurez le linter Ruff dans VSCode pour utiliser le fichier *[pyproject.toml](./app/pyproject.toml)*
A l'aide de la commande palette de VSCode (⇧⌘P), recherchez et sélectionnez *Preferences: Open User Settings (JSON)*.

Dans le fichier JSON qui s'ouvre, ajoutez à la fin du fichier les lignes suivantes :
```json
"ruff.configuration": "<path to pyproject.toml>",
"ruff.format.preview": true,
"ruff.lineLength": 150,
"ruff.codeAction.fixViolation": {
"enable": false
},
"ruff.nativeServer": "on"
```
⚠️ **Attention** : Assurez vous que le fichier *[pyproject.toml](./app/pyproject.toml)* est bien spécifié dans la configuration.
3. **Pour exécuter le linter, utilisez la commande palette de VSCode (⇧⌘P) depuis le fichier sur lequel vous voulez l'exécuter, et recherchez et sélectionnez *Ruff: Format document* et *Ruff: Format imports*.**
18 changes: 3 additions & 15 deletions app/endpoints/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@


@router.post("/chat/completions")
async def chat_completions(
request: ChatCompletionRequest, user: str = Security(check_api_key)
) -> Union[ChatCompletion, ChatCompletionChunk]:
async def chat_completions(request: ChatCompletionRequest, user: str = Security(check_api_key)) -> Union[ChatCompletion, ChatCompletionChunk]:
"""Completion API similar to OpenAI's API.
See https://platform.openai.com/docs/api-reference/chat/create for the API specification.
"""
Expand Down Expand Up @@ -60,12 +58,7 @@ async def chat_completions(
# non stream case
if not request["stream"]:
async_client = httpx.AsyncClient(timeout=20)
response = await async_client.request(
method="POST",
url=url,
headers=headers,
json=request,
)
response = await async_client.request(method="POST", url=url, headers=headers, json=request)
response.raise_for_status()
data = response.json()
data["metadata"] = metadata
Expand All @@ -74,12 +67,7 @@ async def chat_completions(
# stream case
async def forward_stream(client, request: dict):
async with httpx.AsyncClient(timeout=20) as async_client:
async with async_client.stream(
method="POST",
url=url,
headers=headers,
json=request,
) as response:
async with async_client.stream(method="POST", url=url, headers=headers, json=request) as response:
i = 0
async for chunk in response.aiter_raw():
if i == 0:
Expand Down
5 changes: 3 additions & 2 deletions app/endpoints/chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from app.schemas.chunks import Chunks, Chunk, ChunkRequest
from app.utils.security import check_api_key
from app.utils.lifespan import clients
from app.utils.data import get_chunks
from app.helpers._vectorstore import VectorStore

router = APIRouter()

Expand All @@ -23,9 +23,10 @@ async def chunks(
Get a chunk.
"""

vectorstore = VectorStore(clients=clients, user=user)
ids = [chunk] if chunk else dict(request)["chunks"]
filter = Filter(must=[HasIdCondition(has_id=ids)])
chunks = get_chunks(vectorstore=clients["vectors"], collection=collection, user=user, filter=filter)
chunks = vectorstore.get_chunks(collection_name=collection, filter=filter)
if not request:
return chunks[0]

Expand Down
52 changes: 30 additions & 22 deletions app/endpoints/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,44 +5,52 @@
from app.schemas.collections import Collections, Collection
from app.utils.security import check_api_key
from app.utils.lifespan import clients
from app.utils.data import get_collection_metadata, get_collections_metadata, delete_contents
from app.utils.data import delete_contents
from app.utils.config import LOGGER
from app.helpers import VectorStore

router = APIRouter()


@router.get("/collections/{collection}")
@router.get("/collections")
async def get_collections(
collection: Optional[str] = None, user: str = Security(check_api_key)
) -> Union[Collection, Collections]:
async def get_collections(collection: Optional[str] = None, user: str = Security(check_api_key)) -> Union[Collection, Collections]:
"""
Get list of collections.
"""
if collection is None:
collections = get_collections_metadata(vectorstore=clients["vectors"], user=user)
LOGGER.debug(f"collections: {collections}")
return collections
else:
collection = get_collection_metadata(
vectorstore=clients["vectors"], user=user, collection=collection

vectorstore = VectorStore(clients=clients, user=user)

collections = vectorstore.get_collection_metadata(collection_names=[collection])
LOGGER.debug(f"collections: {collections}")
data = []
for row in collections:
data.append(
Collection(
id=row.name,
type=row.type,
model=row.model,
user=row.user,
description=row.description,
created_at=row.created_at,
updated_at=row.updated_at,
)
)
LOGGER.debug(f"collection: {collection}")
return collection

if collection:
return data[0]

return Collections(data=data)


@router.delete("/collections/{collection}")
@router.delete("/collections")
async def delete_collections(
collection: Optional[str] = None, user: str = Security(check_api_key)
) -> Response:
async def delete_collections(collection: Optional[str] = None, user: str = Security(check_api_key)) -> Response:
"""
Get private collections and relative files.
"""
response = delete_contents(
s3=clients["files"],
vectorstore=clients["vectors"],
user=user,
collection=collection,
)

vectorstore = VectorStore(clients=clients, user=user)
response = delete_contents(s3=clients["files"], vectorstore=vectorstore, collection_name=collection)

return response
76 changes: 22 additions & 54 deletions app/endpoints/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,16 @@
import uuid
from typing import List, Optional, Union

from fastapi import APIRouter, Response, Security, UploadFile, HTTPException
from botocore.exceptions import ClientError
from langchain_qdrant import QdrantVectorStore
from qdrant_client.http.models import Filter, FieldCondition, MatchAny
from fastapi import APIRouter, HTTPException, Response, Security, UploadFile
from qdrant_client.http.models import FieldCondition, Filter, MatchAny

from app.helpers import S3FileLoader, VectorStore
from app.schemas.files import File, Files, Upload, Uploads
from app.schemas.config import PRIVATE_COLLECTION_TYPE
from app.utils.config import LOGGER
from app.utils.security import check_api_key
from app.utils.data import get_chunks, get_collection_id, delete_contents, create_collection
from app.utils.data import delete_contents
from app.utils.lifespan import clients
from app.helpers import S3FileLoader

from app.utils.security import check_api_key

router = APIRouter()

Expand Down Expand Up @@ -50,23 +47,14 @@ async def upload_files(
- **files** : Files to upload.
"""

loader = S3FileLoader(s3=clients["files"], chunk_size=chunk_size, chunk_overlap=chunk_overlap, chunk_min_size=chunk_min_size)
vectorstore = VectorStore(clients=clients, user=user)

# if collection already exists, return collection ID too
collection_id = create_collection(
collection=collection,
vectorstore=clients["vectors"],
embeddings_model=embeddings_model,
user=user,
)
collection_id = vectorstore.create_collection(collection_name=collection, model=embeddings_model)

# upload
data = list()
loader = S3FileLoader(
s3=clients["files"],
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
chunk_min_size=chunk_min_size,
)

try:
clients["files"].head_bucket(Bucket=collection_id)
except ClientError:
Expand All @@ -82,13 +70,7 @@ async def upload_files(
file.file,
collection_id,
file_id,
ExtraArgs={
"ContentType": file.content_type,
"Metadata": {
"filename": encoded_file_name,
"id": file_id,
},
},
ExtraArgs={"ContentType": file.content_type, "Metadata": {"filename": encoded_file_name, "id": file_id}},
)
except Exception as e:
LOGGER.error(f"store {file_name}:\n{e}")
Expand All @@ -108,14 +90,11 @@ async def upload_files(
continue

try:
for document in documents:
document.id = str(uuid.uuid4())
# create vectors from documents
db = await QdrantVectorStore.afrom_documents(
documents=documents,
embedding=clients["models"][embeddings_model].embedding,
collection_name=collection_id,
url=clients["vectors"].url,
api_key=clients["vectors"].api_key,
)
vectorstore.from_documents(documents=documents, model=embeddings_model, collection_name=collection)

except Exception as e:
LOGGER.error(f"create vectors of {file_name}:\n{e}")
clients["files"].delete_object(Bucket=collection_id, Key=file_id)
Expand All @@ -138,19 +117,15 @@ async def files(
Get files from a collection. Only files from private collections are returned.
"""

collection_id = get_collection_id(
vectorstore=clients["vectors"],
collection=collection,
user=user,
type=PRIVATE_COLLECTION_TYPE,
)
vectorstore = VectorStore(clients=clients, user=user)
collection = vectorstore.get_collection_metadata(collection_names=[collection])[0]

data = list()
objects = clients["files"].list_objects_v2(Bucket=collection_id).get("Contents", [])
objects = [object | clients["files"].head_object(Bucket=collection_id, Key=object["Key"])["Metadata"] for object in objects] # fmt: off
objects = clients["files"].list_objects_v2(Bucket=collection.id).get("Contents", [])
objects = [object | clients["files"].head_object(Bucket=collection.id, Key=object["Key"])["Metadata"] for object in objects]
file_ids = [object["Key"] for object in objects]
filter = Filter(must=[FieldCondition(key="metadata.file_id", match=MatchAny(any=file_ids))])
chunks = get_chunks(vectorstore=clients["vectors"], collection=collection, filter=filter, user=user)
chunks = vectorstore.get_chunks(collection_name=collection.name, filter=filter)

for object in objects:
chunk_ids = list()
Expand Down Expand Up @@ -180,19 +155,12 @@ async def files(

@router.delete("/files/{collection}/{file}")
@router.delete("/files/{collection}")
async def delete_file(
collection: str, file: Optional[str] = None, user: str = Security(check_api_key)
) -> Response:
async def delete_file(collection: str, file: Optional[str] = None, user: str = Security(check_api_key)) -> Response:
"""
Delete files and relative collections. Only files from private collections can be deleted.
"""

response = delete_contents(
s3=clients["files"],
vectorstore=clients["vectors"],
user=user,
collection=collection,
file=file,
)
vectorstore = VectorStore(clients=clients, user=user)
response = delete_contents(s3=clients["files"], vectorstore=vectorstore, collection_name=collection, file=file)

return response
1 change: 1 addition & 0 deletions app/helpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
from ._textcleaner import TextCleaner
from ._universalparser import UniversalParser
from ._gristkeymanager import GristKeyManager
from ._vectorstore import VectorStore
Loading

0 comments on commit cf6a263

Please sign in to comment.