-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
57dc7e1
commit 66b7378
Showing
14 changed files
with
1,334 additions
and
66 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,9 @@ __pycache__/ | |
# C extensions | ||
*.so | ||
|
||
# PDF Files | ||
|
||
# Distribution / packaging | ||
.Python | ||
build/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
from ._baserag import BaseRAG | ||
from ._usefiles import UseFiles | ||
from ._multiagents import MultiAgents | ||
|
||
__all__ = ["BaseRAG", "UseFiles"] | ||
__all__ = ["BaseRAG", "UseFiles", "MultiAgents"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
from typing import List, Optional | ||
from fastapi import HTTPException | ||
from utils.lifespan import clients | ||
from app.tools.multiagent.tools import go_pipeline | ||
import redis | ||
|
||
|
||
class MultiAgents: | ||
""" | ||
MultiAgents, multiple agents for RAG: Recursive Document Retrival & Web Search. | ||
Args: | ||
embeddings_model (str): OpenAI embeddings model | ||
collection (List[Optional[str]]): Collection names. Defaults to "user" parameter. | ||
file_ids (Optional[List[str]], optional): List of file ids for user collections (after upload files). Defaults to None. | ||
k (int, optional): Top K per collection (max: 6). Defaults to 4. | ||
prompt_template (Optional[str], optional): Prompt template. Defaults to DEFAULT_PROMPT_TEMPLATE. | ||
""" | ||
|
||
DEFAULT_PROMPT_TEMPLATE = "Réponds à la question suivante en te basant sur les documents ci-dessous : %(prompt)s\n\nDocuments :\n\n%(docs)s" | ||
MAX_K = 6 | ||
|
||
def __init__(self, clients: dict, user: str): | ||
self.user = user | ||
self.clients = clients | ||
|
||
async def get_rag_prompt( | ||
self, | ||
embeddings_model: str, | ||
collections: List[Optional[str]], | ||
file_ids: Optional[List[str]] = None, | ||
k: Optional[int] = 4, | ||
prompt_template: Optional[str] = DEFAULT_PROMPT_TEMPLATE, | ||
**request, | ||
) -> str: | ||
chat_id = request.get("chat_id") | ||
|
||
if k > self.MAX_K: | ||
raise HTTPException( | ||
status_code=400, detail=f"K must be less than or equal to {self.MAX_K}" | ||
) | ||
|
||
try: | ||
model_url = str(self.clients["openai"][embeddings_model].base_url) | ||
model_url = model_url.replace("/v1/", "/tei/") | ||
except KeyError: | ||
raise HTTPException(status_code=404, detail="Model not found.") | ||
|
||
prompt = request["messages"][-1]["content"] | ||
|
||
if self.user: | ||
if chat_id: | ||
try: | ||
chat_history = clients["chathistory"].get_chat_history(user_id=request["user"], chat_id=chat_id) # fmt: off | ||
if "messages" in chat_history.keys(): # to avoid empty chat history | ||
request["messages"] = chat_history["messages"] + request["messages"] | ||
except redis.exceptions.ResponseError as e: | ||
print(f"Redis path error: {str(e)}") | ||
request["messages"] = request["messages"] | ||
|
||
else: | ||
print("No chat_id provided") | ||
else: | ||
print("No user provided") | ||
|
||
history = request["messages"] if request["messages"] else list() | ||
|
||
answer, refs = await go_pipeline( | ||
prompt, | ||
docs=[], | ||
refs=[], | ||
n=0, | ||
fact=3, | ||
history=history, | ||
) | ||
|
||
answer = answer + "\n\n" + refs | ||
answer = answer.strip() | ||
|
||
return answer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
import pandas as pd | ||
from tqdm import tqdm | ||
from sklearn.feature_extraction.text import TfidfVectorizer | ||
from langchain_text_splitters import RecursiveCharacterTextSplitter | ||
from langchain.document_loaders import DataFrameLoader | ||
from nltk.corpus import stopwords | ||
import numpy as np | ||
|
||
from openai import OpenAI | ||
|
||
API_KEY = "multivac-FQ1cWX4DpshdhkXY2m" | ||
MODEL_URL = "http://llama38b.multivacplatform.org/v1" | ||
client = OpenAI(api_key=API_KEY, base_url=MODEL_URL) | ||
models = [model.id for model in client.models.list()] | ||
|
||
french_stopwords = set(stopwords.words("french")) | ||
new_stopwords = { | ||
"s'", | ||
"quel", | ||
"que", | ||
"quoi", | ||
"comment", | ||
"l'", | ||
"d'", | ||
"mais", | ||
"ou", | ||
"et", | ||
"donc", | ||
"or", | ||
"ni", | ||
"car", | ||
"quelle", | ||
"quelles", | ||
"pourquoi", | ||
} | ||
french_stopwords.update(new_stopwords) | ||
|
||
|
||
def get_prompt_potential_questions(text): | ||
prompt = f""" | ||
<|system|> Tu parles en Français | ||
<|end|> | ||
<|user|> | ||
Voilà un texte : {text} | ||
En utilisant le contenu du texte fourni, crée deux questions spécifique mais pas trop longue dont la réponse est clairement indiquée dans le texte. Assure-toi que la question n'est pas vague et qu'elle peut être facilement associée au texte, même parmi d'autres questions portant sur d'autres textes. Si le texte est simple, la question peut être simple aussi. Voici quelques exemples pour te guider : | ||
Exemple de mauvaises questions à éviter : ["De quoi parle ce texte ?", "Qui est mentionné ici ?"] (trop vague) | ||
Exemple de bonne question : ["Quel département est affecté par la politique XX-XXX-XX ?"] (précise et claire) | ||
Génères uniquement des questions et non leurs réponses. | ||
Formattes ta réponse en une liste Python contenant deux questions spécifiques sur le texte ci-dessus. | ||
Format de réponse attendu : [ "{{question1}}", "{{question2}}" ] | ||
Réponds uniquement avec la liste de questions et rien d'autre. | ||
<|end|>\n<|assistant|> | ||
""" | ||
return prompt | ||
|
||
|
||
def get_potential_question(text): | ||
chat_completion = client.chat.completions.create( | ||
messages=[ | ||
{ | ||
"role": "user", | ||
"content": get_prompt_potential_questions(text), | ||
} | ||
], | ||
model=models[0], | ||
temperature=0.2, | ||
stream=False, | ||
) | ||
answer = chat_completion.choices[0].message.content | ||
return answer | ||
|
||
|
||
def remove_french_stopwords(text): | ||
text = text.lower() | ||
tokens = text.split() # Split text into words | ||
filtered_tokens = [token for token in tokens if token.lower() not in french_stopwords] | ||
return " ".join(filtered_tokens) | ||
|
||
|
||
def extract_keywords_tfidf(docs, corpus, top_n=5): | ||
""" | ||
Extracts the top N keywords from a given document or list of documents using TF-IDF. | ||
Parameters: | ||
- docs (str or list of str): A single document or a list of documents. | ||
- top_n (int): Number of top keywords to extract from each document. | ||
Returns: | ||
- list of (str, float) tuples: A list of (keyword, score) tuples for each document. | ||
""" | ||
if isinstance(docs, str): | ||
docs = [docs] | ||
# Initialize the TF-IDF vectorizer | ||
vectorizer = TfidfVectorizer(stop_words=list(french_stopwords)) | ||
# Fit and transform the documents | ||
model = vectorizer.fit(corpus) # | ||
tfidf_matrix = model.transform(docs) | ||
# Get feature names to access the corresponding columns in the matrix | ||
feature_names = np.array(vectorizer.get_feature_names_out()) | ||
# Initialize a list to hold the results | ||
keywords_list = [] | ||
# Iterate through each document | ||
for doc_idx in range(tfidf_matrix.shape[0]): | ||
# Get the row corresponding to the document | ||
row = np.squeeze(tfidf_matrix[doc_idx].toarray()) | ||
# Get the indices of the top N values | ||
top_n_indices = row.argsort()[-top_n:][::-1] | ||
# Extract the corresponding keywords and scores | ||
keywords = [ | ||
feature_names[i] for i in top_n_indices | ||
] # [(feature_names[i], row[i]) for i in top_n_indices] | ||
# Add to the list of results | ||
keywords_list.append(keywords) | ||
# If only one document was processed, return its keywords directly | ||
if len(docs) == 1: | ||
return keywords_list[0] | ||
return keywords_list | ||
|
||
|
||
def create_rag_df(df, text_col, metadata_cols, chunk_size=3500): | ||
# Make a document list with every columns we could need in the metadatas | ||
df_loader = DataFrameLoader( | ||
df[[text_col] + metadata_cols], page_content_column=text_col | ||
) #'_clean' | ||
df_document = df_loader.load() | ||
# Slipt document in chunks | ||
text_splitter = RecursiveCharacterTextSplitter( | ||
chunk_size=chunk_size, | ||
chunk_overlap=50, | ||
length_function=len, | ||
separators=["\n\n", ". "], | ||
) | ||
baseline_docs = text_splitter.split_documents(df_document) | ||
|
||
chunk_ids = [] | ||
chunk_contents = [] | ||
chunk_metadatas = [] | ||
# Add a chunk_id and potential questions to metadatas | ||
# llm = load_llm(model_path=model_path) | ||
for i, doc in tqdm( | ||
enumerate(baseline_docs), total=len(baseline_docs), desc="Processing documents" | ||
): | ||
if len(doc.page_content) <= chunk_size: | ||
doc.metadata["chunk_id"] = i | ||
doc.metadata["potential_questions"] = get_potential_question(doc.page_content) | ||
|
||
# print(doc.metadata['potential_questions']) | ||
chunk_ids.append(i) | ||
chunk_contents.append(doc.page_content) | ||
chunk_metadatas.append(doc.metadata) | ||
|
||
df_chunks = pd.DataFrame() | ||
# df_chunks["chunk_id"] = chunk_ids | ||
df_chunks["chunk_content"] = chunk_contents | ||
df_chunks["chunk_metadata"] = chunk_metadatas | ||
|
||
# Recreate the columns from metadatas | ||
metadata_df = df_chunks["chunk_metadata"].apply(pd.Series) | ||
df_chunks = pd.concat([df_chunks.drop(columns=["chunk_metadata"]), metadata_df], axis=1) | ||
return df_chunks |
Oops, something went wrong.