-
Notifications
You must be signed in to change notification settings - Fork 0
/
crag.py
369 lines (296 loc) · 14.8 KB
/
crag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
import faiss
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from heapq import nlargest
from config import GEMINI_API, BING_API
import nltk
import nltk
# Define a function to check and download each resource if necessary
def download_nltk_resource(resource_name):
try:
nltk.data.find(f"tokenizers/{resource_name}" if resource_name == "punkt" else f"corpora/{resource_name}")
print(f"{resource_name} is already installed.")
except LookupError:
print(f"{resource_name} not found. Downloading...")
nltk.download(resource_name)
# List of resources to check and download if missing
resources = ['stopwords', 'punkt', 'wordnet', 'punkt_tab']
for resource in resources:
download_nltk_resource(resource)
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('punkt_tab')
# Load the Sentence Transformer model
model = SentenceTransformer('bert-base-nli-mean-tokens')
# with open(r"C:/Users/mkolla1/OneDrive - Georgia State University/Desktop/CareerSwipe/Google_AI_API.txt", "r") as f:
# GOOGLE_API_KEY = f.read()
genai.configure(api_key=GEMINI_API)
prompt= """ You are Georgia State University's dedicated website chatbot. Your primary function is to provide accurate and helpful information about Georgia State University. You will draw on your knowledge base and the provided reference material {reference_text} to answer user queries.
Please strictly adhere to the following guidelines:
GSU Focus: Limit your responses to topics directly related to Georgia State University, for any other questions please respectfully tell the user to ask questions related to the universiy and you cannot help beyond that.
Informative and Concise: Provide clear, concise, and relevant information.
Helpful and Respectful: Maintain a positive and helpful tone, treating all users with respect.
External Reference: If a query requires information beyond your current knowledge, provide relevant links to official GSU websites or other credible sources.
Example:
User: What are the tuition fees at GSU?
You: Tuition fees at Georgia State University vary depending on your program and residency status. Please visit the official tuition and fees page for the most accurate information: [link to GSU tuition and fees page]"""
prompt1="""You are Georgia State University's Website Chatbot, you will be assisting with any type of questions the user might have about
Georgia State University, you will be assisting with your own knowledge and some reference materials provided to you, here is the reference material {reference_text}
Please provide with any additional links and reference URL's for the users. The user will be asking questions and please be kind and respectful to the user. Good Luck!"""
import os
import glob
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import fitz # PyMuPDF for PDF processing
# Initialize the SentenceTransformer model
model = SentenceTransformer('bert-base-nli-mean-tokens')
import os
from pprint import pprint
import requests
def bing_search(query):
subscription_key = BING_API
endpoint = "https://api.bing.microsoft.com/v7.0/search"
# Construct a request
mkt = 'en-US'
params = {"q": query, "textDecorations": True, "textFormat": "HTML"}
#params = { 'q': query, 'mkt': mkt }
headers = { 'Ocp-Apim-Subscription-Key': subscription_key }
# Call the API
try:
response = requests.get(endpoint, headers=headers, params=params)
response.raise_for_status()
# print(" Headers:")
# print(response.headers)
#print("JSON Response:")
#pprint(response.json())
search_results= response.json()
rows = "\n".join(["""<tr>
<td><a href=\"{0}\">{1}</a></td>
<td>{2}</td>
</tr>""".format(v["url"], v["name"], v["snippet"])
for v in search_results["webPages"]["value"]])
return rows
except Exception as ex:
print("error")
print(ex)
# Download necessary NLTK resources (uncomment if running for the first time)
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
def preprocess_text(text):
# Step 1: Convert to lowercase
text = text.lower()
# Step 2: Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Step 3: Tokenization
tokens = word_tokenize(text)
# Step 4: Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
# Step 5: Lemmatization
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]
# Step 6: Summarization (optional, reduces text further)
summarized_text = summarize_text(' '.join(tokens))
return summarized_text
def summarize_text(text, top_n=5):
"""
Summarizes text by selecting the most important sentences based on TF-IDF
"""
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform([text])
# Rank sentences based on TF-IDF
sentences = text.split('.')
sentence_scores = {}
for i, sentence in enumerate(sentences):
score = 0
for word in sentence.split():
if word in vectorizer.get_feature_names_out():
score += X[0, vectorizer.get_feature_names_out().tolist().index(word)]
sentence_scores[i] = score
# Select the top_n most important sentences
top_sentences = nlargest(top_n, sentence_scores, key=sentence_scores.get)
top_sentences = [sentences[i] for i in top_sentences]
return '. '.join(top_sentences)
import json
import json
import re
from transformers import AutoTokenizer
query2 = """Can you provide me with the director and associate chair of GSU CS Department? and also can you provide me with some helpful link to get a research oppurtunity at GSU Computer Science department"""
query = """Can you provide me with 5 GSU Computer Science Faculty who are working as professors and their emails please. Also who is Dr.Parag Tamhankar from Directory"""
def corrective_rag(query, reference_text):
prompt = """
You are a highly accurate and knowledgeable retrieval evaluator. Your task is to assess the relevance of provided reference text to a given query. For each piece of reference text, classify it as one of the following:
Correct: The text directly answers the query, providing accurate and relevant information.
Ambiguous:The text is partially relevant but requires additional context or interpretation to fully answer the query.
Wrong: The text is irrelevant to the query or provides incorrect information.
Generate ouput in a JSON file, here is a sample format: {format}
Even if the reference piece is Ambiguous or Wrong or Correct, please do generate it in the output.
You will be provided with a query and a list of reference texts. Here are the documents {reference}
""" # Paste the full prompt here
corrective_model = genai.GenerativeModel(
'models/gemini-1.5-flash',
system_instruction=prompt.format(reference=reference_text, format= [{'file_name': 'file name', 'relevance': 'Correct'}, {'file_name': 'file name', 'relevance': 'Wrong'}, {'file_name': 'file name', 'relevance': 'Ambiguous'}]), generation_config={"response_mime_type": "application/json"}
)
response = corrective_model.generate_content(query)
# Parse the response to extract classifications
try:
classifications= json.loads(response.text)
except Exception as e:
print(e)
classifications= response.text
print(classifications)
# print(classifications)
reference_dict = {item['file_name']: item['text'] for item in reference_text}
# Iterate through classification and add text if file_name matches
for item in classifications:
file_name = item['file_name']
if file_name in reference_dict:
item['text'] = reference_dict[file_name]
# classification now has the 'text' added where file names matched
# Filter and refine correct documents
final_knowledge_chunk = process_chunks(classifications, query)
# Save or return the final knowledge chunk
with open("refined_knowledge_chunk.json", "w") as file:
json.dump({"refined_knowledge_chunk": final_knowledge_chunk}, file, indent=2)
return final_knowledge_chunk
#return classifications
def convert_to_chunks(text, max_tokens=5000):
# Initialize tokenizer (adjust model name if necessary)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# Split text by sentences or paragraphs
sentences = re.split(r'(?<=[.!?]) +', text)
chunks = []
current_chunk = ""
for sentence in sentences:
# Add sentence to the current chunk and check the token length
current_chunk += " " + sentence
tokenized_length = len(tokenizer.encode(current_chunk))
if tokenized_length > max_tokens:
# Append the chunk if it exceeds max_tokens, then reset
chunks.append(current_chunk.strip())
current_chunk = sentence # Start a new chunk with current sentence
# Append any remaining text as a final chunk
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def process_chunks(classifications, query):
knowledge_chunks = []
status=True
for item in classifications:
# Check if the relevance is "Correct"
if item['relevance'] == 'Correct':
text = item.get('text', '') # Assuming 'text' contains the document content
file_name = item.get('file_name', 'unknown')
# Convert document text to smaller knowledge chunks
chunks = convert_to_chunks(text)
#chunks= preprocess_text([text])
# Append each chunk as a knowledge piece
for chunk in chunks:
knowledge_chunks.append({
"file_name": file_name,
"chunk_text": chunk
})
if item['relevance']== 'Wrong' or item['relevance']== 'Ambiguous' :
while status:
print("BING SEARCH")
text = item.get('text', '') # Assuming 'text' contains the document content
file_name = item.get('file_name', 'unknown')
chunks= bing_search(query)
chunks= preprocess_text(chunks)
knowledge_chunks.append({
"file_name": 'Bing Search Results',
"chunk_text": chunks
})
status=False
return knowledge_chunks
# Initialize the SentenceTransformer model
model = SentenceTransformer('bert-base-nli-mean-tokens')
# Dictionary to store FAISS indices and corresponding document texts
indices = {}
doc_texts = {}
# Helper function to create FAISS index and store text and embeddings
def create_and_store_index(text, file_name):
embeddings = model.encode([text])
embeddings = np.array(embeddings).astype('float32')
# Initialize FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
# Store the index and document text in dictionaries
indices[file_name] = index
doc_texts[file_name] = text
print(f"Index created for '{file_name}' with {index.ntotal} vectors.")
# Function to perform a search across all indices and return top result text
def search_query(query, indices, doc_texts, model):
# Encode the query into an embedding
query_embedding = model.encode([query])
query_embedding = np.array(query_embedding).astype('float32')
top_result_texts = [] # List to store top results with filenames and distances
# Search each FAISS index
for file_name, index in indices.items():
# Search for the nearest neighbors in the current index
D, I = index.search(query_embedding, k=5) # Retrieve top 3 matches
# Collect results for this file
for i in range(3):
top_result_texts.append({
'file_name': file_name,
'text': doc_texts[file_name], # Document text for each index
'distance': D[0][i] # Distance to the match
})
# Sort results by the closest distance across all files
sorted_results = sorted(top_result_texts, key=lambda x: x['distance'])
# Display the top 3 matches
for i, result in enumerate(sorted_results[:5]):
print(f"\nTop {i+1} match in '{result['file_name']}':")
print("Distance:", result['distance'])
return sorted_results[:5] # Return only the top 3 results
# Folder path where PDF and TXT files are stored
data_folder = "data"
# Retrieve all PDF and TXT files from the folder
pdf_files = glob.glob(os.path.join(data_folder, "*.pdf"))
txt_files = glob.glob(os.path.join(data_folder, "*.txt"))
# Process PDF files and store their indices and texts
for pdf_file in pdf_files:
with fitz.open(pdf_file) as doc:
text = "".join([page.get_text() for page in doc])
create_and_store_index(text, pdf_file)
# Process TXT files and store their indices and texts
for txt_file in txt_files:
with open(txt_file, 'r', encoding='utf-8') as file:
text = file.read()
create_and_store_index(text, txt_file)
history=[]
def answer(query):
global history
top_results = search_query(query, indices, doc_texts, model)
print(len(top_results))
classification= corrective_rag(query,top_results )
print(len(classification))
# Top result text for sending to Gemini bot
if top_results:
top_result_text = top_results[0]['text']
print("Got the Reference Text")
#print(classification)
ai_model = genai.GenerativeModel('models/gemini-1.5-flash',
system_instruction=prompt.format(reference_text=classification))
chat=ai_model.start_chat(history=history)
history.append({'role':'user', 'parts':query})
response = chat.send_message(query)
history.append({'role':'model', 'parts':response.text})
print(response.text)
return response.text
query2 = """Can you provide me with the director and associate chair of GSU CS Department? and also can you provide me with some helpful link to get a research oppurtunity at GSU Computer Science department"""
query = """Can you provide me with 5 GSU Computer Science Faculty who are working as professors and their emails please. Also who is Dr.Parag Tamhankar from Directory"""
query3="""organic and inorganic chemistry research professors at Georgia State University """
query4=""" Can you give me a list of 5 GSU Chemistry lecturers"""
# answer(query2)