-
Notifications
You must be signed in to change notification settings - Fork 0
/
Comparator.py
61 lines (50 loc) · 2.11 KB
/
Comparator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import pandas as pd
import nltk
from dotenv import load_dotenv
from pandas import DataFrame
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from Variator import OpenAIQuery
class Comparator:
"""
Class to compare the similarity between LLM I/O.
"""
def __init__(self):
# Download the necessary NLTK packages
nltk.download('punkt')
nltk.download('wordnet')
load_dotenv()
self.openai_query = OpenAIQuery(api_key=os.getenv('OPENAI_API_KEY'))
def compute_similarity_for_variations(self, input_variations: list) -> DataFrame:
"""
Calculates the similarity between the a list of similar questions and their answers using NLP techniques.
:param question:
:param variations:
:return:
"""
variations = pd.DataFrame(columns=['prompt', 'completion', 'score'])
for variation in input_variations:
completion = self.openai_query.completions(prompt_text=variation, n=1, return_prompt=False)[0]['text']
variations = variations._append(
{'prompt': variation, 'completion': completion,
'score': self.compute_similarity(variation, completion)}, ignore_index=True)
variations = variations.sort_values(by=['score'], ascending=False)
return variations
@staticmethod
def compute_similarity(question: str, answer: str) -> float:
"""
Calculates the similarity between the question and answer using NLP techniques
Args:
question (str): prompt provided to LLM
answer (str): answer generated by LLM
Returns:
score (float): the similarity score between 0 and 1 assigned to the answer
"""
# Preprocess the text
corpus = [question] + [answer]
# Compute the similarity score using cosine similarity metric
vect = TfidfVectorizer(min_df=1, stop_words="english")
tfidf = vect.fit_transform(corpus)
similarity_score = cosine_similarity(tfidf)
return similarity_score[0][1]