-
Notifications
You must be signed in to change notification settings - Fork 0
/
pre_processing.py
51 lines (40 loc) · 1.43 KB
/
pre_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
check_point = 'nomic-ai/nomic-embed-text-v1'
embedding_model = SentenceTransformer(check_point,trust_remote_code=True)
def parese_doc(doc,first_section,ignore_after):
documents_1 = ''
reader = doc
for page in reader.pages:
documents_1 += page.extract_text()
cleaned_string = documents_1.replace('\n', ' ')
cleaned_string = cleaned_string.lower()
start_index = cleaned_string.find(first_section)
end_index = cleaned_string.rfind(ignore_after)
if start_index!=-1 and end_index!=-1:
cleaned_string = cleaned_string[start_index:end_index]
sentence_list = cleaned_string.split('. ')
context_list = []
group_size = 20
overlap = 5
i = 0
while True:
group = sentence_list[i:i+group_size]
text = '. '.join(group)
context_list.append(text)
i+=group_size-overlap
if i>=len(sentence_list):
break
return context_list
def get_embeddings(doc):
model_input = doc
out = embedding_model.encode(model_input)
return out
def create_embedding(context_list):
embedding_dimension = embedding_model.get_sentence_embedding_dimension()
embeddings = list(map(get_embeddings,context_list))
embeddings_array = np.array(embeddings)
index = faiss.IndexFlatL2(embedding_dimension)
index.add(embeddings_array)
return index