Skip to content

Commit

Permalink
Merge pull request #307 from Shubh-Goyal-07/restructure
Browse files Browse the repository at this point in the history
adds kenlm model update functionality
  • Loading branch information
Gautam-Rajeev authored Apr 9, 2024
2 parents 85c736b + 6a3e94f commit 5de7101
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 44 deletions.
14 changes: 14 additions & 0 deletions src/spell_check/kenlm/local/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,32 @@ RUN apt-get update && apt-get install -y cmake g++ zlib1g-dev
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt

RUN echo "Downloading the language model files"
# Install kenlm using pip
RUN pip3 install https://github.com/kpu/kenlm/archive/master.zip
RUN apt-get update && apt-get install -y wget


RUN echo "Downloading the language model files"
RUN apt-get install -y build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev

RUN echo "Downloading the language model files"
# Download the files using wget
RUN wget "https://drive.google.com/uc?export=download&id=1frSw5-qfRMgrYs4QL961s2yYuq2KplEM" -O '5gram_model.bin'
RUN wget "https://drive.google.com/uc?export=download&id=1o31Z4TZbAOEt6E8Rx7VMONJOGJH-5Mwk" -O 'lexicon.txt'
RUN wget "https://drive.google.com/uc?export=download&id=14cMmeDPlAODbRe37CdHLnhClGX7JXG-A" -O 'freq_dict.txt'
RUN wget "https://drive.google.com/uc?export=download&id=1eVWwarCm8Wqq3vYqsE9f2jvrp-rvr6QZ" -O 'texts.txt'

RUN wget "https://drive.google.com/uc?export=download&id=1-Dtk5socjYdeGyqhbQzG-rvWJfWVFGqv" -O '5gram_model_eng.bin'
RUN wget "https://drive.google.com/uc?export=download&id=1-59pDTvEXCMUZ-NQ8BwmCnHQZh4Eg6Gw" -O 'lexicon_eng.txt'
RUN wget "https://drive.google.com/uc?export=download&id=1Ztj6k0A4BMi_o87qwSDKJQ6cyhvlvneD" -O 'freq_dict_eng.txt'
RUN wget "https://drive.google.com/uc?export=download&id=1-iZvej7L92Aga9VZ33BM5ybUTiR0hMF8" -O 'texts_eng.txt'

RUN echo "Downloading the language model files"
# Dowload the kenlm training files
RUN wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
RUN mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2

# Copy the rest of the application code to the working directory
COPY . /app/

Expand Down
14 changes: 13 additions & 1 deletion src/spell_check/kenlm/local/api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from model import Model
from update import UpdationModel
from request import ModelRequest, ModelUpdateRequest
from quart import Quart, request
import aiohttp
Expand All @@ -22,13 +23,20 @@
'eng': 'freq_dict_eng.txt'
}

texts_paths = {
'ory': 'texts.txt',
'eng': 'texts_eng.txt'
}


@app.before_serving
async def startup():
app.client = aiohttp.ClientSession()
global model
model = Model(app, model_paths, vocab_paths, freq_dict_paths)

print("Model loaded successfully")

@app.route('/', methods=['POST'])
async def embed():
global model
Expand All @@ -42,7 +50,11 @@ async def update():
global model
data = await request.get_json()
req = ModelUpdateRequest(**data)
result = await model.update_symspell(req)
result = await UpdationModel(model_paths, vocab_paths, freq_dict_paths, texts_paths).update(req)

if result:
model = Model(app, model_paths, vocab_paths, freq_dict_paths)

return result

if __name__ == "__main__":
Expand Down
52 changes: 11 additions & 41 deletions src/spell_check/kenlm/local/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
'eng': 'freq_dict_eng.txt'
}

texts_paths = {
'ory': 'texts.txt',
'eng': 'texts_eng.txt'
}


class TextCorrector:
def __init__(self, model_paths, vocab_paths, freq_dict_paths):
Expand All @@ -29,11 +34,14 @@ def __init__(self, model_paths, vocab_paths, freq_dict_paths):
'ory': kenlm.Model(model_paths['ory']),
'eng': kenlm.Model(model_paths['eng'])
}

print('Loading vocabularies...')
self.vocabs = {
'ory': self.create_vocab_lexicon(vocab_paths['ory']),
'eng': self.create_vocab_lexicon(vocab_paths['eng'])
}

print('Loading symspell models...')
self.symspell_models = {
'ory': self.create_symspell_model(freq_dict_paths['ory']),
'eng': self.create_symspell_model(freq_dict_paths['eng'])
Expand All @@ -47,6 +55,8 @@ def set_language(self, lang):
self.vocab = self.vocabs[lang]
self.symspell_model = self.symspell_models[lang]

print(self.symspell_models['eng'].words)

def create_vocab_lexicon(self, lexicon_path):
vocabulary = []
with open(lexicon_path, 'r', encoding='utf-8') as file:
Expand All @@ -60,11 +70,6 @@ def create_symspell_model(self, freq_dict_path):
sym_spell.load_dictionary(freq_dict_path, term_index=0, count_index=1, separator=' ')
return sym_spell

# def generate_candidates(self, word, max_distance=1):
# len_range = range(len(word) - max_distance, len(word) + max_distance + 1)
# filtered_vocab = [vocab_word for vocab_word in self.vocab if len(vocab_word) in len_range]
# return [vocab_word for vocab_word in filtered_vocab if 0 <= Levenshtein.distance(word, vocab_word) <= max_distance]

def generate_candidates(self, word, max_distance=1):
suggestions = self.symspell_model.lookup(word, Verbosity.CLOSEST, max_distance)
return [suggestion.term for suggestion in suggestions]
Expand Down Expand Up @@ -112,49 +117,14 @@ def correct_text_with_beam_search(self, text, BEAM_WIDTH=5, SCORE_THRESHOLD=1.5,
corrected_sentences.append(best_sentence)

return ' '.join(corrected_sentences)

def load_freq_dict(self, freq_dict_path):
freq_dict = {}
with open(freq_dict_path, 'r') as f:
for line in f:
word, freq = line.split()
freq_dict[word] = int(freq)
return freq_dict

def make_updation_counter(self, text):

if type(text) == list:
text = ' '.join(text)

# remove punctuations from the text
text = ''.join(e for e in text if e.isalnum() or e.isspace())
words = text.split()

# create a dictionary of words and their frequencies
dict = Counter(words)

return dict

def update_symspell_model(self, lang, text):
# update the frequency dictionary
current_freq_dict_counter = Counter(self.load_freq_dict(freq_dict_paths[lang]))
new_freq_dict_counter = self.make_updation_counter(text)

# merge the two frequency dictionaries
freq_dict_counter = current_freq_dict_counter + new_freq_dict_counter

freq_dict = {}
for word, freq in freq_dict_counter.items():
freq_dict[word] = int(freq)

with open(freq_dict_paths[lang], 'w') as f:
for word, freq in freq_dict.items():
f.write(word + ' ' + str(freq) + '\n')

# retrain the model with the updated frequency dictionary
self.symspell_models[lang] = self.create_symspell_model(freq_dict_paths[lang])

return 'Model updated successfully'


class Model():
Expand Down
151 changes: 151 additions & 0 deletions src/spell_check/kenlm/local/update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import os

model_paths = {
'ory': '5gram_model.bin',
'eng': '5gram_model_eng.bin'
}

vocab_paths = {
'ory': 'lexicon.txt',
'eng': 'lexicon_eng.txt'
}

freq_dict_paths = {
'ory': 'freq_dict.txt',
'eng': 'freq_dict_eng.txt'
}

texts_paths = {
'ory': 'texts.txt',
'eng': 'texts_eng.txt'
}


class UpdationModel():
def __init__(self, model_paths, vocab_paths, freq_dict_paths, texts_paths):
self.model_paths = model_paths
self.vocab_paths = vocab_paths
self.freq_dict_paths = freq_dict_paths
self.texts_paths = texts_paths

def set_language(self, lang):
self.model_path = self.model_paths[lang]
self.vocab_path = self.vocab_paths[lang]
self.freq_dict_path = self.freq_dict_paths[lang]
self.texts_path = self.texts_paths[lang]

def train_kenlm_model(self):
input_path = self.texts_path.split('.')[0] + '_unique.txt'

output_file = '5gram_model'

# Making the arpa files
output_file1 = output_file + ".arpa"
output_file2 = output_file + "_correct.arpa"
output_bin_file = output_file + ".bin"

os.system(f"kenlm/build/bin/lmplz -o 5 <{input_path} > {output_file1} --discount_fallback")

# adding the </s> character to the arpa file
with open(output_file1, "r") as read_file, open(output_file2, "w") as write_file:
has_added_eos = False
for line in read_file:
if not has_added_eos and "ngram 1=" in line:
count = line.strip().split("=")[-1]
write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
elif not has_added_eos and "<s>" in line:
write_file.write(line)
write_file.write(line.replace("<s>", "</s>"))
has_added_eos = True
else:
write_file.write(line)

# converting arpa file to bin file
os.system(f"kenlm/build/bin/build_binary {output_file2} {output_bin_file}")

os.remove(output_file1)
os.remove(output_file2)

def update_lexicon_file(self):
with open(self.texts_path, 'r') as f:
text = f.read()

# Tokenize the text into words
words = set(text.split())

with open(self.vocab_path, 'w') as f:
for word in words:
phonemes = " ".join(list(word))
line = word + " " + phonemes + " |\n"
f.write(line)

return True

def update_text_file(self, text):
final_text = []

if type(text) == list:
# remove puntuations from the text
for line in text:
final_text.append(' '.join([word for word in line.split() if word.isalnum()]))
else:
final_text.append(' '.join([word for word in text.split() if word.isalnum()]))

print(final_text)

with open(self.texts_path, 'a') as f:
for line in final_text:
f.write(line + '\n')

unique_text = set(final_text)

with open(self.texts_path.split('.')[0] + '_unique.txt', 'w') as f:
for line in unique_text:
f.write(line + '\n')

def load_freq_dict(self, freq_dict_path):
freq_dict = {}

# read the frequency dictionary file
with open(freq_dict_path, 'r') as f:
freq_file = f.read().splitlines()

# create a dictionary from the frequency file
for line in freq_file:
word, freq = line.split()
freq_dict[word] = int(freq)

return freq_dict

def update_freq_dict(self):
with open(self.texts_path, 'r') as f:
lines = f.readlines()

freq_dict = {}

for line in lines:
words = line.split()
for word in words:
if word in freq_dict:
freq_dict[word] += 1
else:
freq_dict[word] = 1

with open(self.freq_dict_path, 'w') as f:
for word, freq in freq_dict.items():
f.write(word + ' ' + str(freq) + '\n')

async def update(self, req):
self.lang = req.lang
self.text = req.text

self.set_language(self.lang)
self.update_text_file(self.text)

self.update_freq_dict()

self.update_lexicon_file()

self.train_kenlm_model()

return 'Model updated successfully'
3 changes: 1 addition & 2 deletions src/spell_check/spello/local/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ curl -X POST -H "Content-Type: application/json" -d '{
"lang" : "eng"
}' http://localhost:8000/


**curl request for update:**

curl -X PUT -H "Content-Type: application/json" -d '{
Expand All @@ -32,6 +31,6 @@ curl -X PUT -H "Content-Type: application/json" -d '{
}' http://localhost:8000/

curl -X PUT -H "Content-Type: application/json" -d '{
"text": ["how to apply for", "scheme for my paddi crop"],
"text": ["how to apply for", "scheme for my paddy crop"],
"lang" : "eng"
}' http://localhost:8000/

0 comments on commit 5de7101

Please sign in to comment.