diff --git a/src/spell_check/kenlm/local/Dockerfile b/src/spell_check/kenlm/local/Dockerfile index 62a218d..a837057 100644 --- a/src/spell_check/kenlm/local/Dockerfile +++ b/src/spell_check/kenlm/local/Dockerfile @@ -10,18 +10,32 @@ RUN apt-get update && apt-get install -y cmake g++ zlib1g-dev COPY requirements.txt requirements.txt RUN pip3 install -r requirements.txt +RUN echo "Downloading the language model files" # Install kenlm using pip RUN pip3 install https://github.com/kpu/kenlm/archive/master.zip RUN apt-get update && apt-get install -y wget + +RUN echo "Downloading the language model files" +RUN apt-get install -y build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev + +RUN echo "Downloading the language model files" # Download the files using wget RUN wget "https://drive.google.com/uc?export=download&id=1frSw5-qfRMgrYs4QL961s2yYuq2KplEM" -O '5gram_model.bin' RUN wget "https://drive.google.com/uc?export=download&id=1o31Z4TZbAOEt6E8Rx7VMONJOGJH-5Mwk" -O 'lexicon.txt' RUN wget "https://drive.google.com/uc?export=download&id=14cMmeDPlAODbRe37CdHLnhClGX7JXG-A" -O 'freq_dict.txt' +RUN wget "https://drive.google.com/uc?export=download&id=1eVWwarCm8Wqq3vYqsE9f2jvrp-rvr6QZ" -O 'texts.txt' + RUN wget "https://drive.google.com/uc?export=download&id=1-Dtk5socjYdeGyqhbQzG-rvWJfWVFGqv" -O '5gram_model_eng.bin' RUN wget "https://drive.google.com/uc?export=download&id=1-59pDTvEXCMUZ-NQ8BwmCnHQZh4Eg6Gw" -O 'lexicon_eng.txt' RUN wget "https://drive.google.com/uc?export=download&id=1Ztj6k0A4BMi_o87qwSDKJQ6cyhvlvneD" -O 'freq_dict_eng.txt' +RUN wget "https://drive.google.com/uc?export=download&id=1-iZvej7L92Aga9VZ33BM5ybUTiR0hMF8" -O 'texts_eng.txt' +RUN echo "Downloading the language model files" +# Dowload the kenlm training files +RUN wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz +RUN mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2 + # Copy the rest of the application code to the working directory COPY . /app/ diff --git a/src/spell_check/kenlm/local/api.py b/src/spell_check/kenlm/local/api.py index de05f4a..5562472 100644 --- a/src/spell_check/kenlm/local/api.py +++ b/src/spell_check/kenlm/local/api.py @@ -1,4 +1,5 @@ from model import Model +from update import UpdationModel from request import ModelRequest, ModelUpdateRequest from quart import Quart, request import aiohttp @@ -22,6 +23,11 @@ 'eng': 'freq_dict_eng.txt' } +texts_paths = { + 'ory': 'texts.txt', + 'eng': 'texts_eng.txt' +} + @app.before_serving async def startup(): @@ -29,6 +35,8 @@ async def startup(): global model model = Model(app, model_paths, vocab_paths, freq_dict_paths) + print("Model loaded successfully") + @app.route('/', methods=['POST']) async def embed(): global model @@ -42,7 +50,11 @@ async def update(): global model data = await request.get_json() req = ModelUpdateRequest(**data) - result = await model.update_symspell(req) + result = await UpdationModel(model_paths, vocab_paths, freq_dict_paths, texts_paths).update(req) + + if result: + model = Model(app, model_paths, vocab_paths, freq_dict_paths) + return result if __name__ == "__main__": diff --git a/src/spell_check/kenlm/local/model.py b/src/spell_check/kenlm/local/model.py index bd58a50..e294905 100644 --- a/src/spell_check/kenlm/local/model.py +++ b/src/spell_check/kenlm/local/model.py @@ -21,6 +21,11 @@ 'eng': 'freq_dict_eng.txt' } +texts_paths = { + 'ory': 'texts.txt', + 'eng': 'texts_eng.txt' +} + class TextCorrector: def __init__(self, model_paths, vocab_paths, freq_dict_paths): @@ -29,11 +34,14 @@ def __init__(self, model_paths, vocab_paths, freq_dict_paths): 'ory': kenlm.Model(model_paths['ory']), 'eng': kenlm.Model(model_paths['eng']) } + + print('Loading vocabularies...') self.vocabs = { 'ory': self.create_vocab_lexicon(vocab_paths['ory']), 'eng': self.create_vocab_lexicon(vocab_paths['eng']) } + print('Loading symspell models...') self.symspell_models = { 'ory': self.create_symspell_model(freq_dict_paths['ory']), 'eng': self.create_symspell_model(freq_dict_paths['eng']) @@ -47,6 +55,8 @@ def set_language(self, lang): self.vocab = self.vocabs[lang] self.symspell_model = self.symspell_models[lang] + print(self.symspell_models['eng'].words) + def create_vocab_lexicon(self, lexicon_path): vocabulary = [] with open(lexicon_path, 'r', encoding='utf-8') as file: @@ -60,11 +70,6 @@ def create_symspell_model(self, freq_dict_path): sym_spell.load_dictionary(freq_dict_path, term_index=0, count_index=1, separator=' ') return sym_spell - # def generate_candidates(self, word, max_distance=1): - # len_range = range(len(word) - max_distance, len(word) + max_distance + 1) - # filtered_vocab = [vocab_word for vocab_word in self.vocab if len(vocab_word) in len_range] - # return [vocab_word for vocab_word in filtered_vocab if 0 <= Levenshtein.distance(word, vocab_word) <= max_distance] - def generate_candidates(self, word, max_distance=1): suggestions = self.symspell_model.lookup(word, Verbosity.CLOSEST, max_distance) return [suggestion.term for suggestion in suggestions] @@ -112,7 +117,7 @@ def correct_text_with_beam_search(self, text, BEAM_WIDTH=5, SCORE_THRESHOLD=1.5, corrected_sentences.append(best_sentence) return ' '.join(corrected_sentences) - + def load_freq_dict(self, freq_dict_path): freq_dict = {} with open(freq_dict_path, 'r') as f: @@ -120,41 +125,6 @@ def load_freq_dict(self, freq_dict_path): word, freq = line.split() freq_dict[word] = int(freq) return freq_dict - - def make_updation_counter(self, text): - - if type(text) == list: - text = ' '.join(text) - - # remove punctuations from the text - text = ''.join(e for e in text if e.isalnum() or e.isspace()) - words = text.split() - - # create a dictionary of words and their frequencies - dict = Counter(words) - - return dict - - def update_symspell_model(self, lang, text): - # update the frequency dictionary - current_freq_dict_counter = Counter(self.load_freq_dict(freq_dict_paths[lang])) - new_freq_dict_counter = self.make_updation_counter(text) - - # merge the two frequency dictionaries - freq_dict_counter = current_freq_dict_counter + new_freq_dict_counter - - freq_dict = {} - for word, freq in freq_dict_counter.items(): - freq_dict[word] = int(freq) - - with open(freq_dict_paths[lang], 'w') as f: - for word, freq in freq_dict.items(): - f.write(word + ' ' + str(freq) + '\n') - - # retrain the model with the updated frequency dictionary - self.symspell_models[lang] = self.create_symspell_model(freq_dict_paths[lang]) - - return 'Model updated successfully' class Model(): diff --git a/src/spell_check/kenlm/local/update.py b/src/spell_check/kenlm/local/update.py new file mode 100644 index 0000000..792af74 --- /dev/null +++ b/src/spell_check/kenlm/local/update.py @@ -0,0 +1,151 @@ +import os + +model_paths = { + 'ory': '5gram_model.bin', + 'eng': '5gram_model_eng.bin' +} + +vocab_paths = { + 'ory': 'lexicon.txt', + 'eng': 'lexicon_eng.txt' +} + +freq_dict_paths = { + 'ory': 'freq_dict.txt', + 'eng': 'freq_dict_eng.txt' +} + +texts_paths = { + 'ory': 'texts.txt', + 'eng': 'texts_eng.txt' +} + + +class UpdationModel(): + def __init__(self, model_paths, vocab_paths, freq_dict_paths, texts_paths): + self.model_paths = model_paths + self.vocab_paths = vocab_paths + self.freq_dict_paths = freq_dict_paths + self.texts_paths = texts_paths + + def set_language(self, lang): + self.model_path = self.model_paths[lang] + self.vocab_path = self.vocab_paths[lang] + self.freq_dict_path = self.freq_dict_paths[lang] + self.texts_path = self.texts_paths[lang] + + def train_kenlm_model(self): + input_path = self.texts_path.split('.')[0] + '_unique.txt' + + output_file = '5gram_model' + + # Making the arpa files + output_file1 = output_file + ".arpa" + output_file2 = output_file + "_correct.arpa" + output_bin_file = output_file + ".bin" + + os.system(f"kenlm/build/bin/lmplz -o 5 <{input_path} > {output_file1} --discount_fallback") + + # adding the character to the arpa file + with open(output_file1, "r") as read_file, open(output_file2, "w") as write_file: + has_added_eos = False + for line in read_file: + if not has_added_eos and "ngram 1=" in line: + count = line.strip().split("=")[-1] + write_file.write(line.replace(f"{count}", f"{int(count)+1}")) + elif not has_added_eos and "" in line: + write_file.write(line) + write_file.write(line.replace("", "")) + has_added_eos = True + else: + write_file.write(line) + + # converting arpa file to bin file + os.system(f"kenlm/build/bin/build_binary {output_file2} {output_bin_file}") + + os.remove(output_file1) + os.remove(output_file2) + + def update_lexicon_file(self): + with open(self.texts_path, 'r') as f: + text = f.read() + + # Tokenize the text into words + words = set(text.split()) + + with open(self.vocab_path, 'w') as f: + for word in words: + phonemes = " ".join(list(word)) + line = word + " " + phonemes + " |\n" + f.write(line) + + return True + + def update_text_file(self, text): + final_text = [] + + if type(text) == list: + # remove puntuations from the text + for line in text: + final_text.append(' '.join([word for word in line.split() if word.isalnum()])) + else: + final_text.append(' '.join([word for word in text.split() if word.isalnum()])) + + print(final_text) + + with open(self.texts_path, 'a') as f: + for line in final_text: + f.write(line + '\n') + + unique_text = set(final_text) + + with open(self.texts_path.split('.')[0] + '_unique.txt', 'w') as f: + for line in unique_text: + f.write(line + '\n') + + def load_freq_dict(self, freq_dict_path): + freq_dict = {} + + # read the frequency dictionary file + with open(freq_dict_path, 'r') as f: + freq_file = f.read().splitlines() + + # create a dictionary from the frequency file + for line in freq_file: + word, freq = line.split() + freq_dict[word] = int(freq) + + return freq_dict + + def update_freq_dict(self): + with open(self.texts_path, 'r') as f: + lines = f.readlines() + + freq_dict = {} + + for line in lines: + words = line.split() + for word in words: + if word in freq_dict: + freq_dict[word] += 1 + else: + freq_dict[word] = 1 + + with open(self.freq_dict_path, 'w') as f: + for word, freq in freq_dict.items(): + f.write(word + ' ' + str(freq) + '\n') + + async def update(self, req): + self.lang = req.lang + self.text = req.text + + self.set_language(self.lang) + self.update_text_file(self.text) + + self.update_freq_dict() + + self.update_lexicon_file() + + self.train_kenlm_model() + + return 'Model updated successfully' \ No newline at end of file diff --git a/src/spell_check/spello/local/README.md b/src/spell_check/spello/local/README.md index 2e6fbb5..fde331d 100644 --- a/src/spell_check/spello/local/README.md +++ b/src/spell_check/spello/local/README.md @@ -14,7 +14,6 @@ curl -X POST -H "Content-Type: application/json" -d '{ "lang" : "eng" }' http://localhost:8000/ - **curl request for update:** curl -X PUT -H "Content-Type: application/json" -d '{ @@ -32,6 +31,6 @@ curl -X PUT -H "Content-Type: application/json" -d '{ }' http://localhost:8000/ curl -X PUT -H "Content-Type: application/json" -d '{ -"text": ["how to apply for", "scheme for my paddi crop"], +"text": ["how to apply for", "scheme for my paddy crop"], "lang" : "eng" }' http://localhost:8000/