Merge pull request #307 from Shubh-Goyal-07/restructure

adds kenlm model update functionality
Samagra-Development · Apr 9, 2024 · 5de7101 · 5de7101
2 parents 85c736b + 6a3e94f
commit 5de7101
Show file tree

Hide file tree

Showing 5 changed files with 190 additions and 44 deletions.
diff --git a/src/spell_check/kenlm/local/Dockerfile b/src/spell_check/kenlm/local/Dockerfile
@@ -10,18 +10,32 @@ RUN apt-get update && apt-get install -y cmake g++ zlib1g-dev
 COPY requirements.txt requirements.txt
 RUN pip3 install -r requirements.txt
 
+RUN echo "Downloading the language model files"
 # Install kenlm using pip
 RUN pip3 install https://github.com/kpu/kenlm/archive/master.zip
 RUN apt-get update && apt-get install -y wget
 
+
+RUN echo "Downloading the language model files"
+RUN apt-get install -y build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev
+
+RUN echo "Downloading the language model files"
 # Download the files using wget
 RUN wget "https://drive.google.com/uc?export=download&id=1frSw5-qfRMgrYs4QL961s2yYuq2KplEM" -O '5gram_model.bin'
 RUN wget "https://drive.google.com/uc?export=download&id=1o31Z4TZbAOEt6E8Rx7VMONJOGJH-5Mwk" -O 'lexicon.txt'
 RUN wget "https://drive.google.com/uc?export=download&id=14cMmeDPlAODbRe37CdHLnhClGX7JXG-A" -O 'freq_dict.txt'
+RUN wget "https://drive.google.com/uc?export=download&id=1eVWwarCm8Wqq3vYqsE9f2jvrp-rvr6QZ" -O 'texts.txt'
+
 RUN wget "https://drive.google.com/uc?export=download&id=1-Dtk5socjYdeGyqhbQzG-rvWJfWVFGqv" -O '5gram_model_eng.bin' 
 RUN wget "https://drive.google.com/uc?export=download&id=1-59pDTvEXCMUZ-NQ8BwmCnHQZh4Eg6Gw" -O 'lexicon_eng.txt'
 RUN wget "https://drive.google.com/uc?export=download&id=1Ztj6k0A4BMi_o87qwSDKJQ6cyhvlvneD" -O 'freq_dict_eng.txt'
+RUN wget "https://drive.google.com/uc?export=download&id=1-iZvej7L92Aga9VZ33BM5ybUTiR0hMF8" -O 'texts_eng.txt'
 
+RUN echo "Downloading the language model files"
+# Dowload the kenlm training files
+RUN wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
+RUN mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2
+
 # Copy the rest of the application code to the working directory
 COPY . /app/
 

diff --git a/src/spell_check/kenlm/local/api.py b/src/spell_check/kenlm/local/api.py
@@ -1,4 +1,5 @@
 from model import Model
+from update import UpdationModel
 from request import ModelRequest, ModelUpdateRequest
 from quart import Quart, request
 import aiohttp
@@ -22,13 +23,20 @@
     'eng': 'freq_dict_eng.txt'
 }
 
+texts_paths = {
+    'ory': 'texts.txt',
+    'eng': 'texts_eng.txt'
+}
+
 
 @app.before_serving
 async def startup():
     app.client = aiohttp.ClientSession()
     global model
     model = Model(app, model_paths, vocab_paths, freq_dict_paths)
 
+    print("Model loaded successfully")
+
 @app.route('/', methods=['POST'])
 async def embed():
     global model
@@ -42,7 +50,11 @@ async def update():
     global model
     data = await request.get_json()
     req = ModelUpdateRequest(**data)
-    result = await model.update_symspell(req)
+    result = await UpdationModel(model_paths, vocab_paths, freq_dict_paths, texts_paths).update(req)
+
+    if result:
+        model = Model(app, model_paths, vocab_paths, freq_dict_paths)
+
     return result
 
 if __name__ == "__main__":

diff --git a/src/spell_check/kenlm/local/model.py b/src/spell_check/kenlm/local/model.py
@@ -21,6 +21,11 @@
     'eng': 'freq_dict_eng.txt'
 }
 
+texts_paths = {
+    'ory': 'texts.txt',
+    'eng': 'texts_eng.txt'
+}
+
 
 class TextCorrector:
     def __init__(self, model_paths, vocab_paths, freq_dict_paths):
@@ -29,11 +34,14 @@ def __init__(self, model_paths, vocab_paths, freq_dict_paths):
             'ory': kenlm.Model(model_paths['ory']),
             'eng': kenlm.Model(model_paths['eng'])
         }
+
+        print('Loading vocabularies...')
         self.vocabs = {
             'ory': self.create_vocab_lexicon(vocab_paths['ory']),
             'eng': self.create_vocab_lexicon(vocab_paths['eng'])
         }
 
+        print('Loading symspell models...')
         self.symspell_models = {
             'ory': self.create_symspell_model(freq_dict_paths['ory']),
             'eng': self.create_symspell_model(freq_dict_paths['eng'])
@@ -47,6 +55,8 @@ def set_language(self, lang):
         self.vocab = self.vocabs[lang]
         self.symspell_model = self.symspell_models[lang]
 
+        print(self.symspell_models['eng'].words)
+
     def create_vocab_lexicon(self, lexicon_path):
         vocabulary = []
         with open(lexicon_path, 'r', encoding='utf-8') as file:
@@ -60,11 +70,6 @@ def create_symspell_model(self, freq_dict_path):
         sym_spell.load_dictionary(freq_dict_path, term_index=0, count_index=1, separator=' ')
         return sym_spell
 
-    # def generate_candidates(self, word, max_distance=1):
-    #     len_range = range(len(word) - max_distance, len(word) + max_distance + 1)
-    #     filtered_vocab = [vocab_word for vocab_word in self.vocab if len(vocab_word) in len_range]
-    #     return [vocab_word for vocab_word in filtered_vocab if 0 <= Levenshtein.distance(word, vocab_word) <= max_distance]
-
     def generate_candidates(self, word, max_distance=1):
         suggestions = self.symspell_model.lookup(word, Verbosity.CLOSEST, max_distance)
         return [suggestion.term for suggestion in suggestions]
@@ -112,49 +117,14 @@ def correct_text_with_beam_search(self, text, BEAM_WIDTH=5, SCORE_THRESHOLD=1.5,
             corrected_sentences.append(best_sentence)
 
         return ' '.join(corrected_sentences)
-    
+
     def load_freq_dict(self, freq_dict_path):
         freq_dict = {}
         with open(freq_dict_path, 'r') as f:
             for line in f:
                 word, freq = line.split()
                 freq_dict[word] = int(freq)
         return freq_dict
-
-    def make_updation_counter(self, text):
-
-        if type(text) == list:
-            text = ' '.join(text)
-
-        # remove punctuations from the text
-        text = ''.join(e for e in text if e.isalnum() or e.isspace())
-        words = text.split()
-
-        # create a dictionary of words and their frequencies
-        dict = Counter(words)
-
-        return dict
-
-    def update_symspell_model(self, lang, text):
-        # update the frequency dictionary
-        current_freq_dict_counter = Counter(self.load_freq_dict(freq_dict_paths[lang]))
-        new_freq_dict_counter = self.make_updation_counter(text)
-
-        # merge the two frequency dictionaries
-        freq_dict_counter = current_freq_dict_counter + new_freq_dict_counter
-
-        freq_dict = {}
-        for word, freq in freq_dict_counter.items():
-            freq_dict[word] = int(freq)
-
-        with open(freq_dict_paths[lang], 'w') as f:
-            for word, freq in freq_dict.items():
-                f.write(word + ' ' + str(freq) + '\n')
-
-        # retrain the model with the updated frequency dictionary
-        self.symspell_models[lang] = self.create_symspell_model(freq_dict_paths[lang])
-
-        return 'Model updated successfully'
 
 
 class Model():

diff --git a/src/spell_check/kenlm/local/update.py b/src/spell_check/kenlm/local/update.py
@@ -0,0 +1,151 @@
+import os
+
+model_paths = {
+    'ory': '5gram_model.bin',
+    'eng': '5gram_model_eng.bin'
+}
+
+vocab_paths = {
+    'ory': 'lexicon.txt',
+    'eng': 'lexicon_eng.txt'
+}
+
+freq_dict_paths = {
+    'ory': 'freq_dict.txt',
+    'eng': 'freq_dict_eng.txt'
+}
+
+texts_paths = {
+    'ory': 'texts.txt',
+    'eng': 'texts_eng.txt'
+}
+
+
+class UpdationModel():
+    def __init__(self, model_paths, vocab_paths, freq_dict_paths, texts_paths):
+        self.model_paths = model_paths
+        self.vocab_paths = vocab_paths
+        self.freq_dict_paths = freq_dict_paths
+        self.texts_paths = texts_paths
+
+    def set_language(self, lang):
+        self.model_path = self.model_paths[lang]
+        self.vocab_path = self.vocab_paths[lang]
+        self.freq_dict_path = self.freq_dict_paths[lang]
+        self.texts_path = self.texts_paths[lang]
+
+    def train_kenlm_model(self):
+        input_path = self.texts_path.split('.')[0] + '_unique.txt'
+
+        output_file = '5gram_model'
+
+        #  Making the arpa files
+        output_file1 = output_file + ".arpa"
+        output_file2 = output_file + "_correct.arpa"
+        output_bin_file = output_file + ".bin"
+
+        os.system(f"kenlm/build/bin/lmplz -o 5 <{input_path} > {output_file1} --discount_fallback")
+
+        # adding the </s> character to the arpa file
+        with open(output_file1, "r") as read_file, open(output_file2, "w") as write_file:
+            has_added_eos = False
+            for line in read_file:
+                if not has_added_eos and "ngram 1=" in line:
+                    count = line.strip().split("=")[-1]
+                    write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
+                elif not has_added_eos and "<s>" in line:
+                    write_file.write(line)
+                    write_file.write(line.replace("<s>", "</s>"))
+                    has_added_eos = True
+                else:
+                    write_file.write(line)
+
+        # converting arpa file to bin file
+        os.system(f"kenlm/build/bin/build_binary {output_file2} {output_bin_file}")
+
+        os.remove(output_file1)
+        os.remove(output_file2)
+
+    def update_lexicon_file(self):
+        with open(self.texts_path, 'r') as f:
+            text = f.read()
+
+        # Tokenize the text into words
+        words = set(text.split())
+
+        with open(self.vocab_path, 'w') as f:
+            for word in words:
+                phonemes = " ".join(list(word))
+                line = word + " " + phonemes + " |\n"
+                f.write(line)
+
+        return True
+
+    def update_text_file(self, text):
+        final_text = []
+
+        if type(text) == list:
+            # remove puntuations from the text
+            for line in text:
+                final_text.append(' '.join([word for word in line.split() if word.isalnum()]))
+        else:
+            final_text.append(' '.join([word for word in text.split() if word.isalnum()]))
+
+        print(final_text)
+
+        with open(self.texts_path, 'a') as f:
+            for line in final_text:
+                f.write(line + '\n')
+
+        unique_text = set(final_text)
+
+        with open(self.texts_path.split('.')[0] + '_unique.txt', 'w') as f:
+            for line in unique_text:
+                f.write(line + '\n')
+
+    def load_freq_dict(self, freq_dict_path):
+        freq_dict = {}
+
+        # read the frequency dictionary file
+        with open(freq_dict_path, 'r') as f:
+            freq_file = f.read().splitlines()
+
+        # create a dictionary from the frequency file
+        for line in freq_file:
+            word, freq = line.split()
+            freq_dict[word] = int(freq)
+
+        return freq_dict
+
+    def update_freq_dict(self):
+        with open(self.texts_path, 'r') as f:
+            lines = f.readlines()
+
+        freq_dict = {}
+
+        for line in lines:
+            words = line.split()
+            for word in words:
+                if word in freq_dict:
+                    freq_dict[word] += 1
+                else:
+                    freq_dict[word] = 1
+
+        with open(self.freq_dict_path, 'w') as f:
+            for word, freq in freq_dict.items():
+                f.write(word + ' ' + str(freq) + '\n')
+
+    async def update(self, req):
+        self.lang = req.lang
+        self.text = req.text
+
+        self.set_language(self.lang)
+        self.update_text_file(self.text)
+
+        self.update_freq_dict()
+
+        self.update_lexicon_file()
+
+        self.train_kenlm_model()
+
+        return 'Model updated successfully'
diff --git a/src/spell_check/spello/local/README.md b/src/spell_check/spello/local/README.md
@@ -14,7 +14,6 @@ curl -X POST -H "Content-Type: application/json" -d '{
 "lang" : "eng"
 }' http://localhost:8000/
 
-
 **curl request for update:**
 
 curl -X PUT -H "Content-Type: application/json" -d '{
@@ -32,6 +31,6 @@ curl -X PUT -H "Content-Type: application/json" -d '{
 }' http://localhost:8000/
 
 curl -X PUT -H "Content-Type: application/json" -d '{
-"text": ["how to apply for", "scheme for my paddi crop"],
+"text": ["how to apply for", "scheme for my paddy crop"],
 "lang" : "eng"
 }' http://localhost:8000/