Merge pull request #304 from Shubh-Goyal-07/restructure

Updates spell_check module
Samagra-Development · Apr 5, 2024 · 1f4d19d · 1f4d19d
2 parents d8b414e + f0237de
commit 1f4d19d
Show file tree

Hide file tree

Showing 14 changed files with 397 additions and 14 deletions.
diff --git a/src/spell_check/kenlm/local/Dockerfile b/src/spell_check/kenlm/local/Dockerfile
@@ -17,8 +17,10 @@ RUN apt-get update && apt-get install -y wget
 # Download the files using wget
 RUN wget "https://drive.google.com/uc?export=download&id=1frSw5-qfRMgrYs4QL961s2yYuq2KplEM" -O '5gram_model.bin'
 RUN wget "https://drive.google.com/uc?export=download&id=1o31Z4TZbAOEt6E8Rx7VMONJOGJH-5Mwk" -O 'lexicon.txt'
+RUN wget "https://drive.google.com/uc?export=download&id=14cMmeDPlAODbRe37CdHLnhClGX7JXG-A" -O 'freq_dict.txt'
 RUN wget "https://drive.google.com/uc?export=download&id=1-Dtk5socjYdeGyqhbQzG-rvWJfWVFGqv" -O '5gram_model_eng.bin' 
 RUN wget "https://drive.google.com/uc?export=download&id=1-59pDTvEXCMUZ-NQ8BwmCnHQZh4Eg6Gw" -O 'lexicon_eng.txt'
+RUN wget "https://drive.google.com/uc?export=download&id=1Ztj6k0A4BMi_o87qwSDKJQ6cyhvlvneD" -O 'freq_dict_eng.txt'
 
 # Copy the rest of the application code to the working directory
 COPY . /app/

diff --git a/src/spell_check/kenlm/local/README.md b/src/spell_check/kenlm/local/README.md
@@ -1,4 +1,4 @@
-curl request : 
+.curl request :
 
 curl -X POST -H "Content-Type: application/json" -d '{
 "text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି",
@@ -8,19 +8,40 @@ curl -X POST -H "Content-Type: application/json" -d '{
 "lang" : "ory"
 }' http://localhost:8000/
 
-
 curl -X POST -H "Content-Type: application/json" -d '{
 "text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି",
 "BEAM_WIDTH": 5,
 "SCORE_THRESHOLD": 1.5,
 "max_distance": 1
 }' http://localhost:8000/
 
-
 curl -X POST -H "Content-Type: application/json" -d '{
 "text": "how to apply for go-sugem scheme for my paddi crop",
 "BEAM_WIDTH": 5,
 "SCORE_THRESHOLD": 1.5,
 "max_distance": 1,
 "lang" : "eng"
 }' http://localhost:8000/
+
+
+
+**curl request for update:**
+
+curl -X PUT -H "Content-Type: application/json" -d '{
+"text": "ମିଶନରୀ",
+"lang" : "ory"
+}' http://localhost:8000/
+
+curl -X PUT -H "Content-Type: application/json" -d '{
+"text": ["ପାମ ମିଶନରୀ ଉପରେ", "ରିହାତି ଧୈର୍ଯ ହୋଇଛି"]
+}' http://localhost:8000/
+
+curl -X PUT -H "Content-Type: application/json" -d '{
+"text": "go-sugem",
+"lang" : "eng"
+}' http://localhost:8000/
+
+curl -X PUT -H "Content-Type: application/json" -d '{
+"text": ["how to apply for", "scheme for my paddi crop"],
+"lang" : "eng"
+}' http://localhost:8000/
diff --git a/src/spell_check/kenlm/local/api.py b/src/spell_check/kenlm/local/api.py
@@ -1,5 +1,5 @@
 from model import Model
-from request import ModelRequest
+from request import ModelRequest, ModelUpdateRequest
 from quart import Quart, request
 import aiohttp
 
@@ -17,13 +17,17 @@
     'eng': 'lexicon_eng.txt'
 }
 
+freq_dict_paths = {
+    'ory': 'freq_dict.txt',
+    'eng': 'freq_dict_eng.txt'
+}
 
 
 @app.before_serving
 async def startup():
     app.client = aiohttp.ClientSession()
     global model
-    model = Model(app, model_paths, vocab_paths)
+    model = Model(app, model_paths, vocab_paths, freq_dict_paths)
 
 @app.route('/', methods=['POST'])
 async def embed():
@@ -33,5 +37,13 @@ async def embed():
     result = await model.inference(req)
     return result
 
+@app.route('/', methods=['PUT'])
+async def update():
+    global model
+    data = await request.get_json()
+    req = ModelUpdateRequest(**data)
+    result = await model.update_symspell(req)
+    return result
+
 if __name__ == "__main__":
     app.run()
diff --git a/src/spell_check/kenlm/local/model.py b/src/spell_check/kenlm/local/model.py
@@ -1,7 +1,11 @@
 import kenlm  
-from request import ModelRequest
+from request import ModelRequest, ModelUpdateRequest
 import Levenshtein
 
+from symspellpy import SymSpell, Verbosity
+
+from collections import Counter
+
 model_paths = {
     'ory': '5gram_model.bin',
     'eng': '5gram_model_eng.bin'
@@ -12,9 +16,14 @@
     'eng': 'lexicon_eng.txt'
 }
 
+freq_dict_paths = {
+    'ory': 'freq_dict.txt',
+    'eng': 'freq_dict_eng.txt'
+}
+
 
 class TextCorrector:
-    def __init__(self, model_paths, vocab_paths):
+    def __init__(self, model_paths, vocab_paths, freq_dict_paths):
         # Initialize both models and vocabularies
         self.models = {
             'ory': kenlm.Model(model_paths['ory']),
@@ -24,13 +33,19 @@ def __init__(self, model_paths, vocab_paths):
             'ory': self.create_vocab_lexicon(vocab_paths['ory']),
             'eng': self.create_vocab_lexicon(vocab_paths['eng'])
         }
+
+        self.symspell_models = {
+            'ory': self.create_symspell_model(freq_dict_paths['ory']),
+            'eng': self.create_symspell_model(freq_dict_paths['eng'])
+        }
         # Set the default language
         self.set_language('ory')
 
     def set_language(self, lang):
         # Switch the model and vocabulary based on language
         self.model = self.models[lang]
         self.vocab = self.vocabs[lang]
+        self.symspell_model = self.symspell_models[lang]
 
     def create_vocab_lexicon(self, lexicon_path):
         vocabulary = []
@@ -40,14 +55,23 @@ def create_vocab_lexicon(self, lexicon_path):
                 vocabulary.append(word)
         return vocabulary
 
+    def create_symspell_model(self, freq_dict_path):
+        sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
+        sym_spell.load_dictionary(freq_dict_path, term_index=0, count_index=1, separator=' ')
+        return sym_spell
+
+    # def generate_candidates(self, word, max_distance=1):
+    #     len_range = range(len(word) - max_distance, len(word) + max_distance + 1)
+    #     filtered_vocab = [vocab_word for vocab_word in self.vocab if len(vocab_word) in len_range]
+    #     return [vocab_word for vocab_word in filtered_vocab if 0 <= Levenshtein.distance(word, vocab_word) <= max_distance]
+
     def generate_candidates(self, word, max_distance=1):
-        len_range = range(len(word) - max_distance, len(word) + max_distance + 1)
-        filtered_vocab = [vocab_word for vocab_word in self.vocab if len(vocab_word) in len_range]
-        return [vocab_word for vocab_word in filtered_vocab if 0 <= Levenshtein.distance(word, vocab_word) <= max_distance]
+        suggestions = self.symspell_model.lookup(word, Verbosity.CLOSEST, max_distance)
+        return [suggestion.term for suggestion in suggestions]
 
     def beam_search(self, chunk, BEAM_WIDTH=5, SCORE_THRESHOLD=1.5, max_distance=1):
         original_score = self.model.score(' '.join(chunk))
-        
+
         initial_candidates = self.generate_candidates(chunk[0], max_distance=1)
         if not initial_candidates:
             initial_candidates = [chunk[0]]
@@ -88,11 +112,55 @@ def correct_text_with_beam_search(self, text, BEAM_WIDTH=5, SCORE_THRESHOLD=1.5,
             corrected_sentences.append(best_sentence)
 
         return ' '.join(corrected_sentences)
+
+    def load_freq_dict(self, freq_dict_path):
+        freq_dict = {}
+        with open(freq_dict_path, 'r') as f:
+            for line in f:
+                word, freq = line.split()
+                freq_dict[word] = int(freq)
+        return freq_dict
+
+    def make_updation_counter(self, text):
+
+        if type(text) == list:
+            text = ' '.join(text)
+
+        # remove punctuations from the text
+        text = ''.join(e for e in text if e.isalnum() or e.isspace())
+        words = text.split()
+
+        # create a dictionary of words and their frequencies
+        dict = Counter(words)
+
+        return dict
+
+    def update_symspell_model(self, lang, text):
+        # update the frequency dictionary
+        current_freq_dict_counter = Counter(self.load_freq_dict(freq_dict_paths[lang]))
+        new_freq_dict_counter = self.make_updation_counter(text)
+
+        # merge the two frequency dictionaries
+        freq_dict_counter = current_freq_dict_counter + new_freq_dict_counter
+
+        freq_dict = {}
+        for word, freq in freq_dict_counter.items():
+            freq_dict[word] = int(freq)
+
+        with open(freq_dict_paths[lang], 'w') as f:
+            for word, freq in freq_dict.items():
+                f.write(word + ' ' + str(freq) + '\n')
+
+        # retrain the model with the updated frequency dictionary
+        self.symspell_models[lang] = self.create_symspell_model(freq_dict_paths[lang])
+
+        return 'Model updated successfully'
+
 
 class Model():
-    def __init__(self, context, model_paths, vocab_paths):
+    def __init__(self, context, model_paths, vocab_paths, freq_dict_paths):
         self.context = context
-        self.text_corrector = TextCorrector(model_paths, vocab_paths)
+        self.text_corrector = TextCorrector(model_paths, vocab_paths, freq_dict_paths)
 
     async def inference(self, request: ModelRequest):
         # Set the correct language model based on the request
@@ -105,3 +173,12 @@ async def inference(self, request: ModelRequest):
             max_distance=request.max_distance
         )
         return corrected_text
+
+    async def update_symspell(self, request: ModelUpdateRequest):
+        # Set the correct language model based on the request
+        self.text_corrector.set_language(request.lang)
+
+        # Update the model with the new data
+        self.text_corrector.update_symspell_model(request.lang, request.text)
+
+        return 'Model updated successfully'
diff --git a/src/spell_check/kenlm/local/request.py b/src/spell_check/kenlm/local/request.py
@@ -11,3 +11,11 @@ def __init__(self, text, BEAM_WIDTH, SCORE_THRESHOLD, max_distance, lang='ory'):
 
     def to_json(self):
         return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)
+
+class ModelUpdateRequest():
+    def __init__(self, text, lang='ory'):
+        self.text = text
+        self.lang = lang
+
+    def to_json(self):
+        return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)
diff --git a/src/spell_check/kenlm/local/requirements.txt b/src/spell_check/kenlm/local/requirements.txt
@@ -1,4 +1,5 @@
 quart
 aiohttp
 python-Levenshtein
-requests
+requests
+symspellpy
diff --git a/src/spell_check/spello/README.md b/src/spell_check/spello/README.md
diff --git a/src/spell_check/spello/local/Dockerfile b/src/spell_check/spello/local/Dockerfile
@@ -0,0 +1,26 @@
+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+
+WORKDIR /app
+
+# Install system packages required for building kenlm
+RUN apt-get update && apt-get install -y cmake g++ zlib1g-dev
+
+# Install requirements
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt
+
+# Install wget
+RUN apt-get update && apt-get install -y wget
+
+# Download the files using wget
+RUN wget "https://drive.google.com/uc?export=download&id=14cMmeDPlAODbRe37CdHLnhClGX7JXG-A" -O 'freq_dict.txt'
+RUN wget "https://drive.google.com/uc?export=download&id=1Ztj6k0A4BMi_o87qwSDKJQ6cyhvlvneD" -O 'freq_dict_eng.txt'
+
+# Copy the rest of the application code to the working directory
+COPY . /app/
+
+EXPOSE 8000
+
+# Set the entrypoint for the container
+CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]
diff --git a/src/spell_check/spello/local/README.md b/src/spell_check/spello/local/README.md
@@ -0,0 +1,37 @@
+**curl request for inferenece:**
+
+curl -X POST -H "Content-Type: application/json" -d '{
+"text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି",
+"lang" : "ory"
+}' http://localhost:8000/
+
+curl -X POST -H "Content-Type: application/json" -d '{
+"text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି"
+}' http://localhost:8000/
+
+curl -X POST -H "Content-Type: application/json" -d '{
+"text": "how to apply for go-sugem scheme for my paddi crop",
+"lang" : "eng"
+}' http://localhost:8000/
+
+
+**curl request for update:**
+
+curl -X PUT -H "Content-Type: application/json" -d '{
+"text": "ମିଶନରୀ",
+"lang" : "ory"
+}' http://localhost:8000/
+
+curl -X PUT -H "Content-Type: application/json" -d '{
+"text": ["ପାମ ମିଶନରୀ ଉପରେ", "ରିହାତି ଧୈର୍ଯ ହୋଇଛି"]
+}' http://localhost:8000/
+
+curl -X PUT -H "Content-Type: application/json" -d '{
+"text": "go-sugem",
+"lang" : "eng"
+}' http://localhost:8000/
+
+curl -X PUT -H "Content-Type: application/json" -d '{
+"text": ["how to apply for", "scheme for my paddi crop"],
+"lang" : "eng"
+}' http://localhost:8000/
diff --git a/src/spell_check/spello/local/__init__.py b/src/spell_check/spello/local/__init__.py
@@ -0,0 +1,2 @@
+from .request import *
+from .model import *
diff --git a/src/spell_check/spello/local/api.py b/src/spell_check/spello/local/api.py
@@ -0,0 +1,46 @@
+from model import Model
+from request import ModelRequest
+from quart import Quart, request
+import aiohttp
+
+app = Quart(__name__)
+
+model = None
+
+freq_dict_paths = {
+    'ory': 'freq_dict.txt',
+    'eng': 'freq_dict_eng.txt'
+}
+
+spello_model_paths = {
+    'ory': 'spello_model.pkl',
+    'eng': 'spello_model_eng.pkl'
+}
+
+
+@app.before_serving
+async def startup():
+    app.client = aiohttp.ClientSession()
+    global model
+    model = Model(app, freq_dict_paths)
+
+@app.route('/', methods=['POST'])
+async def infer():
+    global model
+    data = await request.get_json()
+    req = ModelRequest(**data)
+    result = await model.inference(req)
+    return result
+
+@app.route('/', methods=['PUT'])
+async def update():
+    # print("PUT")
+    global model
+    data = await request.get_json()
+    req = ModelRequest(**data)
+    result = await model.update(req)
+    return result
+
+
+if __name__ == "__main__":
+    app.run()