Skip to content

Commit

Permalink
Merge pull request #304 from Shubh-Goyal-07/restructure
Browse files Browse the repository at this point in the history
Updates spell_check module
  • Loading branch information
Gautam-Rajeev authored Apr 5, 2024
2 parents d8b414e + f0237de commit 1f4d19d
Show file tree
Hide file tree
Showing 14 changed files with 397 additions and 14 deletions.
2 changes: 2 additions & 0 deletions src/spell_check/kenlm/local/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ RUN apt-get update && apt-get install -y wget
# Download the files using wget
RUN wget "https://drive.google.com/uc?export=download&id=1frSw5-qfRMgrYs4QL961s2yYuq2KplEM" -O '5gram_model.bin'
RUN wget "https://drive.google.com/uc?export=download&id=1o31Z4TZbAOEt6E8Rx7VMONJOGJH-5Mwk" -O 'lexicon.txt'
RUN wget "https://drive.google.com/uc?export=download&id=14cMmeDPlAODbRe37CdHLnhClGX7JXG-A" -O 'freq_dict.txt'
RUN wget "https://drive.google.com/uc?export=download&id=1-Dtk5socjYdeGyqhbQzG-rvWJfWVFGqv" -O '5gram_model_eng.bin'
RUN wget "https://drive.google.com/uc?export=download&id=1-59pDTvEXCMUZ-NQ8BwmCnHQZh4Eg6Gw" -O 'lexicon_eng.txt'
RUN wget "https://drive.google.com/uc?export=download&id=1Ztj6k0A4BMi_o87qwSDKJQ6cyhvlvneD" -O 'freq_dict_eng.txt'

# Copy the rest of the application code to the working directory
COPY . /app/
Expand Down
27 changes: 24 additions & 3 deletions src/spell_check/kenlm/local/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
curl request :
.curl request :

curl -X POST -H "Content-Type: application/json" -d '{
"text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି",
Expand All @@ -8,19 +8,40 @@ curl -X POST -H "Content-Type: application/json" -d '{
"lang" : "ory"
}' http://localhost:8000/


curl -X POST -H "Content-Type: application/json" -d '{
"text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି",
"BEAM_WIDTH": 5,
"SCORE_THRESHOLD": 1.5,
"max_distance": 1
}' http://localhost:8000/


curl -X POST -H "Content-Type: application/json" -d '{
"text": "how to apply for go-sugem scheme for my paddi crop",
"BEAM_WIDTH": 5,
"SCORE_THRESHOLD": 1.5,
"max_distance": 1,
"lang" : "eng"
}' http://localhost:8000/



**curl request for update:**

curl -X PUT -H "Content-Type: application/json" -d '{
"text": "ମିଶନରୀ",
"lang" : "ory"
}' http://localhost:8000/

curl -X PUT -H "Content-Type: application/json" -d '{
"text": ["ପାମ ମିଶନରୀ ଉପରେ", "ରିହାତି ଧୈର୍ଯ ହୋଇଛି"]
}' http://localhost:8000/

curl -X PUT -H "Content-Type: application/json" -d '{
"text": "go-sugem",
"lang" : "eng"
}' http://localhost:8000/

curl -X PUT -H "Content-Type: application/json" -d '{
"text": ["how to apply for", "scheme for my paddi crop"],
"lang" : "eng"
}' http://localhost:8000/
16 changes: 14 additions & 2 deletions src/spell_check/kenlm/local/api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from model import Model
from request import ModelRequest
from request import ModelRequest, ModelUpdateRequest
from quart import Quart, request
import aiohttp

Expand All @@ -17,13 +17,17 @@
'eng': 'lexicon_eng.txt'
}

freq_dict_paths = {
'ory': 'freq_dict.txt',
'eng': 'freq_dict_eng.txt'
}


@app.before_serving
async def startup():
app.client = aiohttp.ClientSession()
global model
model = Model(app, model_paths, vocab_paths)
model = Model(app, model_paths, vocab_paths, freq_dict_paths)

@app.route('/', methods=['POST'])
async def embed():
Expand All @@ -33,5 +37,13 @@ async def embed():
result = await model.inference(req)
return result

@app.route('/', methods=['PUT'])
async def update():
global model
data = await request.get_json()
req = ModelUpdateRequest(**data)
result = await model.update_symspell(req)
return result

if __name__ == "__main__":
app.run()
93 changes: 85 additions & 8 deletions src/spell_check/kenlm/local/model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import kenlm
from request import ModelRequest
from request import ModelRequest, ModelUpdateRequest
import Levenshtein

from symspellpy import SymSpell, Verbosity

from collections import Counter

model_paths = {
'ory': '5gram_model.bin',
'eng': '5gram_model_eng.bin'
Expand All @@ -12,9 +16,14 @@
'eng': 'lexicon_eng.txt'
}

freq_dict_paths = {
'ory': 'freq_dict.txt',
'eng': 'freq_dict_eng.txt'
}


class TextCorrector:
def __init__(self, model_paths, vocab_paths):
def __init__(self, model_paths, vocab_paths, freq_dict_paths):
# Initialize both models and vocabularies
self.models = {
'ory': kenlm.Model(model_paths['ory']),
Expand All @@ -24,13 +33,19 @@ def __init__(self, model_paths, vocab_paths):
'ory': self.create_vocab_lexicon(vocab_paths['ory']),
'eng': self.create_vocab_lexicon(vocab_paths['eng'])
}

self.symspell_models = {
'ory': self.create_symspell_model(freq_dict_paths['ory']),
'eng': self.create_symspell_model(freq_dict_paths['eng'])
}
# Set the default language
self.set_language('ory')

def set_language(self, lang):
# Switch the model and vocabulary based on language
self.model = self.models[lang]
self.vocab = self.vocabs[lang]
self.symspell_model = self.symspell_models[lang]

def create_vocab_lexicon(self, lexicon_path):
vocabulary = []
Expand All @@ -40,14 +55,23 @@ def create_vocab_lexicon(self, lexicon_path):
vocabulary.append(word)
return vocabulary

def create_symspell_model(self, freq_dict_path):
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
sym_spell.load_dictionary(freq_dict_path, term_index=0, count_index=1, separator=' ')
return sym_spell

# def generate_candidates(self, word, max_distance=1):
# len_range = range(len(word) - max_distance, len(word) + max_distance + 1)
# filtered_vocab = [vocab_word for vocab_word in self.vocab if len(vocab_word) in len_range]
# return [vocab_word for vocab_word in filtered_vocab if 0 <= Levenshtein.distance(word, vocab_word) <= max_distance]

def generate_candidates(self, word, max_distance=1):
len_range = range(len(word) - max_distance, len(word) + max_distance + 1)
filtered_vocab = [vocab_word for vocab_word in self.vocab if len(vocab_word) in len_range]
return [vocab_word for vocab_word in filtered_vocab if 0 <= Levenshtein.distance(word, vocab_word) <= max_distance]
suggestions = self.symspell_model.lookup(word, Verbosity.CLOSEST, max_distance)
return [suggestion.term for suggestion in suggestions]

def beam_search(self, chunk, BEAM_WIDTH=5, SCORE_THRESHOLD=1.5, max_distance=1):
original_score = self.model.score(' '.join(chunk))

initial_candidates = self.generate_candidates(chunk[0], max_distance=1)
if not initial_candidates:
initial_candidates = [chunk[0]]
Expand Down Expand Up @@ -88,11 +112,55 @@ def correct_text_with_beam_search(self, text, BEAM_WIDTH=5, SCORE_THRESHOLD=1.5,
corrected_sentences.append(best_sentence)

return ' '.join(corrected_sentences)

def load_freq_dict(self, freq_dict_path):
freq_dict = {}
with open(freq_dict_path, 'r') as f:
for line in f:
word, freq = line.split()
freq_dict[word] = int(freq)
return freq_dict

def make_updation_counter(self, text):

if type(text) == list:
text = ' '.join(text)

# remove punctuations from the text
text = ''.join(e for e in text if e.isalnum() or e.isspace())
words = text.split()

# create a dictionary of words and their frequencies
dict = Counter(words)

return dict

def update_symspell_model(self, lang, text):
# update the frequency dictionary
current_freq_dict_counter = Counter(self.load_freq_dict(freq_dict_paths[lang]))
new_freq_dict_counter = self.make_updation_counter(text)

# merge the two frequency dictionaries
freq_dict_counter = current_freq_dict_counter + new_freq_dict_counter

freq_dict = {}
for word, freq in freq_dict_counter.items():
freq_dict[word] = int(freq)

with open(freq_dict_paths[lang], 'w') as f:
for word, freq in freq_dict.items():
f.write(word + ' ' + str(freq) + '\n')

# retrain the model with the updated frequency dictionary
self.symspell_models[lang] = self.create_symspell_model(freq_dict_paths[lang])

return 'Model updated successfully'


class Model():
def __init__(self, context, model_paths, vocab_paths):
def __init__(self, context, model_paths, vocab_paths, freq_dict_paths):
self.context = context
self.text_corrector = TextCorrector(model_paths, vocab_paths)
self.text_corrector = TextCorrector(model_paths, vocab_paths, freq_dict_paths)

async def inference(self, request: ModelRequest):
# Set the correct language model based on the request
Expand All @@ -105,3 +173,12 @@ async def inference(self, request: ModelRequest):
max_distance=request.max_distance
)
return corrected_text

async def update_symspell(self, request: ModelUpdateRequest):
# Set the correct language model based on the request
self.text_corrector.set_language(request.lang)

# Update the model with the new data
self.text_corrector.update_symspell_model(request.lang, request.text)

return 'Model updated successfully'
8 changes: 8 additions & 0 deletions src/spell_check/kenlm/local/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,11 @@ def __init__(self, text, BEAM_WIDTH, SCORE_THRESHOLD, max_distance, lang='ory'):

def to_json(self):
return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)

class ModelUpdateRequest():
def __init__(self, text, lang='ory'):
self.text = text
self.lang = lang

def to_json(self):
return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)
3 changes: 2 additions & 1 deletion src/spell_check/kenlm/local/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
quart
aiohttp
python-Levenshtein
requests
requests
symspellpy
Empty file.
26 changes: 26 additions & 0 deletions src/spell_check/spello/local/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Use an official Python runtime as a parent image
FROM python:3.9-slim

WORKDIR /app

# Install system packages required for building kenlm
RUN apt-get update && apt-get install -y cmake g++ zlib1g-dev

# Install requirements
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt

# Install wget
RUN apt-get update && apt-get install -y wget

# Download the files using wget
RUN wget "https://drive.google.com/uc?export=download&id=14cMmeDPlAODbRe37CdHLnhClGX7JXG-A" -O 'freq_dict.txt'
RUN wget "https://drive.google.com/uc?export=download&id=1Ztj6k0A4BMi_o87qwSDKJQ6cyhvlvneD" -O 'freq_dict_eng.txt'

# Copy the rest of the application code to the working directory
COPY . /app/

EXPOSE 8000

# Set the entrypoint for the container
CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]
37 changes: 37 additions & 0 deletions src/spell_check/spello/local/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
**curl request for inferenece:**

curl -X POST -H "Content-Type: application/json" -d '{
"text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି",
"lang" : "ory"
}' http://localhost:8000/

curl -X POST -H "Content-Type: application/json" -d '{
"text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି"
}' http://localhost:8000/

curl -X POST -H "Content-Type: application/json" -d '{
"text": "how to apply for go-sugem scheme for my paddi crop",
"lang" : "eng"
}' http://localhost:8000/


**curl request for update:**

curl -X PUT -H "Content-Type: application/json" -d '{
"text": "ମିଶନରୀ",
"lang" : "ory"
}' http://localhost:8000/

curl -X PUT -H "Content-Type: application/json" -d '{
"text": ["ପାମ ମିଶନରୀ ଉପରେ", "ରିହାତି ଧୈର୍ଯ ହୋଇଛି"]
}' http://localhost:8000/

curl -X PUT -H "Content-Type: application/json" -d '{
"text": "go-sugem",
"lang" : "eng"
}' http://localhost:8000/

curl -X PUT -H "Content-Type: application/json" -d '{
"text": ["how to apply for", "scheme for my paddi crop"],
"lang" : "eng"
}' http://localhost:8000/
2 changes: 2 additions & 0 deletions src/spell_check/spello/local/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .request import *
from .model import *
46 changes: 46 additions & 0 deletions src/spell_check/spello/local/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from model import Model
from request import ModelRequest
from quart import Quart, request
import aiohttp

app = Quart(__name__)

model = None

freq_dict_paths = {
'ory': 'freq_dict.txt',
'eng': 'freq_dict_eng.txt'
}

spello_model_paths = {
'ory': 'spello_model.pkl',
'eng': 'spello_model_eng.pkl'
}


@app.before_serving
async def startup():
app.client = aiohttp.ClientSession()
global model
model = Model(app, freq_dict_paths)

@app.route('/', methods=['POST'])
async def infer():
global model
data = await request.get_json()
req = ModelRequest(**data)
result = await model.inference(req)
return result

@app.route('/', methods=['PUT'])
async def update():
# print("PUT")
global model
data = await request.get_json()
req = ModelRequest(**data)
result = await model.update(req)
return result


if __name__ == "__main__":
app.run()
Loading

0 comments on commit 1f4d19d

Please sign in to comment.