Changes for words passing issue due to calculating wrong confidence a…

…nd constructed text
Sunbird-ALL · May 15, 2024 · c81b31a · c81b31a
1 parent ab6a667
commit c81b31a
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 52 deletions.
diff --git a/.github/workflows/Prod.yml b/.github/workflows/Prod.yml
@@ -2,7 +2,7 @@ name: PROD DEPLOYMENT
 
 on:
   push:
-    branches: [ all-1.0-prod ]
+    branches: [ all-1.0-prod-hotfix ]
 
 jobs:
   build:
@@ -57,4 +57,4 @@ jobs:
               docker rmi ${{ secrets.CONTAINER_REGISTRY_PROD }}:${{ secrets.IMAGE_TAG }}
               docker pull ${{ secrets.CONTAINER_REGISTRY_PROD }}:${{ secrets.IMAGE_TAG }}
 
-              docker run -d --name ${{ secrets.CONTAINER_NAME }} --network ${{ secrets.NETWORK }} -p ${{ secrets.CONTAINER_PORT }} -t ${{ secrets.CONTAINER_REGISTRY_PROD }}:${{ secrets.IMAGE_TAG }} 
+              docker run -d --name ${{ secrets.CONTAINER_NAME }} --network ${{ secrets.NETWORK }} -p ${{ secrets.CONTAINER_PORT }} -t ${{ secrets.CONTAINER_REGISTRY_PROD }}:${{ secrets.IMAGE_TAG }}
diff --git a/app.py b/app.py
@@ -218,68 +218,60 @@ def split_into_phonemes(token):
     return ph_list
 
 def identify_missing_tokens(orig_text, resp_text):
+# Splitting text into words
     if resp_text == None:
         resp_text = ""
-    orig_word_list = orig_text.split()
-    resp_word_list = resp_text.split()
-    construct_word_list =[]
-    missing_word_list=[]
+    orig_word_list = orig_text.lower().split()
+    resp_word_list = resp_text.lower().split()
+
+    # Initialize lists and dictionaries
+    construct_word_list = []
+    missing_word_list = []
     orig_phoneme_list = []
     construct_phoneme_list = []
-    missing_phoneme_list =[]
-    construct_text=''
-    index=0
+    missing_phoneme_list = []
+    construct_text = []
+
+    # Precompute phonemes for response words for quick lookup
+    resp_phonemes = {word: p.convert(word) for word in resp_word_list}
+
     for word in orig_word_list:
-        #use similarity algo euclidean distance and add them, if there is no direct match
-        closest_match, similarity_score = find_closest_match(word, resp_text)
-        print(f"word:{word}: closest match: {closest_match}: sim score:{similarity_score}")
+        # Precompute original word phonemes
         p_word = p.convert(word)
-        print(f"word - {word}:: phonemes - {p_word}")#p_word = split_into_phonemes(p_word)
-        if closest_match != None and (similarity_score > 80 or len(orig_word_list) == 1):
-            #print("matched word")
+
+        # Find closest match based on precomputed phonemes to avoid redundant calculations
+        closest_match, similarity_score = find_closest_match(word, resp_text)
+
+        # Check similarity and categorize word
+        if (closest_match != None) and (similarity_score >= 80 and len(orig_word_list) > 1) or (len(orig_word_list) == 1 and similarity_score >= 60):
             construct_word_list.append(closest_match)
-            p_closest_match = p.convert(closest_match)
+            p_closest_match = resp_phonemes[closest_match]
             construct_phoneme_list.append(split_into_phonemes(p_closest_match))
-            construct_text += closest_match + ' '
+            construct_text.append(closest_match)
         else:
-            print(f"no match for - {word}: closest match: {closest_match}: sim score:{similarity_score}")
             missing_word_list.append(word)
-            missing_phoneme_list.append(split_into_phonemes(p_word))
-        index = index+1
+            p_word_phonemes = split_into_phonemes(p_word)
+            missing_phoneme_list.append(p_word_phonemes)
+
+        # Store original phonemes for each word
         orig_phoneme_list.append(split_into_phonemes(p_word))
 
-        # iterate through the sublist using List comprehension to flatten the nested list to single list
-        orig_flatList = [element for innerList in orig_phoneme_list for element in innerList]
-        missing_flatList = [element for innerList in missing_phoneme_list for element in innerList]
-        construct_flatList = [element for innerList in construct_phoneme_list for element in innerList]
-
-        # ensure duplicates are removed and only unique set are available
-        orig_flatList = list(set(orig_flatList))
-        missing_flatList = list(set(missing_flatList))
-        construct_flatList = list(set(construct_flatList))
-
-        #For words like pew and few, we are adding to construct word and
-        # we just need to eliminate the matching phonemes and
-        # add missing phonemes into missing list
-        for m in orig_flatList:
-            print(m, " in construct phonemelist")
-            if m not in construct_flatList:
-                missing_flatList.append(m)
-                print('adding to missing list', m)
-        missing_flatList = list(set(missing_flatList))
-
-        print(f"orig Text: {orig_text}")
-        print(f"Resp Text: {resp_text}")
-        print(f"construct Text: {construct_text}")
-
-        print(f"original phonemes: {orig_phoneme_list}")
-        #print(f"flat original phonemes: {orig_flatList}")
-        print(f"Construct phonemes: {construct_phoneme_list}")
-
-        #print(f"flat Construct phonemes: {construct_flatList}")
-        #print(f"missing phonemes: {missing_phoneme_list}")
-        print(f"flat missing phonemes: {missing_flatList}")
-    return construct_flatList, missing_flatList,construct_text
+    # Convert list of words to a single string
+    construct_text = ' '.join(construct_text)
+
+    # Efficiently deduplicate and flatten phoneme lists
+    orig_flatList = set(phoneme for sublist in orig_phoneme_list for phoneme in sublist)
+    #missing_flatList = set(phoneme for sublist in missing_phoneme_list for phoneme in sublist)
+    construct_flatList = set(phoneme for sublist in construct_phoneme_list for phoneme in sublist)
+
+    #For words like pew and few, we are adding to construct word and
+    # we just need to eliminate the matching phonemes and
+    # add missing phonemes into missing list
+    for m in orig_flatList:
+        if m not in construct_flatList:
+            missing_phoneme_list.append(m)
+    missing_flatList = set(phoneme for sublist in missing_phoneme_list for phoneme in sublist)
+    return list(construct_flatList), list(missing_flatList),construct_text
 
 def processLP(orig_text, resp_text):
     cons_list, miss_list,construct_text = identify_missing_tokens(orig_text, resp_text)