From 204ee2668d6e800f1a554c60066057d638f424ad Mon Sep 17 00:00:00 2001
From: Gabriel Mechali <gabriel.mechali@gmail.com>
Date: Mon, 28 Oct 2024 14:37:26 -0400
Subject: [PATCH] Improves Typo recognition for autocomplete (#4690)

This PR modifies the scoring algorithm for place autocomplete to count a
small score for non-exact matches, to account for one typo.
With these changes, we will favor "San Diego" over "Dieppe" for the
query "Sna Die".
Prod: https://screenshot.googleplex.com/Bsx2BbyLZArbQuX
Local with this change:
https://screenshot.googleplex.com/9jHqKb2uHJLz37k

Note that "Sne Die" will still go back to "Dieppe" because that's 2
typos, so San Diego is out even if it was returned by google Maps
predictions: https://screenshot.googleplex.com/9LViJoVFni3Lui6

Typo check done as a bag of letters with at most off by one. We do this
check on top of the Google Maps Predictions which already take into
account typo correction. This part is just to choose the best prediction
from google maps.

Doing this as part of gaps identified in place autocomplete:
https://docs.google.com/document/d/15RVckX9ck5eyyhBHW8Nb9lmxPBDPMIeLbax14HbN-GI/edit?tab=t.0
---
 .../routes/shared_api/autocomplete/helpers.py | 46 ++++++++++++++++++-
 server/tests/routes/api/autocomplete_test.py  | 22 ++++++++-
 2 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/server/routes/shared_api/autocomplete/helpers.py b/server/routes/shared_api/autocomplete/helpers.py
index b4299331ed..08e96617d7 100644
--- a/server/routes/shared_api/autocomplete/helpers.py
+++ b/server/routes/shared_api/autocomplete/helpers.py
@@ -73,6 +73,40 @@ def execute_maps_request(query: str, language: str) -> Dict:
   return json.loads(response.text)
 
 
+def bag_of_letters(text: str) -> Dict:
+  """Creates a bag-of-letters representation of a given string.
+    Returns:
+    dict: A dictionary where keys are letters and values are their counts.
+    """
+  bag = {}
+  for char in text.lower():
+    if char.isalpha():
+      bag[char] = bag.get(char, 0) + 1
+  return bag
+
+
+# TODO(gmechali): Look into a better typo algo e.g Levenshtein distance.
+def off_by_one_letter(str1_word: str, name_word: str) -> bool:
+  """Function to do off by one check.
+  Returns whether the two strings are off by at most one letter.
+  """
+  offby = 0
+  str1_bag = bag_of_letters(str1_word)
+  str2_bag = bag_of_letters(name_word)
+  for key, value in str1_bag.items():
+    if key in str2_bag:
+      offby += abs(str2_bag[key] - value)
+    else:
+      offby += value
+
+  # Add to offby for letters in str2 but not str1.
+  for key, value in str2_bag.items():
+    if key not in str1_bag:
+      offby += value
+
+  return offby <= 1
+
+
 def get_match_score(match_string: str, name: str) -> float:
   """Computes a 'score' based on the matching words in two strings. Lowest
   score is best match.
@@ -86,6 +120,7 @@ def get_match_score(match_string: str, name: str) -> float:
   start_index = 0
   for str1_word in words_in_str1:
     str1_word = str1_word.lower()
+    found_match = False
     for idx, name_word in enumerate(words_in_name):
       if idx < start_index:
         continue
@@ -94,13 +129,20 @@ def get_match_score(match_string: str, name: str) -> float:
       if str1_word == name_word:
         start_index = idx + 1
         score -= 1
+        found_match = True
         break
       elif str1_word in name_word:
         start_index = idx + 1
         score -= 0.5
+        found_match = True
         break
-      else:
-        score += 1
+      elif off_by_one_letter(str1_word, name_word):
+        start_index = idx + 1
+        found_match = True
+        score -= 0.25
+
+    if not found_match:
+      score += 1
 
   return score
 
diff --git a/server/tests/routes/api/autocomplete_test.py b/server/tests/routes/api/autocomplete_test.py
index 72c9c48a31..ee19c7f9ca 100644
--- a/server/tests/routes/api/autocomplete_test.py
+++ b/server/tests/routes/api/autocomplete_test.py
@@ -15,6 +15,7 @@
 import unittest
 from unittest.mock import patch
 
+from server.routes.shared_api.autocomplete import helpers
 import server.tests.routes.api.mock_data as mock_data
 from web_app import app
 
@@ -64,4 +65,23 @@ def mock_predict_effect(query, lang):
     self.assertEqual(response.status_code, 200)
 
     response_dict = json.loads(response.data.decode("utf-8"))
-    self.assertEqual(len(response_dict["predictions"]), 5)
\ No newline at end of file
+    self.assertEqual(len(response_dict["predictions"]), 5)
+
+  # Tests for helpers within autocomplete.
+  def test_bag_of_words_same(self):
+    """Tests that bag of words passes for same letters."""
+    text = "San"
+    reordered_text = "Sna"
+    self.assertTrue(helpers.off_by_one_letter(text, reordered_text))
+
+  def test_bag_of_words_off_by_one(self):
+    """Tests that bag of words passes when off by one."""
+    text = "Diego"
+    off_by_one_text = "Digo"
+    self.assertTrue(helpers.off_by_one_letter(text, off_by_one_text))
+
+  def test_bag_of_words_off_by_two(self):
+    """Tests that bag of words passes when off by two."""
+    text = "Diego"
+    off_by_one_text = "Diaga"
+    self.assertFalse(helpers.off_by_one_letter(text, off_by_one_text))