From 204ee2668d6e800f1a554c60066057d638f424ad Mon Sep 17 00:00:00 2001 From: Gabriel Mechali Date: Mon, 28 Oct 2024 14:37:26 -0400 Subject: [PATCH] Improves Typo recognition for autocomplete (#4690) This PR modifies the scoring algorithm for place autocomplete to count a small score for non-exact matches, to account for one typo. With these changes, we will favor "San Diego" over "Dieppe" for the query "Sna Die". Prod: https://screenshot.googleplex.com/Bsx2BbyLZArbQuX Local with this change: https://screenshot.googleplex.com/9jHqKb2uHJLz37k Note that "Sne Die" will still go back to "Dieppe" because that's 2 typos, so San Diego is out even if it was returned by google Maps predictions: https://screenshot.googleplex.com/9LViJoVFni3Lui6 Typo check done as a bag of letters with at most off by one. We do this check on top of the Google Maps Predictions which already take into account typo correction. This part is just to choose the best prediction from google maps. Doing this as part of gaps identified in place autocomplete: https://docs.google.com/document/d/15RVckX9ck5eyyhBHW8Nb9lmxPBDPMIeLbax14HbN-GI/edit?tab=t.0 --- .../routes/shared_api/autocomplete/helpers.py | 46 ++++++++++++++++++- server/tests/routes/api/autocomplete_test.py | 22 ++++++++- 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/server/routes/shared_api/autocomplete/helpers.py b/server/routes/shared_api/autocomplete/helpers.py index b4299331ed..08e96617d7 100644 --- a/server/routes/shared_api/autocomplete/helpers.py +++ b/server/routes/shared_api/autocomplete/helpers.py @@ -73,6 +73,40 @@ def execute_maps_request(query: str, language: str) -> Dict: return json.loads(response.text) +def bag_of_letters(text: str) -> Dict: + """Creates a bag-of-letters representation of a given string. + Returns: + dict: A dictionary where keys are letters and values are their counts. + """ + bag = {} + for char in text.lower(): + if char.isalpha(): + bag[char] = bag.get(char, 0) + 1 + return bag + + +# TODO(gmechali): Look into a better typo algo e.g Levenshtein distance. +def off_by_one_letter(str1_word: str, name_word: str) -> bool: + """Function to do off by one check. + Returns whether the two strings are off by at most one letter. + """ + offby = 0 + str1_bag = bag_of_letters(str1_word) + str2_bag = bag_of_letters(name_word) + for key, value in str1_bag.items(): + if key in str2_bag: + offby += abs(str2_bag[key] - value) + else: + offby += value + + # Add to offby for letters in str2 but not str1. + for key, value in str2_bag.items(): + if key not in str1_bag: + offby += value + + return offby <= 1 + + def get_match_score(match_string: str, name: str) -> float: """Computes a 'score' based on the matching words in two strings. Lowest score is best match. @@ -86,6 +120,7 @@ def get_match_score(match_string: str, name: str) -> float: start_index = 0 for str1_word in words_in_str1: str1_word = str1_word.lower() + found_match = False for idx, name_word in enumerate(words_in_name): if idx < start_index: continue @@ -94,13 +129,20 @@ def get_match_score(match_string: str, name: str) -> float: if str1_word == name_word: start_index = idx + 1 score -= 1 + found_match = True break elif str1_word in name_word: start_index = idx + 1 score -= 0.5 + found_match = True break - else: - score += 1 + elif off_by_one_letter(str1_word, name_word): + start_index = idx + 1 + found_match = True + score -= 0.25 + + if not found_match: + score += 1 return score diff --git a/server/tests/routes/api/autocomplete_test.py b/server/tests/routes/api/autocomplete_test.py index 72c9c48a31..ee19c7f9ca 100644 --- a/server/tests/routes/api/autocomplete_test.py +++ b/server/tests/routes/api/autocomplete_test.py @@ -15,6 +15,7 @@ import unittest from unittest.mock import patch +from server.routes.shared_api.autocomplete import helpers import server.tests.routes.api.mock_data as mock_data from web_app import app @@ -64,4 +65,23 @@ def mock_predict_effect(query, lang): self.assertEqual(response.status_code, 200) response_dict = json.loads(response.data.decode("utf-8")) - self.assertEqual(len(response_dict["predictions"]), 5) \ No newline at end of file + self.assertEqual(len(response_dict["predictions"]), 5) + + # Tests for helpers within autocomplete. + def test_bag_of_words_same(self): + """Tests that bag of words passes for same letters.""" + text = "San" + reordered_text = "Sna" + self.assertTrue(helpers.off_by_one_letter(text, reordered_text)) + + def test_bag_of_words_off_by_one(self): + """Tests that bag of words passes when off by one.""" + text = "Diego" + off_by_one_text = "Digo" + self.assertTrue(helpers.off_by_one_letter(text, off_by_one_text)) + + def test_bag_of_words_off_by_two(self): + """Tests that bag of words passes when off by two.""" + text = "Diego" + off_by_one_text = "Diaga" + self.assertFalse(helpers.off_by_one_letter(text, off_by_one_text))