Adds Continents to AutoComplete (#4693)

This PR implements a hack to add autocomplete for continents. We always fake that Continents came back in the response from the Google Maps Predictions API response, and hardcode the Place ID to DCID. The Place IDs were fetched from here: https://developers.google.com/maps/documentation/places/web-service/place-id For one word continent, we just add a matching for the single word query. For two word continents (S|N America), we add matching for the single word query, and two word query if applicable. We then compute the match score, and remove all continents with positive scores (negative score means a better match). Note that we prepend the continents in order to give the continents an edge over other places that have the same score. I also added a boost to scoring if the query matches the response from the start in order to give "America" an edge over "North America" when typing "Ameri".
datacommonsorg · Oct 31, 2024 · 1ebe115 · 1ebe115
1 parent 6e141a4
commit 1ebe115
Show file tree

Hide file tree

Showing 2 changed files with 108 additions and 13 deletions.
diff --git a/server/routes/shared_api/autocomplete/autocomplete.py b/server/routes/shared_api/autocomplete/autocomplete.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
 import logging
 
 from flask import Blueprint
@@ -22,7 +21,6 @@
 from server.routes.shared_api.autocomplete import helpers
 from server.routes.shared_api.autocomplete.types import AutoCompleteApiResponse
 from server.routes.shared_api.autocomplete.types import AutoCompleteResult
-from server.routes.shared_api.place import findplacedcid
 
 # TODO(gmechali): Add Stat Var search.
 
@@ -45,15 +43,7 @@ def autocomplete():
   # Send requests to the Google Maps Predictions API.
   prediction_responses = helpers.predict(queries, lang)
 
-  place_ids = []
-  for prediction in prediction_responses:
-    place_ids.append(prediction.place_id)
-
-  place_id_to_dcid = []
-  if place_ids:
-    place_id_to_dcid = json.loads(findplacedcid(place_ids).data)
-  logging.info("[Place_Autocomplete] Found %d place ID to DCID mappings.",
-               len(place_id_to_dcid))
+  place_id_to_dcid = helpers.fetch_place_id_to_dcid(prediction_responses)
 
   final_predictions = []
   for prediction in prediction_responses:
@@ -64,6 +54,10 @@ def autocomplete():
           matched_query=prediction.matched_query,
           dcid=place_id_to_dcid[prediction.place_id])
       final_predictions.append(current_prediction)
+
+      if len(final_predictions) == helpers.DISPLAYED_RESPONSE_COUNT_LIMIT:
+        break
+
   logging.info(
       "[Place_Autocomplete] Returning a total of %d place predictions.",
       len(final_predictions))

diff --git a/server/routes/shared_api/autocomplete/helpers.py b/server/routes/shared_api/autocomplete/helpers.py
@@ -22,11 +22,41 @@
 import requests
 
 from server.routes.shared_api.autocomplete.types import ScoredPrediction
+from server.routes.shared_api.place import findplacedcid
 
 MAPS_API_URL = "https://maps.googleapis.com/maps/api/place/autocomplete/json?"
 MIN_CHARACTERS_PER_QUERY = 3
 MAX_NUM_OF_QUERIES = 4
 DISPLAYED_RESPONSE_COUNT_LIMIT = 5
+TWO_WORD_CONTINENTS = [{
+    'description': 'North America',
+    'place_id': 'ChIJnXKOaXELs1IRgqNhl4MoExM'
+}, {
+    'description': 'South America',
+    'place_id': 'ChIJtTRdNRw0CZQRK-PGyc8M1Gk'
+}]
+CONTINENTS = [{
+    'description': 'Europe',
+    'place_id': 'ChIJhdqtz4aI7UYRefD8s-aZ73I'
+}, {
+    'description': 'Oceania',
+    'place_id': 'ChIJQbA4_Cu8QW4RbuvrxISzaks'
+}, {
+    'description': 'Africa',
+    'place_id': 'ChIJ1fWMlApsoBARs_CQnslwghA'
+}, {
+    'description': 'Asia',
+    'place_id': 'ChIJV-jLJIrxYzYRWfSg0_xrQak'
+}] + TWO_WORD_CONTINENTS
+CONTINENT_PLACE_ID_TO_DCID = {
+    'ChIJhdqtz4aI7UYRefD8s-aZ73I': 'europe',
+    'ChIJtTRdNRw0CZQRK-PGyc8M1Gk': 'southamerica',
+    'ChIJnXKOaXELs1IRgqNhl4MoExM': 'northamerica',
+    'ChIJV-jLJIrxYzYRWfSg0_xrQak': 'asia',
+    'ChIJS3WQM3uWuaQRdSAPdB--Um4': 'antarctica',
+    'ChIJQbA4_Cu8QW4RbuvrxISzaks': 'oceania',
+    'ChIJ1fWMlApsoBARs_CQnslwghA': 'africa'
+}
 
 
 def find_queries(user_query: str) -> List[str]:
@@ -112,20 +142,25 @@ def get_match_score(match_string: str, name: str) -> float:
   score is best match.
   Returns:
     Float score."""
+
   rgx = re.compile(r'[\s|,]+')
   words_in_name = re.split(rgx, name)
   words_in_str1 = re.split(rgx, match_string)
 
   score = 0
   start_index = 0
-  for str1_word in words_in_str1:
+  for str1_idx, str1_word in enumerate(words_in_str1):
     str1_word = str1_word.lower()
     found_match = False
     for idx, name_word in enumerate(words_in_name):
       if idx < start_index:
         continue
 
       name_word = name_word.lower()
+      if idx == 0 and str1_idx == 0 and name_word.startswith(str1_word):
+        # boost score for start of query.
+        score -= 0.25
+
       if str1_word == name_word:
         start_index = idx + 1
         score -= 1
@@ -147,6 +182,45 @@ def get_match_score(match_string: str, name: str) -> float:
   return score
 
 
+def score_below_zero(pred: ScoredPrediction) -> bool:
+  """Returns whether the score is below 0."""
+  return pred.score < 0
+
+
+def prepend_continent_hack(responses: List[ScoredPrediction],
+                           queries: List[str]) -> List[ScoredPrediction]:
+  """Prepend continents as responses in order to hack it in autocomplete.
+  Returns:
+    List of scored predictions."""
+
+  continent_responses = []
+  single_word_query = queries[-1]
+  for continent in CONTINENTS:
+    scored_prediction = ScoredPrediction(description=continent['description'],
+                                         place_id=continent['place_id'],
+                                         matched_query=single_word_query,
+                                         score=get_match_score(
+                                             single_word_query,
+                                             continent['description']))
+    continent_responses.append(scored_prediction)
+
+  if len(queries) > 1:
+    two_word_query = queries[-2]
+    # If we have a 2 two word query, also place the two word continents as responses.
+    for continent in TWO_WORD_CONTINENTS:
+      scored_prediction = ScoredPrediction(description=continent['description'],
+                                           place_id=continent['place_id'],
+                                           matched_query=two_word_query,
+                                           score=get_match_score(
+                                               two_word_query,
+                                               continent['description']))
+      continent_responses.append(scored_prediction)
+
+  # Only keep continents with a score below 0 as it implies it's close to the query.
+  continent_responses = list(filter(score_below_zero, continent_responses))
+  return continent_responses + responses
+
+
 def predict(queries: List[str], lang: str) -> List[ScoredPrediction]:
   """Trigger maps prediction api requests and parse the output. Remove duplication responses and limit the number of results.
   Returns:
@@ -164,14 +238,18 @@ def predict(queries: List[str], lang: str) -> List[ScoredPrediction]:
                                                query, pred['description']))
       all_responses.append(scored_prediction)
 
+  # Continent hack - Continents not supported by Google Maps Predictions API.
+  # This hack will always evaluate continents for each response. They will get filtered in/out based on the match_score we compute.
+  all_responses = prepend_continent_hack(all_responses, queries)
+
   all_responses.sort(key=get_score)
   logging.info("[Place_Autocomplete] Received %d total place predictions.",
                len(all_responses))
 
   responses = []
   place_ids = set()
   index = 0
-  while len(responses) < DISPLAYED_RESPONSE_COUNT_LIMIT and index < len(
+  while len(responses) < 2 * DISPLAYED_RESPONSE_COUNT_LIMIT and index < len(
       all_responses):
     if all_responses[index].place_id not in place_ids:
       responses.append(all_responses[index])
@@ -181,6 +259,29 @@ def predict(queries: List[str], lang: str) -> List[ScoredPrediction]:
   return responses
 
 
+def fetch_place_id_to_dcid(
+    prediction_responses: List[ScoredPrediction]) -> Dict[str, str]:
+  """Fetches the associated DCID for each place ID returned by Google.
+  Returns:
+    Mapping of Place ID to DCID."""
+
+  place_ids = []
+  for prediction in prediction_responses:
+    place_ids.append(prediction.place_id)
+
+  place_id_to_dcid = dict()
+  if place_ids:
+    place_id_to_dcid = json.loads(findplacedcid(place_ids).data)
+
+  # Add hardcoded continent Place IDs to DCIDs.
+  place_id_to_dcid.update(CONTINENT_PLACE_ID_TO_DCID)
+
+  logging.info("[Place_Autocomplete] Found %d place ID to DCID mappings.",
+               len(place_id_to_dcid))
+
+  return place_id_to_dcid
+
+
 def get_score(p: ScoredPrediction) -> float:
   """Returns the score."""
   return p.score