Replaces accents and more into ascii equivalents for matching algorit…

…hm. (#4716) Places like Curaçao, España, or Algérie were being unfairly demoted in scoring due to the non-ascii character. This PR replaces all special characters before determine the match score. Screencast with the fix: https://screencast.googleplex.com/cast/NDkzOTc5NTE5MjA4NjUyOHw1N2FlYTVmMC00Zg
datacommonsorg · Nov 7, 2024 · 82d3c9c · 82d3c9c
1 parent 81c43b6
commit 82d3c9c
Showing 1 changed file with 11 additions and 0 deletions.
diff --git a/server/routes/shared_api/autocomplete/helpers.py b/server/routes/shared_api/autocomplete/helpers.py
@@ -16,6 +16,7 @@
 import logging
 import re
 from typing import Dict, List
+import unicodedata
 from urllib.parse import urlencode
 
 from flask import current_app
@@ -135,11 +136,21 @@ def off_by_one_letter(str1_word: str, name_word: str) -> bool:
   return offby <= 1
 
 
+def sanitize_and_replace_non_ascii(string: str) -> str:
+  """Sanitize and replace non ascii.
+  Returns:
+    String sanitized and without accents, cedillas, or enye."""
+  nfkd_form = unicodedata.normalize('NFKD', string)
+  return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
+
+
 def get_match_score(match_string: str, name: str) -> float:
   """Computes a 'score' based on the matching words in two strings. Lowest
   score is best match.
   Returns:
     Float score."""
+  name = sanitize_and_replace_non_ascii(name)
+  match_string = sanitize_and_replace_non_ascii(match_string)
 
   rgx = re.compile(r'[\s|,]+')
   words_in_name = re.split(rgx, name)