From 2fa8f2a74e507cecffe227caa9723e5184595c99 Mon Sep 17 00:00:00 2001 From: Gabriel Mechali <gabriel.mechali@gmail.com> Date: Wed, 6 Nov 2024 11:43:07 -0500 Subject: [PATCH 1/3] Replaces accents etc, into ascii equivalents to make these better match in scoring algo. --- server/routes/shared_api/autocomplete/helpers.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/server/routes/shared_api/autocomplete/helpers.py b/server/routes/shared_api/autocomplete/helpers.py index 53a667f0c9..cface98fb2 100644 --- a/server/routes/shared_api/autocomplete/helpers.py +++ b/server/routes/shared_api/autocomplete/helpers.py @@ -16,6 +16,7 @@ import logging import re from typing import Dict, List +import unicodedata from urllib.parse import urlencode from flask import current_app @@ -135,11 +136,21 @@ def off_by_one_letter(str1_word: str, name_word: str) -> bool: return offby <= 1 +def sanitize_and_replace_non_ascii(str: str) -> str: + """Sanitize and replace non ascii. + Returns: + String sanitized and without accents, cedillas, or enye.""" + nfkd_form = unicodedata.normalize('NFKD', str) + return u"".join([c for c in nfkd_form if not unicodedata.combining(c)]) + + def get_match_score(match_string: str, name: str) -> float: """Computes a 'score' based on the matching words in two strings. Lowest score is best match. Returns: Float score.""" + name = sanitize_and_replace_non_ascii(name) + match_string = sanitize_and_replace_non_ascii(match_string) rgx = re.compile(r'[\s|,]+') words_in_name = re.split(rgx, name) From 2ec66d6d420cde0c8ffae29f89f298ca1cc5f160 Mon Sep 17 00:00:00 2001 From: Gabriel Mechali <gabriel.mechali@gmail.com> Date: Wed, 6 Nov 2024 12:04:17 -0500 Subject: [PATCH 2/3] Rename param to make code analysis happy. --- server/routes/shared_api/autocomplete/helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/routes/shared_api/autocomplete/helpers.py b/server/routes/shared_api/autocomplete/helpers.py index cface98fb2..0f1706f30c 100644 --- a/server/routes/shared_api/autocomplete/helpers.py +++ b/server/routes/shared_api/autocomplete/helpers.py @@ -136,11 +136,11 @@ def off_by_one_letter(str1_word: str, name_word: str) -> bool: return offby <= 1 -def sanitize_and_replace_non_ascii(str: str) -> str: +def sanitize_and_replace_non_ascii(input: str) -> str: """Sanitize and replace non ascii. Returns: String sanitized and without accents, cedillas, or enye.""" - nfkd_form = unicodedata.normalize('NFKD', str) + nfkd_form = unicodedata.normalize('NFKD', input) return u"".join([c for c in nfkd_form if not unicodedata.combining(c)]) From 2f407a1ec42e1c799bbddfa8bd94763e6e01c9eb Mon Sep 17 00:00:00 2001 From: Gabriel Mechali <gabriel.mechali@gmail.com> Date: Wed, 6 Nov 2024 12:09:47 -0500 Subject: [PATCH 3/3] Rename param to make code analysis happy, input still was bad.. --- server/routes/shared_api/autocomplete/helpers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/server/routes/shared_api/autocomplete/helpers.py b/server/routes/shared_api/autocomplete/helpers.py index 0f1706f30c..4a87ff5372 100644 --- a/server/routes/shared_api/autocomplete/helpers.py +++ b/server/routes/shared_api/autocomplete/helpers.py @@ -136,12 +136,12 @@ def off_by_one_letter(str1_word: str, name_word: str) -> bool: return offby <= 1 -def sanitize_and_replace_non_ascii(input: str) -> str: +def sanitize_and_replace_non_ascii(string: str) -> str: """Sanitize and replace non ascii. Returns: String sanitized and without accents, cedillas, or enye.""" - nfkd_form = unicodedata.normalize('NFKD', input) - return u"".join([c for c in nfkd_form if not unicodedata.combining(c)]) + nfkd_form = unicodedata.normalize('NFKD', string) + return "".join([c for c in nfkd_form if not unicodedata.combining(c)]) def get_match_score(match_string: str, name: str) -> float: