Skip to content

Commit

Permalink
fix(clean): fix bug in validate_country
Browse files Browse the repository at this point in the history
  • Loading branch information
brandonlockhart committed Apr 15, 2021
1 parent 125c072 commit 28068d4
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 10 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ series
Name: country, dtype: bool
```

**Currently supports functions for:** Column Headers | Country Names | Dates and Times | Email Addresses | Geographic Coordinates | IP Addresses | Phone Numbers | URLs | US Street Addresses
**Currently supports functions for:** Column Headers | Country Names | Dates and Times | Duplicate Strings | Email Addresses | Geographic Coordinates | IP Addresses | Phone Numbers | URLs | US Street Addresses

## Documentation

Expand Down
18 changes: 9 additions & 9 deletions dataprep/clean/clean_country.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from functools import lru_cache
from operator import itemgetter
from os import path
from re import error
from typing import Any, Union

import dask
Expand All @@ -20,8 +21,6 @@
DATA = pd.read_csv(COUNTRY_DATA_FILE, sep="\t", encoding="utf-8", dtype=str)

REGEXES = [re.compile(entry, re.IGNORECASE) for entry in DATA.regex]
# alternative regex search strategy given on line 243
# REGEXES = re.compile("|".join(f"(?P<a{i}>{x})" for i, x in enumerate(DATA.regex)), re.IGNORECASE)


def clean_country(
Expand Down Expand Up @@ -290,19 +289,20 @@ def _check_country(country: str, input_format: str, strict: bool, clean: bool) -

if strict and input_format == "regex":
for form in ("name", "official"):
ind = DATA[DATA[form].str.contains(f"^{country}$", flags=re.IGNORECASE, na=False)].index
if np.size(ind) > 0:
return (ind[0], "success") if clean else True
try:
ind = DATA[
DATA[form].str.contains(f"^{country}$", flags=re.IGNORECASE, na=False)
].index
if np.size(ind) > 0:
return (ind[0], "success") if clean else True
except error:
return (None, "unknown") if clean else False

elif not strict and input_format in ("regex", "name", "official"):
for index, country_regex in enumerate(REGEXES):
if country_regex.search(country):
return (index, "success") if clean else True

# alternative regex search strategy
# match = REGEXES.search(country)
# if match:
# return (int(match.lastgroup[1:]), "success") if clean else True
else:
ind = DATA[
DATA[input_format].str.contains(f"^{country}$", flags=re.IGNORECASE, na=False)
Expand Down

0 comments on commit 28068d4

Please sign in to comment.