Skip to content

Commit

Permalink
Improve caching of knms results
Browse files Browse the repository at this point in the history
  • Loading branch information
alrichardbollans committed Apr 9, 2024
1 parent 5ad544f commit 85401e2
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 16 deletions.
3 changes: 3 additions & 0 deletions wcvp_name_matching/get_accepted_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,9 @@ def _get_knms_matches_and_accepted_info_from_names_in_column(df: pd.DataFrame, m
"""
if len(df.index) > 0:
match_records = get_knms_name_matches(df[matching_name_col].unique())
if match_records is None:
none_data = pd.DataFrame(columns=[unique_submission_id_col])
return none_data
match_records = pd.merge(match_records, df, left_on='submitted', right_on=matching_name_col)
match_records['ipni_id'] = match_records['ipni_id'].apply(clean_urn_ids)
match_records = get_accepted_wcvp_info_from_ipni_ids_in_column(match_records, 'ipni_id', all_taxa)
Expand Down
44 changes: 28 additions & 16 deletions wcvp_name_matching/knms_name_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,12 @@ def only_roman_chars(unistr):
if uchr.isalpha())


def get_knms_name_matches(names: List[str]):
def get_knms_name_matches(names: List[str]) -> pd.DataFrame:
"""
Searches knms for matching names
:param names:
:return:
"""
# Save previous searches using a hash of names to avoid repeating searches
names = list(names)
unique_name_list = list(np.unique(names))

str_to_hash = str(unique_name_list).encode()
temp_csv = "knmns_matches_" + str(hashlib.md5(str_to_hash).hexdigest()) + ".csv"

try:
os.mkdir(temp_outputs_dir)
except FileExistsError as error:
Expand All @@ -55,11 +48,25 @@ def get_knms_name_matches(names: List[str]):
except FileExistsError as error:
pass

temp_output_knms_csv = os.path.join(knms_outputs_dir, temp_csv)
if os.path.isfile(temp_output_knms_csv):
# Pandas will read TRUE/true as bools and therefore as True rather than true
records = pd.read_csv(temp_output_knms_csv, dtype={'match_state': str}, index_col=0)
else:
unique_name_list = list(np.unique(names))

temp_file_tag = 'knms_matches_cache_'
existing_df = None
existing_info = []
for temp_cl_file in os.listdir(knms_outputs_dir):
if temp_cl_file.startswith(temp_file_tag):
# Pandas will read TRUE/true as bools and therefore as True rather than true
existing_info.append(pd.read_csv(os.path.join(knms_outputs_dir, temp_cl_file), dtype={'match_state': str}, index_col=0))
if len(existing_info) > 0:
existing_df = pd.concat(existing_info)

already_known_names = existing_df['submitted'].tolist()
# remove the item for all its occurrences
for alread_known in already_known_names:
c = unique_name_list.count(alread_known)
for i in range(c):
unique_name_list.remove(alread_known)
if len(unique_name_list) > 0:
knms_url = "http://namematch.science.kew.org/api/v2/powo/match"
res = requests.post(knms_url, json=unique_name_list)
headings = ['submitted', 'match_state', 'ipni_id', 'matched_name']
Expand Down Expand Up @@ -91,11 +98,16 @@ def get_knms_name_matches(names: List[str]):
records['match_state'].ffill(inplace=True)

if (records['match_state'] == 'false').all():
print('All KNMS records return false. Not saving these records')
print('All KNMS records return false. Not saving these records as this sometimes indicates server issues.')
else:

str_to_hash = str(unique_name_list).encode()
temp_output_knms_csv = os.path.join(knms_outputs_dir, temp_file_tag + str(hashlib.md5(str_to_hash).hexdigest()) + ".csv")
records.to_csv(temp_output_knms_csv)

if existing_df is not None:
records = pd.concat([records, existing_df])
else:
print(f'Already searched for these name in KNMS. Returning records from cache directory: {knms_outputs_dir}')
records = existing_df
return records


Expand Down

0 comments on commit 85401e2

Please sign in to comment.