Skip to content

Commit

Permalink
Replace in-place operations with copy methods to fix SettingWithCopyW…
Browse files Browse the repository at this point in the history
…arning.
  • Loading branch information
alrichardbollans committed Apr 9, 2024
1 parent 85401e2 commit c8e6b4d
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 25 deletions.
4 changes: 2 additions & 2 deletions OpenRefineMatching/find_OpenRefine_matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ def _reconcile(row, full_name_col):

def openrefine_match_full_names(df: pd.DataFrame, full_name_col: str,
output_csv: str = None) -> pd.DataFrame:
out_df = df.copy(deep=True)
out_df = df.copy()
out_df[reco_submitted_name_col_id] = df[full_name_col]
out_df = out_df.drop_duplicates(subset=[reco_submitted_name_col_id])

print(f'Trying to resolve {len(out_df)} names with OpenRefine')
# Reconcile
out_df['reco_results'] = out_df.apply(
lambda row: _reconcile(row, full_name_col=full_name_col),
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name='automatchnames',
version='1.3',
version='1.3.1',
packages=find_packages(),
package_data={"wcvp_download": ["inputs/*", "inputs/wgsrpd-master/level3/*"]},
install_requires=[
Expand Down
2 changes: 1 addition & 1 deletion wcvp_download/unit_tests/test_string_hygiene.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_whitespace(self):

def test_authors(self):
test_columns = [wcvp_columns['paranthet_author'], wcvp_columns['primary_author'], wcvp_columns['authors']]
taxa_to_test = wcvp_data.dropna(subset=test_columns, how='any')
taxa_to_test = wcvp_data.dropna(subset=test_columns, how='any').copy()
taxa_to_test['()'] = '(' + taxa_to_test[wcvp_columns['paranthet_author']] + ')'

for c in test_columns:
Expand Down
2 changes: 1 addition & 1 deletion wcvp_download/unit_tests/test_taxa_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def test_unusual_genera(self):
def test_author_information(self):
# accepted cases
# acc name with author = acc name + authors
accepted = wcvp_data[wcvp_data[wcvp_columns['status']].isin(['Accepted', 'Artificial Hybrid'])]
accepted = wcvp_data[wcvp_data[wcvp_columns['status']].isin(['Accepted', 'Artificial Hybrid'])].copy()
accepted['auth_check'] = accepted[wcvp_columns['name']].str.cat(
[accepted[wcvp_columns['authors']].fillna('')],
sep=' ').str.strip()
Expand Down
22 changes: 11 additions & 11 deletions wcvp_name_matching/get_accepted_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,20 +227,20 @@ def _find_best_matches_from_multiple_knms_matches(multiple_match_records: pd.Dat
# First, use matches where accepted name is the same as the submitted name
accepted_names_matching_submitted_names = multiple_match_records[
multiple_match_records['submitted'] == multiple_match_records[
wcvp_accepted_columns['name']]]
wcvp_accepted_columns['name']]].copy()

unmatched_containment_df = multiple_match_records[
~multiple_match_records[unique_submission_id_col].isin(
accepted_names_matching_submitted_names[unique_submission_id_col].values)]
accepted_names_matching_submitted_names[unique_submission_id_col].values)].copy()

# reduce list to remove essentially repeated matches
unique_accepted_matches = unmatched_containment_df.drop_duplicates(
subset=[unique_submission_id_col, wcvp_accepted_columns['ipni_id']], keep='first')
subset=[unique_submission_id_col, wcvp_accepted_columns['ipni_id']], keep='first').copy()

# Next use matches where the submitted name has a unique match
submitted_names_with_single_accepted_match = unique_accepted_matches.drop_duplicates(
subset=[unique_submission_id_col],
keep=False)
keep=False).copy()

# Where neither of the above apply, sort by accepted rank
unresolved_submissions = unique_accepted_matches[
Expand All @@ -249,21 +249,20 @@ def _find_best_matches_from_multiple_knms_matches(multiple_match_records: pd.Dat

# matches where the accepted name is contained in the submitted name
# In case of duplicates, these are sorted by specifity of rank
unresolved_submissions = unresolved_submissions.dropna(subset=[wcvp_accepted_columns['name']])
unresolved_submissions = unresolved_submissions.dropna(subset=[wcvp_accepted_columns['name']]).copy()
accepted_names_in_submitted_names = unresolved_submissions[
unresolved_submissions.apply(lambda x: x[wcvp_accepted_columns['name']] in x['submitted'],
axis=1)]
axis=1)].copy()

for r in accepted_names_in_submitted_names[wcvp_accepted_columns['rank']].unique():
if r not in rank_priority:
raise ValueError(f'Rank priority list does not contain {r} and needs updating.')
accepted_names_in_submitted_names[wcvp_accepted_columns['rank']] = pd.Categorical(
accepted_names_in_submitted_names[wcvp_accepted_columns['rank']], rank_priority)
accepted_names_in_submitted_names.sort_values(wcvp_accepted_columns['rank'], inplace=True)
accepted_names_in_submitted_names = accepted_names_in_submitted_names.sort_values(wcvp_accepted_columns['rank']).copy()

# Get the most precise match by dropping duplicate submissions
accepted_names_in_submitted_names.drop_duplicates(subset=[unique_submission_id_col], keep='first',
inplace=True)
accepted_names_in_submitted_names = accepted_names_in_submitted_names.drop_duplicates(subset=[unique_submission_id_col], keep='first').copy()
accepted_names_in_submitted_names[wcvp_accepted_columns['rank']] = accepted_names_in_submitted_names[
wcvp_accepted_columns['rank']].astype(object)

Expand All @@ -273,7 +272,7 @@ def _find_best_matches_from_multiple_knms_matches(multiple_match_records: pd.Dat
matches_to_use = pd.concat([accepted_names_matching_submitted_names,
submitted_names_with_single_accepted_match,
accepted_names_in_submitted_names])
matches_to_use.drop_duplicates(subset=[unique_submission_id_col], keep='first', inplace=True)
matches_to_use = matches_to_use.drop_duplicates(subset=[unique_submission_id_col], keep='first').copy()

unmatched_df = multiple_match_records[
~multiple_match_records[unique_submission_id_col].isin(
Expand Down Expand Up @@ -453,7 +452,8 @@ def get_accepted_info_from_names_in_column(in_df: pd.DataFrame, name_col: str,
unique_submission_index_col,
all_taxa,
family_column=family_column)

# This will raise a pandas warning
# https://github.com/pandas-dev/pandas/issues/55928
final_resolved_df = pd.concat(
[unmatched_resolutions, fuzzy_resolved_df], axis=0)

Expand Down
8 changes: 4 additions & 4 deletions wcvp_name_matching/resolve_openrefine_matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def resolve_openrefine_to_best_matches(reco_df: pd.DataFrame, all_taxa: pd.DataF
out_df[wcvp_accepted_columns['family']].isin(families_of_interest))]

# Unique matches
unique_matches = out_df[~out_df[reco_submitted_name_col_id].duplicated(keep=False)]
unique_matches = out_df.drop_duplicates(subset=[reco_submitted_name_col_id], keep=False).copy()
unique_matches['matched_by'] = 'openrefine_unique'

non_unique_reco_matches = out_df[out_df[reco_submitted_name_col_id].duplicated(keep=False)]
Expand All @@ -39,16 +39,16 @@ def resolve_openrefine_to_best_matches(reco_df: pd.DataFrame, all_taxa: pd.DataF
subset=[reco_submitted_name_col_id, wcvp_accepted_columns['ipni_id']], keep='first')
submitted_names_with_single_accepted_match = unique_acc_names.drop_duplicates(
subset=[reco_submitted_name_col_id],
keep=False)
keep=False).copy()
submitted_names_with_single_accepted_match['matched_by'] = 'openrefine_unique_accepted_name'

non_unique_matches = non_unique_reco_matches[
~non_unique_reco_matches[reco_submitted_name_col_id].isin(
submitted_names_with_single_accepted_match[reco_submitted_name_col_id].values)]
submitted_names_with_single_accepted_match[reco_submitted_name_col_id].values)].copy()

# Best scoring matches
non_unique_matches['reco_score_max'] = non_unique_matches.groupby([reco_submitted_name_col_id])[
'reco_score'].transform(max)
'reco_score'].transform('max')

top_scorers = non_unique_matches[non_unique_matches['reco_score'] == 'reco_score_max']
top_unique_scorers = top_scorers[~top_scorers[reco_submitted_name_col_id].duplicated(keep=False)]
Expand Down
13 changes: 8 additions & 5 deletions wcvp_name_matching/wcvp_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def tidy_value_for_matching(given_value: str) -> str:
def match_name_to_concatenated_columns(df: pd.DataFrame, matching_name_col: str, all_taxa: pd.DataFrame,
columns: List[str]):
# Remove taxa without author information as these are matched later without authors
taxa_to_use = all_taxa.dropna(subset=columns)
taxa_to_use = all_taxa.dropna(subset=columns).copy()
column_series = [all_taxa[c].fillna('') for c in columns]
taxa_to_use['taxon_name_with_extra_columns'] = taxa_to_use[wcvp_columns['name']].str.cat(column_series,
sep=' ')
Expand All @@ -66,7 +66,7 @@ def match_name_to_concatenated_columns(df: pd.DataFrame, matching_name_col: str,
right_on='taxon_name_with_extra_columns')
author_merged = author_merged.dropna(subset=[wcvp_columns['wcvp_id']])

unmatched_with_authors_df = df[~df[lowercase_name_col].isin(author_merged[lowercase_name_col].values)]
unmatched_with_authors_df = df[~df[lowercase_name_col].isin(author_merged[lowercase_name_col].values)].copy()

# Repeat but with 'tidied' authors
unmatched_with_authors_df[tidied_taxon_authors_col] = unmatched_with_authors_df[matching_name_col].apply(
Expand Down Expand Up @@ -97,7 +97,10 @@ def get_wcvp_info_for_names_in_column(df: pd.DataFrame, matching_name_col: str,
Appends accepted info columns to df from list of taxa, based on names in matching_name_col
:param df:
:param matching_name_col:
:param unique_submission_id_col:
:param all_taxa:
:param family_column:
:param wcvp_version:
:return:
"""
if all_taxa is None:
Expand Down Expand Up @@ -140,7 +143,7 @@ def get_wcvp_info_for_names_in_column(df: pd.DataFrame, matching_name_col: str,

merged_with_wcvp = pd.concat([author_merged, paranthet_author_merged, primary_author_merged, just_name_merged])
match_df = get_family_specific_resolutions(merged_with_wcvp, family_column=family_column)
match_df = match_df[[unique_submission_id_col] + output_record_col_names + ['matched_by', 'matched_name']]
match_df = match_df[[unique_submission_id_col] + output_record_col_names + ['matched_by', 'matched_name']].copy()

# Remove duplicates in match_df based on priority

Expand All @@ -149,11 +152,11 @@ def get_wcvp_info_for_names_in_column(df: pd.DataFrame, matching_name_col: str,
raise ValueError(f'Status priority list does not contain {r} and needs updating.')
match_df['taxon_status'] = pd.Categorical(match_df['taxon_status'],
status_priority)
match_df.sort_values('taxon_status', inplace=True)
match_df = match_df.sort_values('taxon_status')
# Appropriately label unique matches
match_df['matched_by'] = np.where(match_df.duplicated(keep=False, subset=[unique_submission_id_col]),
match_df['matched_by'],
match_df['matched_by'] + '_unique')
match_df.drop_duplicates(subset=[unique_submission_id_col], inplace=True, keep='first')
match_df = match_df.drop_duplicates(subset=[unique_submission_id_col], keep='first')
match_df['taxon_status'] = match_df['taxon_status'].astype(object)
return match_df

0 comments on commit c8e6b4d

Please sign in to comment.