Replace in-place operations with copy methods to fix SettingWithCopyW…

…arning.
alrichardbollans · Apr 9, 2024 · c8e6b4d · c8e6b4d
1 parent 85401e2
commit c8e6b4d
Show file tree

Hide file tree

Showing 7 changed files with 28 additions and 25 deletions.
diff --git a/OpenRefineMatching/find_OpenRefine_matches.py b/OpenRefineMatching/find_OpenRefine_matches.py
@@ -29,10 +29,10 @@ def _reconcile(row, full_name_col):
 
 def openrefine_match_full_names(df: pd.DataFrame, full_name_col: str,
                                 output_csv: str = None) -> pd.DataFrame:
-    out_df = df.copy(deep=True)
+    out_df = df.copy()
     out_df[reco_submitted_name_col_id] = df[full_name_col]
     out_df = out_df.drop_duplicates(subset=[reco_submitted_name_col_id])
-
+    print(f'Trying to resolve {len(out_df)} names with OpenRefine')
     # Reconcile
     out_df['reco_results'] = out_df.apply(
         lambda row: _reconcile(row, full_name_col=full_name_col),

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='automatchnames',
-    version='1.3',
+    version='1.3.1',
     packages=find_packages(),
     package_data={"wcvp_download": ["inputs/*", "inputs/wgsrpd-master/level3/*"]},
     install_requires=[

diff --git a/wcvp_download/unit_tests/test_string_hygiene.py b/wcvp_download/unit_tests/test_string_hygiene.py
@@ -67,7 +67,7 @@ def test_whitespace(self):
 
     def test_authors(self):
         test_columns = [wcvp_columns['paranthet_author'], wcvp_columns['primary_author'], wcvp_columns['authors']]
-        taxa_to_test = wcvp_data.dropna(subset=test_columns, how='any')
+        taxa_to_test = wcvp_data.dropna(subset=test_columns, how='any').copy()
         taxa_to_test['()'] = '(' + taxa_to_test[wcvp_columns['paranthet_author']] + ')'
 
         for c in test_columns:

diff --git a/wcvp_download/unit_tests/test_taxa_outputs.py b/wcvp_download/unit_tests/test_taxa_outputs.py
@@ -206,7 +206,7 @@ def test_unusual_genera(self):
     def test_author_information(self):
         # accepted cases
         # acc name with author = acc name + authors
-        accepted = wcvp_data[wcvp_data[wcvp_columns['status']].isin(['Accepted', 'Artificial Hybrid'])]
+        accepted = wcvp_data[wcvp_data[wcvp_columns['status']].isin(['Accepted', 'Artificial Hybrid'])].copy()
         accepted['auth_check'] = accepted[wcvp_columns['name']].str.cat(
             [accepted[wcvp_columns['authors']].fillna('')],
             sep=' ').str.strip()

diff --git a/wcvp_name_matching/get_accepted_info.py b/wcvp_name_matching/get_accepted_info.py
@@ -227,20 +227,20 @@ def _find_best_matches_from_multiple_knms_matches(multiple_match_records: pd.Dat
     # First, use matches where accepted name is the same as the submitted name
     accepted_names_matching_submitted_names = multiple_match_records[
         multiple_match_records['submitted'] == multiple_match_records[
-            wcvp_accepted_columns['name']]]
+            wcvp_accepted_columns['name']]].copy()
 
     unmatched_containment_df = multiple_match_records[
         ~multiple_match_records[unique_submission_id_col].isin(
-            accepted_names_matching_submitted_names[unique_submission_id_col].values)]
+            accepted_names_matching_submitted_names[unique_submission_id_col].values)].copy()
 
     # reduce list to remove essentially repeated matches
     unique_accepted_matches = unmatched_containment_df.drop_duplicates(
-        subset=[unique_submission_id_col, wcvp_accepted_columns['ipni_id']], keep='first')
+        subset=[unique_submission_id_col, wcvp_accepted_columns['ipni_id']], keep='first').copy()
 
     #  Next use matches where the submitted name has a unique match
     submitted_names_with_single_accepted_match = unique_accepted_matches.drop_duplicates(
         subset=[unique_submission_id_col],
-        keep=False)
+        keep=False).copy()
 
     # Where neither of the above apply, sort by accepted rank
     unresolved_submissions = unique_accepted_matches[
@@ -249,21 +249,20 @@ def _find_best_matches_from_multiple_knms_matches(multiple_match_records: pd.Dat
 
     # matches where the accepted name is contained in the submitted name
     # In case of duplicates, these are sorted by specifity of rank
-    unresolved_submissions = unresolved_submissions.dropna(subset=[wcvp_accepted_columns['name']])
+    unresolved_submissions = unresolved_submissions.dropna(subset=[wcvp_accepted_columns['name']]).copy()
     accepted_names_in_submitted_names = unresolved_submissions[
         unresolved_submissions.apply(lambda x: x[wcvp_accepted_columns['name']] in x['submitted'],
-                                     axis=1)]
+                                     axis=1)].copy()
 
     for r in accepted_names_in_submitted_names[wcvp_accepted_columns['rank']].unique():
         if r not in rank_priority:
             raise ValueError(f'Rank priority list does not contain {r} and needs updating.')
     accepted_names_in_submitted_names[wcvp_accepted_columns['rank']] = pd.Categorical(
         accepted_names_in_submitted_names[wcvp_accepted_columns['rank']], rank_priority)
-    accepted_names_in_submitted_names.sort_values(wcvp_accepted_columns['rank'], inplace=True)
+    accepted_names_in_submitted_names = accepted_names_in_submitted_names.sort_values(wcvp_accepted_columns['rank']).copy()
 
     # Get the most precise match by dropping duplicate submissions
-    accepted_names_in_submitted_names.drop_duplicates(subset=[unique_submission_id_col], keep='first',
-                                                      inplace=True)
+    accepted_names_in_submitted_names = accepted_names_in_submitted_names.drop_duplicates(subset=[unique_submission_id_col], keep='first').copy()
     accepted_names_in_submitted_names[wcvp_accepted_columns['rank']] = accepted_names_in_submitted_names[
         wcvp_accepted_columns['rank']].astype(object)
 
@@ -273,7 +272,7 @@ def _find_best_matches_from_multiple_knms_matches(multiple_match_records: pd.Dat
     matches_to_use = pd.concat([accepted_names_matching_submitted_names,
                                 submitted_names_with_single_accepted_match,
                                 accepted_names_in_submitted_names])
-    matches_to_use.drop_duplicates(subset=[unique_submission_id_col], keep='first', inplace=True)
+    matches_to_use = matches_to_use.drop_duplicates(subset=[unique_submission_id_col], keep='first').copy()
 
     unmatched_df = multiple_match_records[
         ~multiple_match_records[unique_submission_id_col].isin(
@@ -453,7 +452,8 @@ def get_accepted_info_from_names_in_column(in_df: pd.DataFrame, name_col: str,
                                                                      unique_submission_index_col,
                                                                      all_taxa,
                                                                      family_column=family_column)
-
+                # This will raise a pandas warning
+                # https://github.com/pandas-dev/pandas/issues/55928
                 final_resolved_df = pd.concat(
                     [unmatched_resolutions, fuzzy_resolved_df], axis=0)
 

diff --git a/wcvp_name_matching/resolve_openrefine_matches.py b/wcvp_name_matching/resolve_openrefine_matches.py
@@ -29,7 +29,7 @@ def resolve_openrefine_to_best_matches(reco_df: pd.DataFrame, all_taxa: pd.DataF
                 out_df[wcvp_accepted_columns['family']].isin(families_of_interest))]
 
     # Unique matches
-    unique_matches = out_df[~out_df[reco_submitted_name_col_id].duplicated(keep=False)]
+    unique_matches = out_df.drop_duplicates(subset=[reco_submitted_name_col_id], keep=False).copy()
     unique_matches['matched_by'] = 'openrefine_unique'
 
     non_unique_reco_matches = out_df[out_df[reco_submitted_name_col_id].duplicated(keep=False)]
@@ -39,16 +39,16 @@ def resolve_openrefine_to_best_matches(reco_df: pd.DataFrame, all_taxa: pd.DataF
         subset=[reco_submitted_name_col_id, wcvp_accepted_columns['ipni_id']], keep='first')
     submitted_names_with_single_accepted_match = unique_acc_names.drop_duplicates(
         subset=[reco_submitted_name_col_id],
-        keep=False)
+        keep=False).copy()
     submitted_names_with_single_accepted_match['matched_by'] = 'openrefine_unique_accepted_name'
 
     non_unique_matches = non_unique_reco_matches[
         ~non_unique_reco_matches[reco_submitted_name_col_id].isin(
-            submitted_names_with_single_accepted_match[reco_submitted_name_col_id].values)]
+            submitted_names_with_single_accepted_match[reco_submitted_name_col_id].values)].copy()
 
     # Best scoring matches
     non_unique_matches['reco_score_max'] = non_unique_matches.groupby([reco_submitted_name_col_id])[
-        'reco_score'].transform(max)
+        'reco_score'].transform('max')
 
     top_scorers = non_unique_matches[non_unique_matches['reco_score'] == 'reco_score_max']
     top_unique_scorers = top_scorers[~top_scorers[reco_submitted_name_col_id].duplicated(keep=False)]

diff --git a/wcvp_name_matching/wcvp_matching.py b/wcvp_name_matching/wcvp_matching.py
@@ -53,7 +53,7 @@ def tidy_value_for_matching(given_value: str) -> str:
 def match_name_to_concatenated_columns(df: pd.DataFrame, matching_name_col: str, all_taxa: pd.DataFrame,
                                        columns: List[str]):
     # Remove taxa without author information as these are matched later without authors
-    taxa_to_use = all_taxa.dropna(subset=columns)
+    taxa_to_use = all_taxa.dropna(subset=columns).copy()
     column_series = [all_taxa[c].fillna('') for c in columns]
     taxa_to_use['taxon_name_with_extra_columns'] = taxa_to_use[wcvp_columns['name']].str.cat(column_series,
                                                                                              sep=' ')
@@ -66,7 +66,7 @@ def match_name_to_concatenated_columns(df: pd.DataFrame, matching_name_col: str,
                              right_on='taxon_name_with_extra_columns')
     author_merged = author_merged.dropna(subset=[wcvp_columns['wcvp_id']])
 
-    unmatched_with_authors_df = df[~df[lowercase_name_col].isin(author_merged[lowercase_name_col].values)]
+    unmatched_with_authors_df = df[~df[lowercase_name_col].isin(author_merged[lowercase_name_col].values)].copy()
 
     # Repeat but with 'tidied' authors
     unmatched_with_authors_df[tidied_taxon_authors_col] = unmatched_with_authors_df[matching_name_col].apply(
@@ -97,7 +97,10 @@ def get_wcvp_info_for_names_in_column(df: pd.DataFrame, matching_name_col: str,
     Appends accepted info columns to df from list of taxa, based on names in matching_name_col
     :param df:
     :param matching_name_col:
+    :param unique_submission_id_col:
     :param all_taxa:
+    :param family_column:
+    :param wcvp_version:
     :return:
     """
     if all_taxa is None:
@@ -140,7 +143,7 @@ def get_wcvp_info_for_names_in_column(df: pd.DataFrame, matching_name_col: str,
 
     merged_with_wcvp = pd.concat([author_merged, paranthet_author_merged, primary_author_merged, just_name_merged])
     match_df = get_family_specific_resolutions(merged_with_wcvp, family_column=family_column)
-    match_df = match_df[[unique_submission_id_col] + output_record_col_names + ['matched_by', 'matched_name']]
+    match_df = match_df[[unique_submission_id_col] + output_record_col_names + ['matched_by', 'matched_name']].copy()
 
     # Remove duplicates in match_df based on priority
 
@@ -149,11 +152,11 @@ def get_wcvp_info_for_names_in_column(df: pd.DataFrame, matching_name_col: str,
             raise ValueError(f'Status priority list does not contain {r} and needs updating.')
     match_df['taxon_status'] = pd.Categorical(match_df['taxon_status'],
                                               status_priority)
-    match_df.sort_values('taxon_status', inplace=True)
+    match_df = match_df.sort_values('taxon_status')
     # Appropriately label unique matches
     match_df['matched_by'] = np.where(match_df.duplicated(keep=False, subset=[unique_submission_id_col]),
                                       match_df['matched_by'],
                                       match_df['matched_by'] + '_unique')
-    match_df.drop_duplicates(subset=[unique_submission_id_col], inplace=True, keep='first')
+    match_df = match_df.drop_duplicates(subset=[unique_submission_id_col], keep='first')
     match_df['taxon_status'] = match_df['taxon_status'].astype(object)
     return match_df