Improve matching with author strings and version

alrichardbollans · Oct 25, 2023 · 5ef8079 · 5ef8079
1 parent e246ab6
commit 5ef8079
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 9 deletions.
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='automatchnames',
-    version='1.2.5',
+    version='1.2.6',
     packages=find_packages(),
     package_data={"wcvp_download": ["inputs/*", "inputs/wgsrpd-master/level3/*"]},
     install_requires=[

diff --git a/wcvp_name_matching/unit_tests/test_string_utils.py b/wcvp_name_matching/unit_tests/test_string_utils.py
@@ -115,8 +115,10 @@ def test_adding_space(self):
 
     def test_whitespace_removal(self):
         test_dict = {'first second third': 'first second third',
-                     'A   B B  C ': 'A B B C', 2:2, '  ':''}
+                     'A   B B  C ': 'A B B C', 2: 2, '  ': ''}
         for t in test_dict:
             self.assertEqual(clean_whitespaces_in_names(t), test_dict[t])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/wcvp_name_matching/wcvp_matching.py b/wcvp_name_matching/wcvp_matching.py
@@ -56,7 +56,7 @@ def match_name_to_concatenated_columns(df: pd.DataFrame, matching_name_col: str,
     taxa_to_use = all_taxa.dropna(subset=columns)
     column_series = [all_taxa[c].fillna('') for c in columns]
     taxa_to_use['taxon_name_with_extra_columns'] = taxa_to_use[wcvp_columns['name']].str.cat(column_series,
-                                                                                       sep=' ')
+                                                                                             sep=' ')
     taxa_to_use['taxon_name_with_extra_columns'] = taxa_to_use['taxon_name_with_extra_columns'].apply(
         tidy_value_for_matching)
 
@@ -92,7 +92,7 @@ def match_name_to_concatenated_columns(df: pd.DataFrame, matching_name_col: str,
 
 
 def get_wcvp_info_for_names_in_column(df: pd.DataFrame, matching_name_col: str, unique_submission_id_col: str,
-                                      all_taxa: pd.DataFrame = None, family_column: str = None, wcvp_version:str = None):
+                                      all_taxa: pd.DataFrame = None, family_column: str = None, wcvp_version: str = None):
     """
     Appends accepted info columns to df from list of taxa, based on names in matching_name_col
     :param df:
@@ -104,33 +104,41 @@ def get_wcvp_info_for_names_in_column(df: pd.DataFrame, matching_name_col: str,
         all_taxa = get_all_taxa(version=wcvp_version)
 
     # First try with author info i.e. taxon name + taxon_authors and then
-    # taxon name + parenthetical_author + primary_author
+    # taxon name + parenthetical_author + primary_author then taxon name + primary author
     author_merged, unmatched_with_authors_df = match_name_to_concatenated_columns(df, matching_name_col,
                                                                                   all_taxa,
                                                                                   [wcvp_columns['authors']])
 
     author_merged['matched_by'] = 'direct_wcvp_w_author'
 
-    # Try with paranthetical and primary author columns
+    # Try with paranthetical and primary author columns - slight difference with taxon authors as no parantheses around paranthet author
     paranthet_author_merged, unmatched_with_paranthet_authors_df = match_name_to_concatenated_columns(
         unmatched_with_authors_df, matching_name_col,
         all_taxa,
         [wcvp_columns['paranthet_author'], wcvp_columns['primary_author']])
 
     paranthet_author_merged['matched_by'] = 'direct_wcvp_w_author'
 
+    # Try with primary author columns
+    primary_author_merged, unmatched_with_primary_author_df = match_name_to_concatenated_columns(
+        unmatched_with_paranthet_authors_df, matching_name_col,
+        all_taxa,
+        [wcvp_columns['primary_author']])
+
+    primary_author_merged['matched_by'] = 'direct_wcvp_w_author'
+
     # Match with just name
     all_taxa['tidied_taxon_name'] = all_taxa[wcvp_columns['name']].apply(tidy_value_for_matching)
-    unmatched_with_paranthet_authors_df[lowercase_name_col] = unmatched_with_paranthet_authors_df[
+    unmatched_with_primary_author_df[lowercase_name_col] = unmatched_with_primary_author_df[
         matching_name_col].apply(tidy_value_for_matching)
-    just_name_merged = pd.merge(unmatched_with_paranthet_authors_df, all_taxa, how='left',
+    just_name_merged = pd.merge(unmatched_with_primary_author_df, all_taxa, how='left',
                                 left_on=lowercase_name_col,
                                 right_on='tidied_taxon_name')
     just_name_merged = just_name_merged.dropna(subset=[wcvp_columns['wcvp_id']])
     just_name_merged['matched_by'] = 'direct_wcvp'
     just_name_merged['matched_name'] = just_name_merged[wcvp_columns['name']]
 
-    merged_with_wcvp = pd.concat([author_merged, paranthet_author_merged, just_name_merged])
+    merged_with_wcvp = pd.concat([author_merged, paranthet_author_merged, primary_author_merged, just_name_merged])
     match_df = get_family_specific_resolutions(merged_with_wcvp, family_column=family_column)
     match_df = match_df[[unique_submission_id_col] + output_record_col_names + ['matched_by', 'matched_name']]