Skip to content

Commit

Permalink
Improve matching with author strings and version
Browse files Browse the repository at this point in the history
  • Loading branch information
alrichardbollans committed Oct 25, 2023
1 parent e246ab6 commit 5ef8079
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 9 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name='automatchnames',
version='1.2.5',
version='1.2.6',
packages=find_packages(),
package_data={"wcvp_download": ["inputs/*", "inputs/wgsrpd-master/level3/*"]},
install_requires=[
Expand Down
4 changes: 3 additions & 1 deletion wcvp_name_matching/unit_tests/test_string_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,10 @@ def test_adding_space(self):

def test_whitespace_removal(self):
test_dict = {'first second third': 'first second third',
'A B B C ': 'A B B C', 2:2, ' ':''}
'A B B C ': 'A B B C', 2: 2, ' ': ''}
for t in test_dict:
self.assertEqual(clean_whitespaces_in_names(t), test_dict[t])


if __name__ == '__main__':
unittest.main()
22 changes: 15 additions & 7 deletions wcvp_name_matching/wcvp_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def match_name_to_concatenated_columns(df: pd.DataFrame, matching_name_col: str,
taxa_to_use = all_taxa.dropna(subset=columns)
column_series = [all_taxa[c].fillna('') for c in columns]
taxa_to_use['taxon_name_with_extra_columns'] = taxa_to_use[wcvp_columns['name']].str.cat(column_series,
sep=' ')
sep=' ')
taxa_to_use['taxon_name_with_extra_columns'] = taxa_to_use['taxon_name_with_extra_columns'].apply(
tidy_value_for_matching)

Expand Down Expand Up @@ -92,7 +92,7 @@ def match_name_to_concatenated_columns(df: pd.DataFrame, matching_name_col: str,


def get_wcvp_info_for_names_in_column(df: pd.DataFrame, matching_name_col: str, unique_submission_id_col: str,
all_taxa: pd.DataFrame = None, family_column: str = None, wcvp_version:str = None):
all_taxa: pd.DataFrame = None, family_column: str = None, wcvp_version: str = None):
"""
Appends accepted info columns to df from list of taxa, based on names in matching_name_col
:param df:
Expand All @@ -104,33 +104,41 @@ def get_wcvp_info_for_names_in_column(df: pd.DataFrame, matching_name_col: str,
all_taxa = get_all_taxa(version=wcvp_version)

# First try with author info i.e. taxon name + taxon_authors and then
# taxon name + parenthetical_author + primary_author
# taxon name + parenthetical_author + primary_author then taxon name + primary author
author_merged, unmatched_with_authors_df = match_name_to_concatenated_columns(df, matching_name_col,
all_taxa,
[wcvp_columns['authors']])

author_merged['matched_by'] = 'direct_wcvp_w_author'

# Try with paranthetical and primary author columns
# Try with paranthetical and primary author columns - slight difference with taxon authors as no parantheses around paranthet author
paranthet_author_merged, unmatched_with_paranthet_authors_df = match_name_to_concatenated_columns(
unmatched_with_authors_df, matching_name_col,
all_taxa,
[wcvp_columns['paranthet_author'], wcvp_columns['primary_author']])

paranthet_author_merged['matched_by'] = 'direct_wcvp_w_author'

# Try with primary author columns
primary_author_merged, unmatched_with_primary_author_df = match_name_to_concatenated_columns(
unmatched_with_paranthet_authors_df, matching_name_col,
all_taxa,
[wcvp_columns['primary_author']])

primary_author_merged['matched_by'] = 'direct_wcvp_w_author'

# Match with just name
all_taxa['tidied_taxon_name'] = all_taxa[wcvp_columns['name']].apply(tidy_value_for_matching)
unmatched_with_paranthet_authors_df[lowercase_name_col] = unmatched_with_paranthet_authors_df[
unmatched_with_primary_author_df[lowercase_name_col] = unmatched_with_primary_author_df[
matching_name_col].apply(tidy_value_for_matching)
just_name_merged = pd.merge(unmatched_with_paranthet_authors_df, all_taxa, how='left',
just_name_merged = pd.merge(unmatched_with_primary_author_df, all_taxa, how='left',
left_on=lowercase_name_col,
right_on='tidied_taxon_name')
just_name_merged = just_name_merged.dropna(subset=[wcvp_columns['wcvp_id']])
just_name_merged['matched_by'] = 'direct_wcvp'
just_name_merged['matched_name'] = just_name_merged[wcvp_columns['name']]

merged_with_wcvp = pd.concat([author_merged, paranthet_author_merged, just_name_merged])
merged_with_wcvp = pd.concat([author_merged, paranthet_author_merged, primary_author_merged, just_name_merged])
match_df = get_family_specific_resolutions(merged_with_wcvp, family_column=family_column)
match_df = match_df[[unique_submission_id_col] + output_record_col_names + ['matched_by', 'matched_name']]

Expand Down

0 comments on commit 5ef8079

Please sign in to comment.