From e246ab619170728db16214be6ea8096d65ad2d4d Mon Sep 17 00:00:00 2001 From: alrichardbollans <38588335+alrichardbollans@users.noreply.github.com> Date: Wed, 25 Oct 2023 15:44:20 +0100 Subject: [PATCH] Improve cleaning of author strings --- wcvp_download/get_taxa_from_wcvp.py | 13 +++++++++---- .../unit_tests/test_string_hygiene.py | 19 +++++++++++++++++-- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/wcvp_download/get_taxa_from_wcvp.py b/wcvp_download/get_taxa_from_wcvp.py index 6a2fe6b..3cd75a7 100644 --- a/wcvp_download/get_taxa_from_wcvp.py +++ b/wcvp_download/get_taxa_from_wcvp.py @@ -21,9 +21,11 @@ 'status': 'taxon_status', 'parent_name': 'parent_name', 'parent_ipni_id': 'parent_ipni_id', - 'authors': 'taxon_authors', - 'paranthet_author': 'parenthetical_author', - 'primary_author': 'primary_author', + 'authors': 'taxon_authors', # Concatenation of parenthetical and primary authors. + # Missing values indicate instances where authorship is unknown or non-applicable (e.g. autonyms). + 'paranthet_author': 'parenthetical_author', # The author of the basionym. Empty when there is no basionym. + 'primary_author': 'primary_author', # The author or authors who published the scientific name. + # Missing values indicate instances where authorship is non-applicable (i.e. autonyms) or unknown. 'wcvp_id': 'plant_name_id', 'parent_plant_name_id': 'parent_plant_name_id', 'acc_plant_name_id': 'accepted_plant_name_id', @@ -64,7 +66,10 @@ def clean_whitespaces_in_names(given_str: str): else: stripped = given_str.strip() out = " ".join(stripped.split()) - return out + # fixing authors + fixed_authors = out.replace('. )', '.)') + fixed_authors2 = fixed_authors.replace(' )', ')') + return fixed_authors2 except AttributeError: return given_str diff --git a/wcvp_download/unit_tests/test_string_hygiene.py b/wcvp_download/unit_tests/test_string_hygiene.py index ea61ea0..781e9de 100644 --- a/wcvp_download/unit_tests/test_string_hygiene.py +++ b/wcvp_download/unit_tests/test_string_hygiene.py @@ -2,15 +2,18 @@ import re import unittest +import pandas.testing + from wcvp_download import get_all_taxa, wcvp_columns_used_in_direct_matching, infraspecific_chars, \ - hybrid_characters, wcvp_columns, wcvp_accepted_columns + hybrid_characters, wcvp_columns, wcvp_accepted_columns, clean_whitespaces_in_names wcvp_data = get_all_taxa() _output_path = 'test_outputs' +things_not_in_checklist = [' ', ' .', '. )', ' )', '\t'] + def string_hygeine_tests(df, output_dir): - things_not_in_checklist = [' ', ' .', '\t'] notin_problem_dfs = [] for col in wcvp_columns_used_in_direct_matching: for unused_string in things_not_in_checklist: @@ -62,6 +65,18 @@ def test_whitespace(self): problems.to_csv(os.path.join(_output_path, str(c) + '.csv')) raise ValueError + def test_authors(self): + test_columns = [wcvp_columns['paranthet_author'], wcvp_columns['primary_author'], wcvp_columns['authors']] + taxa_to_test = wcvp_data.dropna(subset=test_columns, how='any') + taxa_to_test['()'] = '(' + taxa_to_test[wcvp_columns['paranthet_author']] + ')' + + for c in test_columns: + taxa_to_test[c] = taxa_to_test[c].apply(clean_whitespaces_in_names) + taxa_to_test['test_col'] = taxa_to_test['()'].str.cat([taxa_to_test[wcvp_columns['primary_author']]], + sep=' ') + + pandas.testing.assert_series_equal(taxa_to_test['test_col'], taxa_to_test[wcvp_columns['authors']], check_names=False) + if __name__ == '__main__': unittest.main()