Skip to content

Commit

Permalink
Improve cleaning of author strings
Browse files Browse the repository at this point in the history
  • Loading branch information
alrichardbollans committed Oct 25, 2023
1 parent ce6e16c commit e246ab6
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 6 deletions.
13 changes: 9 additions & 4 deletions wcvp_download/get_taxa_from_wcvp.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@
'status': 'taxon_status',
'parent_name': 'parent_name',
'parent_ipni_id': 'parent_ipni_id',
'authors': 'taxon_authors',
'paranthet_author': 'parenthetical_author',
'primary_author': 'primary_author',
'authors': 'taxon_authors', # Concatenation of parenthetical and primary authors.
# Missing values indicate instances where authorship is unknown or non-applicable (e.g. autonyms).
'paranthet_author': 'parenthetical_author', # The author of the basionym. Empty when there is no basionym.
'primary_author': 'primary_author', # The author or authors who published the scientific name.
# Missing values indicate instances where authorship is non-applicable (i.e. autonyms) or unknown.
'wcvp_id': 'plant_name_id',
'parent_plant_name_id': 'parent_plant_name_id',
'acc_plant_name_id': 'accepted_plant_name_id',
Expand Down Expand Up @@ -64,7 +66,10 @@ def clean_whitespaces_in_names(given_str: str):
else:
stripped = given_str.strip()
out = " ".join(stripped.split())
return out
# fixing authors
fixed_authors = out.replace('. )', '.)')
fixed_authors2 = fixed_authors.replace(' )', ')')
return fixed_authors2
except AttributeError:
return given_str

Expand Down
19 changes: 17 additions & 2 deletions wcvp_download/unit_tests/test_string_hygiene.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@
import re
import unittest

import pandas.testing

from wcvp_download import get_all_taxa, wcvp_columns_used_in_direct_matching, infraspecific_chars, \
hybrid_characters, wcvp_columns, wcvp_accepted_columns
hybrid_characters, wcvp_columns, wcvp_accepted_columns, clean_whitespaces_in_names

wcvp_data = get_all_taxa()
_output_path = 'test_outputs'

things_not_in_checklist = [' ', ' .', '. )', ' )', '\t']


def string_hygeine_tests(df, output_dir):
things_not_in_checklist = [' ', ' .', '\t']
notin_problem_dfs = []
for col in wcvp_columns_used_in_direct_matching:
for unused_string in things_not_in_checklist:
Expand Down Expand Up @@ -62,6 +65,18 @@ def test_whitespace(self):
problems.to_csv(os.path.join(_output_path, str(c) + '.csv'))
raise ValueError

def test_authors(self):
test_columns = [wcvp_columns['paranthet_author'], wcvp_columns['primary_author'], wcvp_columns['authors']]
taxa_to_test = wcvp_data.dropna(subset=test_columns, how='any')
taxa_to_test['()'] = '(' + taxa_to_test[wcvp_columns['paranthet_author']] + ')'

for c in test_columns:
taxa_to_test[c] = taxa_to_test[c].apply(clean_whitespaces_in_names)
taxa_to_test['test_col'] = taxa_to_test['()'].str.cat([taxa_to_test[wcvp_columns['primary_author']]],
sep=' ')

pandas.testing.assert_series_equal(taxa_to_test['test_col'], taxa_to_test[wcvp_columns['authors']], check_names=False)


if __name__ == '__main__':
unittest.main()

0 comments on commit e246ab6

Please sign in to comment.