From e246ab619170728db16214be6ea8096d65ad2d4d Mon Sep 17 00:00:00 2001
From: alrichardbollans <38588335+alrichardbollans@users.noreply.github.com>
Date: Wed, 25 Oct 2023 15:44:20 +0100
Subject: [PATCH] Improve cleaning of author strings

---
 wcvp_download/get_taxa_from_wcvp.py           | 13 +++++++++----
 .../unit_tests/test_string_hygiene.py         | 19 +++++++++++++++++--
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/wcvp_download/get_taxa_from_wcvp.py b/wcvp_download/get_taxa_from_wcvp.py
index 6a2fe6b..3cd75a7 100644
--- a/wcvp_download/get_taxa_from_wcvp.py
+++ b/wcvp_download/get_taxa_from_wcvp.py
@@ -21,9 +21,11 @@
                 'status': 'taxon_status',
                 'parent_name': 'parent_name',
                 'parent_ipni_id': 'parent_ipni_id',
-                'authors': 'taxon_authors',
-                'paranthet_author': 'parenthetical_author',
-                'primary_author': 'primary_author',
+                'authors': 'taxon_authors',  # Concatenation of parenthetical and primary authors.
+                # Missing values indicate instances where authorship is unknown or non-applicable (e.g. autonyms).
+                'paranthet_author': 'parenthetical_author',  # The author of the basionym. Empty when there is no basionym.
+                'primary_author': 'primary_author',  # The author or authors who published the scientific name.
+                # Missing values indicate instances where authorship is non-applicable (i.e. autonyms) or unknown.
                 'wcvp_id': 'plant_name_id',
                 'parent_plant_name_id': 'parent_plant_name_id',
                 'acc_plant_name_id': 'accepted_plant_name_id',
@@ -64,7 +66,10 @@ def clean_whitespaces_in_names(given_str: str):
         else:
             stripped = given_str.strip()
             out = " ".join(stripped.split())
-            return out
+            # fixing authors
+            fixed_authors = out.replace('. )', '.)')
+            fixed_authors2 = fixed_authors.replace(' )', ')')
+            return fixed_authors2
     except AttributeError:
         return given_str
 
diff --git a/wcvp_download/unit_tests/test_string_hygiene.py b/wcvp_download/unit_tests/test_string_hygiene.py
index ea61ea0..781e9de 100644
--- a/wcvp_download/unit_tests/test_string_hygiene.py
+++ b/wcvp_download/unit_tests/test_string_hygiene.py
@@ -2,15 +2,18 @@
 import re
 import unittest
 
+import pandas.testing
+
 from wcvp_download import get_all_taxa, wcvp_columns_used_in_direct_matching, infraspecific_chars, \
-    hybrid_characters, wcvp_columns, wcvp_accepted_columns
+    hybrid_characters, wcvp_columns, wcvp_accepted_columns, clean_whitespaces_in_names
 
 wcvp_data = get_all_taxa()
 _output_path = 'test_outputs'
 
+things_not_in_checklist = ['  ', ' .', '. )', ' )', '\t']
+
 
 def string_hygeine_tests(df, output_dir):
-    things_not_in_checklist = ['  ', ' .',  '\t']
     notin_problem_dfs = []
     for col in wcvp_columns_used_in_direct_matching:
         for unused_string in things_not_in_checklist:
@@ -62,6 +65,18 @@ def test_whitespace(self):
                 problems.to_csv(os.path.join(_output_path, str(c) + '.csv'))
                 raise ValueError
 
+    def test_authors(self):
+        test_columns = [wcvp_columns['paranthet_author'], wcvp_columns['primary_author'], wcvp_columns['authors']]
+        taxa_to_test = wcvp_data.dropna(subset=test_columns, how='any')
+        taxa_to_test['()'] = '(' + taxa_to_test[wcvp_columns['paranthet_author']] + ')'
+
+        for c in test_columns:
+            taxa_to_test[c] = taxa_to_test[c].apply(clean_whitespaces_in_names)
+        taxa_to_test['test_col'] = taxa_to_test['()'].str.cat([taxa_to_test[wcvp_columns['primary_author']]],
+                                                              sep=' ')
+
+        pandas.testing.assert_series_equal(taxa_to_test['test_col'], taxa_to_test[wcvp_columns['authors']], check_names=False)
+
 
 if __name__ == '__main__':
     unittest.main()