-
Notifications
You must be signed in to change notification settings - Fork 128
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[titlecase] Adds augur curate titlecase sub-command
Adds a new sub-command `augur curate titlecase` based on the transform-string-fields script in the monkeypox repo. The `augur curate normalize` sub-command has already been added based on the same script (#1039). Overall this is part of filling in the gaps in the augur curate suite of commands (#860), specifically addressing issue (#999), and is a follow-up to #1039. `augur curate titlecase` would transform the values of a given metadata field to titlecase. This is useful for normalizing the values of a string that may contain inconsistent capitalization such as "North America" and "north america". This commit also adds a test for the new sub-command and updates the documentation. For testing an upper case to lower case circumflex'd o character conversion, had to use the escaped unicode character Co-authored-by: Jover Lee <[email protected]>
- Loading branch information
1 parent
9ef4711
commit 9940898
Showing
5 changed files
with
163 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
""" | ||
Applies titlecase to string fields in a metadata record | ||
""" | ||
import re | ||
from typing import Optional, Set, Union | ||
|
||
from augur.errors import AugurError | ||
from augur.io.print import print_err | ||
from augur.types import DataErrorMethod | ||
|
||
def register_parser(parent_subparsers): | ||
parser = parent_subparsers.add_parser("titlecase", | ||
parents = [parent_subparsers.shared_parser], | ||
help = __doc__) | ||
|
||
required = parser.add_argument_group(title="REQUIRED") | ||
required.add_argument("--titlecase-fields", nargs="*", | ||
help="List of fields to convert to titlecase.", required=True) | ||
|
||
optional = parser.add_argument_group(title="OPTIONAL") | ||
optional.add_argument("--articles", nargs="*", | ||
help="List of articles that should not be converted to titlecase.") | ||
optional.add_argument("--abbreviations", nargs="*", | ||
help="List of abbreviations that should not be converted to titlecase, keeps uppercase.") | ||
|
||
optional.add_argument("--failure-reporting", | ||
type=DataErrorMethod.argtype, | ||
choices=[ method for method in DataErrorMethod ], | ||
default=DataErrorMethod.ERROR_FIRST, | ||
help="How should failed titlecase formatting be reported.") | ||
return parser | ||
|
||
|
||
def titlecase(text: Union[str, None], articles: Set[str] = set(), abbreviations: Set[str] = set()) -> Optional[str]: | ||
""" | ||
Originally from nextstrain/ncov-ingest | ||
Returns a title cased location name from the given location name | ||
*tokens*. Ensures that no tokens contained in the *whitelist_tokens* are | ||
converted to title case. | ||
>>> articles = {'a', 'and', 'of', 'the', 'le'} | ||
>>> abbreviations = {'USA', 'DC'} | ||
>>> titlecase("the night OF THE LIVING DEAD", articles) | ||
'The Night of the Living Dead' | ||
>>> titlecase("BRAINE-LE-COMTE, FRANCE", articles) | ||
'Braine-le-Comte, France' | ||
>>> titlecase("auvergne-RHÔNE-alpes", articles) | ||
'Auvergne-Rhône-Alpes' | ||
>>> titlecase("washington DC, usa", articles, abbreviations) | ||
'Washington DC, USA' | ||
""" | ||
if not isinstance(text, str): | ||
return None | ||
|
||
words = enumerate(re.split(r'\b', text)) | ||
|
||
def changecase(index, word): | ||
casefold = word.casefold() | ||
upper = word.upper() | ||
|
||
if upper in abbreviations: | ||
return upper | ||
elif casefold in articles and index != 1: | ||
return word.lower() | ||
else: | ||
return word.title() | ||
|
||
return ''.join(changecase(i, w) for i, w in words) | ||
|
||
|
||
def run(args, records): | ||
failures = [] | ||
failure_reporting = args.failure_reporting | ||
|
||
articles = set() | ||
if args.articles: | ||
articles = set(args.articles) | ||
|
||
abbreviations = set() | ||
if args.abbreviations: | ||
abbreviations = set(args.abbreviations) | ||
|
||
for index, record in enumerate(records): | ||
record = record.copy() | ||
record_id = index | ||
|
||
for field in args.titlecase_fields: | ||
titlecased_string = titlecase(record.get(field, ""), articles, abbreviations) | ||
|
||
failure_message = f"Failed to titlecase {field!r}:{record.get(field)!r} in record {record_id!r}" | ||
if titlecased_string is None: | ||
if failure_reporting is DataErrorMethod.ERROR_FIRST: | ||
raise AugurError(failure_message) | ||
|
||
if failure_reporting is DataErrorMethod.WARN: | ||
print_err(f"WARNING: {failure_message}") | ||
|
||
# Keep track of failures for final summary | ||
failures.append((record_id, field, record.get(field))) | ||
else: | ||
record[field] = titlecased_string | ||
|
||
yield record | ||
|
||
if failure_reporting is not DataErrorMethod.SILENT and failures: | ||
failure_message = ( | ||
"Unable to change to titlecase for the following (record, field, field value):\n" + \ | ||
'\n'.join(map(repr, failures)) | ||
) | ||
if failure_reporting is DataErrorMethod.ERROR_ALL: | ||
raise AugurError(failure_message) | ||
|
||
elif failure_reporting is DataErrorMethod.WARN: | ||
print_err(f"WARNING: {failure_message}") | ||
|
||
else: | ||
raise ValueError(f"Encountered unhandled failure reporting method: {failure_reporting!r}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
================= | ||
titlecase | ||
================= | ||
|
||
.. argparse:: | ||
:module: augur | ||
:func: make_parser | ||
:prog: augur | ||
:path: curate titlecase |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
Setup | ||
|
||
$ pushd "$TESTDIR" > /dev/null | ||
$ export AUGUR="${AUGUR:-../../../../bin/augur}" | ||
|
||
|
||
Test output with articles and a mixture of lower and uppercase letters. | ||
|
||
$ echo '{"title": "the night OF THE LIVING DEAD"}' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "title" --articles "a" "and" "of" "the" "le" | ||
{"title": "The Night of the Living Dead"} | ||
|
||
Test output with hyphenated location. | ||
|
||
$ echo '{"location": "BRAINE-LE-COMTE, FRANCE"}' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "location" --articles "a" "and" "of" "the" "le" | ||
{"location": "Braine-le-Comte, France"} | ||
|
||
Test output with unicode characters | ||
|
||
$ echo '{"location": "Auvergne-Rhône-Alpes" }' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "location" | ||
{"location": "Auvergne-Rh\u00f4ne-Alpes"} | ||
|
||
Test output with abbreviations | ||
|
||
$ echo '{"city": "Washington DC, USA"}' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "city" --abbreviations "USA" "DC" | ||
{"city": "Washington DC, USA"} |