-
Notifications
You must be signed in to change notification settings - Fork 128
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adds augur curate titlecase sub-command
Adds a new sub-command `augur curate titlecase` based on the transform-string-fields script in the monkeypox repo. The `augur curate normalize` sub-command has already been added based on the same script (#1039). Overall this is part of filling in the gaps in the augur curate suite of commands (#860), specifically addressing issue (#999), and is a follow-up to #1039. As part of the augur curate suite of commands, `augur curate titlecase` would transform the values of a given metadata field to titlecase. This is useful for normalizing the values of a string that may contain inconsistent capitalization such as "North America" and "north america". Co-authored-by: Jover Lee <[email protected]>
- Loading branch information
1 parent
e6ff9ef
commit 83a016b
Showing
3 changed files
with
147 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
#!/usr/bin/env python3 | ||
""" | ||
Standardizes string fields of the NDJSON record from stdin and outputs the | ||
modified record to stdout. | ||
""" | ||
import argparse | ||
|
||
import re | ||
import unicodedata | ||
from typing import Optional, Set, Union | ||
|
||
from augur.errors import AugurError | ||
from augur.io.print import print_err | ||
from augur.types import DataErrorMethod | ||
|
||
def register_parser(parent_subparsers): | ||
parser = parent_subparsers.add_parser("titlecase", | ||
parents = [parent_subparsers.shared_parser], | ||
help = __doc__) | ||
|
||
required = parser.add_argument_group(title="REQUIRED") | ||
required.add_argument("--titlecase-fields", nargs="*", | ||
help="List of fields to convert to titlecase.", required=True) | ||
|
||
optional = parser.add_argument_group(title="OPTIONAL") | ||
optional.add_argument("--articles", nargs="*", | ||
help="List of articles that should not be cast to titlecase.") | ||
optional.add_argument("--abbreviations", nargs="*", | ||
help="List of abbreviations that should not be cast to titlecase, keeps uppercase.") | ||
|
||
optional.add_argument("--failure-reporting", | ||
type=DataErrorMethod, | ||
choices=[ method for method in DataErrorMethod ], | ||
default=DataErrorMethod.ERROR_FIRST, | ||
help="How should failed titlecase formatting be reported.") | ||
return parser | ||
|
||
|
||
def titlecase(text: Union[str, None], articles: Set[str] = {}, abbreviations: Set[str] = {}) -> Optional[str]: | ||
""" | ||
Originally from nextstrain/ncov-ingest | ||
Returns a title cased location name from the given location name | ||
*tokens*. Ensures that no tokens contained in the *whitelist_tokens* are | ||
converted to title case. | ||
>>> articles = {'a', 'and', 'of', 'the', 'le'} | ||
>>> abbreviations = {'USA', 'DC'} | ||
>>> titlecase("the night OF THE LIVING DEAD", articles) | ||
'The Night of the Living Dead' | ||
>>> titlecase("BRAINE-LE-COMTE, FRANCE", articles) | ||
'Braine-le-Comte, France' | ||
>>> titlecase("auvergne-RHÔNE-alpes", articles) | ||
'Auvergne-Rhône-Alpes' | ||
>>> titlecase("washington DC, usa", articles, abbreviations) | ||
'Washington DC, USA' | ||
""" | ||
if not isinstance(text, str): | ||
return None | ||
|
||
words = enumerate(re.split(r'\b', text)) | ||
|
||
def changecase(index, word): | ||
casefold = word.casefold() | ||
upper = word.upper() | ||
|
||
if upper in abbreviations: | ||
return upper | ||
elif casefold in articles and index != 1: | ||
return word.lower() | ||
else: | ||
return word.title() | ||
|
||
return ''.join(changecase(i, w) for i, w in words) | ||
|
||
|
||
def run(args, records): | ||
failures = [] | ||
failure_reporting = args.failure_reporting | ||
|
||
articles = set() | ||
if args.articles: | ||
articles = set(args.articles) | ||
|
||
abbreviations = set() | ||
if args.abbreviations: | ||
abbreviations = set(args.abbreviations) | ||
|
||
for index, record in enumerate(records): | ||
record = record.copy() | ||
record_id = index | ||
|
||
for field in args.titlecase_fields: | ||
titlecased_string = titlecase(record.get(field, ""), articles, abbreviations) | ||
|
||
failure_message = f"Failed to titlecase {field} in record {record_id}" | ||
if titlecased_string is None: | ||
if failure_reporting is DataErrorMethod.ERROR_FIRST: | ||
raise AugurError(failure_message) | ||
|
||
if failure_reporting is DataErrorMethod.WARN: | ||
print_err(f"WARNING: {failure_message}") | ||
|
||
# Keep track of failures for final summary | ||
failures.append((record_id, field, record.get(field, ""))) | ||
else: | ||
record[field] = titlecased_string | ||
|
||
yield record | ||
|
||
if failure_reporting is not DataErrorMethod.SILENT and failures: | ||
failure_message = ( | ||
"Unable to change to titlecase for the following (record, field string):\n" + \ | ||
'\n'.join(map(repr, failures)) | ||
) | ||
if failure_reporting is DataErrorMethod.ERROR_ALL: | ||
raise AugurError(failure_message) | ||
|
||
elif failure_reporting is DataErrorMethod.WARN: | ||
print_err(f"WARNING: {failure_message}") | ||
|
||
else: | ||
raise ValueError(f"Encountered unhandled failure reporting method: {failure_reporting!r}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
Setup | ||
|
||
$ pushd "$TESTDIR" > /dev/null | ||
$ export AUGUR="${AUGUR:-../../../../bin/augur}" | ||
|
||
|
||
Create NDJSON file for testing titlecase with different forms | ||
|
||
$ cat >$TMP/records.ndjson <<~~ | ||
> {"record": 1, "authors": "john smith", "author2": "Jane Doe"} | ||
> ~~ | ||
|
||
|
||
Test output with Unicode normalization form "NFKC". | ||
|
||
$ cat $TMP/records.ndjson \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "authors" "author2" | ||
{"record": 1, "authors": "John Smith", "author2": "Jane Doe"} |