From c074ae2878d92f348be20308dc0ae6dbb59007fe Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Mon, 16 Dec 2024 01:40:28 +0100 Subject: [PATCH] Comment and file formatting --- src/scribe_data/cli/main.py | 9 +++--- src/scribe_data/wikidata/wikidata_utils.py | 35 ++++++++++++++-------- tests/cli/test_get.py | 3 +- 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index df1f7cdd..a18eb607 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -28,13 +28,13 @@ from scribe_data.cli.cli_utils import validate_language_and_data_type from scribe_data.cli.convert import convert_wrapper +from scribe_data.cli.download import download_wrapper from scribe_data.cli.get import get_data from scribe_data.cli.interactive import start_interactive_mode from scribe_data.cli.list import list_wrapper from scribe_data.cli.total import total_wrapper from scribe_data.cli.upgrade import upgrade_cli from scribe_data.cli.version import get_version_message -from scribe_data.cli.download import download_wrapper LIST_DESCRIPTION = "List languages, data types and combinations of each that Scribe-Data can be used for." GET_DESCRIPTION = ( @@ -161,7 +161,7 @@ def main() -> None: help="The case format for identifiers in the output data (default: camel).", ) get_parser.add_argument( - "-wd", + "-wdp", "--wikidata-dump-path", type=str, help="Path to a local Wikidata lexemes dump required for running with '--all'.", @@ -284,7 +284,7 @@ def main() -> None: ) download_parser._actions[0].help = "Show this help message and exit." download_parser.add_argument( - "-wd", + "-wdv", "--wikidata-dump-version", nargs="?", const="latest", @@ -310,7 +310,7 @@ def main() -> None: return try: - # Only validate language and data_type for relevant commands + # Only validate language and data_type for relevant commands. if args.command in ["list", "l", "get", "g", "total", "t", "convert", "c"]: if ( hasattr(args, "data_type") @@ -327,6 +327,7 @@ def main() -> None: if hasattr(args, "data_type") else None, ) + except ValueError as e: print(f"Input validation failed with error: {e}") return diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index 3afb79a0..96bac231 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -20,6 +20,7 @@ --> """ +import contextlib import re from datetime import datetime @@ -54,15 +55,19 @@ def parse_date(date_string): for fmt in formats: try: return datetime.strptime(date_string, fmt).date() + except ValueError: continue + print( f"Invalid date format: {date_string}. Expected formats: YYYYMMDD, YYYY/MM/DD, or YYYY-MM-DD." ) return None -def available_closest_lexeme_dumpfile(target_entity, other_old_dumps, try_old_dump): +def available_closest_lexeme_dumpfile( + target_entity: str, other_old_dumps: str, check_wd_dump_exists +): """ Finds the closest available dump file based on the target date. @@ -74,7 +79,7 @@ def available_closest_lexeme_dumpfile(target_entity, other_old_dumps, try_old_du other_old_dumps : list List of available dump folders as strings. - try_old_dump : function + check_wd_dump_exists : function A function to validate if the dump file exists. Returns @@ -82,17 +87,18 @@ def available_closest_lexeme_dumpfile(target_entity, other_old_dumps, try_old_du str : The closest available dump file date (as a string). None : If no suitable dump is found. """ - available_dates = [] target_date = parse_date(target_entity) closest_date = None closest_diff = None if target_date: + available_dates = [] for i in other_old_dumps: if i == "..": continue - try: - if try_old_dump(i): + + with contextlib.suppress(requests.exceptions.HTTPError): + if check_wd_dump_exists(i): available_dates.append(i) current_date = parse_date(i) diff = abs((current_date - target_date).days) @@ -103,12 +109,11 @@ def available_closest_lexeme_dumpfile(target_entity, other_old_dumps, try_old_du if current_date >= target_date: break - except requests.exceptions.HTTPError: - pass + return closest_date -def download_wiki_lexeme_dump(target_entity="latest-lexemes"): +def download_wiki_lexeme_dump(target_entity: str = "latest-lexemes"): """ Downloads a Wikimedia lexeme dump based on the specified target entity or date. @@ -127,7 +132,7 @@ def download_wiki_lexeme_dump(target_entity="latest-lexemes"): """ base_url = "https://dumps.wikimedia.org/wikidatawiki/entities" - def try_old_dump(target_entity): + def check_wd_dump_exists(target_entity): """ Checks if the specified dump file exists for a target entity. @@ -146,15 +151,16 @@ def try_old_dump(target_entity): entity_response.raise_for_status() dump_filenames = re.findall(r'href="([^"]+)"', entity_response.text) - fileurl = f"wikidata-{target_entity}-lexemes.json.bz2" - if fileurl in dump_filenames: + file_url = f"wikidata-{target_entity}-lexemes.json.bz2" + + if file_url in dump_filenames: return f"{base_url}/{target_entity}/{fileurl}" if target_entity != "latest-lexemes": try: if parse_date(target_entity): target_entity = target_entity.replace("/", "").replace("-", "") - return try_old_dump(target_entity) + return check_wd_dump_exists(target_entity) except requests.exceptions.HTTPError as http_err: print( @@ -175,16 +181,19 @@ def try_old_dump(target_entity): if user_input == "y" or user_input == "": if other_old_dumps: closest_date = available_closest_lexeme_dumpfile( - target_entity, other_old_dumps, try_old_dump + target_entity, other_old_dumps, check_wd_dump_exists ) print( f"\nClosest available older dumps(YYYYMMDD): {parse_date(closest_date)}" ) fileurl = f"{closest_date}/wikidata-{closest_date}-lexemes.json.bz2" + if closest_date: return f"{base_url}/{fileurl}" + else: return + return other_old_dumps try: diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py index 02716a5e..532195f3 100644 --- a/tests/cli/test_get.py +++ b/tests/cli/test_get.py @@ -47,7 +47,8 @@ def test_invalid_arguments(self): # MARK: All Data - # Using sparql based data extract tests + # Note: Wikidata dumps are required for extracting all data. + # @patch("scribe_data.cli.get.query_data") # def test_get_all_data_types_for_language(self, mock_query_data): # get_data(all=True, language="English")