Skip to content

Commit

Permalink
Comment and file formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtavis committed Dec 16, 2024
1 parent 6e70995 commit c074ae2
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 18 deletions.
9 changes: 5 additions & 4 deletions src/scribe_data/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@

from scribe_data.cli.cli_utils import validate_language_and_data_type
from scribe_data.cli.convert import convert_wrapper
from scribe_data.cli.download import download_wrapper
from scribe_data.cli.get import get_data
from scribe_data.cli.interactive import start_interactive_mode
from scribe_data.cli.list import list_wrapper
from scribe_data.cli.total import total_wrapper
from scribe_data.cli.upgrade import upgrade_cli
from scribe_data.cli.version import get_version_message
from scribe_data.cli.download import download_wrapper

LIST_DESCRIPTION = "List languages, data types and combinations of each that Scribe-Data can be used for."
GET_DESCRIPTION = (
Expand Down Expand Up @@ -161,7 +161,7 @@ def main() -> None:
help="The case format for identifiers in the output data (default: camel).",
)
get_parser.add_argument(
"-wd",
"-wdp",
"--wikidata-dump-path",
type=str,
help="Path to a local Wikidata lexemes dump required for running with '--all'.",
Expand Down Expand Up @@ -284,7 +284,7 @@ def main() -> None:
)
download_parser._actions[0].help = "Show this help message and exit."
download_parser.add_argument(
"-wd",
"-wdv",
"--wikidata-dump-version",
nargs="?",
const="latest",
Expand All @@ -310,7 +310,7 @@ def main() -> None:
return

try:
# Only validate language and data_type for relevant commands
# Only validate language and data_type for relevant commands.
if args.command in ["list", "l", "get", "g", "total", "t", "convert", "c"]:
if (
hasattr(args, "data_type")
Expand All @@ -327,6 +327,7 @@ def main() -> None:
if hasattr(args, "data_type")
else None,
)

except ValueError as e:
print(f"Input validation failed with error: {e}")
return
Expand Down
35 changes: 22 additions & 13 deletions src/scribe_data/wikidata/wikidata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
-->
"""

import contextlib
import re
from datetime import datetime

Expand Down Expand Up @@ -54,15 +55,19 @@ def parse_date(date_string):
for fmt in formats:
try:
return datetime.strptime(date_string, fmt).date()

except ValueError:
continue

print(
f"Invalid date format: {date_string}. Expected formats: YYYYMMDD, YYYY/MM/DD, or YYYY-MM-DD."
)
return None


def available_closest_lexeme_dumpfile(target_entity, other_old_dumps, try_old_dump):
def available_closest_lexeme_dumpfile(
target_entity: str, other_old_dumps: str, check_wd_dump_exists
):
"""
Finds the closest available dump file based on the target date.
Expand All @@ -74,25 +79,26 @@ def available_closest_lexeme_dumpfile(target_entity, other_old_dumps, try_old_du
other_old_dumps : list
List of available dump folders as strings.
try_old_dump : function
check_wd_dump_exists : function
A function to validate if the dump file exists.
Returns
-------
str : The closest available dump file date (as a string).
None : If no suitable dump is found.
"""
available_dates = []
target_date = parse_date(target_entity)
closest_date = None
closest_diff = None

if target_date:
available_dates = []
for i in other_old_dumps:
if i == "..":
continue
try:
if try_old_dump(i):

with contextlib.suppress(requests.exceptions.HTTPError):
if check_wd_dump_exists(i):
available_dates.append(i)
current_date = parse_date(i)
diff = abs((current_date - target_date).days)
Expand All @@ -103,12 +109,11 @@ def available_closest_lexeme_dumpfile(target_entity, other_old_dumps, try_old_du

if current_date >= target_date:
break
except requests.exceptions.HTTPError:
pass

return closest_date


def download_wiki_lexeme_dump(target_entity="latest-lexemes"):
def download_wiki_lexeme_dump(target_entity: str = "latest-lexemes"):
"""
Downloads a Wikimedia lexeme dump based on the specified target entity or date.
Expand All @@ -127,7 +132,7 @@ def download_wiki_lexeme_dump(target_entity="latest-lexemes"):
"""
base_url = "https://dumps.wikimedia.org/wikidatawiki/entities"

def try_old_dump(target_entity):
def check_wd_dump_exists(target_entity):
"""
Checks if the specified dump file exists for a target entity.
Expand All @@ -146,15 +151,16 @@ def try_old_dump(target_entity):
entity_response.raise_for_status()
dump_filenames = re.findall(r'href="([^"]+)"', entity_response.text)

fileurl = f"wikidata-{target_entity}-lexemes.json.bz2"
if fileurl in dump_filenames:
file_url = f"wikidata-{target_entity}-lexemes.json.bz2"

if file_url in dump_filenames:
return f"{base_url}/{target_entity}/{fileurl}"

if target_entity != "latest-lexemes":
try:
if parse_date(target_entity):
target_entity = target_entity.replace("/", "").replace("-", "")
return try_old_dump(target_entity)
return check_wd_dump_exists(target_entity)

except requests.exceptions.HTTPError as http_err:
print(
Expand All @@ -175,16 +181,19 @@ def try_old_dump(target_entity):
if user_input == "y" or user_input == "":
if other_old_dumps:
closest_date = available_closest_lexeme_dumpfile(
target_entity, other_old_dumps, try_old_dump
target_entity, other_old_dumps, check_wd_dump_exists
)
print(
f"\nClosest available older dumps(YYYYMMDD): {parse_date(closest_date)}"
)
fileurl = f"{closest_date}/wikidata-{closest_date}-lexemes.json.bz2"

if closest_date:
return f"{base_url}/{fileurl}"

else:
return

return other_old_dumps

try:
Expand Down
3 changes: 2 additions & 1 deletion tests/cli/test_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def test_invalid_arguments(self):

# MARK: All Data

# Using sparql based data extract tests
# Note: Wikidata dumps are required for extracting all data.

# @patch("scribe_data.cli.get.query_data")
# def test_get_all_data_types_for_language(self, mock_query_data):
# get_data(all=True, language="English")
Expand Down

0 comments on commit c074ae2

Please sign in to comment.