Skip to content

Commit

Permalink
Move files from Wiktionary utils to WD utils - delete Wiktionary dir
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtavis committed Dec 15, 2024
1 parent 29707db commit 27378b5
Show file tree
Hide file tree
Showing 10 changed files with 201 additions and 210 deletions.
1 change: 0 additions & 1 deletion docs/source/scribe_data/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ Scribe-Data
unicode/index
wikidata/index
wikipedia/index
wiktionary/index

.. toctree::
:maxdepth: 1
Expand Down
6 changes: 0 additions & 6 deletions docs/source/scribe_data/wiktionary/index.rst

This file was deleted.

26 changes: 13 additions & 13 deletions src/scribe_data/cli/download.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Functions for downloading Wikidata dumps.
Functions for downloading Wikidata lexeme dumps.
.. raw:: html
<!--
Expand Down Expand Up @@ -29,7 +29,7 @@
from tqdm import tqdm

from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR, check_lexeme_dump_prompt_download
from scribe_data.wiktionary.wikitionary_utils import download_wiki_lexeme_dump
from scribe_data.wikidata.wikidata_utils import download_wiki_lexeme_dump


def download_wrapper(
Expand All @@ -41,25 +41,20 @@ def download_wrapper(
wikidata_dump: Optional date string in YYYYMMDD format for specific dumps
output_dir: Optional directory path for the downloaded file. Defaults to 'scribe_data_wikidumps' directory
"""
dump_url = download_wiki_lexeme_dump(
"latest-lexemes" if not wikidata_dump else wikidata_dump
)
dump_url = download_wiki_lexeme_dump(wikidata_dump or "latest-lexemes")

if not dump_url:
rprint("[bold red]No dump URL found.[/bold red]")
return False

try:
output_dir = output_dir if output_dir else DEFAULT_DUMP_EXPORT_DIR
output_dir = output_dir or DEFAULT_DUMP_EXPORT_DIR

os.makedirs(output_dir, exist_ok=True)

# Don't check for lexeme if date given
# Don't check for lexeme if date given.
if not wikidata_dump:
useable_file_dir = check_lexeme_dump_prompt_download(output_dir)

# Check for existing .json.bz2 files
if useable_file_dir:
if useable_file_dir := check_lexeme_dump_prompt_download(output_dir):
return useable_file_dir

filename = dump_url.split("/")[-1]
Expand All @@ -68,13 +63,13 @@ def download_wrapper(
user_response = (
input(
"We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities."
"Do you want to proceed? (Yes/Cancel): "
"Do you want to proceed? (y/n): "
)
.strip()
.lower()
)

if user_response == "yes" or user_response == "":
if user_response == "y":
rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]")

response = requests.get(dump_url, stream=True)
Expand All @@ -90,9 +85,14 @@ def download_wrapper(
pbar.update(len(chunk))

rprint("[bold green]Download completed successfully![/bold green]")

return output_path

else:
return

except requests.exceptions.RequestException as e:
rprint(f"[bold red]Error downloading dump: {e}[/bold red]")

except Exception as e:
rprint(f"[bold red]An error occurred: {e}[/bold red]")
6 changes: 3 additions & 3 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,9 @@ def get_data(
subprocess_result = False

# MARK: Get All
if all:
# Using wikimedia lexeme based dump

if all:
# Using Wikidata lexeme based dumps.
if wikidata_dump:
print("wikidata_dump", wikidata_dump)
download_wrapper(None, wikidata_dump)
Expand All @@ -125,7 +125,7 @@ def get_data(
"[bold red]Parsing lexeme dump feature will be available soon...[/bold red]"
)

# Using sparql based data extract
# Using Wikidata Query Service based data extraction.

# if language:
# language_or_sub_language = language.split(" ")[0]
Expand Down
39 changes: 30 additions & 9 deletions src/scribe_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@
import json
import os
import re
from datetime import datetime
from importlib import resources
from pathlib import Path
from typing import Any, Optional

from rich import print as rprint
from datetime import datetime

# MARK: Utils Variables

Expand Down Expand Up @@ -620,57 +621,77 @@ def list_languages_with_metadata_for_data_type(language_metadata=_languages):


def camel_to_snake(name: str) -> str:
"""Convert camelCase to snake_case."""
"""
Convert camelCase to snake_case.
"""
return re.sub(r"(?<!^)(?=[A-Z])", "_", name).lower()


# MARK : Check Dump
# MARK: Check Dump


def check_lexeme_dump_prompt_download(output_dir):
def check_lexeme_dump_prompt_download(output_dir: str):
"""
Checks to see if a Wikidata lexeme dump exists and prompts the user to download one if not.
Parameters
----------
output_dir : str
The directory to check for the existence of a Wikidata lexeme dump.
Returns
-------
None : The user is prompted to download a new Wikidata dump after the existence of one is checked.
"""
existing_dumps = list(Path(output_dir).glob("*.json.bz2"))
if existing_dumps:
rprint("[bold yellow]Existing dump files found:[/bold yellow]")
for dump in existing_dumps:
rprint(f" - {Path(output_dir)}/{dump.name}")

user_input = input(
"\nDo you want to\n - Delete existing dumps,\n - Skip download,\n - Use existing latest dump\n -Download (n)ew version?\n [d/s/u/n]: "
"\nDo you want to:\n - Delete existing dumps (d)?\n - Skip download (s)?\n - Use existing latest dump (u)?\n -Download new version(n)?\n[d/s/u/n]: "
).lower()

if user_input == "d":
for dump in existing_dumps:
dump.unlink()

rprint("[bold green]Existing dumps deleted.[/bold green]")
user_input = input("Do you want to download latest lexeme dump? (y/N): ")
return user_input != "y"

elif user_input == "u":
# Check for the latest dump file
# Check for the latest dump file.
latest_dump = None
if any(dump.name == "latest-lexemes.json.bz2" for dump in existing_dumps):
latest_dump = Path(output_dir) / "latest-lexemes.json.bz2"

else:
# Extract dates from filenames using datetime validation
# Extract dates from filenames using datetime validation.
dated_dumps = []
for dump in existing_dumps:
parts = dump.stem.split("-")
if len(parts) > 1:
try:
date = datetime.strptime(parts[1], "%Y%m%d")
dated_dumps.append((dump, date))

except ValueError:
continue # Skip files without a valid date
continue # skip files without a valid date

if dated_dumps:
# Find the dump with the most recent date
# Find the dump with the most recent date.
latest_dump = max(dated_dumps, key=lambda x: x[1])[0]

if latest_dump:
rprint(f"[bold green]Using latest dump:[/bold green] {latest_dump}")
return latest_dump

else:
rprint("[bold red]No valid dumps found.[/bold red]")
return None

else:
rprint("[bold blue]Skipping download.[/bold blue]")
return True
155 changes: 155 additions & 0 deletions src/scribe_data/wikidata/wikidata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,163 @@
-->
"""

import re
from datetime import datetime

import requests
from SPARQLWrapper import JSON, POST, SPARQLWrapper

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(JSON)
sparql.setMethod(POST)


def parse_date(date_string):
"""
Parses a date string into a `datetime.date` object.
Supported formats:
- YYYYMMDD
- YYYY/MM/DD
- YYYY-MM-DD
Args:
date_string (str): The date string to be parsed.
Returns:
datetime.date: Parsed date object if the format is valid.
None: If the date format is invalid.
"""
formats = ["%Y%m%d", "%Y/%m/%d", "%Y-%m-%d"]
for fmt in formats:
try:
return datetime.strptime(date_string, fmt).date()
except ValueError:
continue
print(
f"Invalid date format: {date_string}. Expected formats: YYYYMMDD, YYYY/MM/DD, or YYYY-MM-DD."
)
return None


def available_closest_lexeme_dumpfile(target_entity, other_old_dumps, try_old_dump):
"""
Finds the closest available dump file based on the target date.
Args:
target_entity (str): The target date for which the dump is requested (format: YYYY/MM/DD or similar).
other_old_dumps (list): List of available dump folders as strings.
try_old_dump (function): A function to validate if the dump file exists.
Returns:
str: The closest available dump file date (as a string).
None: If no suitable dump is found.
"""
available_dates = []
target_date = parse_date(target_entity)
closest_date = None
closest_diff = None

if target_date:
for i in other_old_dumps:
if i == "..":
continue
try:
if try_old_dump(i):
available_dates.append(i)
current_date = parse_date(i)
diff = abs((current_date - target_date).days)

if closest_diff is None or diff < closest_diff:
closest_date = i
closest_diff = diff

if current_date >= target_date:
break
except requests.exceptions.HTTPError:
pass
return closest_date


def download_wiki_lexeme_dump(target_entity="latest-lexemes"):
"""
Downloads a Wikimedia lexeme dump based on the specified target entity or date.
Args:
target_entity (str, optional): The target dump to download. Defaults to "latest-lexemes".
- If "latest-lexemes", downloads the latest dump.
- If a valid date (e.g., YYYYMMDD), attempts to download the dump for that date.
Returns:
str: The URL of the requested or closest available dump.
None: If no suitable dump is found or the request fails.
"""
base_url = "https://dumps.wikimedia.org/wikidatawiki/entities"

def try_old_dump(target_entity):
"""
Checks if the specified dump file exists for a target entity.
Args:
target_entity (str): The target entity or date folder to check.
Returns:
str: The URL of the dump file if it exists.
None: If the dump file does not exist.
"""
entity_url = f"{base_url}/{target_entity}/"
entity_response = requests.get(entity_url)
entity_response.raise_for_status()
dump_filenames = re.findall(r'href="([^"]+)"', entity_response.text)

fileurl = f"wikidata-{target_entity}-lexemes.json.bz2"
if fileurl in dump_filenames:
return f"{base_url}/{target_entity}/{fileurl}"

if target_entity != "latest-lexemes":
try:
if parse_date(target_entity):
target_entity = target_entity.replace("/", "").replace("-", "")
return try_old_dump(target_entity)

except requests.exceptions.HTTPError as http_err:
print(
f"HTTP error occurred: {http_err} Status code: {http_err.response.status_code}"
)
print("We could not find your requested Wikidata lexeme dump.")

response = requests.get(base_url)
other_old_dumps = re.findall(r'href="([^"]+)/"', response.text)

user_input = input(
"Do you want to see the closest available older dumps? [Y/n]"
).lower()

if user_input == "n":
return

if user_input == "y" or user_input == "":
if other_old_dumps:
closest_date = available_closest_lexeme_dumpfile(
target_entity, other_old_dumps, try_old_dump
)
print(
f"\nClosest available older dumps(YYYYMMDD): {parse_date(closest_date)}"
)
fileurl = f"{closest_date}/wikidata-{closest_date}-lexemes.json.bz2"
if closest_date:
return f"{base_url}/{fileurl}"
else:
return
return other_old_dumps

try:
response = requests.get(base_url)
response.raise_for_status()
latest_dump = re.findall(r'href="([^"]+)"', response.text)
if "latest-all.json.bz2" in latest_dump:
latest_dump_link = f"{base_url}/latest-lexemes.json.bz2"
return latest_dump_link

except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
Empty file.
Empty file.
Empty file.
Loading

0 comments on commit 27378b5

Please sign in to comment.