Move files from Wiktionary utils to WD utils - delete Wiktionary dir

scribe-org · Dec 15, 2024 · 27378b5 · 27378b5
1 parent 29707db
commit 27378b5
Show file tree

Hide file tree

Showing 10 changed files with 201 additions and 210 deletions.
diff --git a/docs/source/scribe_data/index.rst b/docs/source/scribe_data/index.rst
@@ -10,7 +10,6 @@ Scribe-Data
     unicode/index
     wikidata/index
     wikipedia/index
-    wiktionary/index
 
 .. toctree::
     :maxdepth: 1

diff --git a/docs/source/scribe_data/wiktionary/index.rst b/docs/source/scribe_data/wiktionary/index.rst
diff --git a/src/scribe_data/cli/download.py b/src/scribe_data/cli/download.py
@@ -1,5 +1,5 @@
 """
-Functions for downloading Wikidata dumps.
+Functions for downloading Wikidata lexeme dumps.
 
 .. raw:: html
     <!--
@@ -29,7 +29,7 @@
 from tqdm import tqdm
 
 from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR, check_lexeme_dump_prompt_download
-from scribe_data.wiktionary.wikitionary_utils import download_wiki_lexeme_dump
+from scribe_data.wikidata.wikidata_utils import download_wiki_lexeme_dump
 
 
 def download_wrapper(
@@ -41,25 +41,20 @@ def download_wrapper(
         wikidata_dump: Optional date string in YYYYMMDD format for specific dumps
         output_dir: Optional directory path for the downloaded file. Defaults to 'scribe_data_wikidumps' directory
     """
-    dump_url = download_wiki_lexeme_dump(
-        "latest-lexemes" if not wikidata_dump else wikidata_dump
-    )
+    dump_url = download_wiki_lexeme_dump(wikidata_dump or "latest-lexemes")
 
     if not dump_url:
         rprint("[bold red]No dump URL found.[/bold red]")
         return False
 
     try:
-        output_dir = output_dir if output_dir else DEFAULT_DUMP_EXPORT_DIR
+        output_dir = output_dir or DEFAULT_DUMP_EXPORT_DIR
 
         os.makedirs(output_dir, exist_ok=True)
 
-        # Don't check for lexeme if date given
+        # Don't check for lexeme if date given.
         if not wikidata_dump:
-            useable_file_dir = check_lexeme_dump_prompt_download(output_dir)
-
-            # Check for existing .json.bz2 files
-            if useable_file_dir:
+            if useable_file_dir := check_lexeme_dump_prompt_download(output_dir):
                 return useable_file_dir
 
         filename = dump_url.split("/")[-1]
@@ -68,13 +63,13 @@ def download_wrapper(
         user_response = (
             input(
                 "We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities."
-                "Do you want to proceed? (Yes/Cancel): "
+                "Do you want to proceed? (y/n): "
             )
             .strip()
             .lower()
         )
 
-        if user_response == "yes" or user_response == "":
+        if user_response == "y":
             rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]")
 
             response = requests.get(dump_url, stream=True)
@@ -90,9 +85,14 @@ def download_wrapper(
                             pbar.update(len(chunk))
 
             rprint("[bold green]Download completed successfully![/bold green]")
+
             return output_path
 
+        else:
+            return
+
     except requests.exceptions.RequestException as e:
         rprint(f"[bold red]Error downloading dump: {e}[/bold red]")
+
     except Exception as e:
         rprint(f"[bold red]An error occurred: {e}[/bold red]")
diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
@@ -106,9 +106,9 @@ def get_data(
     subprocess_result = False
 
     # MARK: Get All
-    if all:
-        # Using wikimedia lexeme based dump
 
+    if all:
+        # Using Wikidata lexeme based dumps.
         if wikidata_dump:
             print("wikidata_dump", wikidata_dump)
             download_wrapper(None, wikidata_dump)
@@ -125,7 +125,7 @@ def get_data(
                     "[bold red]Parsing lexeme dump feature will be available soon...[/bold red]"
                 )
 
-        # Using sparql based data extract
+        # Using Wikidata Query Service based data extraction.
 
         # if language:
         #     language_or_sub_language = language.split(" ")[0]

diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
@@ -25,11 +25,12 @@
 import json
 import os
 import re
+from datetime import datetime
 from importlib import resources
 from pathlib import Path
 from typing import Any, Optional
+
 from rich import print as rprint
-from datetime import datetime
 
 # MARK: Utils Variables
 
@@ -620,57 +621,77 @@ def list_languages_with_metadata_for_data_type(language_metadata=_languages):
 
 
 def camel_to_snake(name: str) -> str:
-    """Convert camelCase to snake_case."""
+    """
+    Convert camelCase to snake_case.
+    """
     return re.sub(r"(?<!^)(?=[A-Z])", "_", name).lower()
 
 
-# MARK : Check Dump
+# MARK: Check Dump
 
 
-def check_lexeme_dump_prompt_download(output_dir):
+def check_lexeme_dump_prompt_download(output_dir: str):
+    """
+    Checks to see if a Wikidata lexeme dump exists and prompts the user to download one if not.
+
+    Parameters
+    ----------
+        output_dir : str
+            The directory to check for the existence of a Wikidata lexeme dump.
+
+    Returns
+    -------
+        None : The user is prompted to download a new Wikidata dump after the existence of one is checked.
+    """
     existing_dumps = list(Path(output_dir).glob("*.json.bz2"))
     if existing_dumps:
         rprint("[bold yellow]Existing dump files found:[/bold yellow]")
         for dump in existing_dumps:
             rprint(f"  - {Path(output_dir)}/{dump.name}")
 
         user_input = input(
-            "\nDo you want to\n - Delete existing dumps,\n - Skip download,\n - Use existing latest dump\n -Download (n)ew version?\n [d/s/u/n]: "
+            "\nDo you want to:\n - Delete existing dumps (d)?\n - Skip download (s)?\n - Use existing latest dump (u)?\n -Download new version(n)?\n[d/s/u/n]: "
         ).lower()
+
         if user_input == "d":
             for dump in existing_dumps:
                 dump.unlink()
+
             rprint("[bold green]Existing dumps deleted.[/bold green]")
             user_input = input("Do you want to download latest lexeme dump? (y/N): ")
             return user_input != "y"
 
         elif user_input == "u":
-            # Check for the latest dump file
+            # Check for the latest dump file.
             latest_dump = None
             if any(dump.name == "latest-lexemes.json.bz2" for dump in existing_dumps):
                 latest_dump = Path(output_dir) / "latest-lexemes.json.bz2"
+
             else:
-                # Extract dates from filenames using datetime validation
+                # Extract dates from filenames using datetime validation.
                 dated_dumps = []
                 for dump in existing_dumps:
                     parts = dump.stem.split("-")
                     if len(parts) > 1:
                         try:
                             date = datetime.strptime(parts[1], "%Y%m%d")
                             dated_dumps.append((dump, date))
+
                         except ValueError:
-                            continue  # Skip files without a valid date
+                            continue  # skip files without a valid date
 
                 if dated_dumps:
-                    # Find the dump with the most recent date
+                    # Find the dump with the most recent date.
                     latest_dump = max(dated_dumps, key=lambda x: x[1])[0]
 
             if latest_dump:
                 rprint(f"[bold green]Using latest dump:[/bold green] {latest_dump}")
                 return latest_dump
+
             else:
                 rprint("[bold red]No valid dumps found.[/bold red]")
                 return None
+
         else:
             rprint("[bold blue]Skipping download.[/bold blue]")
             return True
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
@@ -20,8 +20,163 @@
     -->
 """
 
+import re
+from datetime import datetime
+
+import requests
 from SPARQLWrapper import JSON, POST, SPARQLWrapper
 
 sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
 sparql.setReturnFormat(JSON)
 sparql.setMethod(POST)
+
+
+def parse_date(date_string):
+    """
+    Parses a date string into a `datetime.date` object.
+
+    Supported formats:
+        - YYYYMMDD
+        - YYYY/MM/DD
+        - YYYY-MM-DD
+
+    Args:
+        date_string (str): The date string to be parsed.
+
+    Returns:
+        datetime.date: Parsed date object if the format is valid.
+        None: If the date format is invalid.
+    """
+    formats = ["%Y%m%d", "%Y/%m/%d", "%Y-%m-%d"]
+    for fmt in formats:
+        try:
+            return datetime.strptime(date_string, fmt).date()
+        except ValueError:
+            continue
+    print(
+        f"Invalid date format: {date_string}. Expected formats: YYYYMMDD, YYYY/MM/DD, or YYYY-MM-DD."
+    )
+    return None
+
+
+def available_closest_lexeme_dumpfile(target_entity, other_old_dumps, try_old_dump):
+    """
+    Finds the closest available dump file based on the target date.
+
+    Args:
+        target_entity (str): The target date for which the dump is requested (format: YYYY/MM/DD or similar).
+        other_old_dumps (list): List of available dump folders as strings.
+        try_old_dump (function): A function to validate if the dump file exists.
+
+    Returns:
+        str: The closest available dump file date (as a string).
+        None: If no suitable dump is found.
+    """
+    available_dates = []
+    target_date = parse_date(target_entity)
+    closest_date = None
+    closest_diff = None
+
+    if target_date:
+        for i in other_old_dumps:
+            if i == "..":
+                continue
+            try:
+                if try_old_dump(i):
+                    available_dates.append(i)
+                    current_date = parse_date(i)
+                    diff = abs((current_date - target_date).days)
+
+                    if closest_diff is None or diff < closest_diff:
+                        closest_date = i
+                        closest_diff = diff
+
+                    if current_date >= target_date:
+                        break
+            except requests.exceptions.HTTPError:
+                pass
+        return closest_date
+
+
+def download_wiki_lexeme_dump(target_entity="latest-lexemes"):
+    """
+    Downloads a Wikimedia lexeme dump based on the specified target entity or date.
+
+    Args:
+        target_entity (str, optional): The target dump to download. Defaults to "latest-lexemes".
+            - If "latest-lexemes", downloads the latest dump.
+            - If a valid date (e.g., YYYYMMDD), attempts to download the dump for that date.
+
+    Returns:
+        str: The URL of the requested or closest available dump.
+        None: If no suitable dump is found or the request fails.
+    """
+    base_url = "https://dumps.wikimedia.org/wikidatawiki/entities"
+
+    def try_old_dump(target_entity):
+        """
+        Checks if the specified dump file exists for a target entity.
+
+        Args:
+            target_entity (str): The target entity or date folder to check.
+
+        Returns:
+            str: The URL of the dump file if it exists.
+            None: If the dump file does not exist.
+        """
+        entity_url = f"{base_url}/{target_entity}/"
+        entity_response = requests.get(entity_url)
+        entity_response.raise_for_status()
+        dump_filenames = re.findall(r'href="([^"]+)"', entity_response.text)
+
+        fileurl = f"wikidata-{target_entity}-lexemes.json.bz2"
+        if fileurl in dump_filenames:
+            return f"{base_url}/{target_entity}/{fileurl}"
+
+    if target_entity != "latest-lexemes":
+        try:
+            if parse_date(target_entity):
+                target_entity = target_entity.replace("/", "").replace("-", "")
+                return try_old_dump(target_entity)
+
+        except requests.exceptions.HTTPError as http_err:
+            print(
+                f"HTTP error occurred: {http_err} Status code: {http_err.response.status_code}"
+            )
+            print("We could not find your requested Wikidata lexeme dump.")
+
+            response = requests.get(base_url)
+            other_old_dumps = re.findall(r'href="([^"]+)/"', response.text)
+
+            user_input = input(
+                "Do you want to see the closest available older dumps? [Y/n]"
+            ).lower()
+
+            if user_input == "n":
+                return
+
+            if user_input == "y" or user_input == "":
+                if other_old_dumps:
+                    closest_date = available_closest_lexeme_dumpfile(
+                        target_entity, other_old_dumps, try_old_dump
+                    )
+                    print(
+                        f"\nClosest available older dumps(YYYYMMDD): {parse_date(closest_date)}"
+                    )
+                    fileurl = f"{closest_date}/wikidata-{closest_date}-lexemes.json.bz2"
+                    if closest_date:
+                        return f"{base_url}/{fileurl}"
+                    else:
+                        return
+            return other_old_dumps
+
+    try:
+        response = requests.get(base_url)
+        response.raise_for_status()
+        latest_dump = re.findall(r'href="([^"]+)"', response.text)
+        if "latest-all.json.bz2" in latest_dump:
+            latest_dump_link = f"{base_url}/latest-lexemes.json.bz2"
+            return latest_dump_link
+
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
diff --git a/src/scribe_data/wiktionary/__init__.py b/src/scribe_data/wiktionary/__init__.py
diff --git a/src/scribe_data/wiktionary/synonum_antonym/__init__.py b/src/scribe_data/wiktionary/synonum_antonym/__init__.py
diff --git a/src/scribe_data/wiktionary/translation/__init__.py b/src/scribe_data/wiktionary/translation/__init__.py