scribe-org · andrewtavis · Dec 16, 2024 · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024
diff --git a/.gitignore b/.gitignore
@@ -40,3 +40,7 @@ scribe_data_csv_export/*
 scribe_data_json_export/*
 scribe_data_sqlite_export/*
 scribe_data_tsv_export/*
+
+# MARK: Wiki Dumps
+
+*.json.bz2
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,7 +10,13 @@ Scribe-Data tries to follow [semantic versioning](https://semver.org/), a MAJOR.
 
 Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/).
 
-## [Upcoming] Scribe-Data 4.x
+## [Upcoming] Scribe-Data 5.0.0
+
+### ✨ Features
+
+- Scribe-Data now has the ability to download the most recent or a specific Wikidata lexemes dump ([#517](https://github.com/scribe-org/Scribe-Data/issues/517)).
+  - The user is prompted to download a dump for calls for all data ([#518](https://github.com/scribe-org/Scribe-Data/issues/518)).
+  - Scribe-Data must now use a lexeme dump to download all Wikidata lexeme data ([#519](https://github.com/scribe-org/Scribe-Data/issues/519)).
 
 ## Scribe-Data 4.1.0
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -287,6 +287,33 @@ Scribe does not accept direct edits to the grammar JSON files as they are source
 
 The documentation for Scribe-Data can be found at [scribe-data.readthedocs.io](https://scribe-data.readthedocs.io/en/latest/). Documentation is an invaluable way to contribute to coding projects as it allows others to more easily understand the project structure and contribute. Issues related to documentation are marked with the [`documentation`](https://github.com/scribe-org/Scribe-Data/labels/documentation) label.
 
+### Function Docstrings
+
+Scribe-Data generally follows [NumPy conventions](https://numpydoc.readthedocs.io/en/latest/format.html) for documenting functions and Python code in general. Function docstrings should have the following format:
+
+```py
+def example_function(argument: argument_type) -> return_type:
+    """
+    An example docstring for a function so others understand your work.
+
+    Parameters
+    ----------
+        argument: argument_type
+            Description of your argument.
+
+    Returns
+    -------
+        return_value : return_type
+            Description of your return value.
+    """
+
+    ...
+
+    return return_value
+```
+
+### Building the Docs
+
 Use the following commands to build the documentation locally:
 
 ```bash

diff --git a/docs/source/scribe_data/index.rst b/docs/source/scribe_data/index.rst
@@ -10,7 +10,6 @@ Scribe-Data
     unicode/index
     wikidata/index
     wikipedia/index
-    wiktionary/index
 
 .. toctree::
     :maxdepth: 1

diff --git a/docs/source/scribe_data/wiktionary/index.rst b/docs/source/scribe_data/wiktionary/index.rst
diff --git a/src/scribe_data/cli/download.py b/src/scribe_data/cli/download.py
@@ -0,0 +1,282 @@
+"""
+Functions for downloading Wikidata lexeme dumps.
+
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->
+"""
+
+import contextlib
+import os
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+import requests
+from rich import print as rprint
+from tqdm import tqdm
+
+from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR, check_lexeme_dump_prompt_download
+
+
+def parse_date(date_string):
+    """
+    Parses a date string into a `datetime.date` object.
+
+    Supported formats:
+        - YYYYMMDD
+        - YYYY/MM/DD
+        - YYYY-MM-DD
+
+    Parameters
+    ----------
+        date_string : str
+            The date string to be parsed.
+
+    Returns
+    -------
+        datetime.date : Parsed date object if the format is valid.
+        None : If the date format is invalid.
+    """
+    formats = ["%Y%m%d", "%Y/%m/%d", "%Y-%m-%d"]
+    for fmt in formats:
+        try:
+            return datetime.strptime(date_string, fmt).date()
+
+        except ValueError:
+            continue
+
+    print(
+        f"Invalid date format: {date_string}. Expected formats: YYYYMMDD, YYYY/MM/DD, or YYYY-MM-DD."
+    )
+    return None
+
+
+def available_closest_lexeme_dumpfile(
+    target_entity: str, other_old_dumps: str, check_wd_dump_exists
+):
+    """
+    Finds the closest available dump file based on the target date.
+
+    Parameters
+    ----------
+        target_entity : str
+            The target date for which the dump is requested (format: YYYY/MM/DD or similar).
+
+        other_old_dumps : list
+            List of available dump folders as strings.
+
+        check_wd_dump_exists : function
+            A function to validate if the dump file exists.
+
+    Returns
+    -------
+        str : The closest available dump file date (as a string).
+        None : If no suitable dump is found.
+    """
+    target_date = parse_date(target_entity)
+    closest_date = None
+    closest_diff = None
+
+    if target_date:
+        available_dates = []
+        for i in other_old_dumps:
+            if i == "..":
+                continue
+
+            with contextlib.suppress(requests.exceptions.HTTPError):
+                if check_wd_dump_exists(i):
+                    available_dates.append(i)
+                    current_date = parse_date(i)
+                    diff = abs((current_date - target_date).days)
+
+                    if closest_diff is None or diff < closest_diff:
+                        closest_date = i
+                        closest_diff = diff
+
+                    if current_date >= target_date:
+                        break
+
+        return closest_date
+
+
+def download_wd_lexeme_dump(target_entity: str = "latest-lexemes"):
+    """
+    Downloads a Wikimedia lexeme dump based on the specified target entity or date.
+
+    Parameters
+    ----------
+        target_entity : str, optional
+            The target dump to download. Defaults to "latest-lexemes".
+
+            - If "latest-lexemes", downloads the latest dump.
+            - If a valid date (e.g., YYYYMMDD), attempts to download the dump for that date.
+
+    Returns
+    -------
+        str : The URL of the requested or closest available dump.
+        None : If no suitable dump is found or the request fails.
+    """
+    base_url = "https://dumps.wikimedia.org/wikidatawiki/entities"
+
+    def check_wd_dump_exists(target_entity):
+        """
+        Checks if the specified dump file exists for a target entity.
+
+        Parameters
+        ----------
+            target_entity : str
+                The target entity or date folder to check.
+
+        Returns
+        -------
+            str : The URL of the dump file if it exists.
+            None : If the dump file does not exist.
+        """
+        entity_url = f"{base_url}/{target_entity}/"
+        entity_response = requests.get(entity_url)
+        entity_response.raise_for_status()
+        dump_filenames = re.findall(r'href="([^"]+)"', entity_response.text)
+
+        file_url = f"wikidata-{target_entity}-lexemes.json.bz2"
+
+        if file_url in dump_filenames:
+            return f"{base_url}/{target_entity}/{fileurl}"
+
+    if target_entity != "latest-lexemes":
+        try:
+            if parse_date(target_entity):
+                target_entity = target_entity.replace("/", "").replace("-", "")
+                return check_wd_dump_exists(target_entity)
+
+        except requests.exceptions.HTTPError as http_err:
+            print(
+                f"HTTP error occurred: {http_err} Status code: {http_err.response.status_code}"
+            )
+            print("We could not find your requested Wikidata lexeme dump.")
+
+            response = requests.get(base_url)
+            other_old_dumps = re.findall(r'href="([^"]+)/"', response.text)
+
+            user_input = input(
+                "Do you want to see the closest available older dumps? [Y/n]"
+            ).lower()
+
+            if user_input != "y":
+                return
+
+            else:
+                if other_old_dumps:
+                    closest_date = available_closest_lexeme_dumpfile(
+                        target_entity, other_old_dumps, check_wd_dump_exists
+                    )
+                    print(
+                        f"\nClosest available older dumps(YYYYMMDD): {parse_date(closest_date)}"
+                    )
+                    fileurl = f"{closest_date}/wikidata-{closest_date}-lexemes.json.bz2"
+
+                    if closest_date:
+                        return f"{base_url}/{fileurl}"
+
+                    else:
+                        return
+
+            return other_old_dumps
+
+    try:
+        response = requests.get(base_url)
+        response.raise_for_status()
+        latest_dump = re.findall(r'href="([^"]+)"', response.text)
+        if "latest-all.json.bz2" in latest_dump:
+            latest_dump_link = f"{base_url}/latest-lexemes.json.bz2"
+            return latest_dump_link
+
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
+
+
+def wd_lexeme_dump_download_wrapper(
+    wikidata_dump: Optional[str] = None, output_dir: Optional[str] = None
+) -> None:
+    """
+    Download Wikidata lexeme dumps given user preferences.
+
+    Parameters
+    ----------
+        wikidata_dump : str
+            Optional date string in YYYYMMDD format for specific dumps.
+
+        output_dir : str
+            Optional directory path for the downloaded file.
+            Defaults to 'scribe_data_wikidata_dumps_export' directory.
+    """
+    dump_url = download_wd_lexeme_dump(wikidata_dump or "latest-lexemes")
+
+    if not dump_url:
+        rprint("[bold red]No dump URL found.[/bold red]")
+        return False
+
+    try:
+        output_dir = output_dir or DEFAULT_DUMP_EXPORT_DIR
+
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Don't check for lexeme if date given.
+        if not wikidata_dump:
+            if useable_file_dir := check_lexeme_dump_prompt_download(output_dir):
+                return useable_file_dir
+
+        filename = dump_url.split("/")[-1]
+        output_path = str(Path(output_dir) / filename)
+
+        user_response = (
+            input(
+                "We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities."
+                "\nDo you want to proceed? (y/n): "
+            )
+            .strip()
+            .lower()
+        )
+
+        if user_response == "y":
+            rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]")
+
+            response = requests.get(dump_url, stream=True)
+            total_size = int(response.headers.get("content-length", 0))
+
+            with open(output_path, "wb") as f:
+                with tqdm(
+                    total=total_size, unit="iB", unit_scale=True, desc=output_path
+                ) as pbar:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        if chunk:
+                            f.write(chunk)
+                            pbar.update(len(chunk))
+
+            rprint("[bold green]Download completed successfully![/bold green]")
+
+            return output_path
+
+        else:
+            return
+
+    except requests.exceptions.RequestException as e:
+        rprint(f"[bold red]Error downloading dump: {e}[/bold red]")
+
+    except Exception as e:
+        rprint(f"[bold red]An error occurred: {e}[/bold red]")