scribe-org · andrewtavis · Dec 16, 2024 · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024
diff --git a/.gitignore b/.gitignore
@@ -40,3 +40,7 @@ scribe_data_csv_export/*
 scribe_data_json_export/*
 scribe_data_sqlite_export/*
 scribe_data_tsv_export/*
+
+# MARK: Wiki Dumps
+
+*.json.bz2
diff --git a/docs/source/scribe_data/index.rst b/docs/source/scribe_data/index.rst
@@ -10,7 +10,6 @@ Scribe-Data
     unicode/index
     wikidata/index
     wikipedia/index
-    wiktionary/index
 
 .. toctree::
     :maxdepth: 1

diff --git a/docs/source/scribe_data/wiktionary/index.rst b/docs/source/scribe_data/wiktionary/index.rst
diff --git a/src/scribe_data/cli/download.py b/src/scribe_data/cli/download.py
@@ -0,0 +1,98 @@
+"""
+Functions for downloading Wikidata lexeme dumps.
+
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->
+"""
+
+import os
+from pathlib import Path
+from typing import Optional
+
+import requests
+from rich import print as rprint
+from tqdm import tqdm
+
+from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR, check_lexeme_dump_prompt_download
+from scribe_data.wikidata.wikidata_utils import download_wiki_lexeme_dump
+
+
+def download_wrapper(
+    wikidata_dump: Optional[str] = None, output_dir: Optional[str] = None
+) -> None:
+    """Download Wikidata dumps.
+
+    Args:
+        wikidata_dump: Optional date string in YYYYMMDD format for specific dumps
+        output_dir: Optional directory path for the downloaded file. Defaults to 'scribe_data_wikidumps' directory
+    """
+    dump_url = download_wiki_lexeme_dump(wikidata_dump or "latest-lexemes")
+
+    if not dump_url:
+        rprint("[bold red]No dump URL found.[/bold red]")
+        return False
+
+    try:
+        output_dir = output_dir or DEFAULT_DUMP_EXPORT_DIR
+
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Don't check for lexeme if date given.
+        if not wikidata_dump:
+            if useable_file_dir := check_lexeme_dump_prompt_download(output_dir):
+                return useable_file_dir
+
+        filename = dump_url.split("/")[-1]
+        output_path = str(Path(output_dir) / filename)
+
+        user_response = (
+            input(
+                "We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities."
+                "Do you want to proceed? (y/n): "
+            )
+            .strip()
+            .lower()
+        )
+
+        if user_response == "y":
+            rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]")
+
+            response = requests.get(dump_url, stream=True)
+            total_size = int(response.headers.get("content-length", 0))
+
+            with open(output_path, "wb") as f:
+                with tqdm(
+                    total=total_size, unit="iB", unit_scale=True, desc=output_path
+                ) as pbar:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        if chunk:
+                            f.write(chunk)
+                            pbar.update(len(chunk))
+
+            rprint("[bold green]Download completed successfully![/bold green]")
+
+            return output_path
+
+        else:
+            return
+
+    except requests.exceptions.RequestException as e:
+        rprint(f"[bold red]Error downloading dump: {e}[/bold red]")
+
+    except Exception as e:
+        rprint(f"[bold red]An error occurred: {e}[/bold red]")
diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
@@ -25,7 +25,10 @@
 from pathlib import Path
 from typing import List, Union
 
+from rich import print as rprint
+
 from scribe_data.cli.convert import convert_wrapper
+from scribe_data.cli.download import download_wrapper
 from scribe_data.unicode.generate_emoji_keywords import generate_emoji
 from scribe_data.utils import (
     DEFAULT_CSV_EXPORT_DIR,
@@ -46,6 +49,7 @@ def get_data(
     all: bool = False,
     interactive: bool = False,
     identifier_case: str = "camel",
+    wikidata_dump: str = None,
 ) -> None:
     """
     Function for controlling the data get process for the CLI.
@@ -79,6 +83,9 @@ def get_data(
         identifier_case : str
             The case format for identifiers. Default is "camel".
 
+        wikidata_dump : str
+            The local Wikidata dump that should be used to get data.
+
     Returns
     -------
         The requested data saved locally given file type and location arguments.
@@ -99,41 +106,61 @@ def get_data(
     subprocess_result = False
 
     # MARK: Get All
+
     if all:
-        if language:
-            language_or_sub_language = language.split(" ")[0]
-            print(f"Updating all data types for language for {language.title()}")
-            query_data(
-                languages=[language_or_sub_language],
-                data_type=None,
-                output_dir=output_dir,
-                overwrite=overwrite,
-            )
-            print(
-                f"Query completed for all data types with specified language for {language.title()}."
-            )
-
-        elif data_type:
-            print(f"Updating all languages for data type: {data_type.capitalize()}")
-            query_data(
-                languages=None,
-                data_type=[data_type],
-                output_dir=output_dir,
-                overwrite=overwrite,
-            )
-            print(
-                f"Query completed for all languages with specified data type for {data_type.capitalize()}."
-            )
+        # Using Wikidata lexeme based dumps.
+        if wikidata_dump:
+            print("wikidata_dump", wikidata_dump)
+            download_wrapper(None, wikidata_dump)
 
         else:
-            print("Updating all languages and data types...")
-            query_data(
-                languages=None,
-                data_type=None,
-                output_dir=output_dir,
-                overwrite=overwrite,
-            )
-            print("Query completed for all languages and all data types.")
+            print("Using Wikidata lexeme dump...")
+            file_path = download_wrapper()
+            if isinstance(file_path, str) and file_path:
+                rprint(
+                    "[bold green]We'll use the following lexeme dump[/bold green]",
+                    file_path,
+                )
+                rprint(
+                    "[bold red]Parsing lexeme dump feature will be available soon...[/bold red]"
+                )
+
+        # Using Wikidata Query Service based data extraction.
+
+        # if language:
+        #     language_or_sub_language = language.split(" ")[0]
+        #     print(f"Updating all data types for language for {language.title()}")
+        #     query_data(
+        #         languages=[language_or_sub_language],
+        #         data_type=None,
+        #         output_dir=output_dir,
+        #         overwrite=overwrite,
+        #     )
+        #     print(
+        #         f"Query completed for all data types with specified language for {language.title()}."
+        #     )
+
+        # elif data_type:
+        #     print(f"Updating all languages for data type: {data_type.capitalize()}")
+        #     query_data(
+        #         languages=None,
+        #         data_type=[data_type],
+        #         output_dir=output_dir,
+        #         overwrite=overwrite,
+        #     )
+        #     print(
+        #         f"Query completed for all languages with specified data type for {data_type.capitalize()}."
+        #     )
+
+        # else:
+        #     print("Updating all languages and data types...")
+        #     query_data(
+        #         languages=None,
+        #         data_type=None,
+        #         output_dir=output_dir,
+        #         overwrite=overwrite,
+        #     )
+        #     print("Query completed for all languages and all data types.")
 
         subprocess_result = True
 
@@ -172,7 +199,8 @@ def get_data(
         or isinstance(subprocess_result, bool)
         and subprocess_result
     ):
-        print(f"Updated data was saved in: {Path(output_dir).resolve()}.")
+        if not all:
+            print(f"Updated data was saved in: {Path(output_dir).resolve()}.")
 
         json_input_path = Path(output_dir) / f"{language}/{data_type}.json"
 

diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
@@ -34,6 +34,7 @@
 from scribe_data.cli.total import total_wrapper
 from scribe_data.cli.upgrade import upgrade_cli
 from scribe_data.cli.version import get_version_message
+from scribe_data.cli.download import download_wrapper
 
 LIST_DESCRIPTION = "List languages, data types and combinations of each that Scribe-Data can be used for."
 GET_DESCRIPTION = (
@@ -159,6 +160,12 @@ def main() -> None:
         default="camel",
         help="The case format for identifiers in the output data (default: camel).",
     )
+    get_parser.add_argument(
+        "-wd",
+        "--wikidata-dump-path",
+        type=str,
+        help="Path to a local Wikidata lexemes dump required for running with '--all'.",
+    )
 
     # MARK: Total
 
@@ -265,22 +272,34 @@ def main() -> None:
         help="Convert all languages and data types.",
     )
 
-    # MARK: Setup CLI
+    # MARK: Download
 
-    args = parser.parse_args()
-
-    if args.data_type and isinstance(args.data_type, str):
-        args.data_type = args.data_type.replace("-", "_")
+    download_parser = subparsers.add_parser(
+        "download",
+        aliases=["d"],
+        help="Download Wikidata dumps.",
+        description="Download Wikidata dumps from dumps.wikimedia.org.",
+        epilog=CLI_EPILOG,
+        formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=60),
+    )
+    download_parser._actions[0].help = "Show this help message and exit."
+    download_parser.add_argument(
+        "-wd",
+        "--wikidata-dump-version",
+        nargs="?",
+        const="latest",
+        help="Download Wikidata dump. Optionally specify date in YYYYMMDD format.",
+    )
+    download_parser.add_argument(
+        "-od",
+        "--output-dir",
+        type=str,
+        help="The output directory path for the downloaded dump.",
+    )
 
-    try:
-        if args.language or args.data_type:
-            validate_language_and_data_type(
-                language=args.language, data_type=args.data_type
-            )
+    # MARK: Setup CLI
 
-    except ValueError as e:
-        print(f"Input validation failed with error: {e}")
-        return
+    args = parser.parse_args()
 
     if args.upgrade:
         upgrade_cli()
@@ -291,6 +310,27 @@ def main() -> None:
         return
 
     try:
+        # Only validate language and data_type for relevant commands
+        if args.command in ["list", "l", "get", "g", "total", "t", "convert", "c"]:
+            if (
+                hasattr(args, "data_type")
+                and args.data_type
+                and isinstance(args.data_type, str)
+            ):
+                args.data_type = args.data_type.replace("-", "_")
+
+            if hasattr(args, "language") or hasattr(args, "data_type"):
+                try:
+                    validate_language_and_data_type(
+                        language=args.language if hasattr(args, "language") else None,
+                        data_type=args.data_type
+                        if hasattr(args, "data_type")
+                        else None,
+                    )
+                except ValueError as e:
+                    print(f"Input validation failed with error: {e}")
+                    return
+
         if args.command in ["list", "l"]:
             list_wrapper(
                 language=args.language, data_type=args.data_type, all_bool=args.all
@@ -314,6 +354,7 @@ def main() -> None:
                     overwrite=args.overwrite,
                     all=args.all,
                     identifier_case=args.identifier_case,
+                    wiki_dump=args.wikidata_dump,
                 )
 
         elif args.command in ["total", "t"]:
@@ -345,6 +386,14 @@ def main() -> None:
                 all=args.all,
             )
 
+        elif args.command in ["download", "d"]:
+            download_wrapper(
+                wikidata_dump=args.wikidata_dump
+                if args.wikidata_dump != "latest"
+                else None,
+                output_dir=args.output_dir,
+            )
+
         else:
             parser.print_help()