scribe-org · andrewtavis · Dec 16, 2024 · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024
diff --git a/.gitignore b/.gitignore
@@ -40,3 +40,7 @@ scribe_data_csv_export/*
 scribe_data_json_export/*
 scribe_data_sqlite_export/*
 scribe_data_tsv_export/*
+
+# MARK: Wiki Dumps
+
+*.json.bz2
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -287,6 +287,33 @@ Scribe does not accept direct edits to the grammar JSON files as they are source
 
 The documentation for Scribe-Data can be found at [scribe-data.readthedocs.io](https://scribe-data.readthedocs.io/en/latest/). Documentation is an invaluable way to contribute to coding projects as it allows others to more easily understand the project structure and contribute. Issues related to documentation are marked with the [`documentation`](https://github.com/scribe-org/Scribe-Data/labels/documentation) label.
 
+### Function Docstrings
+
+Scribe-Data generally follows [NumPy conventions](https://numpydoc.readthedocs.io/en/latest/format.html) for documenting functions and Python code in general. Function docstrings should have the following format:
+
+```py
+def example_function(argument: argument_type) -> return_type:
+    """
+    An example docstring for a function so others understand your work.
+
+    Parameters
+    ----------
+        argument: argument_type
+            Description of your argument.
+
+    Returns
+    -------
+        return_value : return_type
+            Description of your return value.
+    """
+
+    ...
+
+    return return_value
+```
+
+### Building the Docs
+
 Use the following commands to build the documentation locally:
 
 ```bash

diff --git a/docs/source/scribe_data/index.rst b/docs/source/scribe_data/index.rst
@@ -10,7 +10,6 @@ Scribe-Data
     unicode/index
     wikidata/index
     wikipedia/index
-    wiktionary/index
 
 .. toctree::
     :maxdepth: 1

diff --git a/docs/source/scribe_data/wiktionary/index.rst b/docs/source/scribe_data/wiktionary/index.rst
diff --git a/src/scribe_data/cli/download.py b/src/scribe_data/cli/download.py
@@ -0,0 +1,103 @@
+"""
+Functions for downloading Wikidata lexeme dumps.
+
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->
+"""
+
+import os
+from pathlib import Path
+from typing import Optional
+
+import requests
+from rich import print as rprint
+from tqdm import tqdm
+
+from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR, check_lexeme_dump_prompt_download
+from scribe_data.wikidata.wikidata_utils import download_wiki_lexeme_dump
+
+
+def download_wrapper(
+    wikidata_dump: Optional[str] = None, output_dir: Optional[str] = None
+) -> None:
+    """
+    Download Wikidata lexeme dumps given user preferences.
+
+    Parameters
+    ----------
+        wikidata_dump : str
+            Optional date string in YYYYMMDD format for specific dumps.
+
+        output_dir : str
+            Optional directory path for the downloaded file. Defaults to 'scribe_data_wikidumps' directory.
+    """
+    dump_url = download_wiki_lexeme_dump(wikidata_dump or "latest-lexemes")
+
+    if not dump_url:
+        rprint("[bold red]No dump URL found.[/bold red]")
+        return False
+
+    try:
+        output_dir = output_dir or DEFAULT_DUMP_EXPORT_DIR
+
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Don't check for lexeme if date given.
+        if not wikidata_dump:
+            if useable_file_dir := check_lexeme_dump_prompt_download(output_dir):
+                return useable_file_dir
+
+        filename = dump_url.split("/")[-1]
+        output_path = str(Path(output_dir) / filename)
+
+        user_response = (
+            input(
+                "We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities."
+                "Do you want to proceed? (y/n): "
+            )
+            .strip()
+            .lower()
+        )
+
+        if user_response == "y":
+            rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]")
+
+            response = requests.get(dump_url, stream=True)
+            total_size = int(response.headers.get("content-length", 0))
+
+            with open(output_path, "wb") as f:
+                with tqdm(
+                    total=total_size, unit="iB", unit_scale=True, desc=output_path
+                ) as pbar:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        if chunk:
+                            f.write(chunk)
+                            pbar.update(len(chunk))
+
+            rprint("[bold green]Download completed successfully![/bold green]")
+
+            return output_path
+
+        else:
+            return
+
+    except requests.exceptions.RequestException as e:
+        rprint(f"[bold red]Error downloading dump: {e}[/bold red]")
+
+    except Exception as e:
+        rprint(f"[bold red]An error occurred: {e}[/bold red]")
diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
@@ -25,7 +25,10 @@
 from pathlib import Path
 from typing import List, Union
 
+from rich import print as rprint
+
 from scribe_data.cli.convert import convert_wrapper
+from scribe_data.cli.download import download_wrapper
 from scribe_data.unicode.generate_emoji_keywords import generate_emoji
 from scribe_data.utils import (
     DEFAULT_CSV_EXPORT_DIR,
@@ -46,6 +49,7 @@ def get_data(
     all: bool = False,
     interactive: bool = False,
     identifier_case: str = "camel",
+    wikidata_dump: str = None,
 ) -> None:
     """
     Function for controlling the data get process for the CLI.
@@ -79,6 +83,9 @@ def get_data(
         identifier_case : str
             The case format for identifiers. Default is "camel".
 
+        wikidata_dump : str
+            The local Wikidata dump that should be used to get data.
+
     Returns
     -------
         The requested data saved locally given file type and location arguments.
@@ -99,41 +106,61 @@ def get_data(
     subprocess_result = False
 
     # MARK: Get All
+
     if all:
-        if language:
-            language_or_sub_language = language.split(" ")[0]
-            print(f"Updating all data types for language for {language.title()}")
-            query_data(
-                languages=[language_or_sub_language],
-                data_type=None,
-                output_dir=output_dir,
-                overwrite=overwrite,
-            )
-            print(
-                f"Query completed for all data types with specified language for {language.title()}."
-            )
-
-        elif data_type:
-            print(f"Updating all languages for data type: {data_type.capitalize()}")
-            query_data(
-                languages=None,
-                data_type=[data_type],
-                output_dir=output_dir,
-                overwrite=overwrite,
-            )
-            print(
-                f"Query completed for all languages with specified data type for {data_type.capitalize()}."
-            )
+        # Using Wikidata lexeme based dumps.
+        if wikidata_dump:
+            print("wikidata_dump", wikidata_dump)
+            download_wrapper(None, wikidata_dump)
 
         else:
-            print("Updating all languages and data types...")
-            query_data(
-                languages=None,
-                data_type=None,
-                output_dir=output_dir,
-                overwrite=overwrite,
-            )
-            print("Query completed for all languages and all data types.")
+            print("Using Wikidata lexeme dump...")
+            file_path = download_wrapper()
+            if isinstance(file_path, str) and file_path:
+                rprint(
+                    "[bold green]We'll use the following lexeme dump[/bold green]",
+                    file_path,
+                )
+                rprint(
+                    "[bold red]Parsing lexeme dump feature will be available soon...[/bold red]"
+                )
+
+        # Using Wikidata Query Service based data extraction.
+
+        # if language:
+        #     language_or_sub_language = language.split(" ")[0]
+        #     print(f"Updating all data types for language for {language.title()}")
+        #     query_data(
+        #         languages=[language_or_sub_language],
+        #         data_type=None,
+        #         output_dir=output_dir,
+        #         overwrite=overwrite,
+        #     )
+        #     print(
+        #         f"Query completed for all data types with specified language for {language.title()}."
+        #     )
+
+        # elif data_type:
+        #     print(f"Updating all languages for data type: {data_type.capitalize()}")
+        #     query_data(
+        #         languages=None,
+        #         data_type=[data_type],
+        #         output_dir=output_dir,
+        #         overwrite=overwrite,
+        #     )
+        #     print(
+        #         f"Query completed for all languages with specified data type for {data_type.capitalize()}."
+        #     )
+
+        # else:
+        #     print("Updating all languages and data types...")
+        #     query_data(
+        #         languages=None,
+        #         data_type=None,
+        #         output_dir=output_dir,
+        #         overwrite=overwrite,
+        #     )
+        #     print("Query completed for all languages and all data types.")
 
         subprocess_result = True
 
@@ -172,7 +199,8 @@ def get_data(
         or isinstance(subprocess_result, bool)
         and subprocess_result
     ):
-        print(f"Updated data was saved in: {Path(output_dir).resolve()}.")
+        if not all:
+            print(f"Updated data was saved in: {Path(output_dir).resolve()}.")
 
         json_input_path = Path(output_dir) / f"{language}/{data_type}.json"