final

scribe-org · Dec 10, 2024 · 8ce7744 · 8ce7744
1 parent 1b0d6fa
commit 8ce7744
Show file tree

Hide file tree

Showing 5 changed files with 132 additions and 57 deletions.
diff --git a/src/scribe_data/cli/download.py b/src/scribe_data/cli/download.py
@@ -46,35 +46,50 @@ def download_wrapper(
 
     if not dump_url:
         rprint("[bold red]No dump URL found.[/bold red]")
-        return
+        return False
 
     try:
         output_dir = output_dir if output_dir else DEFAULT_DUMP_EXPORT_DIR
 
         os.makedirs(output_dir, exist_ok=True)
 
-        # Check for existing .json.bz2 files
-        if check_lexeme_dump_prompt_download(output_dir):
-            return
+        # Don't check for lexeme if date given
+        if not wikidata_dump:
+            useable_file_dir = check_lexeme_dump_prompt_download(output_dir)
+
+            # Check for existing .json.bz2 files
+            if useable_file_dir:
+                return useable_file_dir
 
         filename = dump_url.split("/")[-1]
         output_path = str(Path(output_dir) / filename)
 
-        rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]")
+        user_response = (
+            input(
+                "We'll using lexeme dump from dumps.wikimedia.org/wikidatawiki/entities."
+                "Do you want to Use it? (Yes/Cancel): "
+            )
+            .strip()
+            .lower()
+        )
+
+        if user_response == "yes" or user_response == "":
+            rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]")
 
-        response = requests.get(dump_url, stream=True)
-        total_size = int(response.headers.get("content-length", 0))
+            response = requests.get(dump_url, stream=True)
+            total_size = int(response.headers.get("content-length", 0))
 
-        with open(output_path, "wb") as f:
-            with tqdm(
-                total=total_size, unit="iB", unit_scale=True, desc=output_path
-            ) as pbar:
-                for chunk in response.iter_content(chunk_size=8192):
-                    if chunk:
-                        f.write(chunk)
-                        pbar.update(len(chunk))
+            with open(output_path, "wb") as f:
+                with tqdm(
+                    total=total_size, unit="iB", unit_scale=True, desc=output_path
+                ) as pbar:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        if chunk:
+                            f.write(chunk)
+                            pbar.update(len(chunk))
 
-        rprint("[bold green]Download completed successfully![/bold green]")
+            rprint("[bold green]Download completed successfully![/bold green]")
+            return output_path
 
     except requests.exceptions.RequestException as e:
         rprint(f"[bold red]Error downloading dump: {e}[/bold red]")

diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
@@ -24,6 +24,7 @@
 import subprocess
 from pathlib import Path
 from typing import List, Union
+from rich import print as rprint
 
 from scribe_data.cli.convert import convert_wrapper
 from scribe_data.unicode.generate_emoji_keywords import generate_emoji
@@ -34,6 +35,7 @@
     DEFAULT_TSV_EXPORT_DIR,
 )
 from scribe_data.wikidata.query_data import query_data
+from scribe_data.cli.download import download_wrapper
 
 
 def get_data(
@@ -46,6 +48,7 @@ def get_data(
     all: bool = False,
     interactive: bool = False,
     identifier_case: str = "camel",
+    wiki_dump: str = None,
 ) -> None:
     """
     Function for controlling the data get process for the CLI.
@@ -100,40 +103,62 @@ def get_data(
 
     # MARK: Get All
     if all:
-        if language:
-            language_or_sub_language = language.split(" ")[0]
-            print(f"Updating all data types for language for {language.title()}")
-            query_data(
-                languages=[language_or_sub_language],
-                data_type=None,
-                output_dir=output_dir,
-                overwrite=overwrite,
-            )
-            print(
-                f"Query completed for all data types with specified language for {language.title()}."
-            )
-
-        elif data_type:
-            print(f"Updating all languages for data type: {data_type.capitalize()}")
-            query_data(
-                languages=None,
-                data_type=[data_type],
-                output_dir=output_dir,
-                overwrite=overwrite,
-            )
-            print(
-                f"Query completed for all languages with specified data type for {data_type.capitalize()}."
-            )
-
+        if wiki_dump:
+            print("wiki_dump", wiki_dump)
+            download_wrapper(None, wiki_dump)
         else:
-            print("Updating all languages and data types...")
-            query_data(
-                languages=None,
-                data_type=None,
-                output_dir=output_dir,
-                overwrite=overwrite,
-            )
-            print("Query completed for all languages and all data types.")
+            # user_response = input(
+            #     "We'll using lexeme dump from dumps.wikimedia.org/wikidatawiki/entities."
+            #     "Do you want to Use it? (Yes/Cancel): "
+            # ).strip().lower()
+            # if user_response == "yes" or user_response=="":
+            print("Using wikimedia lexeme dump...")
+            file_path = download_wrapper()
+            if file_path:
+                rprint("[bold green]we'll use this lexeme dump[/bold green]", file_path)
+                rprint(
+                    "[bold red]Parsing lexeme dump feature will be available soon...[/bold red]"
+                )
+            else:
+                print("Error occurred! Please check the dump file")
+        # else:
+        #     print("canceled...")
+        #     return
+
+        # if language:
+        #     language_or_sub_language = language.split(" ")[0]
+        #     print(f"Updating all data types for language for {language.title()}")
+        #     query_data(
+        #         languages=[language_or_sub_language],
+        #         data_type=None,
+        #         output_dir=output_dir,
+        #         overwrite=overwrite,
+        #     )
+        #     print(
+        #         f"Query completed for all data types with specified language for {language.title()}."
+        #     )
+
+        # elif data_type:
+        #     print(f"Updating all languages for data type: {data_type.capitalize()}")
+        #     query_data(
+        #         languages=None,
+        #         data_type=[data_type],
+        #         output_dir=output_dir,
+        #         overwrite=overwrite,
+        #     )
+        #     print(
+        #         f"Query completed for all languages with specified data type for {data_type.capitalize()}."
+        #     )
+
+        # else:
+        #     print("Updating all languages and data types...")
+        #     query_data(
+        #         languages=None,
+        #         data_type=None,
+        #         output_dir=output_dir,
+        #         overwrite=overwrite,
+        #     )
+        #     print("Query completed for all languages and all data types.")
 
         subprocess_result = True
 
@@ -172,7 +197,8 @@ def get_data(
         or isinstance(subprocess_result, bool)
         and subprocess_result
     ):
-        print(f"Updated data was saved in: {Path(output_dir).resolve()}.")
+        if not all:
+            print(f"Updated data was saved in: {Path(output_dir).resolve()}.")
 
         json_input_path = Path(output_dir) / f"{language}/{data_type}.json"
 

diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
@@ -160,6 +160,12 @@ def main() -> None:
         default="camel",
         help="The case format for identifiers in the output data (default: camel).",
     )
+    get_parser.add_argument(
+        "-wd",
+        "--wikidata-dump",
+        type=str,
+        help="Path to a local Wikidata lexemes dump required for running with '--all'.",
+    )
 
     # MARK: Total
 
@@ -348,6 +354,7 @@ def main() -> None:
                     overwrite=args.overwrite,
                     all=args.all,
                     identifier_case=args.identifier_case,
+                    wiki_dump=args.wikidata_dump,
                 )
 
         elif args.command in ["total", "t"]:

diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
@@ -29,6 +29,7 @@
 from pathlib import Path
 from typing import Any, Optional
 from rich import print as rprint
+from datetime import datetime
 
 # MARK: Utils Variables
 
@@ -634,16 +635,42 @@ def check_lexeme_dump_prompt_download(output_dir):
             rprint(f"  - {Path(output_dir)}/{dump.name}")
 
         user_input = input(
-            "\nDo you want to\n (d)elete existing dumps,\n (s)kip download,\n or download (n)ew version? [d/s/n]: "
+            "\nDo you want to\n (d)elete existing dumps,\n (s)kip download,\n (u)se existing latest dump\n or download (n)ew version? [d/s/u/n]: "
         ).lower()
         if user_input == "d":
             for dump in existing_dumps:
                 dump.unlink()
             rprint("[bold green]Existing dumps deleted.[/bold green]")
-            user_input = input("Do you want to download latest lexeme dump now?(y/N)")
-            if user_input == "y" or user_input == "":
-                return False
-            return True
+            user_input = input("Do you want to download latest lexeme dump? (y/N): ")
+            return user_input != "y"
+
+        elif user_input == "u":
+            # Check for the latest dump file
+            latest_dump = None
+            if any(dump.name == "latest-lexemes.json.bz2" for dump in existing_dumps):
+                latest_dump = Path(output_dir) / "latest-lexemes.json.bz2"
+            else:
+                # Extract dates from filenames using datetime validation
+                dated_dumps = []
+                for dump in existing_dumps:
+                    parts = dump.stem.split("-")
+                    if len(parts) > 1:
+                        try:
+                            date = datetime.strptime(parts[1], "%Y%m%d")
+                            dated_dumps.append((dump, date))
+                        except ValueError:
+                            continue  # Skip files without a valid date
+
+                if dated_dumps:
+                    # Find the dump with the most recent date
+                    latest_dump = max(dated_dumps, key=lambda x: x[1])[0]
+
+            if latest_dump:
+                rprint(f"[bold green]Using latest dump:[/bold green] {latest_dump}")
+                return latest_dump
+            else:
+                rprint("[bold red]No valid dumps found.[/bold red]")
+                return None
         else:
             rprint("[bold blue]Skipping download.[/bold blue]")
             return True
diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py
@@ -336,5 +336,5 @@ def query_data(
             )
 
 
-if __name__ == "__main__":
-    query_data()
+# if __name__ == "__main__":
+#     query_data()