Skip to content

Commit

Permalink
final
Browse files Browse the repository at this point in the history
  • Loading branch information
axif0 committed Dec 10, 2024
1 parent 1b0d6fa commit 8ce7744
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 57 deletions.
47 changes: 31 additions & 16 deletions src/scribe_data/cli/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,35 +46,50 @@ def download_wrapper(

if not dump_url:
rprint("[bold red]No dump URL found.[/bold red]")
return
return False

try:
output_dir = output_dir if output_dir else DEFAULT_DUMP_EXPORT_DIR

os.makedirs(output_dir, exist_ok=True)

# Check for existing .json.bz2 files
if check_lexeme_dump_prompt_download(output_dir):
return
# Don't check for lexeme if date given
if not wikidata_dump:
useable_file_dir = check_lexeme_dump_prompt_download(output_dir)

# Check for existing .json.bz2 files
if useable_file_dir:
return useable_file_dir

filename = dump_url.split("/")[-1]
output_path = str(Path(output_dir) / filename)

rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]")
user_response = (
input(
"We'll using lexeme dump from dumps.wikimedia.org/wikidatawiki/entities."
"Do you want to Use it? (Yes/Cancel): "
)
.strip()
.lower()
)

if user_response == "yes" or user_response == "":
rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]")

response = requests.get(dump_url, stream=True)
total_size = int(response.headers.get("content-length", 0))
response = requests.get(dump_url, stream=True)
total_size = int(response.headers.get("content-length", 0))

with open(output_path, "wb") as f:
with tqdm(
total=total_size, unit="iB", unit_scale=True, desc=output_path
) as pbar:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
pbar.update(len(chunk))
with open(output_path, "wb") as f:
with tqdm(
total=total_size, unit="iB", unit_scale=True, desc=output_path
) as pbar:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
pbar.update(len(chunk))

rprint("[bold green]Download completed successfully![/bold green]")
rprint("[bold green]Download completed successfully![/bold green]")
return output_path

except requests.exceptions.RequestException as e:
rprint(f"[bold red]Error downloading dump: {e}[/bold red]")
Expand Down
94 changes: 60 additions & 34 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import subprocess
from pathlib import Path
from typing import List, Union
from rich import print as rprint

from scribe_data.cli.convert import convert_wrapper
from scribe_data.unicode.generate_emoji_keywords import generate_emoji
Expand All @@ -34,6 +35,7 @@
DEFAULT_TSV_EXPORT_DIR,
)
from scribe_data.wikidata.query_data import query_data
from scribe_data.cli.download import download_wrapper


def get_data(
Expand All @@ -46,6 +48,7 @@ def get_data(
all: bool = False,
interactive: bool = False,
identifier_case: str = "camel",
wiki_dump: str = None,
) -> None:
"""
Function for controlling the data get process for the CLI.
Expand Down Expand Up @@ -100,40 +103,62 @@ def get_data(

# MARK: Get All
if all:
if language:
language_or_sub_language = language.split(" ")[0]
print(f"Updating all data types for language for {language.title()}")
query_data(
languages=[language_or_sub_language],
data_type=None,
output_dir=output_dir,
overwrite=overwrite,
)
print(
f"Query completed for all data types with specified language for {language.title()}."
)

elif data_type:
print(f"Updating all languages for data type: {data_type.capitalize()}")
query_data(
languages=None,
data_type=[data_type],
output_dir=output_dir,
overwrite=overwrite,
)
print(
f"Query completed for all languages with specified data type for {data_type.capitalize()}."
)

if wiki_dump:
print("wiki_dump", wiki_dump)
download_wrapper(None, wiki_dump)
else:
print("Updating all languages and data types...")
query_data(
languages=None,
data_type=None,
output_dir=output_dir,
overwrite=overwrite,
)
print("Query completed for all languages and all data types.")
# user_response = input(
# "We'll using lexeme dump from dumps.wikimedia.org/wikidatawiki/entities."
# "Do you want to Use it? (Yes/Cancel): "
# ).strip().lower()
# if user_response == "yes" or user_response=="":
print("Using wikimedia lexeme dump...")
file_path = download_wrapper()
if file_path:
rprint("[bold green]we'll use this lexeme dump[/bold green]", file_path)
rprint(
"[bold red]Parsing lexeme dump feature will be available soon...[/bold red]"
)
else:
print("Error occurred! Please check the dump file")
# else:
# print("canceled...")
# return

# if language:
# language_or_sub_language = language.split(" ")[0]
# print(f"Updating all data types for language for {language.title()}")
# query_data(
# languages=[language_or_sub_language],
# data_type=None,
# output_dir=output_dir,
# overwrite=overwrite,
# )
# print(
# f"Query completed for all data types with specified language for {language.title()}."
# )

# elif data_type:
# print(f"Updating all languages for data type: {data_type.capitalize()}")
# query_data(
# languages=None,
# data_type=[data_type],
# output_dir=output_dir,
# overwrite=overwrite,
# )
# print(
# f"Query completed for all languages with specified data type for {data_type.capitalize()}."
# )

# else:
# print("Updating all languages and data types...")
# query_data(
# languages=None,
# data_type=None,
# output_dir=output_dir,
# overwrite=overwrite,
# )
# print("Query completed for all languages and all data types.")

subprocess_result = True

Expand Down Expand Up @@ -172,7 +197,8 @@ def get_data(
or isinstance(subprocess_result, bool)
and subprocess_result
):
print(f"Updated data was saved in: {Path(output_dir).resolve()}.")
if not all:
print(f"Updated data was saved in: {Path(output_dir).resolve()}.")

json_input_path = Path(output_dir) / f"{language}/{data_type}.json"

Expand Down
7 changes: 7 additions & 0 deletions src/scribe_data/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,12 @@ def main() -> None:
default="camel",
help="The case format for identifiers in the output data (default: camel).",
)
get_parser.add_argument(
"-wd",
"--wikidata-dump",
type=str,
help="Path to a local Wikidata lexemes dump required for running with '--all'.",
)

# MARK: Total

Expand Down Expand Up @@ -348,6 +354,7 @@ def main() -> None:
overwrite=args.overwrite,
all=args.all,
identifier_case=args.identifier_case,
wiki_dump=args.wikidata_dump,
)

elif args.command in ["total", "t"]:
Expand Down
37 changes: 32 additions & 5 deletions src/scribe_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from pathlib import Path
from typing import Any, Optional
from rich import print as rprint
from datetime import datetime

# MARK: Utils Variables

Expand Down Expand Up @@ -634,16 +635,42 @@ def check_lexeme_dump_prompt_download(output_dir):
rprint(f" - {Path(output_dir)}/{dump.name}")

user_input = input(
"\nDo you want to\n (d)elete existing dumps,\n (s)kip download,\n or download (n)ew version? [d/s/n]: "
"\nDo you want to\n (d)elete existing dumps,\n (s)kip download,\n (u)se existing latest dump\n or download (n)ew version? [d/s/u/n]: "
).lower()
if user_input == "d":
for dump in existing_dumps:
dump.unlink()
rprint("[bold green]Existing dumps deleted.[/bold green]")
user_input = input("Do you want to download latest lexeme dump now?(y/N)")
if user_input == "y" or user_input == "":
return False
return True
user_input = input("Do you want to download latest lexeme dump? (y/N): ")
return user_input != "y"

elif user_input == "u":
# Check for the latest dump file
latest_dump = None
if any(dump.name == "latest-lexemes.json.bz2" for dump in existing_dumps):
latest_dump = Path(output_dir) / "latest-lexemes.json.bz2"
else:
# Extract dates from filenames using datetime validation
dated_dumps = []
for dump in existing_dumps:
parts = dump.stem.split("-")
if len(parts) > 1:
try:
date = datetime.strptime(parts[1], "%Y%m%d")
dated_dumps.append((dump, date))
except ValueError:
continue # Skip files without a valid date

if dated_dumps:
# Find the dump with the most recent date
latest_dump = max(dated_dumps, key=lambda x: x[1])[0]

if latest_dump:
rprint(f"[bold green]Using latest dump:[/bold green] {latest_dump}")
return latest_dump
else:
rprint("[bold red]No valid dumps found.[/bold red]")
return None
else:
rprint("[bold blue]Skipping download.[/bold blue]")
return True
4 changes: 2 additions & 2 deletions src/scribe_data/wikidata/query_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,5 +336,5 @@ def query_data(
)


if __name__ == "__main__":
query_data()
# if __name__ == "__main__":
# query_data()

0 comments on commit 8ce7744

Please sign in to comment.