Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added download cli cmd #528

Merged
merged 17 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,7 @@ scribe_data_csv_export/*
scribe_data_json_export/*
scribe_data_sqlite_export/*
scribe_data_tsv_export/*

# MARK: Wiki Dumps

*.json.bz2
1 change: 0 additions & 1 deletion docs/source/scribe_data/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ Scribe-Data
unicode/index
wikidata/index
wikipedia/index
wiktionary/index

.. toctree::
:maxdepth: 1
Expand Down
6 changes: 0 additions & 6 deletions docs/source/scribe_data/wiktionary/index.rst

This file was deleted.

98 changes: 98 additions & 0 deletions src/scribe_data/cli/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""
Functions for downloading Wikidata lexeme dumps.

.. raw:: html
<!--
* Copyright (C) 2024 Scribe
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
-->
"""

import os
from pathlib import Path
from typing import Optional

import requests
from rich import print as rprint
from tqdm import tqdm

from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR, check_lexeme_dump_prompt_download
from scribe_data.wikidata.wikidata_utils import download_wiki_lexeme_dump


def download_wrapper(
wikidata_dump: Optional[str] = None, output_dir: Optional[str] = None
) -> None:
"""Download Wikidata dumps.

Args:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Appreciate the doc string here, @axif0, but let's please use the format that's used in the rest of the package as this one here is not going to be rendered in the docs. Updating the contribution guide now with some directions here :)

wikidata_dump: Optional date string in YYYYMMDD format for specific dumps
output_dir: Optional directory path for the downloaded file. Defaults to 'scribe_data_wikidumps' directory
"""
dump_url = download_wiki_lexeme_dump(wikidata_dump or "latest-lexemes")

if not dump_url:
rprint("[bold red]No dump URL found.[/bold red]")
return False

try:
output_dir = output_dir or DEFAULT_DUMP_EXPORT_DIR

os.makedirs(output_dir, exist_ok=True)

# Don't check for lexeme if date given.
if not wikidata_dump:
if useable_file_dir := check_lexeme_dump_prompt_download(output_dir):
return useable_file_dir

filename = dump_url.split("/")[-1]
output_path = str(Path(output_dir) / filename)

user_response = (
input(
"We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities."
"Do you want to proceed? (y/n): "
)
.strip()
.lower()
)

if user_response == "y":
rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]")

response = requests.get(dump_url, stream=True)
total_size = int(response.headers.get("content-length", 0))

with open(output_path, "wb") as f:
with tqdm(
total=total_size, unit="iB", unit_scale=True, desc=output_path
) as pbar:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
pbar.update(len(chunk))

rprint("[bold green]Download completed successfully![/bold green]")

return output_path

else:
return

except requests.exceptions.RequestException as e:
rprint(f"[bold red]Error downloading dump: {e}[/bold red]")

except Exception as e:
rprint(f"[bold red]An error occurred: {e}[/bold red]")
94 changes: 61 additions & 33 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@
from pathlib import Path
from typing import List, Union

from rich import print as rprint

from scribe_data.cli.convert import convert_wrapper
from scribe_data.cli.download import download_wrapper
from scribe_data.unicode.generate_emoji_keywords import generate_emoji
from scribe_data.utils import (
DEFAULT_CSV_EXPORT_DIR,
Expand All @@ -46,6 +49,7 @@ def get_data(
all: bool = False,
interactive: bool = False,
identifier_case: str = "camel",
wikidata_dump: str = None,
) -> None:
"""
Function for controlling the data get process for the CLI.
Expand Down Expand Up @@ -79,6 +83,9 @@ def get_data(
identifier_case : str
The case format for identifiers. Default is "camel".

wikidata_dump : str
The local Wikidata dump that should be used to get data.

Returns
-------
The requested data saved locally given file type and location arguments.
Expand All @@ -99,41 +106,61 @@ def get_data(
subprocess_result = False

# MARK: Get All

if all:
if language:
language_or_sub_language = language.split(" ")[0]
print(f"Updating all data types for language for {language.title()}")
query_data(
languages=[language_or_sub_language],
data_type=None,
output_dir=output_dir,
overwrite=overwrite,
)
print(
f"Query completed for all data types with specified language for {language.title()}."
)

elif data_type:
print(f"Updating all languages for data type: {data_type.capitalize()}")
query_data(
languages=None,
data_type=[data_type],
output_dir=output_dir,
overwrite=overwrite,
)
print(
f"Query completed for all languages with specified data type for {data_type.capitalize()}."
)
# Using Wikidata lexeme based dumps.
if wikidata_dump:
print("wikidata_dump", wikidata_dump)
download_wrapper(None, wikidata_dump)

else:
print("Updating all languages and data types...")
query_data(
languages=None,
data_type=None,
output_dir=output_dir,
overwrite=overwrite,
)
print("Query completed for all languages and all data types.")
print("Using Wikidata lexeme dump...")
file_path = download_wrapper()
if isinstance(file_path, str) and file_path:
rprint(
"[bold green]We'll use the following lexeme dump[/bold green]",
file_path,
)
rprint(
"[bold red]Parsing lexeme dump feature will be available soon...[/bold red]"
)

# Using Wikidata Query Service based data extraction.

# if language:
# language_or_sub_language = language.split(" ")[0]
# print(f"Updating all data types for language for {language.title()}")
# query_data(
# languages=[language_or_sub_language],
# data_type=None,
# output_dir=output_dir,
# overwrite=overwrite,
# )
# print(
# f"Query completed for all data types with specified language for {language.title()}."
# )

# elif data_type:
# print(f"Updating all languages for data type: {data_type.capitalize()}")
# query_data(
# languages=None,
# data_type=[data_type],
# output_dir=output_dir,
# overwrite=overwrite,
# )
# print(
# f"Query completed for all languages with specified data type for {data_type.capitalize()}."
# )

# else:
# print("Updating all languages and data types...")
# query_data(
# languages=None,
# data_type=None,
# output_dir=output_dir,
# overwrite=overwrite,
# )
# print("Query completed for all languages and all data types.")

subprocess_result = True

Expand Down Expand Up @@ -172,7 +199,8 @@ def get_data(
or isinstance(subprocess_result, bool)
and subprocess_result
):
print(f"Updated data was saved in: {Path(output_dir).resolve()}.")
if not all:
print(f"Updated data was saved in: {Path(output_dir).resolve()}.")

json_input_path = Path(output_dir) / f"{language}/{data_type}.json"

Expand Down
75 changes: 62 additions & 13 deletions src/scribe_data/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from scribe_data.cli.total import total_wrapper
from scribe_data.cli.upgrade import upgrade_cli
from scribe_data.cli.version import get_version_message
from scribe_data.cli.download import download_wrapper

LIST_DESCRIPTION = "List languages, data types and combinations of each that Scribe-Data can be used for."
GET_DESCRIPTION = (
Expand Down Expand Up @@ -159,6 +160,12 @@ def main() -> None:
default="camel",
help="The case format for identifiers in the output data (default: camel).",
)
get_parser.add_argument(
"-wd",
"--wikidata-dump-path",
type=str,
help="Path to a local Wikidata lexemes dump required for running with '--all'.",
)

# MARK: Total

Expand Down Expand Up @@ -265,22 +272,34 @@ def main() -> None:
help="Convert all languages and data types.",
)

# MARK: Setup CLI
# MARK: Download

args = parser.parse_args()

if args.data_type and isinstance(args.data_type, str):
args.data_type = args.data_type.replace("-", "_")
download_parser = subparsers.add_parser(
"download",
aliases=["d"],
help="Download Wikidata dumps.",
description="Download Wikidata dumps from dumps.wikimedia.org.",
epilog=CLI_EPILOG,
formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=60),
)
download_parser._actions[0].help = "Show this help message and exit."
download_parser.add_argument(
"-wd",
"--wikidata-dump-version",
nargs="?",
const="latest",
help="Download Wikidata dump. Optionally specify date in YYYYMMDD format.",
)
download_parser.add_argument(
"-od",
"--output-dir",
type=str,
help="The output directory path for the downloaded dump.",
)

try:
if args.language or args.data_type:
validate_language_and_data_type(
language=args.language, data_type=args.data_type
)
# MARK: Setup CLI

except ValueError as e:
print(f"Input validation failed with error: {e}")
return
args = parser.parse_args()

if args.upgrade:
upgrade_cli()
Expand All @@ -291,6 +310,27 @@ def main() -> None:
return

try:
# Only validate language and data_type for relevant commands
if args.command in ["list", "l", "get", "g", "total", "t", "convert", "c"]:
if (
hasattr(args, "data_type")
and args.data_type
and isinstance(args.data_type, str)
):
args.data_type = args.data_type.replace("-", "_")

if hasattr(args, "language") or hasattr(args, "data_type"):
try:
validate_language_and_data_type(
language=args.language if hasattr(args, "language") else None,
data_type=args.data_type
if hasattr(args, "data_type")
else None,
)
except ValueError as e:
print(f"Input validation failed with error: {e}")
return

if args.command in ["list", "l"]:
list_wrapper(
language=args.language, data_type=args.data_type, all_bool=args.all
Expand All @@ -314,6 +354,7 @@ def main() -> None:
overwrite=args.overwrite,
all=args.all,
identifier_case=args.identifier_case,
wiki_dump=args.wikidata_dump,
)

elif args.command in ["total", "t"]:
Expand Down Expand Up @@ -345,6 +386,14 @@ def main() -> None:
all=args.all,
)

elif args.command in ["download", "d"]:
download_wrapper(
wikidata_dump=args.wikidata_dump
if args.wikidata_dump != "latest"
else None,
output_dir=args.output_dir,
)

else:
parser.print_help()

Expand Down
Loading
Loading