Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added download cli cmd #528

Merged
merged 17 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,7 @@ scribe_data_csv_export/*
scribe_data_json_export/*
scribe_data_sqlite_export/*
scribe_data_tsv_export/*

# MARK: Wiki Dumps

*.json.bz2
27 changes: 27 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,33 @@ Scribe does not accept direct edits to the grammar JSON files as they are source

The documentation for Scribe-Data can be found at [scribe-data.readthedocs.io](https://scribe-data.readthedocs.io/en/latest/). Documentation is an invaluable way to contribute to coding projects as it allows others to more easily understand the project structure and contribute. Issues related to documentation are marked with the [`documentation`](https://github.com/scribe-org/Scribe-Data/labels/documentation) label.

### Function Docstrings

Scribe-Data generally follows [NumPy conventions](https://numpydoc.readthedocs.io/en/latest/format.html) for documenting functions and Python code in general. Function docstrings should have the following format:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@axif0: Just added this to the contributing guide for directions on how to write docstings that will be rendered properly in the docs. Can we ask you to familiarize yourself with them? :)


```py
def example_function(argument: argument_type) -> return_type:
"""
An example docstring for a function so others understand your work.

Parameters
----------
argument: argument_type
Description of your argument.

Returns
-------
return_value : return_type
Description of your return value.
"""

...

return return_value
```

### Building the Docs

Use the following commands to build the documentation locally:

```bash
Expand Down
1 change: 0 additions & 1 deletion docs/source/scribe_data/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ Scribe-Data
unicode/index
wikidata/index
wikipedia/index
wiktionary/index

.. toctree::
:maxdepth: 1
Expand Down
6 changes: 0 additions & 6 deletions docs/source/scribe_data/wiktionary/index.rst

This file was deleted.

103 changes: 103 additions & 0 deletions src/scribe_data/cli/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""
Functions for downloading Wikidata lexeme dumps.

.. raw:: html
<!--
* Copyright (C) 2024 Scribe
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
-->
"""

import os
from pathlib import Path
from typing import Optional

import requests
from rich import print as rprint
from tqdm import tqdm

from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR, check_lexeme_dump_prompt_download
from scribe_data.wikidata.wikidata_utils import download_wiki_lexeme_dump


def download_wrapper(
wikidata_dump: Optional[str] = None, output_dir: Optional[str] = None
) -> None:
"""
Download Wikidata lexeme dumps given user preferences.

Parameters
----------
wikidata_dump : str
Optional date string in YYYYMMDD format for specific dumps.

output_dir : str
Optional directory path for the downloaded file. Defaults to 'scribe_data_wikidumps' directory.
"""
dump_url = download_wiki_lexeme_dump(wikidata_dump or "latest-lexemes")

if not dump_url:
rprint("[bold red]No dump URL found.[/bold red]")
return False

try:
output_dir = output_dir or DEFAULT_DUMP_EXPORT_DIR

os.makedirs(output_dir, exist_ok=True)

# Don't check for lexeme if date given.
if not wikidata_dump:
if useable_file_dir := check_lexeme_dump_prompt_download(output_dir):
return useable_file_dir

filename = dump_url.split("/")[-1]
output_path = str(Path(output_dir) / filename)

user_response = (
input(
"We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities."
"Do you want to proceed? (y/n): "
)
.strip()
.lower()
)

if user_response == "y":
rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]")

response = requests.get(dump_url, stream=True)
total_size = int(response.headers.get("content-length", 0))

with open(output_path, "wb") as f:
with tqdm(
total=total_size, unit="iB", unit_scale=True, desc=output_path
) as pbar:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
pbar.update(len(chunk))

rprint("[bold green]Download completed successfully![/bold green]")

return output_path

else:
return

except requests.exceptions.RequestException as e:
rprint(f"[bold red]Error downloading dump: {e}[/bold red]")

except Exception as e:
rprint(f"[bold red]An error occurred: {e}[/bold red]")
94 changes: 61 additions & 33 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@
from pathlib import Path
from typing import List, Union

from rich import print as rprint

from scribe_data.cli.convert import convert_wrapper
from scribe_data.cli.download import download_wrapper
from scribe_data.unicode.generate_emoji_keywords import generate_emoji
from scribe_data.utils import (
DEFAULT_CSV_EXPORT_DIR,
Expand All @@ -46,6 +49,7 @@ def get_data(
all: bool = False,
interactive: bool = False,
identifier_case: str = "camel",
wikidata_dump: str = None,
) -> None:
"""
Function for controlling the data get process for the CLI.
Expand Down Expand Up @@ -79,6 +83,9 @@ def get_data(
identifier_case : str
The case format for identifiers. Default is "camel".

wikidata_dump : str
The local Wikidata dump that should be used to get data.

Returns
-------
The requested data saved locally given file type and location arguments.
Expand All @@ -99,41 +106,61 @@ def get_data(
subprocess_result = False

# MARK: Get All

if all:
if language:
language_or_sub_language = language.split(" ")[0]
print(f"Updating all data types for language for {language.title()}")
query_data(
languages=[language_or_sub_language],
data_type=None,
output_dir=output_dir,
overwrite=overwrite,
)
print(
f"Query completed for all data types with specified language for {language.title()}."
)

elif data_type:
print(f"Updating all languages for data type: {data_type.capitalize()}")
query_data(
languages=None,
data_type=[data_type],
output_dir=output_dir,
overwrite=overwrite,
)
print(
f"Query completed for all languages with specified data type for {data_type.capitalize()}."
)
# Using Wikidata lexeme based dumps.
if wikidata_dump:
print("wikidata_dump", wikidata_dump)
download_wrapper(None, wikidata_dump)

else:
print("Updating all languages and data types...")
query_data(
languages=None,
data_type=None,
output_dir=output_dir,
overwrite=overwrite,
)
print("Query completed for all languages and all data types.")
print("Using Wikidata lexeme dump...")
file_path = download_wrapper()
if isinstance(file_path, str) and file_path:
rprint(
"[bold green]We'll use the following lexeme dump[/bold green]",
file_path,
)
rprint(
"[bold red]Parsing lexeme dump feature will be available soon...[/bold red]"
)

# Using Wikidata Query Service based data extraction.

# if language:
# language_or_sub_language = language.split(" ")[0]
# print(f"Updating all data types for language for {language.title()}")
# query_data(
# languages=[language_or_sub_language],
# data_type=None,
# output_dir=output_dir,
# overwrite=overwrite,
# )
# print(
# f"Query completed for all data types with specified language for {language.title()}."
# )

# elif data_type:
# print(f"Updating all languages for data type: {data_type.capitalize()}")
# query_data(
# languages=None,
# data_type=[data_type],
# output_dir=output_dir,
# overwrite=overwrite,
# )
# print(
# f"Query completed for all languages with specified data type for {data_type.capitalize()}."
# )

# else:
# print("Updating all languages and data types...")
# query_data(
# languages=None,
# data_type=None,
# output_dir=output_dir,
# overwrite=overwrite,
# )
# print("Query completed for all languages and all data types.")

subprocess_result = True

Expand Down Expand Up @@ -172,7 +199,8 @@ def get_data(
or isinstance(subprocess_result, bool)
and subprocess_result
):
print(f"Updated data was saved in: {Path(output_dir).resolve()}.")
if not all:
print(f"Updated data was saved in: {Path(output_dir).resolve()}.")

json_input_path = Path(output_dir) / f"{language}/{data_type}.json"

Expand Down
Loading
Loading