Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added download cli cmd #528

Merged
merged 17 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,7 @@ scribe_data_csv_export/*
scribe_data_json_export/*
scribe_data_sqlite_export/*
scribe_data_tsv_export/*

# MARK: Wiki Dumps

*.json.bz2
27 changes: 27 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,33 @@ Scribe does not accept direct edits to the grammar JSON files as they are source

The documentation for Scribe-Data can be found at [scribe-data.readthedocs.io](https://scribe-data.readthedocs.io/en/latest/). Documentation is an invaluable way to contribute to coding projects as it allows others to more easily understand the project structure and contribute. Issues related to documentation are marked with the [`documentation`](https://github.com/scribe-org/Scribe-Data/labels/documentation) label.

### Function Docstrings

Scribe-Data generally follows [NumPy conventions](https://numpydoc.readthedocs.io/en/latest/format.html) for documenting functions and Python code in general. Function docstrings should have the following format:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@axif0: Just added this to the contributing guide for directions on how to write docstings that will be rendered properly in the docs. Can we ask you to familiarize yourself with them? :)


```py
def example_function(argument: argument_type) -> return_type:
"""
An example docstring for a function so others understand your work.

Parameters
----------
argument: argument_type
Description of your argument.

Returns
-------
return_value : return_type
Description of your return value.
"""

...

return return_value
```

### Building the Docs

Use the following commands to build the documentation locally:

```bash
Expand Down
1 change: 0 additions & 1 deletion docs/source/scribe_data/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ Scribe-Data
unicode/index
wikidata/index
wikipedia/index
wiktionary/index

.. toctree::
:maxdepth: 1
Expand Down
6 changes: 0 additions & 6 deletions docs/source/scribe_data/wiktionary/index.rst

This file was deleted.

281 changes: 281 additions & 0 deletions src/scribe_data/cli/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
"""
Functions for downloading Wikidata lexeme dumps.

.. raw:: html
<!--
* Copyright (C) 2024 Scribe
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
-->
"""

import contextlib
import os
import re
from datetime import datetime
from pathlib import Path
from typing import Optional

import requests
from rich import print as rprint
from tqdm import tqdm

from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR, check_lexeme_dump_prompt_download


def parse_date(date_string):
"""
Parses a date string into a `datetime.date` object.

Supported formats:
- YYYYMMDD
- YYYY/MM/DD
- YYYY-MM-DD

Parameters
----------
date_string : str
The date string to be parsed.

Returns
-------
datetime.date : Parsed date object if the format is valid.
None : If the date format is invalid.
"""
formats = ["%Y%m%d", "%Y/%m/%d", "%Y-%m-%d"]
for fmt in formats:
try:
return datetime.strptime(date_string, fmt).date()

except ValueError:
continue

print(
f"Invalid date format: {date_string}. Expected formats: YYYYMMDD, YYYY/MM/DD, or YYYY-MM-DD."
)
return None


def available_closest_lexeme_dumpfile(
target_entity: str, other_old_dumps: str, check_wd_dump_exists
):
"""
Finds the closest available dump file based on the target date.

Parameters
----------
target_entity : str
The target date for which the dump is requested (format: YYYY/MM/DD or similar).

other_old_dumps : list
List of available dump folders as strings.

check_wd_dump_exists : function
A function to validate if the dump file exists.

Returns
-------
str : The closest available dump file date (as a string).
None : If no suitable dump is found.
"""
target_date = parse_date(target_entity)
closest_date = None
closest_diff = None

if target_date:
available_dates = []
for i in other_old_dumps:
if i == "..":
continue

with contextlib.suppress(requests.exceptions.HTTPError):
if check_wd_dump_exists(i):
available_dates.append(i)
current_date = parse_date(i)
diff = abs((current_date - target_date).days)

if closest_diff is None or diff < closest_diff:
closest_date = i
closest_diff = diff

if current_date >= target_date:
break

return closest_date


def download_wd_lexeme_dump(target_entity: str = "latest-lexemes"):
"""
Downloads a Wikimedia lexeme dump based on the specified target entity or date.

Parameters
----------
target_entity : str, optional
The target dump to download. Defaults to "latest-lexemes".

- If "latest-lexemes", downloads the latest dump.
- If a valid date (e.g., YYYYMMDD), attempts to download the dump for that date.

Returns
-------
str : The URL of the requested or closest available dump.
None : If no suitable dump is found or the request fails.
"""
base_url = "https://dumps.wikimedia.org/wikidatawiki/entities"

def check_wd_dump_exists(target_entity):
"""
Checks if the specified dump file exists for a target entity.

Parameters
----------
target_entity : str
The target entity or date folder to check.

Returns
-------
str : The URL of the dump file if it exists.
None : If the dump file does not exist.
"""
entity_url = f"{base_url}/{target_entity}/"
entity_response = requests.get(entity_url)
entity_response.raise_for_status()
dump_filenames = re.findall(r'href="([^"]+)"', entity_response.text)

file_url = f"wikidata-{target_entity}-lexemes.json.bz2"

if file_url in dump_filenames:
return f"{base_url}/{target_entity}/{fileurl}"

if target_entity != "latest-lexemes":
try:
if parse_date(target_entity):
target_entity = target_entity.replace("/", "").replace("-", "")
return check_wd_dump_exists(target_entity)

except requests.exceptions.HTTPError as http_err:
print(
f"HTTP error occurred: {http_err} Status code: {http_err.response.status_code}"
)
print("We could not find your requested Wikidata lexeme dump.")

response = requests.get(base_url)
other_old_dumps = re.findall(r'href="([^"]+)/"', response.text)

user_input = input(
"Do you want to see the closest available older dumps? [Y/n]"
).lower()

if user_input != "y":
return

else:
if other_old_dumps:
closest_date = available_closest_lexeme_dumpfile(
target_entity, other_old_dumps, check_wd_dump_exists
)
print(
f"\nClosest available older dumps(YYYYMMDD): {parse_date(closest_date)}"
)
fileurl = f"{closest_date}/wikidata-{closest_date}-lexemes.json.bz2"

if closest_date:
return f"{base_url}/{fileurl}"

else:
return

return other_old_dumps

try:
response = requests.get(base_url)
response.raise_for_status()
latest_dump = re.findall(r'href="([^"]+)"', response.text)
if "latest-all.json.bz2" in latest_dump:
latest_dump_link = f"{base_url}/latest-lexemes.json.bz2"
return latest_dump_link

except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")


def wd_lexeme_dump_download_wrapper(
wikidata_dump: Optional[str] = None, output_dir: Optional[str] = None
) -> None:
"""
Download Wikidata lexeme dumps given user preferences.

Parameters
----------
wikidata_dump : str
Optional date string in YYYYMMDD format for specific dumps.

output_dir : str
Optional directory path for the downloaded file. Defaults to 'scribe_data_wikidumps' directory.
"""
dump_url = download_wd_lexeme_dump(wikidata_dump or "latest-lexemes")

if not dump_url:
rprint("[bold red]No dump URL found.[/bold red]")
return False

try:
output_dir = output_dir or DEFAULT_DUMP_EXPORT_DIR

os.makedirs(output_dir, exist_ok=True)

# Don't check for lexeme if date given.
if not wikidata_dump:
if useable_file_dir := check_lexeme_dump_prompt_download(output_dir):
return useable_file_dir

filename = dump_url.split("/")[-1]
output_path = str(Path(output_dir) / filename)

user_response = (
input(
"We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities."
"Do you want to proceed? (y/n): "
)
.strip()
.lower()
)

if user_response == "y":
rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]")

response = requests.get(dump_url, stream=True)
total_size = int(response.headers.get("content-length", 0))

with open(output_path, "wb") as f:
with tqdm(
total=total_size, unit="iB", unit_scale=True, desc=output_path
) as pbar:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
pbar.update(len(chunk))

rprint("[bold green]Download completed successfully![/bold green]")

return output_path

else:
return

except requests.exceptions.RequestException as e:
rprint(f"[bold red]Error downloading dump: {e}[/bold red]")

except Exception as e:
rprint(f"[bold red]An error occurred: {e}[/bold red]")
Loading
Loading