Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added functionality to convert data to CSV/TSV and JSON and vice versa. #329

Closed
wants to merge 25 commits into from
Closed
Changes from 4 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
bc76779
fixing IncompleteRead issue
axif0 Oct 3, 2024
c6b7cae
feat: Implement functionality to convert data to CSV/TSV and JSON
john-thuo1 Oct 12, 2024
74b522b
adding a sparql file in Hebrew/adjectives for hebrew adjectives
OmarAI2003 Oct 12, 2024
d71b304
Renaming the query file to /Hebrew/adjectives/query_adjectives.sparql…
OmarAI2003 Oct 12, 2024
919f6c8
simple sparql query for fetching all the 4274 hebrew adjectives on wi…
OmarAI2003 Oct 12, 2024
aef6592
Merge branch 'main' into decouple_convert
andrewtavis Oct 12, 2024
3270e86
Fix tests and minor updates to cli main
andrewtavis Oct 12, 2024
40ab3fc
Minor docstring fixes and making all args in cli main explicit
andrewtavis Oct 12, 2024
dd40ccd
fix - interactive bug
axif0 Oct 12, 2024
b982d17
Merge branch 'main' into decouple_convert
andrewtavis Oct 12, 2024
18f377d
Expand query to optionally return all forms of Hebrew adjectives
OmarAI2003 Oct 13, 2024
0c326b6
Add optional retrieval of Hebrew adjective forms by gender and number
OmarAI2003 Oct 13, 2024
1de9dd3
fix interactive cli command
axif0 Oct 13, 2024
df7fa75
Add filter not exist to remove construct forms and filter he
andrewtavis Oct 13, 2024
8614915
Remove selection of lexeme URI
andrewtavis Oct 13, 2024
e93f8fb
Merge pull request #333 from OmarAI2003/Heb-adjectives
andrewtavis Oct 13, 2024
43fcb55
Merge branch 'main' into IncompleteRead
andrewtavis Oct 13, 2024
50289a1
Spacing and removing unused import
andrewtavis Oct 13, 2024
419576e
Merge pull request #221 from axif0/IncompleteRead
andrewtavis Oct 13, 2024
3869e75
Minor edits to the interactive mode setup / functionality
andrewtavis Oct 13, 2024
c4da4e9
Merge pull request #334 from axif0/inter
andrewtavis Oct 13, 2024
1571ce3
Merge branch 'decouple_convert' of github.com:john-thuo1/Scribe-Data …
john-thuo1 Oct 13, 2024
a59be57
removed required on convert arg --output dir
john-thuo1 Oct 13, 2024
eb349a6
Update convert tests(sqlite)
john-thuo1 Oct 13, 2024
aa9433c
Update Default Directories for convert functions
john-thuo1 Oct 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
@@ -40,6 +40,7 @@ def get_data(
overwrite: bool = False,
outputs_per_entry: int = None,
all: bool = False,
interactive: bool = False,
) -> None:
"""
Function for controlling the data get process for the CLI.
@@ -61,12 +62,15 @@ def get_data(
outputs_per_entry : str
How many outputs should be generated per data entry.

overwrite : bool
Whether to overwrite existing files (default: False).
overwrite : bool (default: False)
Whether to overwrite existing files.

all : bool
Get all languages and data types.

interactive : bool (default: False)
Whether it's running in interactive mode.

Returns
-------
The requested data saved locally given file type and location arguments.
@@ -125,6 +129,7 @@ def get_data(
data_type=data_type,
output_dir=output_dir,
overwrite=overwrite,
interactive=interactive,
)
subprocess_result = True

@@ -140,6 +145,8 @@ def get_data(
print(
f"Updated data was saved in: {Path(output_dir).resolve()}.",
)
if interactive:
return True

# The emoji keywords process has failed.
elif data_type in {"emoji-keywords", "emoji_keywords"}:
60 changes: 45 additions & 15 deletions src/scribe_data/cli/interactive.py
Original file line number Diff line number Diff line change
@@ -20,21 +20,33 @@
-->
"""

import logging
from pathlib import Path
from typing import List

import questionary
from questionary import Choice
from rich import print as rprint
from rich.console import Console
from rich.logging import RichHandler
from rich.table import Table
from tqdm import tqdm

from scribe_data.cli.cli_utils import data_type_metadata, language_metadata
from scribe_data.cli.get import get_data
from scribe_data.cli.version import get_version_message
from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR

# MARK: Config Setup

logging.basicConfig(
level=logging.INFO,
format="%(message)s",
datefmt="[%X]",
handlers=[RichHandler(markup=True)], # Enable markup for colors
)
console = Console()
logger = logging.getLogger("rich")


class ScribeDataConfig:
@@ -60,9 +72,11 @@ def display_summary():
"""
Displays a summary of the interactive mode request to run.
"""
table = Table(title="Scribe-Data Configuration Summary")
table = Table(
title="Scribe-Data Request Configuration Summary", style="bright_white"
)

table.add_column("Setting", style="cyan")
table.add_column("Setting", style="bold cyan", no_wrap=True)
table.add_column("Value(s)", style="magenta")

table.add_row("Languages", ", ".join(config.selected_languages) or "None")
@@ -71,7 +85,9 @@ def display_summary():
table.add_row("Output Directory", str(config.output_dir))
table.add_row("Overwrite", "Yes" if config.overwrite else "No")

console.print(table)
console.print("\n")
console.print(table, justify="left")
console.print("\n")


def configure_settings():
@@ -107,7 +123,7 @@ def configure_settings():
rprint(
"[yellow]No language selected. Please select at least one option with space followed by enter.[/yellow]"
)
if questionary.confirm("Continue?").ask():
if questionary.confirm("Continue?", default=True).ask():
return configure_settings()

else:
@@ -135,7 +151,7 @@ def configure_settings():
rprint(
"[yellow]No data type selected. Please select at least one option with space followed by enter.[/yellow]"
)
if questionary.confirm("Continue?").ask():
if questionary.confirm("Continue?", default=True).ask():
return configure_settings()

if data_type_selected:
@@ -166,27 +182,40 @@ def run_request():
rprint("[bold red]Error: Please configure languages and data types.[/bold red]")
return

# MARK: Export Data
# Calculate total operations
total_operations = len(config.selected_languages) * len(config.selected_data_types)

with console.status("[bold green]Exporting data...[/bold green]") as status:
# MARK: Export Data
with tqdm(
total=total_operations,
desc="Exporting data",
unit="operation",
) as pbar:
for language in config.selected_languages:
for data_type in config.selected_data_types:
status.update(
f"[bold green]Exporting {language} {data_type} data...[/bold green]"
)
pbar.set_description(f"Exporting {language} {data_type} data")

get_data(
if get_data(
language=language,
data_type=data_type,
output_type=config.output_type,
output_dir=str(config.output_dir),
overwrite=config.overwrite,
all=config.output_type,
)
interactive=True,
):
logger.info(
f"[green]✔ Exported {language} {data_type} data.[/green]"
)

else:
logger.info(
f"[red]✘ Failed to export {language} {data_type} data.[/red]"
)

rprint(f"\n[green]✔[/green] Exported {language} {data_type} data.")
pbar.update(1)

rprint("[bold green]Data export completed successfully![/bold green]")
if config.overwrite:
rprint("[bold green]Data request completed successfully![/bold green]")


# MARK: Start
@@ -219,6 +248,7 @@ def start_interactive_mode():
break

else:
rprint("[bold cyan]Thank you for using Scribe-Data![/bold cyan]")
break


44 changes: 24 additions & 20 deletions src/scribe_data/wikidata/query_data.py
Original file line number Diff line number Diff line change
@@ -87,6 +87,7 @@ def query_data(
data_type: str = None,
output_dir: str = None,
overwrite: bool = None,
interactive: bool = False,
):
"""
Queries language data from the Wikidata lexicographical data.
@@ -102,8 +103,8 @@ def query_data(
output_dir : str
The output directory path for results.

overwrite : bool
Whether to overwrite existing files (default: False).
overwrite : bool (default: False)
Whether to overwrite existing files.

Returns
-------
@@ -155,6 +156,8 @@ def query_data(
queries_to_run,
desc="Data updated",
unit="process",
disable=interactive,
colour="MAGENTA",
):
lang = q.parent.parent.name
target_type = q.parent.name
@@ -172,24 +175,25 @@ def query_data(
for file in existing_files:
file.unlink()
else:
print(
f"\nExisting file(s) found for {lang} {target_type} in the {output_dir} directory:\n"
)
for i, file in enumerate(existing_files, 1):
print(f"{i}. {file.name}")

# choice = input(
# "\nChoose an option:\n1. Overwrite existing (press 'o')\n2. Keep all (press 'k')\n3. Skip process (press anything else)\nEnter your choice: "
# )

choice = input(
"\nChoose an option:\n1. Overwrite existing data (press 'o')\n2. Skip process (press anything else)\nEnter your choice: "
)

if choice.lower() == "o":
print("Removing existing files ...")
for file in existing_files:
file.unlink()
if not interactive:
print(
f"\nExisting file(s) found for {lang} {target_type} in the {output_dir} directory:\n"
)
for i, file in enumerate(existing_files, 1):
print(f"{i}. {file.name}")

# choice = input(
# "\nChoose an option:\n1. Overwrite existing (press 'o')\n2. Keep all (press 'k')\n3. Skip process (press anything else)\nEnter your choice: "
# )

choice = input(
"\nChoose an option:\n1. Overwrite existing data (press 'o')\n2. Skip process (press anything else)\nEnter your choice: "
)

if choice.lower() == "o":
print("Removing existing files ...")
for file in existing_files:
file.unlink()

# elif choice in ["k", "K"]:
# timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
7 changes: 7 additions & 0 deletions tests/cli/test_get.py
Original file line number Diff line number Diff line change
@@ -57,6 +57,7 @@ def test_get_specific_language_and_data_type(self, mock_query_data):
data_type=["nouns"],
output_dir="./test_output",
overwrite=False,
interactive=False,
)

# MARK: Capitalized Language
@@ -69,6 +70,7 @@ def test_get_data_with_capitalized_language(self, mock_query_data):
data_type=["nouns"],
output_dir="scribe_data_json_export",
overwrite=False,
interactive=False,
)

# MARK: Lowercase Language
@@ -81,6 +83,7 @@ def test_get_data_with_lowercase_language(self, mock_query_data):
data_type=["nouns"],
output_dir="scribe_data_json_export",
overwrite=False,
interactive=False,
)

# MARK: Output Directory
@@ -95,6 +98,7 @@ def test_get_data_with_different_output_directory(self, mock_query_data):
data_type=["nouns"],
output_dir="./custom_output_test",
overwrite=False,
interactive=False,
)

# MARK: Overwrite is True
@@ -107,6 +111,7 @@ def test_get_data_with_overwrite_true(self, mock_query_data):
data_type=["verbs"],
output_dir="scribe_data_json_export",
overwrite=True,
interactive=False,
)

# MARK: Overwrite is False
@@ -118,10 +123,12 @@ def test_get_data_with_overwrite_false(self, mock_query_data):
data_type="verbs",
overwrite=False,
output_dir="./custom_output_test",
interactive=False,
)
mock_query_data.assert_called_once_with(
languages=["English"],
data_type=["verbs"],
output_dir="./custom_output_test",
overwrite=False,
interactive=False,
)