diff --git a/docs/source/scribe_data/cli.rst b/docs/source/scribe_data/cli.rst index c99eaed2..5df9ee92 100644 --- a/docs/source/scribe_data/cli.rst +++ b/docs/source/scribe_data/cli.rst @@ -143,14 +143,31 @@ Options: - ``-ot, --output-type {json,csv,tsv}``: The output file type. - ``-ope, --outputs-per-entry OUTPUTS_PER_ENTRY``: How many outputs should be generated per data entry. - ``-o, --overwrite``: Whether to overwrite existing files (default: False). -- ``-a, --all ALL``: Get all languages and data types. +- ``-a, --all``: Get all languages and data types. Can be combined with `-dt` to get all languages for a specific data type, or with `-lang` to get all data types for a specific language. - ``-i, --interactive``: Run in interactive mode. -Example: +Examples: + +.. code-block:: bash + + $ scribe-data get --all + Getting data for all languages and all data types... + +.. code-block:: bash + + $ scribe-data get --all -dt nouns + Getting all nouns for all languages... + +.. code-block:: bash + + $ scribe-data get --all -lang English + Getting all data types for English... .. code-block:: bash $ scribe-data get -l English --data-type verbs -od ~/path/for/output + Getting and formatting English verbs + Data updated: 100%|████████████████████████| 1/1 [00:XY<00:00, XY.Zs/process] Behavior and Output: ^^^^^^^^^^^^^^^^^^^^ @@ -180,7 +197,7 @@ Behavior and Output: .. code-block:: text Getting and formatting English verbs - Data updated: 100%|████████████████████████| 1/1 [00:29<00:00, 29.73s/process] + Data updated: 100%|████████████████████████| 1/1 [00:XY<00:00, XY.Zs/process] 4. If no data is found, you'll see a warning: @@ -242,30 +259,63 @@ Usage: Options: ^^^^^^^^ -- ``-lang, --language LANGUAGE``: The language(s) to check totals for. +- ``-lang, --language LANGUAGE``: The language(s) to check totals for. Can be a language name or QID. - ``-dt, --data-type DATA_TYPE``: The data type(s) to check totals for. -- ``-a, --all ALL``: Get totals for all languages and data types. +- ``-a, --all``: Get totals for all languages and data types. Examples: .. code-block:: text - $scribe-data total -dt nouns # verbs, adjectives, etc - Data type: nouns - Total number of lexemes: 123456 + $ scribe-data total --all + Total lexemes for all languages and data types: + ============================================== + Language Data Type Total Lexemes + ============================================== + English nouns 123,456 + verbs 234,567 + ... .. code-block:: text - $scribe-data total -lang English - Language: English - Total number of lexemes: 123456 + $ scribe-data total --language English + Returning total counts for English data types... + + Language Data Type Total Wikidata Lexemes + ================================================================ + English adjectives 12,345 + adverbs 23,456 + nouns 34,567 + ... .. code-block:: text - $scribe-data total -lang English -dt nouns # verbs, adjectives, etc + $ scribe-data total --language Q1860 + Wikidata QID Q1860 passed. Checking all data types. + + Language Data Type Total Wikidata Lexemes + ================================================================ + Q1860 adjectives 12,345 + adverbs 23,456 + articles 30 + conjunctions 40 + nouns 56,789 + personal pronouns 60 + ... + +.. code-block:: text + + $ scribe-data total --language English -dt nouns Language: English Data type: nouns - Total number of lexemes: 12345 + Total number of lexemes: 12,345 + +.. code-block:: text + + $ scribe-data total --language Q1860 -dt verbs + Language: Q1860 + Data type: verbs + Total number of lexemes: 23,456 Convert Command ~~~~~~~~~~~~~~~ diff --git a/src/scribe_data/check/check_pyicu.py b/src/scribe_data/check/check_pyicu.py index a1f24cd8..9e9bdd50 100644 --- a/src/scribe_data/check/check_pyicu.py +++ b/src/scribe_data/check/check_pyicu.py @@ -57,7 +57,7 @@ def get_python_version_and_architecture(): def fetch_wheel_releases(): """ - Fetch the release data for PyICU from GitHub. + Fetch the release data for PyICU from GitHub with error handling for rate limits. Returns ------- diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py index 75637050..6e9d0745 100644 --- a/src/scribe_data/cli/interactive.py +++ b/src/scribe_data/cli/interactive.py @@ -34,7 +34,9 @@ from rich.table import Table from tqdm import tqdm +# from scribe_data.cli.list import list_wrapper from scribe_data.cli.get import get_data +from scribe_data.cli.total import total_wrapper from scribe_data.cli.version import get_version_message from scribe_data.utils import ( DEFAULT_JSON_EXPORT_DIR, @@ -53,6 +55,7 @@ ) console = Console() logger = logging.getLogger("rich") +THANK_YOU_MESSAGE = "[bold cyan]Thank you for using Scribe-Data![/bold cyan]" class ScribeDataConfig: @@ -64,6 +67,7 @@ def __init__(self): self.output_type: str = "json" self.output_dir: Path = Path(DEFAULT_JSON_EXPORT_DIR) self.overwrite: bool = False + self.configured: bool = False config = ScribeDataConfig() @@ -94,6 +98,70 @@ def display_summary(): console.print("\n") +# Helper function to create a WordCompleter. +def create_word_completer( + options: List[str], include_all: bool = False +) -> WordCompleter: + if include_all: + options = ["All"] + options + return WordCompleter(options, ignore_case=True) + + +# MARK: Language Selection + + +def prompt_for_languages(): + """ + Requests language and data type for lexeme totals. + """ + language_completer = create_word_completer(config.languages, include_all=True) + initial_language_selection = ", ".join(config.selected_languages) + selected_languages = prompt( + "Select languages (comma-separated or 'All'): ", + default=initial_language_selection, + completer=language_completer, + ) + if "All" in selected_languages: + config.selected_languages = config.languages + elif selected_languages.strip(): # check if input is not just whitespace + config.selected_languages = [ + lang.strip() + for lang in selected_languages.split(",") + if lang.strip() in config.languages + ] + + if not config.selected_languages: + rprint("[yellow]No language selected. Please try again.[/yellow]") + return prompt_for_languages() + + +# MARK: Data Type Selection + + +def prompt_for_data_types(): + data_type_completer = create_word_completer(config.data_types, include_all=True) + initial_data_type_selection = ", ".join(config.selected_data_types) + while True: + selected_data_types = prompt( + "Select data types (comma-separated or 'All'): ", + default=initial_data_type_selection, + completer=data_type_completer, + ) + if "All" in selected_data_types.capitalize(): + config.selected_data_types = config.data_types + break + elif selected_data_types.strip(): # check if input is not just whitespace + config.selected_data_types = [ + dt.strip() + for dt in selected_data_types.split(",") + if dt.strip() in config.data_types + ] + if config.selected_data_types: + break # exit loop if valid data types are selected + + rprint("[yellow]No data type selected. Please try again.[/yellow]") + + def configure_settings(): """ Configures the settings of the interactive mode request. @@ -108,51 +176,12 @@ def configure_settings(): rprint( "[cyan]Follow the prompts below. Press tab for completions and enter to select.[/cyan]" ) - # MARK: Languages - language_completer = WordCompleter(["All"] + config.languages, ignore_case=True) - if not config.selected_languages: - selected_languages = prompt( - "Select languages (comma-separated or type 'All'): ", - completer=language_completer, - ) - - if "All" in selected_languages: - config.selected_languages = config.languages - else: - config.selected_languages = [ - lang.strip() - for lang in selected_languages.split(",") - if lang.strip() in config.languages - ] - - if not config.selected_languages: - rprint("[yellow]No language selected. Please try again.[/yellow]") - return configure_settings() - - # MARK: Data Types - - data_type_completer = WordCompleter(["All"] + config.data_types, ignore_case=True) - selected_data_types = prompt( - "Select data types (comma-separated or type 'All'): ", - completer=data_type_completer, - ) - - if "All" in selected_data_types.capitalize(): - config.selected_data_types = config.data_types - else: - config.selected_data_types = [ - dt.strip() - for dt in selected_data_types.split(",") - if dt.strip() in config.data_types - ] - - if not config.selected_data_types: - rprint("[yellow]No data type selected. Please try again.[/yellow]") - return configure_settings() + prompt_for_languages() + prompt_for_data_types() # MARK: Output Type - output_type_completer = WordCompleter(["json", "csv", "tsv"], ignore_case=True) + output_type_completer = create_word_completer(["json", "csv", "tsv"]) config.output_type = prompt( "Select output type (json/csv/tsv): ", completer=output_type_completer ) @@ -163,19 +192,18 @@ def configure_settings(): ) # MARK: Output Directory - if output_dir := prompt(f"Enter output directory (default: {config.output_dir}): "): config.output_dir = Path(output_dir) # MARK: Overwrite Confirmation - - overwrite_completer = WordCompleter(["Y", "n"], ignore_case=True) + overwrite_completer = create_word_completer(["Y", "n"]) overwrite = ( prompt("Overwrite existing files? (Y/n): ", completer=overwrite_completer) or "y" ) config.overwrite = overwrite.lower() == "y" + config.configured = True display_summary() @@ -223,37 +251,123 @@ def run_request(): rprint("[bold green]Data request completed successfully![/bold green]") -# MARK: Start - - -def start_interactive_mode(): +def request_total_lexeme_loop(): """ - Provides base options and forwarding to other interactive mode functionality. + Continuously prompts for lexeme requests until exit. """ - rprint( - f"[bold cyan]Welcome to {get_version_message()} interactive mode![/bold cyan]" - ) - while True: choice = questionary.select( "What would you like to do?", choices=[ - Choice("Configure request", "configure"), - Choice("Run configured data request", "run"), + Choice("Configure total lexemes request", "total"), + Choice("Run total lexemes request", "run"), Choice("Exit", "exit"), ], ).ask() + if choice == "run": + total_wrapper( + language=config.selected_languages, + data_type=config.selected_data_types, + all_bool=False, + ) + config.selected_languages, config.selected_data_types = [], [] + rprint(THANK_YOU_MESSAGE) + break + elif choice == "exit": + return + else: + prompt_for_languages() + prompt_for_data_types() + + +# MARK: List + +# def see_list_languages(): +# """ +# See list of languages. +# """ + +# choice = questionary.select( +# "What would you like to list?", +# choices=[ +# Choice("All languages", "all_languages"), +# Choice("Languages for a specific data type", "languages_for_data_type"), +# Choice("Data types for a specific language", "data_types_for_language"), +# ], +# ).ask() + +# if choice == "all_languages": +# list_wrapper(all_bool=True) +# elif choice == "languages_for_data_type": +# list_wrapper(data_type=True) +# elif choice == "data_types_for_language": +# list_wrapper(language=True) + + +# MARK: Start +def start_interactive_mode(operation: str = None): + """ + Entry point for interactive mode. + + Parameters + ---------- + operation : str + The type of operation that interactive mode is being ran with. + """ + rprint( + f"[bold cyan]Welcome to {get_version_message()} interactive mode![/bold cyan]" + ) + while True: + # Check if both selected_languages and selected_data_types are empty. + if not config.selected_languages and not config.selected_data_types: + if operation == "get": + choices = [ + Choice("Configure get data request", "configure"), + # Choice("See list of languages", "languages"), + Choice("Exit", "exit"), + ] + + elif operation == "total": + choices = [ + Choice("Configure total lexemes request", "total"), + # Choice("See list of languages", "languages"), + Choice("Exit", "exit"), + ] + + else: + choices = [ + Choice("Configure get data request", "configure"), + Choice("Exit", "exit"), + ] + if config.configured: + choices.insert(1, Choice("Request for get data", "run")) + + else: + choices.insert(1, Choice("Request for total lexeme", "total")) + + choice = questionary.select("What would you like to do?", choices=choices).ask() + if choice == "configure": configure_settings() + elif choice == "total": + prompt_for_languages() + prompt_for_data_types() + request_total_lexeme_loop() + break + + # elif choice == "languages": + # see_list_languages() + # break + elif choice == "run": run_request() - rprint("[bold cyan]Thank you for using Scribe-Data![/bold cyan]") + rprint(THANK_YOU_MESSAGE) break else: - rprint("[bold cyan]Thank you for using Scribe-Data![/bold cyan]") + rprint(THANK_YOU_MESSAGE) break diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 313ab74d..3104267b 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -178,6 +178,9 @@ def main() -> None: action=argparse.BooleanOptionalAction, help="Check for all languages and data types.", ) + total_parser.add_argument( + "-i", "--interactive", action="store_true", help="Run in interactive mode" + ) # MARK: Convert @@ -273,7 +276,7 @@ def main() -> None: elif args.command in ["get", "g"]: if args.interactive: - start_interactive_mode() + start_interactive_mode(operation="get") else: get_data( @@ -287,9 +290,12 @@ def main() -> None: ) elif args.command in ["total", "t"]: - total_wrapper( - language=args.language, data_type=args.data_type, all_bool=args.all - ) + if args.interactive: + start_interactive_mode(operation="total") + else: + total_wrapper( + language=args.language, data_type=args.data_type, all_bool=args.all + ) elif args.command in ["convert", "c"]: convert_wrapper( diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 466f0c73..989dbd25 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -21,6 +21,7 @@ """ from http.client import IncompleteRead +from typing import List, Union from urllib.error import HTTPError import requests @@ -30,7 +31,6 @@ LANGUAGE_DATA_EXTRACTION_DIR, data_type_metadata, format_sublanguage_name, - language_map, language_metadata, language_to_qid, list_all_languages, @@ -65,7 +65,7 @@ def get_qid_by_input(input_str): def get_datatype_list(language): """ - Get the data types for a given language based on the project directory structure. + Get the data types for a given language based on the project directory structure, including handling sub-languages. Parameters ---------- @@ -77,29 +77,49 @@ def get_datatype_list(language): data_types : list[str] or None A list of the corresponding data types. """ + language_key = language.strip().lower() # normalize input languages = list_all_languages(language_metadata) - if language.lower() in languages: - language_data = language_map.get(language.lower()) - languages = format_sublanguage_name(language, language_metadata) - language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language + # Adjust language_key for sub-languages using the format_sublanguage_name function. + formatted_language = format_sublanguage_name(language_key, language_metadata) + language_key = formatted_language.split("/")[ + 0 + ].lower() # use the main language part if formatted + + if language_key in languages: + if "sub_languages" in language_metadata[language_key]: + sub_languages = language_metadata[language_key]["sub_languages"] + data_types = [] + + for sub_lang_key in sub_languages: + sub_lang_dir = ( + LANGUAGE_DATA_EXTRACTION_DIR / sub_languages[sub_lang_key]["iso"] + ) + if sub_lang_dir.exists(): + data_types.extend( + [f.name for f in sub_lang_dir.iterdir() if f.is_dir()] + ) + + if not data_types: + raise ValueError( + f"No data types available for sub-languages of '{formatted_language}'." + ) + + return sorted(set(data_types)) # remove duplicates and sort - if not language_data: - raise ValueError(f"Language '{language}' is not recognized.") - - data_types = [f.name for f in language_dir.iterdir() if f.is_dir()] - if not data_types: - raise ValueError( - f"No data types available for language '{language.capitalize()}'." - ) + else: + language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_key + if not language_dir.exists(): + raise ValueError(f"Directory '{language_dir}' does not exist.") - data_types = sorted(data_types) + data_types = [f.name for f in language_dir.iterdir() if f.is_dir()] - for t in ["autosuggestions", "emoji_keywords"]: - if t in data_types: - data_types.remove(t) + if not data_types: + raise ValueError( + f"No data types available for language '{formatted_language}'." + ) - return data_types + return sorted(data_types) else: # return all data types return data_type_metadata @@ -171,15 +191,16 @@ def print_total_lexemes(language: str = None): else: print(f"Returning total counts for {language} data types...\n") - def print_total_header(): + def print_total_header(language, dt, total_lexemes): """ Prints the header of the total command output. """ + language_display = ( + "All Languages" if language is None else language.capitalize() + ) print(f"{'Language':<20} {'Data Type':<25} {'Total Wikidata Lexemes':<25}") print("=" * 70) - print( - f"{language.capitalize():<20} {dt.replace('_', '-'): <25} {total_lexemes:<25}" - ) + print(f"{language_display:<20} {dt.replace('_', '-'): <25} {total_lexemes:<25}") if language is None: # all languages languages = list_all_languages(language_metadata) @@ -192,7 +213,7 @@ def print_total_header(): total_lexemes = get_total_lexemes(lang, dt, False) total_lexemes = f"{total_lexemes:,}" if first_row: - print_total_header() + print_total_header(lang, dt, total_lexemes) first_row = False else: @@ -215,7 +236,7 @@ def print_total_header(): total_lexemes = get_total_lexemes(language, dt, False) total_lexemes = f"{total_lexemes:,}" if first_row: - print_total_header() + print_total_header(language, dt, total_lexemes) first_row = False else: @@ -343,18 +364,21 @@ def get_total_lexemes(language, data_type, doPrint=True): def total_wrapper( - language: str = None, data_type: str = None, all_bool: bool = False + language: Union[str, List[str]] = None, + data_type: Union[str, List[str]] = None, + all_bool: bool = False, ) -> None: """ Conditionally provides the full functionality of the total command. + Now accepts lists for language and data type to output a table of total lexemes. Parameters ---------- - language : str - The language to potentially total data types for. + language : Union[str, List[str]] + The language(s) to potentially total data types for. - data_type : str - The data type to check for. + data_type : Union[str, List[str]] + The data type(s) to check for. all_bool : boolean Whether all languages and data types should be listed. @@ -363,6 +387,31 @@ def total_wrapper( if (not language and not data_type) and all_bool: print_total_lexemes() + elif isinstance(language, list) or isinstance(data_type, list): + languages = language if isinstance(language, list) else [language] + data_types = data_type if isinstance(data_type, list) else [data_type] + + print(f"{'Language':<20} {'Data Type':<25} {'Total Lexemes':<25}") + print("=" * 70) + + for lang in languages: + first_row = ( + True # flag to check if it's the first data type for the language + ) + for dt in data_types: + total_lexemes = get_total_lexemes(lang, dt, False) + total_lexemes = ( + f"{total_lexemes:,}" if total_lexemes is not None else "N/A" + ) + if first_row: + print(f"{lang:<20} {dt:<25} {total_lexemes:<25}") + first_row = False + else: + print( + f"{'':<20} {dt:<25} {total_lexemes:<25}" + ) # print empty space for language + print() + elif language is not None and data_type is None: print_total_lexemes(language) diff --git a/tests/cli/test_interactive.py b/tests/cli/test_interactive.py new file mode 100644 index 00000000..cf99997a --- /dev/null +++ b/tests/cli/test_interactive.py @@ -0,0 +1,180 @@ +""" +Interactive for the list file functions. + +.. raw:: html + +""" + +import unittest +from pathlib import Path +from unittest.mock import MagicMock, call, patch + +from scribe_data.cli.interactive import ( + ScribeDataConfig, + configure_settings, + display_summary, + prompt_for_data_types, + prompt_for_languages, + run_request, +) + + +class TestScribeDataInteractive(unittest.TestCase): + def setUp(self): + """Set up test fixtures before each test method.""" + self.config = ScribeDataConfig() + # Mock the language_metadata and data_type_metadata. + self.config.languages = ["english", "spanish", "french"] + self.config.data_types = ["nouns", "verbs"] + + def test_scribe_data_config_initialization(self): + """Test ScribeDataConfig initialization.""" + self.assertEqual(self.config.selected_languages, []) + self.assertEqual(self.config.selected_data_types, []) + self.assertEqual(self.config.output_type, "json") + self.assertIsInstance(self.config.output_dir, Path) + self.assertFalse(self.config.overwrite) + self.assertFalse(self.config.configured) + + @patch("scribe_data.cli.interactive.prompt") + @patch("scribe_data.cli.interactive.rprint") + def test_configure_settings_all_languages(self, mock_rprint, mock_prompt): + """Test configure_settings with 'All' languages selection.""" + # Set up mock responses. + responses = iter( + [ + "All", # languages + "nouns", # data types + "json", # output type + "", # output directory (default) + "y", # overwrite + ] + ) + mock_prompt.side_effect = lambda *args, **kwargs: next(responses) + + with patch("scribe_data.cli.interactive.config", self.config): + with patch("scribe_data.cli.interactive.display_summary"): + configure_settings() + + self.assertEqual(self.config.selected_languages, self.config.languages) + self.assertEqual(self.config.selected_data_types, ["nouns"]) + self.assertEqual(self.config.output_type, "json") + self.assertTrue(self.config.configured) + + @patch("scribe_data.cli.interactive.prompt") + @patch("scribe_data.cli.interactive.rprint") + def test_configure_settings_specific_languages(self, mock_rprint, mock_prompt): + """Test configure_settings with specific language selection.""" + # Set up mock responses. + responses = iter( + [ + "english, spanish", # languages + "nouns, verbs", # data types + "csv", # output type + "/custom/path", # output directory + "n", # overwrite + ] + ) + mock_prompt.side_effect = lambda *args, **kwargs: next(responses) + + with patch("scribe_data.cli.interactive.config", self.config): + with patch("scribe_data.cli.interactive.display_summary"): + configure_settings() + + self.assertEqual(self.config.selected_languages, ["english", "spanish"]) + self.assertEqual(self.config.selected_data_types, ["nouns", "verbs"]) + self.assertEqual(self.config.output_type, "csv") + self.assertEqual(self.config.output_dir.as_posix(), "/custom/path") + self.assertFalse(self.config.overwrite) + + @patch("scribe_data.cli.interactive.get_data") + @patch("scribe_data.cli.interactive.tqdm") + @patch("scribe_data.cli.interactive.logger") + def test_run_request(self, mock_logger, mock_tqdm, mock_get_data): + """Test run_request functionality.""" + self.config.selected_languages = ["english"] + self.config.selected_data_types = ["nouns"] + self.config.configured = True + + mock_get_data.return_value = True + mock_progress = MagicMock() + mock_tqdm.return_value.__enter__.return_value = mock_progress + + with patch("scribe_data.cli.interactive.config", self.config): + run_request() + + mock_get_data.assert_called_once_with( + language="english", + data_type="nouns", + output_type=self.config.output_type, + output_dir=str(self.config.output_dir), + overwrite=self.config.overwrite, + interactive=True, + ) + + @patch("scribe_data.cli.interactive.prompt") + @patch("scribe_data.cli.interactive.rprint") + def test_request_total_lexeme(self, mock_rprint, mock_prompt): + """Test request_total_lexeme functionality.""" + # Set up mock responses. + mock_prompt.side_effect = [ + "english, french", # first call for languages + "nouns", # first call for data types + ] + + with patch("scribe_data.cli.interactive.config", self.config): + with patch( + "scribe_data.cli.interactive.list_all_languages", + return_value=["english", "french"], + ): + prompt_for_languages() + prompt_for_data_types() + + # Verify the config was updated correctly. + self.assertEqual(self.config.selected_languages, ["english", "french"]) + self.assertEqual(self.config.selected_data_types, ["nouns"]) + + # Verify prompt was called with correct arguments. + expected_calls = [ + call( + "Select languages (comma-separated or 'All'): ", + completer=unittest.mock.ANY, + default="", + ), + call( + "Select data types (comma-separated or 'All'): ", + completer=unittest.mock.ANY, + default="", + ), + ] + mock_prompt.assert_has_calls(expected_calls, any_order=False) + + @patch("rich.console.Console.print") + def test_display_summary(self, mock_print): + """Test display_summary functionality.""" + self.config.selected_languages = ["english"] + self.config.selected_data_types = ["nouns"] + self.config.output_type = "json" + + with patch("scribe_data.cli.interactive.config", self.config): + display_summary() + mock_print.assert_called() + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/cli/test_total.py b/tests/cli/test_total.py index f601c26d..0bbe340f 100644 --- a/tests/cli/test_total.py +++ b/tests/cli/test_total.py @@ -24,8 +24,11 @@ from unittest.mock import MagicMock, call, patch from scribe_data.cli.total import ( + check_qid_is_language, + get_datatype_list, get_qid_by_input, get_total_lexemes, + total_wrapper, ) @@ -133,6 +136,42 @@ def test_get_total_lexemes_various_data_types(self, mock_query, mock_get_qid): ] mock_print.assert_has_calls(expected_calls) + @patch("scribe_data.cli.total.get_qid_by_input") + @patch("scribe_data.cli.total.sparql.query") + @patch("scribe_data.cli.total.LANGUAGE_DATA_EXTRACTION_DIR") + def test_get_total_lexemes_sub_languages(self, mock_dir, mock_query, mock_get_qid): + # Setup for sub-languages. + mock_get_qid.side_effect = lambda x: { + "bokmål": "Q25167", + "nynorsk": "Q25164", + }.get(x.lower()) + mock_results = MagicMock() + mock_results.convert.return_value = { + "results": {"bindings": [{"total": {"value": "30"}}]} + } + mock_query.return_value = mock_results + + # Mocking directory paths and contents. + mock_dir.__truediv__.return_value.exists.return_value = True + mock_dir.__truediv__.return_value.iterdir.return_value = [ + MagicMock(name="verbs", is_dir=lambda: True), + MagicMock(name="nouns", is_dir=lambda: True), + ] + + with patch("builtins.print") as mock_print: + get_total_lexemes("Norwegian", "verbs") + get_total_lexemes("Norwegian", "nouns") + + expected_calls = [ + call( + "\nLanguage: Norwegian\nData type: verbs\nTotal number of lexemes: 30\n" + ), + call( + "\nLanguage: Norwegian\nData type: nouns\nTotal number of lexemes: 30\n" + ), + ] + mock_print.assert_has_calls(expected_calls) + class TestGetQidByInput(unittest.TestCase): def setUp(self): @@ -154,3 +193,70 @@ def test_get_qid_by_input_invalid(self, mock_data_type_metadata): mock_data_type_metadata.update(self.valid_data_types) self.assertIsNone(get_qid_by_input("invalid_data_type")) + + +class TestGetDatatypeList(unittest.TestCase): + @patch("scribe_data.cli.total.LANGUAGE_DATA_EXTRACTION_DIR") + def test_get_datatype_list_invalid_language(self, mock_dir): + mock_dir.__truediv__.return_value.exists.return_value = False + + with self.assertRaises(ValueError): + get_datatype_list("InvalidLanguage") + + @patch("scribe_data.cli.total.LANGUAGE_DATA_EXTRACTION_DIR") + def test_get_datatype_list_no_data_types(self, mock_dir): + mock_dir.__truediv__.return_value.exists.return_value = True + mock_dir.__truediv__.return_value.iterdir.return_value = [] + + with self.assertRaises(ValueError): + get_datatype_list("English") + + +class TestCheckQidIsLanguage(unittest.TestCase): + @patch("scribe_data.cli.total.requests.get") + def test_check_qid_is_language_valid(self, mock_get): + mock_response = MagicMock() + mock_response.json.return_value = { + "statements": {"P31": [{"value": {"content": "Q34770"}}]}, + "labels": {"en": "English"}, + } + mock_get.return_value = mock_response + + with patch("builtins.print") as mock_print: + result = check_qid_is_language("Q1860") + + self.assertEqual(result, "English") + mock_print.assert_called_once_with("English (Q1860) is a language.\n") + + @patch("scribe_data.cli.total.requests.get") + def test_check_qid_is_language_invalid(self, mock_get): + mock_response = MagicMock() + mock_response.json.return_value = { + "statements": {"P31": [{"value": {"content": "Q5"}}]}, + "labels": {"en": "Human"}, + } + mock_get.return_value = mock_response + + with self.assertRaises(ValueError): + check_qid_is_language("Q5") + + +class TestTotalWrapper(unittest.TestCase): + @patch("scribe_data.cli.total.print_total_lexemes") + def test_total_wrapper_all_bool(self, mock_print_total_lexemes): + total_wrapper(all_bool=True) + mock_print_total_lexemes.assert_called_once_with() + + @patch("scribe_data.cli.total.print_total_lexemes") + def test_total_wrapper_language_only(self, mock_print_total_lexemes): + total_wrapper(language="English") + mock_print_total_lexemes.assert_called_once_with("English") + + @patch("scribe_data.cli.total.get_total_lexemes") + def test_total_wrapper_language_and_data_type(self, mock_get_total_lexemes): + total_wrapper(language="English", data_type="nouns") + mock_get_total_lexemes.assert_called_once_with("English", "nouns") + + def test_total_wrapper_invalid_input(self): + with self.assertRaises(ValueError): + total_wrapper()