diff --git a/docs/source/scribe_data/cli.rst b/docs/source/scribe_data/cli.rst index c99eaed2..5218891c 100644 --- a/docs/source/scribe_data/cli.rst +++ b/docs/source/scribe_data/cli.rst @@ -143,14 +143,31 @@ Options: - ``-ot, --output-type {json,csv,tsv}``: The output file type. - ``-ope, --outputs-per-entry OUTPUTS_PER_ENTRY``: How many outputs should be generated per data entry. - ``-o, --overwrite``: Whether to overwrite existing files (default: False). -- ``-a, --all ALL``: Get all languages and data types. +- ``-a, --all``: Get all languages and data types. Can be combined with `-dt` to get all languages for a specific data type, or with `-lang` to get all data types for a specific language. - ``-i, --interactive``: Run in interactive mode. -Example: +Examples: + +.. code-block:: bash + + $ scribe-data get --all + Getting data for all languages and all data types... + +.. code-block:: bash + + $ scribe-data get --all -dt nouns + Getting all nouns for all languages... + +.. code-block:: bash + + $ scribe-data get --all -lang English + Getting all data types for English... .. code-block:: bash $ scribe-data get -l English --data-type verbs -od ~/path/for/output + Getting and formatting English verbs + Data updated: 100%|████████████████████████| 1/1 [00:29<00:00, 29.73s/process] Behavior and Output: ^^^^^^^^^^^^^^^^^^^^ @@ -210,23 +227,30 @@ Interactive Mode .. code-block:: text $ scribe-data get -i - Welcome to Scribe-Data interactive mode! - Language options: - 1. English - 2. French - 3. German - ... - - Please enter the languages to get data for, their numbers or (a) for all languages: 1 - - Data type options: - 1. autosuggestions - 2. emoji_keywords - 3. nouns - 4. prepositions - 5. verbs - - ... + Welcome to Scribe-Data v3.3.0 interactive mode! + ? What would you like to do? Configure request + Follow the prompts below. Press tab for completions and enter to select. + Select languages (comma-separated or type 'All'): english + Select data types (comma-separated or type 'All'): nouns + Select output type (json/csv/tsv): json + Enter output directory (default: scribe_data_json_export): + Overwrite existing files? (Y/n): + + Scribe-Data Request Configuration Summary + ┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Setting ┃ Value(s) ┃ + ┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ Languages │ english │ + │ Data Types │ nouns │ + │ Output Type │ json │ + │ Output Directory │ scribe_data_json_export │ + │ Overwrite │ Yes │ + └──────────────────┴─────────────────────────┘ + + ? What would you like to do? (Use arrow keys) + » Configure request + Run configured data request + Exit Total Command ~~~~~~~~~~~~~ @@ -242,31 +266,64 @@ Usage: Options: ^^^^^^^^ -- ``-lang, --language LANGUAGE``: The language(s) to check totals for. +- ``-lang, --language LANGUAGE``: The language(s) to check totals for. Can be a language name or QID. - ``-dt, --data-type DATA_TYPE``: The data type(s) to check totals for. -- ``-a, --all ALL``: Get totals for all languages and data types. +- ``-a, --all``: Get totals for all languages and data types. Examples: .. code-block:: text - $scribe-data total -dt nouns # verbs, adjectives, etc - Data type: nouns - Total number of lexemes: 123456 + $ scribe-data total --all + Total lexemes for all languages and data types: + ============================================== + Language Data Type Total Lexemes + ============================================== + English nouns 123456 + verbs 234567 + ... .. code-block:: text - $scribe-data total -lang English - Language: English - Total number of lexemes: 123456 + $ scribe-data total --language English + Returning total counts for English data types... + + Language Data Type Total Wikidata Lexemes + ================================================================ + English adjectives 12,848 + adverbs 19,998 + nouns 30,786 + ... .. code-block:: text - $scribe-data total -lang English -dt nouns # verbs, adjectives, etc + $ scribe-data total --language Q1860 + Wikidata QID Q1860 passed. Checking all data types. + + Language Data Type Total Wikidata Lexemes + ================================================================ + Q1860 adjectives 12,848 + adverbs 19,998 + articles 0 + conjunctions 72 + nouns 30,786 + personal pronouns 32 + ... + +.. code-block:: text + + $ scribe-data total --language English -dt nouns Language: English Data type: nouns Total number of lexemes: 12345 +.. code-block:: text + + $ scribe-data total --language Q1860 -dt verbs + Language: Q1860 + Data type: verbs + Total number of lexemes: 23456 + Convert Command ~~~~~~~~~~~~~~~ diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 466f0c73..1f4f2d25 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -29,8 +29,6 @@ from scribe_data.utils import ( LANGUAGE_DATA_EXTRACTION_DIR, data_type_metadata, - format_sublanguage_name, - language_map, language_metadata, language_to_qid, list_all_languages, @@ -65,7 +63,7 @@ def get_qid_by_input(input_str): def get_datatype_list(language): """ - Get the data types for a given language based on the project directory structure. + Get the data types for a given language based on the project directory structure, including handling sub-languages. Parameters ---------- @@ -77,29 +75,45 @@ def get_datatype_list(language): data_types : list[str] or None A list of the corresponding data types. """ + language_key = language.strip().lower() # Normalize input languages = list_all_languages(language_metadata) - if language.lower() in languages: - language_data = language_map.get(language.lower()) - languages = format_sublanguage_name(language, language_metadata) - language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language - - if not language_data: - raise ValueError(f"Language '{language}' is not recognized.") - - data_types = [f.name for f in language_dir.iterdir() if f.is_dir()] - if not data_types: - raise ValueError( - f"No data types available for language '{language.capitalize()}'." - ) - - data_types = sorted(data_types) - - for t in ["autosuggestions", "emoji_keywords"]: - if t in data_types: - data_types.remove(t) - - return data_types + # Adjust language_key for sub-languages + for lang, data in language_metadata.items(): + if "sub_languages" in data: + for sub_lang in data["sub_languages"]: + if sub_lang.lower() == language_key: + language_key = lang.lower() + break + + if language_key in languages: + # language_data = language_map.get(language_key) + if "sub_languages" in language_metadata[language_key]: + sub_languages = language_metadata[language_key]["sub_languages"] + data_types = [] + for sub_lang_key in sub_languages: + sub_lang_dir = ( + LANGUAGE_DATA_EXTRACTION_DIR / sub_languages[sub_lang_key]["iso"] + ) + if sub_lang_dir.exists(): + data_types.extend( + [f.name for f in sub_lang_dir.iterdir() if f.is_dir()] + ) + if not data_types: + raise ValueError( + f"No data types available for sub-languages of '{language.capitalize()}'." + ) + return sorted(set(data_types)) # Remove duplicates and sort + else: + language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_key + if not language_dir.exists(): + raise ValueError(f"Directory '{language_dir}' does not exist.") + data_types = [f.name for f in language_dir.iterdir() if f.is_dir()] + if not data_types: + raise ValueError( + f"No data types available for language '{language.capitalize()}'." + ) + return sorted(data_types) else: # return all data types return data_type_metadata diff --git a/tests/cli/test_total.py b/tests/cli/test_total.py index f601c26d..1a5d254c 100644 --- a/tests/cli/test_total.py +++ b/tests/cli/test_total.py @@ -133,6 +133,42 @@ def test_get_total_lexemes_various_data_types(self, mock_query, mock_get_qid): ] mock_print.assert_has_calls(expected_calls) + @patch("scribe_data.cli.total.get_qid_by_input") + @patch("scribe_data.cli.total.sparql.query") + @patch("scribe_data.cli.total.LANGUAGE_DATA_EXTRACTION_DIR") + def test_get_total_lexemes_sub_languages(self, mock_dir, mock_query, mock_get_qid): + # Setup for sub-languages + mock_get_qid.side_effect = lambda x: { + "bokmål": "Q25167", + "nynorsk": "Q25164", + }.get(x.lower()) + mock_results = MagicMock() + mock_results.convert.return_value = { + "results": {"bindings": [{"total": {"value": "30"}}]} + } + mock_query.return_value = mock_results + + # Mocking directory paths and contents + mock_dir.__truediv__.return_value.exists.return_value = True + mock_dir.__truediv__.return_value.iterdir.return_value = [ + MagicMock(name="verbs", is_dir=lambda: True), + MagicMock(name="nouns", is_dir=lambda: True), + ] + + with patch("builtins.print") as mock_print: + get_total_lexemes("Norwegian", "verbs") + get_total_lexemes("Norwegian", "nouns") + + expected_calls = [ + call( + "\nLanguage: Norwegian\nData type: verbs\nTotal number of lexemes: 30\n" + ), + call( + "\nLanguage: Norwegian\nData type: nouns\nTotal number of lexemes: 30\n" + ), + ] + mock_print.assert_has_calls(expected_calls) + class TestGetQidByInput(unittest.TestCase): def setUp(self):