scribe-org · axif0 · Oct 28, 2024 · Oct 28, 2024 · Oct 29, 2024
diff --git a/docs/source/scribe_data/cli.rst b/docs/source/scribe_data/cli.rst
@@ -143,14 +143,31 @@ Options:
 - ``-ot, --output-type {json,csv,tsv}``: The output file type.
 - ``-ope, --outputs-per-entry OUTPUTS_PER_ENTRY``: How many outputs should be generated per data entry.
 - ``-o, --overwrite``: Whether to overwrite existing files (default: False).
-- ``-a, --all ALL``: Get all languages and data types.
+- ``-a, --all``: Get all languages and data types. Can be combined with `-dt` to get all languages for a specific data type, or with `-lang` to get all data types for a specific language.
 - ``-i, --interactive``: Run in interactive mode.
 
-Example:
+Examples:
+
+.. code-block:: bash
+
+    $ scribe-data get --all
+    Getting data for all languages and all data types...
+
+.. code-block:: bash
+
+    $ scribe-data get --all -dt nouns
+    Getting all nouns for all languages...
+
+.. code-block:: bash
+
+    $ scribe-data get --all -lang English
+    Getting all data types for English...
 
 .. code-block:: bash
 
     $ scribe-data get -l English --data-type verbs -od ~/path/for/output
+    Getting and formatting English verbs
+    Data updated: 100%|████████████████████████| 1/1 [00:29<00:00, 29.73s/process]
 
 Behavior and Output:
 ^^^^^^^^^^^^^^^^^^^^
@@ -210,23 +227,30 @@ Interactive Mode
 .. code-block:: text
 
     $ scribe-data get -i
-    Welcome to Scribe-Data interactive mode!
-    Language options:
-    1. English
-    2. French
-    3. German
-    ...
-
-    Please enter the languages to get data for, their numbers or (a) for all languages: 1
-
-    Data type options:
-    1. autosuggestions
-    2. emoji_keywords
-    3. nouns
-    4. prepositions
-    5. verbs
-
-    ...
+    Welcome to Scribe-Data v3.3.0 interactive mode!
+    ? What would you like to do? Configure request
+    Follow the prompts below. Press tab for completions and enter to select.
+    Select languages (comma-separated or type 'All'): english
+    Select data types (comma-separated or type 'All'): nouns
+    Select output type (json/csv/tsv): json
+    Enter output directory (default: scribe_data_json_export):
+    Overwrite existing files? (Y/n):
+
+    Scribe-Data Request Configuration Summary
+    ┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓
+    ┃ Setting          ┃ Value(s)                ┃
+    ┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩
+    │ Languages        │ english                 │
+    │ Data Types       │ nouns                   │
+    │ Output Type      │ json                    │
+    │ Output Directory │ scribe_data_json_export │
+    │ Overwrite        │ Yes                     │
+    └──────────────────┴─────────────────────────┘
+
+    ? What would you like to do? (Use arrow keys)
+     » Configure request
+       Run configured data request
+       Exit
 
 Total Command
 ~~~~~~~~~~~~~
@@ -242,31 +266,64 @@ Usage:
 Options:
 ^^^^^^^^
 
-- ``-lang, --language LANGUAGE``: The language(s) to check totals for.
+- ``-lang, --language LANGUAGE``: The language(s) to check totals for. Can be a language name or QID.
 - ``-dt, --data-type DATA_TYPE``: The data type(s) to check totals for.
-- ``-a, --all ALL``: Get totals for all languages and data types.
+- ``-a, --all``: Get totals for all languages and data types.
 
 Examples:
 
 .. code-block:: text
 
-    $scribe-data total -dt nouns  # verbs, adjectives, etc
-    Data type: nouns
-    Total number of lexemes: 123456
+    $ scribe-data total --all
+    Total lexemes for all languages and data types:
+    ==============================================
+    Language     Data Type     Total Lexemes
+    ==============================================
+    English      nouns         123456
+                 verbs         234567
+    ...
 
 .. code-block:: text
 
-    $scribe-data total -lang English
-    Language: English
-    Total number of lexemes: 123456
+    $ scribe-data total --language English
+    Returning total counts for English data types...
+
+    Language        Data Type                 Total Wikidata Lexemes
+    ================================================================
+    English         adjectives                12,848
+                    adverbs                   19,998
+                    nouns                     30,786
+    ...
 
 .. code-block:: text
 
-    $scribe-data total -lang English -dt nouns  # verbs, adjectives, etc
+    $ scribe-data total --language Q1860
+    Wikidata QID Q1860 passed. Checking all data types.
+
+    Language        Data Type                 Total Wikidata Lexemes
+    ================================================================
+    Q1860           adjectives                12,848
+                    adverbs                   19,998
+                    articles                  0
+                    conjunctions              72
+                    nouns                     30,786
+                    personal pronouns         32
+    ...
+
+.. code-block:: text
+
+    $ scribe-data total --language English -dt nouns
     Language: English
     Data type: nouns
     Total number of lexemes: 12345
 
+.. code-block:: text
+
+    $ scribe-data total --language Q1860 -dt verbs
+    Language: Q1860
+    Data type: verbs
+    Total number of lexemes: 23456
+
 Convert Command
 ~~~~~~~~~~~~~~~
 

diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
@@ -29,8 +29,6 @@
 from scribe_data.utils import (
     LANGUAGE_DATA_EXTRACTION_DIR,
     data_type_metadata,
-    format_sublanguage_name,
-    language_map,
     language_metadata,
     language_to_qid,
     list_all_languages,
@@ -65,7 +63,7 @@ def get_qid_by_input(input_str):
 
 def get_datatype_list(language):
     """
-    Get the data types for a given language based on the project directory structure.
+    Get the data types for a given language based on the project directory structure, including handling sub-languages.
 
     Parameters
     ----------
@@ -77,29 +75,45 @@ def get_datatype_list(language):
         data_types : list[str] or None
             A list of the corresponding data types.
     """
+    language_key = language.strip().lower()  # Normalize input
     languages = list_all_languages(language_metadata)
 
-    if language.lower() in languages:
-        language_data = language_map.get(language.lower())
-        languages = format_sublanguage_name(language, language_metadata)
-        language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language
-
-        if not language_data:
-            raise ValueError(f"Language '{language}' is not recognized.")
-
-        data_types = [f.name for f in language_dir.iterdir() if f.is_dir()]
-        if not data_types:
-            raise ValueError(
-                f"No data types available for language '{language.capitalize()}'."
-            )
-
-        data_types = sorted(data_types)
-
-        for t in ["autosuggestions", "emoji_keywords"]:
-            if t in data_types:
-                data_types.remove(t)
-
-        return data_types
+    # Adjust language_key for sub-languages
+    for lang, data in language_metadata.items():
+        if "sub_languages" in data:
+            for sub_lang in data["sub_languages"]:
+                if sub_lang.lower() == language_key:
+                    language_key = lang.lower()
+                    break
+
+    if language_key in languages:
+        # language_data = language_map.get(language_key)
+        if "sub_languages" in language_metadata[language_key]:
+            sub_languages = language_metadata[language_key]["sub_languages"]
+            data_types = []
+            for sub_lang_key in sub_languages:
+                sub_lang_dir = (
+                    LANGUAGE_DATA_EXTRACTION_DIR / sub_languages[sub_lang_key]["iso"]
+                )
+                if sub_lang_dir.exists():
+                    data_types.extend(
+                        [f.name for f in sub_lang_dir.iterdir() if f.is_dir()]
+                    )
+            if not data_types:
+                raise ValueError(
+                    f"No data types available for sub-languages of '{language.capitalize()}'."
+                )
+            return sorted(set(data_types))  # Remove duplicates and sort
+        else:
+            language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_key
+            if not language_dir.exists():
+                raise ValueError(f"Directory '{language_dir}' does not exist.")
+            data_types = [f.name for f in language_dir.iterdir() if f.is_dir()]
+            if not data_types:
+                raise ValueError(
+                    f"No data types available for language '{language.capitalize()}'."
+                )
+            return sorted(data_types)
 
     else:  # return all data types
         return data_type_metadata

diff --git a/tests/cli/test_total.py b/tests/cli/test_total.py
@@ -133,6 +133,42 @@ def test_get_total_lexemes_various_data_types(self, mock_query, mock_get_qid):
         ]
         mock_print.assert_has_calls(expected_calls)
 
+    @patch("scribe_data.cli.total.get_qid_by_input")
+    @patch("scribe_data.cli.total.sparql.query")
+    @patch("scribe_data.cli.total.LANGUAGE_DATA_EXTRACTION_DIR")
+    def test_get_total_lexemes_sub_languages(self, mock_dir, mock_query, mock_get_qid):
+        # Setup for sub-languages
+        mock_get_qid.side_effect = lambda x: {
+            "bokmål": "Q25167",
+            "nynorsk": "Q25164",
+        }.get(x.lower())
+        mock_results = MagicMock()
+        mock_results.convert.return_value = {
+            "results": {"bindings": [{"total": {"value": "30"}}]}
+        }
+        mock_query.return_value = mock_results
+
+        # Mocking directory paths and contents
+        mock_dir.__truediv__.return_value.exists.return_value = True
+        mock_dir.__truediv__.return_value.iterdir.return_value = [
+            MagicMock(name="verbs", is_dir=lambda: True),
+            MagicMock(name="nouns", is_dir=lambda: True),
+        ]
+
+        with patch("builtins.print") as mock_print:
+            get_total_lexemes("Norwegian", "verbs")
+            get_total_lexemes("Norwegian", "nouns")
+
+        expected_calls = [
+            call(
+                "\nLanguage: Norwegian\nData type: verbs\nTotal number of lexemes: 30\n"
+            ),
+            call(
+                "\nLanguage: Norwegian\nData type: nouns\nTotal number of lexemes: 30\n"
+            ),
+        ]
+        mock_print.assert_has_calls(expected_calls)
+
 
 class TestGetQidByInput(unittest.TestCase):
     def setUp(self):