Edits to language metadata and supporting functions + pr checklist

scribe-org · Oct 18, 2024 · ad61c66 · ad61c66
1 parent 0b75b4e
commit ad61c66
Show file tree

Hide file tree

Showing 9 changed files with 158 additions and 211 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -7,6 +7,7 @@ Thank you for your pull request! 🚀
 <!-- Please replace the empty checkboxes [] below with checked ones [x] accordingly. -->
 
 - [] This pull request is on a [separate branch](https://docs.github.com/en/get-started/quickstart/github-flow) and not the main branch
+- [] I have tested my code with the `pytest` command as directed in the [testing section of the contributing guide](https://github.com/scribe-org/Scribe-Data/blob/main/CONTRIBUTING.md#testing)
 
 ---
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -15,6 +15,7 @@ If you have questions or would like to communicate with the team, please [join u
 - [First steps as a contributor](#first-steps)
 - [Learning the tech stack](#learning-the-tech)
 - [Development environment](#dev-env)
+- [Testing](#testing)
 - [Issues and projects](#issues-projects)
 - [Bug reports](#bug-reports)
 - [Feature requests](#feature-requests)
@@ -171,6 +172,16 @@ pip install -e .
 > [!NOTE]
 > Feel free to contact the team in the [Data room on Matrix](https://matrix.to/#/#ScribeData:matrix.org) if you're having problems getting your environment setup!
 
+<a id="testing"></a>
+
+## Testing [`⇧`](#contents)
+
+In addition to the [pre-commit](https://pre-commit.com/) hooks that are set up during the [development environment section](#dev-env), Scribe-Data also includes a testing suite that should be ran before all pull requests and subsequent commits. Please run the following in the project root:
+
+```bash
+pytest
+```
+
 <a id="issues-projects"></a>
 
 ## Issues and projects [`⇧`](#contents)

diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py
@@ -27,6 +27,8 @@
 
 from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR
 
+# MARK: CLI Variables
+
 LANGUAGE_DATA_EXTRACTION_DIR = Path(__file__).parent.parent / "language_data_extraction"
 
 LANGUAGE_METADATA_FILE = (
@@ -56,20 +58,21 @@
 language_map = {}
 language_to_qid = {}
 
-# Process each language and its potential sub-languages in one pass
-for lang_key, lang_data in language_metadata.items():
-    lang_key_lower = lang_key.lower()
+# Process each language and its potential sub-languages in one pass.
+for lang, lang_data in language_metadata.items():
+    lang_lower = lang.lower()
 
-    # Handle sub-languages if they exist
+    # Handle sub-languages if they exist.
     if "sub_languages" in lang_data:
-        for sub_lang_key, sub_lang_data in lang_data["sub_languages"].items():
-            sub_lang_key_lower = sub_lang_key.lower()
-            language_map[sub_lang_key_lower] = sub_lang_data
-            language_to_qid[sub_lang_key_lower] = sub_lang_data["qid"]
+        for sub_lang, sub_lang_data in lang_data["sub_languages"].items():
+            sub_lang_lower = sub_lang.lower()
+            language_map[sub_lang_lower] = sub_lang_data
+            language_to_qid[sub_lang_lower] = sub_lang_data["qid"]
+
     else:
-        # Handle the main language directly
-        language_map[lang_key_lower] = lang_data
-        language_to_qid[lang_key_lower] = lang_data["qid"]
+        # Handle the main language directly.
+        language_map[lang_lower] = lang_data
+        language_to_qid[lang_lower] = lang_data["qid"]
 
 
 # MARK: Correct Inputs
@@ -112,41 +115,37 @@ def print_formatted_data(data: Union[dict, list], data_type: str) -> None:
     if isinstance(data, dict):
         max_key_length = max((len(key) for key in data.keys()), default=0)
 
-        if data_type == "autosuggestions":
-            for key, value in data.items():
+        for key, value in data.items():
+            if data_type == "autosuggestions":
                 print(f"{key:<{max_key_length}} : {', '.join(value)}")
 
-        elif data_type == "emoji_keywords":
-            for key, value in data.items():
+            elif data_type == "emoji_keywords":
                 emojis = [item["emoji"] for item in value]
                 print(f"{key:<{max_key_length}} : {' '.join(emojis)}")
 
-        elif data_type in {"prepositions"}:
-            for key, value in data.items():
+            elif data_type in {"prepositions"}:
                 print(f"{key:<{max_key_length}} : {value}")
 
-        else:
-            for key, value in data.items():
-                if isinstance(value, dict):
-                    print(f"{key:<{max_key_length}} : ")
-                    max_sub_key_length = max(
-                        (len(sub_key) for sub_key in value.keys()), default=0
-                    )
-                    for sub_key, sub_value in value.items():
-                        print(f"  {sub_key:<{max_sub_key_length}} : {sub_value}")
-
-                elif isinstance(value, list):
-                    print(f"{key:<{max_key_length}} : ")
-                    for item in value:
-                        if isinstance(item, dict):
-                            for sub_key, sub_value in item.items():
-                                print(f"  {sub_key:<{max_key_length}} : {sub_value}")
-
-                        else:
-                            print(f"  {item}")
-
-                else:
-                    print(f"{key:<{max_key_length}} : {value}")
+            elif isinstance(value, dict):
+                print(f"{key:<{max_key_length}} : ")
+                max_sub_key_length = max(
+                    (len(sub_key) for sub_key in value.keys()), default=0
+                )
+                for sub_key, sub_value in value.items():
+                    print(f"  {sub_key:<{max_sub_key_length}} : {sub_value}")
+
+            elif isinstance(value, list):
+                print(f"{key:<{max_key_length}} : ")
+                for item in value:
+                    if isinstance(item, dict):
+                        for sub_key, sub_value in item.items():
+                            print(f"  {sub_key:<{max_key_length}} : {sub_value}")
+
+                    else:
+                        print(f"  {item}")
+
+            else:
+                print(f"{key:<{max_key_length}} : {value}")
 
     elif isinstance(data, list):
         for item in data:
@@ -211,12 +210,12 @@ def validate_single_item(item, valid_options, item_type):
         ):
             closest_match = difflib.get_close_matches(item, valid_options, n=1)
             closest_match_str = (
-                f" The closest matching {item_type} is {closest_match[0]}."
+                f" The closest matching {item_type} is '{closest_match[0]}'."
                 if closest_match
                 else ""
             )
 
-            return f"Invalid {item_type} {item}.{closest_match_str}"
+            return f"Invalid {item_type} '{item}'.{closest_match_str}"
 
         return None
 

diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py
@@ -21,16 +21,16 @@
 """
 
 from scribe_data.cli.cli_utils import (
+    LANGUAGE_DATA_EXTRACTION_DIR,
     correct_data_type,
-    language_metadata,
     language_map,
-    LANGUAGE_DATA_EXTRACTION_DIR,
+    language_metadata,
 )
 from scribe_data.utils import (
-    list_all_languages,
+    format_sublanguage_name,
     get_language_iso,
     get_language_qid,
-    format_sublanguage_name,
+    list_all_languages,
 )
 
 
@@ -39,7 +39,6 @@ def list_languages() -> None:
     Generates a table of languages, their ISO-2 codes and their Wikidata QIDs.
     """
     languages = list_all_languages(language_metadata)
-    languages.sort()
 
     language_col_width = max(len(lang) for lang in languages) + 2
     iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2

diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
@@ -29,8 +29,8 @@
     language_metadata,
     language_to_qid,
 )
+from scribe_data.utils import format_sublanguage_name, list_all_languages
 from scribe_data.wikidata.wikidata_utils import sparql
-from scribe_data.utils import list_all_languages, format_sublanguage_name
 
 
 def get_qid_by_input(input_str):
@@ -73,9 +73,8 @@ def get_datatype_list(language):
             A list of the corresponding data types.
     """
     languages = list_all_languages(language_metadata)
-    language_list = [lang for lang in languages]
 
-    if language.lower() in language_list:
+    if language.lower() in languages:
         language_data = language_map.get(language.lower())
         language_capitalized = format_sublanguage_name(
             language, language_metadata
@@ -134,13 +133,9 @@ def print_total_lexemes(language: str = None):
     print("=" * 64)
 
     if language is None:  # all languages
-        languages = list_all_languages(
-            language_metadata
-        )  # this returns a list of language names
-        language_list = languages  # sorts the list in place
-        language_list.sort()
+        languages = list_all_languages(language_metadata)
 
-        for lang in language_list:
+        for lang in languages:
             data_types = get_datatype_list(lang)
 
             first_row = True

diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json
@@ -11,6 +11,14 @@
     "iso": "bn",
     "qid": "Q9610"
   },
+  "chinese": {
+    "sub_languages": {
+      "mandarin": {
+        "iso": "zh",
+        "qid": "Q727694"
+      }
+    }
+  },
   "czech": {
     "iso": "cs",
     "qid": "Q9056"
@@ -95,23 +103,15 @@
     "iso": "ml",
     "qid": "Q36236"
   },
-  "chinese": {
-    "sub_languages": {
-      "mandarin": {
-        "iso": "zh",
-        "qid": "Q727694"
-      }
-    }
-  },
   "norwegian": {
     "sub_languages": {
-      "nynorsk": {
-        "iso": "nn",
-        "qid": "Q25164"
-      },
       "bokmål": {
         "iso": "nb",
         "qid": "Q25167"
+      },
+      "nynorsk": {
+        "iso": "nn",
+        "qid": "Q25164"
       }
     }
   },
@@ -133,13 +133,13 @@
   },
   "punjabi": {
     "sub_languages": {
-      "shahmukhi": {
-        "iso": "pnb",
-        "qid": "Q58635"
-      },
       "gurmukhi": {
         "iso": "pa",
         "qid": "Q58635"
+      },
+      "shahmukhi": {
+        "iso": "pnb",
+        "qid": "Q58635"
       }
     }
   },
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,7 @@ Thank you for your pull request! 🚀 @@
     <!-- Please replace the empty checkboxes [] below with checked ones [x] accordingly. -->
     - [] This pull request is on a [separate branch](https://docs.github.com/en/get-started/quickstart/github-flow) and not the main branch
+    - [] I have tested my code with the `pytest` command as directed in the [testing section of the contributing guide](https://github.com/scribe-org/Scribe-Data/blob/main/CONTRIBUTING.md#testing)
     ---
@@ Expand Down @@