Skip to content

Commit

Permalink
Edits to language metadata and supporting functions + pr checklist
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtavis authored and DeleMike committed Oct 18, 2024
1 parent 0b75b4e commit ad61c66
Show file tree
Hide file tree
Showing 9 changed files with 158 additions and 211 deletions.
1 change: 1 addition & 0 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Thank you for your pull request! 🚀
<!-- Please replace the empty checkboxes [] below with checked ones [x] accordingly. -->

- [] This pull request is on a [separate branch](https://docs.github.com/en/get-started/quickstart/github-flow) and not the main branch
- [] I have tested my code with the `pytest` command as directed in the [testing section of the contributing guide](https://github.com/scribe-org/Scribe-Data/blob/main/CONTRIBUTING.md#testing)

---

Expand Down
11 changes: 11 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ If you have questions or would like to communicate with the team, please [join u
- [First steps as a contributor](#first-steps)
- [Learning the tech stack](#learning-the-tech)
- [Development environment](#dev-env)
- [Testing](#testing)
- [Issues and projects](#issues-projects)
- [Bug reports](#bug-reports)
- [Feature requests](#feature-requests)
Expand Down Expand Up @@ -171,6 +172,16 @@ pip install -e .
> [!NOTE]
> Feel free to contact the team in the [Data room on Matrix](https://matrix.to/#/#ScribeData:matrix.org) if you're having problems getting your environment setup!
<a id="testing"></a>

## Testing [``](#contents)

In addition to the [pre-commit](https://pre-commit.com/) hooks that are set up during the [development environment section](#dev-env), Scribe-Data also includes a testing suite that should be ran before all pull requests and subsequent commits. Please run the following in the project root:

```bash
pytest
```

<a id="issues-projects"></a>

## Issues and projects [``](#contents)
Expand Down
81 changes: 40 additions & 41 deletions src/scribe_data/cli/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR

# MARK: CLI Variables

LANGUAGE_DATA_EXTRACTION_DIR = Path(__file__).parent.parent / "language_data_extraction"

LANGUAGE_METADATA_FILE = (
Expand Down Expand Up @@ -56,20 +58,21 @@
language_map = {}
language_to_qid = {}

# Process each language and its potential sub-languages in one pass
for lang_key, lang_data in language_metadata.items():
lang_key_lower = lang_key.lower()
# Process each language and its potential sub-languages in one pass.
for lang, lang_data in language_metadata.items():
lang_lower = lang.lower()

# Handle sub-languages if they exist
# Handle sub-languages if they exist.
if "sub_languages" in lang_data:
for sub_lang_key, sub_lang_data in lang_data["sub_languages"].items():
sub_lang_key_lower = sub_lang_key.lower()
language_map[sub_lang_key_lower] = sub_lang_data
language_to_qid[sub_lang_key_lower] = sub_lang_data["qid"]
for sub_lang, sub_lang_data in lang_data["sub_languages"].items():
sub_lang_lower = sub_lang.lower()
language_map[sub_lang_lower] = sub_lang_data
language_to_qid[sub_lang_lower] = sub_lang_data["qid"]

else:
# Handle the main language directly
language_map[lang_key_lower] = lang_data
language_to_qid[lang_key_lower] = lang_data["qid"]
# Handle the main language directly.
language_map[lang_lower] = lang_data
language_to_qid[lang_lower] = lang_data["qid"]


# MARK: Correct Inputs
Expand Down Expand Up @@ -112,41 +115,37 @@ def print_formatted_data(data: Union[dict, list], data_type: str) -> None:
if isinstance(data, dict):
max_key_length = max((len(key) for key in data.keys()), default=0)

if data_type == "autosuggestions":
for key, value in data.items():
for key, value in data.items():
if data_type == "autosuggestions":
print(f"{key:<{max_key_length}} : {', '.join(value)}")

elif data_type == "emoji_keywords":
for key, value in data.items():
elif data_type == "emoji_keywords":
emojis = [item["emoji"] for item in value]
print(f"{key:<{max_key_length}} : {' '.join(emojis)}")

elif data_type in {"prepositions"}:
for key, value in data.items():
elif data_type in {"prepositions"}:
print(f"{key:<{max_key_length}} : {value}")

else:
for key, value in data.items():
if isinstance(value, dict):
print(f"{key:<{max_key_length}} : ")
max_sub_key_length = max(
(len(sub_key) for sub_key in value.keys()), default=0
)
for sub_key, sub_value in value.items():
print(f" {sub_key:<{max_sub_key_length}} : {sub_value}")

elif isinstance(value, list):
print(f"{key:<{max_key_length}} : ")
for item in value:
if isinstance(item, dict):
for sub_key, sub_value in item.items():
print(f" {sub_key:<{max_key_length}} : {sub_value}")

else:
print(f" {item}")

else:
print(f"{key:<{max_key_length}} : {value}")
elif isinstance(value, dict):
print(f"{key:<{max_key_length}} : ")
max_sub_key_length = max(
(len(sub_key) for sub_key in value.keys()), default=0
)
for sub_key, sub_value in value.items():
print(f" {sub_key:<{max_sub_key_length}} : {sub_value}")

elif isinstance(value, list):
print(f"{key:<{max_key_length}} : ")
for item in value:
if isinstance(item, dict):
for sub_key, sub_value in item.items():
print(f" {sub_key:<{max_key_length}} : {sub_value}")

else:
print(f" {item}")

else:
print(f"{key:<{max_key_length}} : {value}")

elif isinstance(data, list):
for item in data:
Expand Down Expand Up @@ -211,12 +210,12 @@ def validate_single_item(item, valid_options, item_type):
):
closest_match = difflib.get_close_matches(item, valid_options, n=1)
closest_match_str = (
f" The closest matching {item_type} is {closest_match[0]}."
f" The closest matching {item_type} is '{closest_match[0]}'."
if closest_match
else ""
)

return f"Invalid {item_type} {item}.{closest_match_str}"
return f"Invalid {item_type} '{item}'.{closest_match_str}"

return None

Expand Down
9 changes: 4 additions & 5 deletions src/scribe_data/cli/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,16 @@
"""

from scribe_data.cli.cli_utils import (
LANGUAGE_DATA_EXTRACTION_DIR,
correct_data_type,
language_metadata,
language_map,
LANGUAGE_DATA_EXTRACTION_DIR,
language_metadata,
)
from scribe_data.utils import (
list_all_languages,
format_sublanguage_name,
get_language_iso,
get_language_qid,
format_sublanguage_name,
list_all_languages,
)


Expand All @@ -39,7 +39,6 @@ def list_languages() -> None:
Generates a table of languages, their ISO-2 codes and their Wikidata QIDs.
"""
languages = list_all_languages(language_metadata)
languages.sort()

language_col_width = max(len(lang) for lang in languages) + 2
iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2
Expand Down
13 changes: 4 additions & 9 deletions src/scribe_data/cli/total.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
language_metadata,
language_to_qid,
)
from scribe_data.utils import format_sublanguage_name, list_all_languages
from scribe_data.wikidata.wikidata_utils import sparql
from scribe_data.utils import list_all_languages, format_sublanguage_name


def get_qid_by_input(input_str):
Expand Down Expand Up @@ -73,9 +73,8 @@ def get_datatype_list(language):
A list of the corresponding data types.
"""
languages = list_all_languages(language_metadata)
language_list = [lang for lang in languages]

if language.lower() in language_list:
if language.lower() in languages:
language_data = language_map.get(language.lower())
language_capitalized = format_sublanguage_name(
language, language_metadata
Expand Down Expand Up @@ -134,13 +133,9 @@ def print_total_lexemes(language: str = None):
print("=" * 64)

if language is None: # all languages
languages = list_all_languages(
language_metadata
) # this returns a list of language names
language_list = languages # sorts the list in place
language_list.sort()
languages = list_all_languages(language_metadata)

for lang in language_list:
for lang in languages:
data_types = get_datatype_list(lang)

first_row = True
Expand Down
32 changes: 16 additions & 16 deletions src/scribe_data/resources/language_metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
"iso": "bn",
"qid": "Q9610"
},
"chinese": {
"sub_languages": {
"mandarin": {
"iso": "zh",
"qid": "Q727694"
}
}
},
"czech": {
"iso": "cs",
"qid": "Q9056"
Expand Down Expand Up @@ -95,23 +103,15 @@
"iso": "ml",
"qid": "Q36236"
},
"chinese": {
"sub_languages": {
"mandarin": {
"iso": "zh",
"qid": "Q727694"
}
}
},
"norwegian": {
"sub_languages": {
"nynorsk": {
"iso": "nn",
"qid": "Q25164"
},
"bokmål": {
"iso": "nb",
"qid": "Q25167"
},
"nynorsk": {
"iso": "nn",
"qid": "Q25164"
}
}
},
Expand All @@ -133,13 +133,13 @@
},
"punjabi": {
"sub_languages": {
"shahmukhi": {
"iso": "pnb",
"qid": "Q58635"
},
"gurmukhi": {
"iso": "pa",
"qid": "Q58635"
},
"shahmukhi": {
"iso": "pnb",
"qid": "Q58635"
}
}
},
Expand Down
Loading

0 comments on commit ad61c66

Please sign in to comment.