Skip to content

Commit

Permalink
Merge pull request scribe-org#402 from OmarAI2003/refactor-languages_…
Browse files Browse the repository at this point in the history
…metadata.json-and-rework-references

Moving from Old Language Metadata Structure to Support Sub-languages and Simplified JSON
  • Loading branch information
andrewtavis authored Oct 18, 2024
2 parents eec4622 + 661b131 commit 9df1756
Show file tree
Hide file tree
Showing 13 changed files with 568 additions and 369 deletions.
1 change: 1 addition & 0 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Thank you for your pull request! 🚀
<!-- Please replace the empty checkboxes [] below with checked ones [x] accordingly. -->

- [] This pull request is on a [separate branch](https://docs.github.com/en/get-started/quickstart/github-flow) and not the main branch
- [] I have tested my code with the `pytest` command as directed in the [testing section of the contributing guide](https://github.com/scribe-org/Scribe-Data/blob/main/CONTRIBUTING.md#testing)

---

Expand Down
11 changes: 11 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ If you have questions or would like to communicate with the team, please [join u
- [First steps as a contributor](#first-steps)
- [Learning the tech stack](#learning-the-tech)
- [Development environment](#dev-env)
- [Testing](#testing)
- [Issues and projects](#issues-projects)
- [Bug reports](#bug-reports)
- [Feature requests](#feature-requests)
Expand Down Expand Up @@ -171,6 +172,16 @@ pip install -e .
> [!NOTE]
> Feel free to contact the team in the [Data room on Matrix](https://matrix.to/#/#ScribeData:matrix.org) if you're having problems getting your environment setup!
<a id="testing"></a>

## Testing [``](#contents)

In addition to the [pre-commit](https://pre-commit.com/) hooks that are set up during the [development environment section](#dev-env), Scribe-Data also includes a testing suite that should be ran before all pull requests and subsequent commits. Please run the following in the project root:

```bash
pytest
```

<a id="issues-projects"></a>

## Issues and projects [``](#contents)
Expand Down
82 changes: 45 additions & 37 deletions src/scribe_data/cli/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR

# MARK: CLI Variables

LANGUAGE_DATA_EXTRACTION_DIR = Path(__file__).parent.parent / "language_data_extraction"

LANGUAGE_METADATA_FILE = (
Expand All @@ -53,14 +55,24 @@
print(f"Error reading data type metadata: {e}")


language_map = {
lang["language"].lower(): lang for lang in language_metadata["languages"]
}
language_map = {}
language_to_qid = {}

# Process each language and its potential sub-languages in one pass.
for lang, lang_data in language_metadata.items():
lang_lower = lang.lower()

# Create language_to_qid dictionary.
language_to_qid = {
lang["language"].lower(): lang["qid"] for lang in language_metadata["languages"]
}
# Handle sub-languages if they exist.
if "sub_languages" in lang_data:
for sub_lang, sub_lang_data in lang_data["sub_languages"].items():
sub_lang_lower = sub_lang.lower()
language_map[sub_lang_lower] = sub_lang_data
language_to_qid[sub_lang_lower] = sub_lang_data["qid"]

else:
# Handle the main language directly.
language_map[lang_lower] = lang_data
language_to_qid[lang_lower] = lang_data["qid"]


# MARK: Correct Inputs
Expand Down Expand Up @@ -103,41 +115,37 @@ def print_formatted_data(data: Union[dict, list], data_type: str) -> None:
if isinstance(data, dict):
max_key_length = max((len(key) for key in data.keys()), default=0)

if data_type == "autosuggestions":
for key, value in data.items():
for key, value in data.items():
if data_type == "autosuggestions":
print(f"{key:<{max_key_length}} : {', '.join(value)}")

elif data_type == "emoji_keywords":
for key, value in data.items():
elif data_type == "emoji_keywords":
emojis = [item["emoji"] for item in value]
print(f"{key:<{max_key_length}} : {' '.join(emojis)}")

elif data_type in {"prepositions"}:
for key, value in data.items():
elif data_type in {"prepositions"}:
print(f"{key:<{max_key_length}} : {value}")

else:
for key, value in data.items():
if isinstance(value, dict):
print(f"{key:<{max_key_length}} : ")
max_sub_key_length = max(
(len(sub_key) for sub_key in value.keys()), default=0
)
for sub_key, sub_value in value.items():
print(f" {sub_key:<{max_sub_key_length}} : {sub_value}")

elif isinstance(value, list):
print(f"{key:<{max_key_length}} : ")
for item in value:
if isinstance(item, dict):
for sub_key, sub_value in item.items():
print(f" {sub_key:<{max_key_length}} : {sub_value}")

else:
print(f" {item}")

else:
print(f"{key:<{max_key_length}} : {value}")
elif isinstance(value, dict):
print(f"{key:<{max_key_length}} : ")
max_sub_key_length = max(
(len(sub_key) for sub_key in value.keys()), default=0
)
for sub_key, sub_value in value.items():
print(f" {sub_key:<{max_sub_key_length}} : {sub_value}")

elif isinstance(value, list):
print(f"{key:<{max_key_length}} : ")
for item in value:
if isinstance(item, dict):
for sub_key, sub_value in item.items():
print(f" {sub_key:<{max_key_length}} : {sub_value}")

else:
print(f" {item}")

else:
print(f"{key:<{max_key_length}} : {value}")

elif isinstance(data, list):
for item in data:
Expand Down Expand Up @@ -202,12 +210,12 @@ def validate_single_item(item, valid_options, item_type):
):
closest_match = difflib.get_close_matches(item, valid_options, n=1)
closest_match_str = (
f" The closest matching {item_type} is {closest_match[0]}."
f" The closest matching {item_type} is '{closest_match[0]}'."
if closest_match
else ""
)

return f"Invalid {item_type} {item}.{closest_match_str}"
return f"Invalid {item_type} '{item}'.{closest_match_str}"

return None

Expand Down
6 changes: 2 additions & 4 deletions src/scribe_data/cli/interactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from scribe_data.cli.cli_utils import data_type_metadata, language_metadata
from scribe_data.cli.get import get_data
from scribe_data.cli.version import get_version_message
from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR
from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR, list_all_languages

# MARK: Config Setup

Expand All @@ -51,9 +51,7 @@

class ScribeDataConfig:
def __init__(self):
self.languages = [
lang["language"].capitalize() for lang in language_metadata["languages"]
]
self.languages = list_all_languages(language_metadata)
self.data_types = list(data_type_metadata.keys())
self.selected_languages: List[str] = []
self.selected_data_types: List[str] = []
Expand Down
40 changes: 26 additions & 14 deletions src/scribe_data/cli/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,28 @@
"""

from scribe_data.cli.cli_utils import (
LANGUAGE_DATA_EXTRACTION_DIR,
correct_data_type,
language_metadata,
language_map,
LANGUAGE_DATA_EXTRACTION_DIR,
language_metadata,
)
from scribe_data.utils import (
format_sublanguage_name,
get_language_iso,
get_language_qid,
list_all_languages,
)


def list_languages() -> None:
"""
Generates a table of languages, their ISO-2 codes and their Wikidata QIDs.
"""
languages = list(language_metadata["languages"])
languages.sort(key=lambda x: x["language"])
languages = list_all_languages(language_metadata)

language_col_width = max(len(lang["language"]) for lang in languages) + 2
iso_col_width = max(len(lang["iso"]) for lang in languages) + 2
qid_col_width = max(len(lang["qid"]) for lang in languages) + 2
language_col_width = max(len(lang) for lang in languages) + 2
iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2
qid_col_width = max(len(get_language_qid(lang)) for lang in languages) + 2

table_line_length = language_col_width + iso_col_width + qid_col_width

Expand All @@ -49,7 +54,7 @@ def list_languages() -> None:

for lang in languages:
print(
f"{lang['language'].capitalize():<{language_col_width}} {lang['iso']:<{iso_col_width}} {lang['qid']:<{qid_col_width}}"
f"{lang.capitalize():<{language_col_width}} {get_language_iso(lang):<{iso_col_width}} {get_language_qid(lang):<{qid_col_width}}"
)

print("-" * table_line_length)
Expand All @@ -65,7 +70,9 @@ def list_data_types(language: str = None) -> None:
language : str
The language to potentially list data types for.
"""
languages = list_all_languages(language_metadata)
if language:
language = format_sublanguage_name(language, language_metadata)
language_data = language_map.get(language.lower())
language_capitalized = language.capitalize()
language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_capitalized
Expand All @@ -83,8 +90,11 @@ def list_data_types(language: str = None) -> None:

else:
data_types = set()
for lang in language_metadata["languages"]:
language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang["language"].capitalize()
for lang in languages:
language_dir = (
LANGUAGE_DATA_EXTRACTION_DIR
/ format_sublanguage_name(lang, language_metadata).capitalize()
)
if language_dir.is_dir():
data_types.update(f.name for f in language_dir.iterdir() if f.is_dir())

Expand Down Expand Up @@ -122,13 +132,15 @@ def list_languages_for_data_type(data_type: str) -> None:
The data type to check for.
"""
data_type = correct_data_type(data_type=data_type)
all_languages = list_all_languages(language_metadata)
available_languages = []
for lang in language_metadata["languages"]:
language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang["language"].capitalize()
for lang in all_languages:
lang = format_sublanguage_name(lang, language_metadata)
language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang
if language_dir.is_dir():
dt_path = language_dir / data_type
if dt_path.exists():
available_languages.append(lang["language"])
available_languages.append(lang)

available_languages.sort()
table_header = f"Available languages: {data_type}"
Expand All @@ -141,7 +153,7 @@ def list_languages_for_data_type(data_type: str) -> None:
print("-" * table_line_length)

for lang in available_languages:
print(f"{lang.capitalize()}")
print(f"{lang}")

print("-" * table_line_length)
print()
Expand Down
16 changes: 8 additions & 8 deletions src/scribe_data/cli/total.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
language_metadata,
language_to_qid,
)
from scribe_data.utils import format_sublanguage_name, list_all_languages
from scribe_data.wikidata.wikidata_utils import sparql


Expand Down Expand Up @@ -71,12 +72,13 @@ def get_datatype_list(language):
data_types : list[str] or None
A list of the corresponding data types.
"""
languages = list(language_metadata["languages"])
language_list = [lang["language"] for lang in languages]
languages = list_all_languages(language_metadata)

if language.lower() in language_list:
if language.lower() in languages:
language_data = language_map.get(language.lower())
language_capitalized = language.capitalize()
language_capitalized = format_sublanguage_name(
language, language_metadata
).capitalize()
language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_capitalized

if not language_data:
Expand Down Expand Up @@ -131,11 +133,9 @@ def print_total_lexemes(language: str = None):
print("=" * 64)

if language is None: # all languages
languages = list(language_metadata["languages"])
languages.sort(key=lambda x: x["language"])
language_list = [lang["language"] for lang in languages]
languages = list_all_languages(language_metadata)

for lang in language_list:
for lang in languages:
data_types = get_datatype_list(lang)

first_row = True
Expand Down
4 changes: 2 additions & 2 deletions src/scribe_data/load/data_to_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
DEFAULT_SQLITE_EXPORT_DIR,
get_language_iso,
)
from scribe_data.utils import list_all_languages


def data_to_sqlite(
Expand All @@ -52,8 +53,7 @@ def data_to_sqlite(
current_language_data = json.load(f_languages)
data_types = json.load(f_data_types).keys()

current_languages = [d["language"] for d in current_language_data["languages"]]

current_languages = list_all_languages(current_language_data)
if not languages:
languages = current_languages

Expand Down
Loading

0 comments on commit 9df1756

Please sign in to comment.