diff --git a/.github/workflows/check_project_structure.yaml b/.github/workflows/check_project_structure.yaml new file mode 100644 index 000000000..6c131e0d8 --- /dev/null +++ b/.github/workflows/check_project_structure.yaml @@ -0,0 +1,23 @@ +name: Check Project Structure +on: + push: + branches: [main] + pull_request: + branches: [main] + types: [opened, reopened, synchronize] + +jobs: + structure-check: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Run check_project_structure.py + working-directory: ./src/scribe_data/check + run: python check_project_structure.py + + - name: Post-run status + if: failure() + run: echo "Project structure check failed. Please fix the reported errors." diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml index 780da47da..3a601fe60 100644 --- a/.github/workflows/check_query_identifiers.yaml +++ b/.github/workflows/check_query_identifiers.yaml @@ -1,4 +1,4 @@ -name: check_query_identifiers +name: Check Query Identifiers on: push: branches: [main] diff --git a/.github/workflows/pr_ci.yaml b/.github/workflows/pr_ci.yaml index 0f317ee6a..9946fb02e 100644 --- a/.github/workflows/pr_ci.yaml +++ b/.github/workflows/pr_ci.yaml @@ -1,4 +1,4 @@ -name: pr_ci +name: CI on: push: branches: [main] diff --git a/.github/workflows/pr_maintainer_checklist.yaml b/.github/workflows/pr_maintainer_checklist.yaml index bee8e4f41..61566ce9c 100644 --- a/.github/workflows/pr_maintainer_checklist.yaml +++ b/.github/workflows/pr_maintainer_checklist.yaml @@ -1,4 +1,4 @@ -name: pr_maintainer_checklist +name: PR Maintainer Checklist on: pull_request_target: branches: diff --git a/src/scribe_data/check/check_project_structure.py b/src/scribe_data/check/check_project_structure.py new file mode 100644 index 000000000..4dcb21e32 --- /dev/null +++ b/src/scribe_data/check/check_project_structure.py @@ -0,0 +1,162 @@ +import os + +# Expected languages and data types. +LANGUAGES = { + "Arabic", + "English", + "Greek", + "Italian", + "Malayalam", + "Russian", + "Tamil", + "Basque", + "Esperanto", + "Hausa", + "Japanese", + "Norwegian", + "Slovak", + "Ukrainian", + "Bengali", + "Estonian", + "Hebrew", + "Korean", + "Pidgin", + "Spanish", + "Yoruba", + "Chinese", + "Finnish", + "Hindustani", + "Kurmanji", + "Polish", + "Swahili", + "Czech", + "French", + "Indonesian", + "Latin", + "Portuguese", + "Swedish", + "Danish", + "German", + "Malay", + "Punjabi", + "Tajik", +} + +DATA_TYPES = { + "adjectives", + "adverbs", + "articles", + "autosuggestions", + "conjunctions", + "emoji_keywords", + "nouns", + "personal_pronouns", + "postpositions", + "prepositions", + "pronouns", + "proper_nouns", + "verbs", +} + +# Sub-subdirectories expected for specific languages. +SUB_DIRECTORIES = { + "Chinese": ["Mandarin"], + "Hindustani": ["Urdu", "Hindi"], + "Norwegian": ["Nynorsk", "Bokmål"], + "Pidgin": ["Nigerian"], + "Punjabi": ["Shahmukhi", "Gurmukhi"], +} + + +# Base directory path. +BASE_DIR = "../language_data_extraction" + + +def validate_project_structure(): + """ + Validate that all directories follow the expected project structure and check for unexpected files and directories.""" + errors = [] + + if not os.path.exists(BASE_DIR): + print(f"Error: Base directory '{BASE_DIR}' does not exist.") + exit(1) + + # Check for unexpected files in BASE_DIR + for item in os.listdir(BASE_DIR): + item_path = os.path.join(BASE_DIR, item) + if os.path.isfile(item_path) and item != "__init__.py": + errors.append(f"Unexpected file found in BASE_DIR: {item}") + + # Iterate through the language directories + for language in os.listdir(BASE_DIR): + language_path = os.path.join(BASE_DIR, language) + + if not os.path.isdir(language_path) or language == "__init__.py": + continue + + if language not in LANGUAGES: + errors.append(f"Unexpected language directory: {language}") + continue + + # Check for unexpected files in language directory + for item in os.listdir(language_path): + item_path = os.path.join(language_path, item) + if os.path.isfile(item_path) and item != "__init__.py": + errors.append(f"Unexpected file found in {language} directory: {item}") + + found_subdirs = { + item + for item in os.listdir(language_path) + if os.path.isdir(os.path.join(language_path, item)) + and item != "__init__.py" + } + + if language in SUB_DIRECTORIES: + expected_subdirs = set(SUB_DIRECTORIES[language]) + unexpected_subdirs = found_subdirs - expected_subdirs + missing_subdirs = expected_subdirs - found_subdirs + + if unexpected_subdirs: + errors.append( + f"Unexpected sub-subdirectories in '{language}': {unexpected_subdirs}" + ) + if missing_subdirs: + errors.append( + f"Missing sub-subdirectories in '{language}': {missing_subdirs}" + ) + + # Check contents of expected sub-subdirectories + for subdir in expected_subdirs: + subdir_path = os.path.join(language_path, subdir) + if os.path.exists(subdir_path): + for item in os.listdir(subdir_path): + item_path = os.path.join(subdir_path, item) + if os.path.isfile(item_path) and item != "__init__.py": + errors.append( + f"Unexpected file found in {language}/{subdir}: {item}" + ) + + elif os.path.isdir(item_path) and item not in DATA_TYPES: + errors.append( + f"Unexpected directory found in {language}/{subdir}: {item}" + ) + + elif unexpected_data_types := found_subdirs - DATA_TYPES: + errors.append( + f"Unexpected subdirectories in '{language}': {unexpected_data_types}" + ) + + if errors: + print("Errors found:") + for error in errors: + print(f" - {error}") + exit(1) + + else: + print( + "All directories and files are correctly named and organized, and no unexpected files or directories were found." + ) + + +if __name__ == "__main__": + validate_project_structure() diff --git a/src/scribe_data/language_data_extraction/Chinese/Mandarin/Adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Chinese/Mandarin/adverbs/query_adverbs.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Chinese/Mandarin/Adverbs/query_adverbs.sparql rename to src/scribe_data/language_data_extraction/Chinese/Mandarin/adverbs/query_adverbs.sparql diff --git a/src/scribe_data/language_data_extraction/Chinese/Mandarin/Prepositions/query_prepositions.sparql b/src/scribe_data/language_data_extraction/Chinese/Mandarin/prepositions/query_prepositions.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Chinese/Mandarin/Prepositions/query_prepositions.sparql rename to src/scribe_data/language_data_extraction/Chinese/Mandarin/prepositions/query_prepositions.sparql diff --git a/src/scribe_data/language_data_extraction/Esperanto/Emoji_Keywords/__init__.py b/src/scribe_data/language_data_extraction/Esperanto/Emoji_Keywords/__init__.py deleted file mode 100644 index 8b1378917..000000000 --- a/src/scribe_data/language_data_extraction/Esperanto/Emoji_Keywords/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/scribe_data/language_data_extraction/Esperanto/emoji_keywords/__init__.py b/src/scribe_data/language_data_extraction/Esperanto/emoji_keywords/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/scribe_data/language_data_extraction/Esperanto/Emoji_Keywords/generate_emoji_keywords.py b/src/scribe_data/language_data_extraction/Esperanto/emoji_keywords/generate_emoji_keywords.py similarity index 100% rename from src/scribe_data/language_data_extraction/Esperanto/Emoji_Keywords/generate_emoji_keywords.py rename to src/scribe_data/language_data_extraction/Esperanto/emoji_keywords/generate_emoji_keywords.py diff --git a/src/scribe_data/language_data_extraction/Hausa/Emoji_keywords/__init__.py b/src/scribe_data/language_data_extraction/Hausa/Emoji_keywords/__init__.py deleted file mode 100644 index 8b1378917..000000000 --- a/src/scribe_data/language_data_extraction/Hausa/Emoji_keywords/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/scribe_data/language_data_extraction/Hausa/emoji_keywords/__init__.py b/src/scribe_data/language_data_extraction/Hausa/emoji_keywords/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/scribe_data/language_data_extraction/Hausa/Emoji_keywords/generate_emoji_keywords.py b/src/scribe_data/language_data_extraction/Hausa/emoji_keywords/generate_emoji_keywords.py similarity index 99% rename from src/scribe_data/language_data_extraction/Hausa/Emoji_keywords/generate_emoji_keywords.py rename to src/scribe_data/language_data_extraction/Hausa/emoji_keywords/generate_emoji_keywords.py index fbe6f657f..476fab10c 100644 --- a/src/scribe_data/language_data_extraction/Hausa/Emoji_keywords/generate_emoji_keywords.py +++ b/src/scribe_data/language_data_extraction/Hausa/emoji_keywords/generate_emoji_keywords.py @@ -1,4 +1,3 @@ - """ Generates keyword-emoji relationships from a selection of Hausa words. diff --git a/src/scribe_data/language_data_extraction/Korean/postposition/query_postpositions.sparql b/src/scribe_data/language_data_extraction/Korean/postpositions/query_postpositions.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Korean/postposition/query_postpositions.sparql rename to src/scribe_data/language_data_extraction/Korean/postpositions/query_postpositions.sparql diff --git a/src/scribe_data/language_data_extraction/Kurmanji/adjective/query_adjective.sparql b/src/scribe_data/language_data_extraction/Kurmanji/adjectives/query_adjective.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Kurmanji/adjective/query_adjective.sparql rename to src/scribe_data/language_data_extraction/Kurmanji/adjectives/query_adjective.sparql diff --git a/src/scribe_data/language_data_extraction/Kurmanji/adverb/query_adverb.sparql b/src/scribe_data/language_data_extraction/Kurmanji/adverbs/query_adverb.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Kurmanji/adverb/query_adverb.sparql rename to src/scribe_data/language_data_extraction/Kurmanji/adverbs/query_adverb.sparql diff --git a/src/scribe_data/language_data_extraction/Kurmanji/preposition/query_preposition.sparql b/src/scribe_data/language_data_extraction/Kurmanji/prepositions/query_preposition.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Kurmanji/preposition/query_preposition.sparql rename to src/scribe_data/language_data_extraction/Kurmanji/prepositions/query_preposition.sparql diff --git a/src/scribe_data/language_data_extraction/Slovak/adjecives/query_adjectives_1.sparql b/src/scribe_data/language_data_extraction/Slovak/adjectives/query_adjectives_1.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Slovak/adjecives/query_adjectives_1.sparql rename to src/scribe_data/language_data_extraction/Slovak/adjectives/query_adjectives_1.sparql diff --git a/src/scribe_data/language_data_extraction/Slovak/adjecives/query_adjectives_2.sparql b/src/scribe_data/language_data_extraction/Slovak/adjectives/query_adjectives_2.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Slovak/adjecives/query_adjectives_2.sparql rename to src/scribe_data/language_data_extraction/Slovak/adjectives/query_adjectives_2.sparql diff --git a/src/scribe_data/language_data_extraction/Slovak/adjecives/query_adjectives_3.sparql b/src/scribe_data/language_data_extraction/Slovak/adjectives/query_adjectives_3.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Slovak/adjecives/query_adjectives_3.sparql rename to src/scribe_data/language_data_extraction/Slovak/adjectives/query_adjectives_3.sparql diff --git a/src/scribe_data/language_data_extraction/Slovak/adjecives/query_adjectives_4.sparql b/src/scribe_data/language_data_extraction/Slovak/adjectives/query_adjectives_4.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Slovak/adjecives/query_adjectives_4.sparql rename to src/scribe_data/language_data_extraction/Slovak/adjectives/query_adjectives_4.sparql diff --git a/src/scribe_data/language_data_extraction/Slovak/adjecives/query_adjectives_5.sparql b/src/scribe_data/language_data_extraction/Slovak/adjectives/query_adjectives_5.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Slovak/adjecives/query_adjectives_5.sparql rename to src/scribe_data/language_data_extraction/Slovak/adjectives/query_adjectives_5.sparql diff --git a/src/scribe_data/language_data_extraction/Slovak/adjecives/query_adjectives_6.sparql b/src/scribe_data/language_data_extraction/Slovak/adjectives/query_adjectives_6.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Slovak/adjecives/query_adjectives_6.sparql rename to src/scribe_data/language_data_extraction/Slovak/adjectives/query_adjectives_6.sparql diff --git a/src/scribe_data/language_data_extraction/Tajik/Adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Tajik/adverbs/query_adverbs.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Tajik/Adverbs/query_adverbs.sparql rename to src/scribe_data/language_data_extraction/Tajik/adverbs/query_adverbs.sparql diff --git a/src/scribe_data/language_data_extraction/Yoruba/adjective/query_adjective.sparql b/src/scribe_data/language_data_extraction/Yoruba/adjectives/query_adjective.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Yoruba/adjective/query_adjective.sparql rename to src/scribe_data/language_data_extraction/Yoruba/adjectives/query_adjective.sparql diff --git a/src/scribe_data/language_data_extraction/Yoruba/verb/query_verb.sparql b/src/scribe_data/language_data_extraction/Yoruba/verbs/query_verb.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Yoruba/verb/query_verb.sparql rename to src/scribe_data/language_data_extraction/Yoruba/verbs/query_verb.sparql