Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add missing file check function #432

Merged
merged 5 commits into from
Oct 19, 2024
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 81 additions & 37 deletions src/scribe_data/check/check_project_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,32 @@
BASE_DIR = "../language_data_extraction"


def check_data_type_folders(path, language, subdir, errors):
def check_for_sparql_files(folder_path, data_type, language, subdir, missing_queries):
"""
Check if a data-type folder contains at least one .sparql file.

Args:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@axif0 Can you please remember to format your docstrings in a way that's consistent with the rest of the project. As it's written this won't be parsed into the documentation.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for this. And thank you. I will remember this in future.

folder_path (str): The path to the data-type folder.
data_type (str): The name of the data type being checked.
language (str): The name of the language being processed.
subdir (str or None): The name of the sub-directory (for languages with sub-dialects), or None.
missing_queries (list): A list to which missing SPARQL query files will be appended.

Returns:
bool: True if at least one .sparql file is found, False otherwise.
"""
sparql_files = [f for f in os.listdir(folder_path) if f.endswith(".sparql")]
if not sparql_files:
missing_queries.append(
f"{language}/{subdir or ''}/{data_type}/query_{data_type}.sparql"
)
return False
return True


def check_data_type_folders(
path, language, subdir, errors, missing_folders, missing_queries
):
"""
Validate the contents of data type folders within a language directory.

Expand Down Expand Up @@ -104,35 +129,35 @@ def check_data_type_folders(path, language, subdir, errors):

Any files not matching these patterns (except '__init__.py') are reported as unexpected.
"""
for item in os.listdir(path):
existing_data_types = set(os.listdir(path)) - {"__init__.py"}
missing_data_types = DATA_TYPES - existing_data_types - {"emoji_keywords"}

for missing_type in missing_data_types:
missing_folders.append(f"{language}/{subdir or ''}/{missing_type}")

for item in existing_data_types:
item_path = os.path.join(path, item)
if os.path.isfile(item_path) and item != "__init__.py":
if os.path.isfile(item_path):
errors.append(f"Unexpected file found in {language}/{subdir or ''}: {item}")
elif os.path.isdir(item_path):
if item not in DATA_TYPES:
errors.append(
f"Unexpected directory found in {language}/{subdir or ''}: {item}"
)
else:
# Skip validation for emoji_keywords.
if item == "emoji_keywords":
continue

# Check for correctly formatted files.
valid_files = [
f
for f in os.listdir(item_path)
if (f.startswith(f"query_{item}") and f.endswith(".sparql"))
or f == f"format_{item}.py"
or f == f"{item}_queried.json"
]

for file in os.listdir(item_path):
if file not in valid_files and file != "__init__.py":
error_subdir = f"{subdir}/" or ""
errors.append(
f"Unexpected file in {language}/{error_subdir}{item}: {file}"
)
elif item not in DATA_TYPES:
errors.append(
f"Unexpected directory found in {language}/{subdir or ''}: {item}"
)
else:
if item == "emoji_keywords":
continue

check_for_sparql_files(item_path, item, language, subdir, missing_queries)

valid_files = [
f for f in os.listdir(item_path) if f.endswith(".sparql")
] + [f"format_{item}.py", f"{item}_queried.json", "__init__.py"]

for file in os.listdir(item_path):
if file not in valid_files:
errors.append(
f"Unexpected file in {language}/{subdir or ''}/{item}: {file}"
)


def validate_project_structure():
Expand All @@ -141,6 +166,8 @@ def validate_project_structure():
Also validate SPARQL query file names in data_type folders and SUBDIRECTORIES.
"""
errors = []
missing_folders = []
missing_queries = []

if not os.path.exists(BASE_DIR):
print(f"Error: Base directory '{BASE_DIR}' does not exist.")
Expand Down Expand Up @@ -190,21 +217,38 @@ def validate_project_structure():
f"Missing sub-subdirectories in '{language}': {missing_subdirs}"
)

# Check contents of expected sub-subdirectories
# Check contents of expected sub-subdirectories.
for subdir in expected_subdirs:
subdir_path = os.path.join(language_path, subdir)
if os.path.exists(subdir_path):
check_data_type_folders(subdir_path, language, subdir, errors)
check_data_type_folders(
subdir_path,
language,
subdir,
errors,
missing_folders,
missing_queries,
)

else:
check_data_type_folders(language_path, language, None, errors)

if errors:
print("Errors found:")
for error in errors:
print(f" - {error}")
check_data_type_folders(
language_path, language, None, errors, missing_folders, missing_queries
)

if errors or missing_folders or missing_queries:
if errors:
print("Errors found:")
for error in errors:
print(f" - {error}")
if missing_folders:
print("\nMissing data type folders:")
for folder in missing_folders:
print(f" - {folder}")
if missing_queries:
print("\nMissing SPARQL query files:")
for query in missing_queries:
print(f" - {query}")
exit(1)

else:
print(
"All directories and files are correctly named and organized, and no unexpected files or directories were found."
Expand Down
Loading