Skip to content

Commit

Permalink
Add optional QID order validation and refactor label decomposition
Browse files Browse the repository at this point in the history
- Introduced check_optional_qid_order function to validate the order of QIDs in optional statements within SPARQL queries.
- Extracted label decomposition logic into the decompose_label_features function, which is now used in both functions.
- Updated check_query_forms to include validation for the correct order of QIDs in OPTIONAL clauses.
  • Loading branch information
OmarAI2003 committed Nov 13, 2024
1 parent a0e0692 commit 39815f9
Showing 1 changed file with 87 additions and 19 deletions.
106 changes: 87 additions & 19 deletions src/scribe_data/check/check_query_forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@
if "label" in sub_value
)

qid_label_dict = dict(zip(lexeme_form_labels_order, lexeme_form_qid_order))


# MARK: Extract Forms


Expand Down Expand Up @@ -107,6 +110,44 @@ def extract_form_rep_label(form_text: str):
return label_match[1].strip()


# MARK: Decompose Label


def decompose_label_features(label):
"""
Decomposes a concatenated grammatical label into a list of individual features.
Parameters
----------
label : str
The concatenated label string composed of several grammatical features.
Returns
-------
list
A list of grammatical features extracted from the label in their original order.
"""
components = re.findall(r"[A-Za-z][^A-Z]*", label)
valid_components = []
temp_component = ""

for index, component in enumerate(components):
temp_component += component.capitalize()

# Append valid components in lexeme_form_labels_order.
if index + 1 != len(components) and (
temp_component.lower() in map(str.lower, lexeme_form_labels_order)
and temp_component + components[index + 1] not in lexeme_form_labels_order
):
valid_components.append(temp_component)
temp_component = ""

if temp_component:
valid_components.append(temp_component)

return valid_components


# MARK: Extract QIDs


Expand Down Expand Up @@ -399,25 +440,7 @@ def check_forms_order(query_text):
# Split each column label into components.
split_vars = []
for col in set(select_vars) - set(labeling_service_cols):
components = re.findall(r"[A-Za-z][^A-Z]*", col)
valid_components = []
temp_component = ""

for index, component in enumerate(components):
temp_component += component.capitalize()

# Append valid components in lexeme_form_labels_order.
if index + 1 != len(components) and (
temp_component.lower() in map(str.lower, lexeme_form_labels_order)
and temp_component + components[index + 1]
not in lexeme_form_labels_order
):
valid_components.append(temp_component)
temp_component = ""

if temp_component:
valid_components.append(temp_component)

valid_components = decompose_label_features(col)
split_vars.append(valid_components)

# Create a map for fast component position lookup.
Expand Down Expand Up @@ -463,6 +486,46 @@ def compare_key(components):
return sorted_lower == select_lower


# MARK: Optional Validation


def check_optional_qid_order(query_file: str) -> str:
"""
Checks the order of QIDs in optional statements within a SPARQL query file to ensure they
align with the expected sequence based on label features.
Parameters
----------
query_file : str
The path to the SPARQL query file to be checked.
Returns
-------
str
A formatted string with details on any order mismatches in the QIDs, or an empty
string if all QIDs are correctly ordered.
"""
forms = extract_forms_from_sparql(query_file)
error_messages = []
statement_index = 0
for form_text in forms:
statement_index += 1
if "ontolex:lexicalForm" in form_text and "ontolex:representation" in form_text:
# Extract the actual QIDs and label for the current form.
actual_qids = extract_form_qids(form_text=form_text)
form_label = extract_form_rep_label(form_text)
label_components = decompose_label_features(form_label)
expected_qids = [qid_label_dict[key] for key in label_components]

# Check if the actual QIDs match the expected order.
if actual_qids != expected_qids:
formatted_qids = ", ".join(f"wd:{qid}" for qid in expected_qids) + " ."
error_messages.append(
f"QIDs in optional statement number {statement_index} should be ordered this way: \n {formatted_qids}"
)
return "\n".join(error_messages) if error_messages else ""


# MARK: Main Validation


Expand Down Expand Up @@ -495,6 +558,11 @@ def check_query_forms() -> None:
error_output += f"\n{index}. {query_file_str}:\n - {forms_order_and_definition_check}\n"
index += 1

# Check that all variables in the OPTIONAL clauses have their QIDs in the correct order.
if labels_qids_order_check := check_optional_qid_order(query_file_str):
error_output += f"\n{index}. {query_file_str}:\n{labels_qids_order_check}\n"
index += 1

if extract_forms_from_sparql(query_file):
query_form_check_dict = {}
for form_text in extract_forms_from_sparql(query_file):
Expand Down

0 comments on commit 39815f9

Please sign in to comment.