From d09572ae82a5d9303da096c75428046d03631cc0 Mon Sep 17 00:00:00 2001 From: Veronica Waiganjo Date: Fri, 25 Oct 2024 22:29:58 +0300 Subject: [PATCH 1/2] Adding whitespace error for comma and non whitespace error for periods and semicolons --- src/scribe_data/check/check_query_forms.py | 51 ++++++++++++++++--- .../persian/verbs/query_verbs_1.sparql | 14 ++--- .../persian/verbs/query_verbs_2.sparql | 18 +++---- 3 files changed, 60 insertions(+), 23 deletions(-) diff --git a/src/scribe_data/check/check_query_forms.py b/src/scribe_data/check/check_query_forms.py index 58202e907..5897c4205 100644 --- a/src/scribe_data/check/check_query_forms.py +++ b/src/scribe_data/check/check_query_forms.py @@ -162,6 +162,34 @@ def extract_form_qids(form_text: str): return [q.split("wd:")[1].split(" .")[0] for q in match[0].split(", ")] +# MARK: Correct Panctuation + + +def check_query_formatting(form_text: str): + """ + Checks the formatting of the given SPARQL query text for common issues. + + Parameters + ---------- + query_text : str + The SPARQL query text to check. + + Returns + ------- + str + A message indicating formatting issues, if any. + """ + # Check for spaces before commas + if re.search(r"\s+[,]", form_text): + return False + + # Check for spaces before periods and semicolons + if re.search(r"\S(?=[.;])", form_text): + return False + + return True + + # MARK: Correct Label @@ -450,6 +478,7 @@ def check_query_forms() -> None: "ontolex:lexicalForm" in form_text and "ontolex:representation" in form_text ): + correct_form_spacing = check_query_formatting(form_text=form_text) form_rep_label = extract_form_rep_label(form_text=form_text) check = check_form_label(form_text=form_text) qids = extract_form_qids(form_text=form_text) @@ -457,6 +486,7 @@ def check_query_forms() -> None: query_form_check_dict[form_rep_label] = { "form_rep_match": check, + "correct_form_spacing": correct_form_spacing, "qids": qids, "correct_form_rep_label": correct_form_rep_label, } @@ -464,15 +494,22 @@ def check_query_forms() -> None: if query_form_check_dict: incorrect_query_labels = [] for k in query_form_check_dict: - if k != query_form_check_dict[k]["correct_form_rep_label"]: - incorrect_query_labels.append( - (k, query_form_check_dict[k]["correct_form_rep_label"]) - ) - - elif query_form_check_dict[k]["form_rep_match"] is False: + if k != query_form_check_dict[k]["correct_form_spacing"] is False: incorrect_query_labels.append( - (k, "Form and representation labels don't match") + ( + k, + "Invalid query formatting found - please put spaces before all periods and semicolons and also remove spaces before commas.", + ) ) + else: + if k != query_form_check_dict[k]["correct_form_rep_label"]: + incorrect_query_labels.append( + (k, query_form_check_dict[k]["correct_form_rep_label"]) + ) + elif query_form_check_dict[k]["form_rep_match"] is False: + incorrect_query_labels.append( + (k, "Form and representation labels don't match") + ) if incorrect_query_labels: current_rep_label_to_correct_label_str = [ diff --git a/src/scribe_data/wikidata/language_data_extraction/persian/verbs/query_verbs_1.sparql b/src/scribe_data/wikidata/language_data_extraction/persian/verbs/query_verbs_1.sparql index f2d6841ec..6c2b8c0df 100644 --- a/src/scribe_data/wikidata/language_data_extraction/persian/verbs/query_verbs_1.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/persian/verbs/query_verbs_1.sparql @@ -11,9 +11,9 @@ SELECT ?pastWordStem WHERE { - ?lexeme dct:language wd:Q9168; - wikibase:lexicalCategory wd:Q24905; - wikibase:lemma ?infinitive. + ?lexeme dct:language wd:Q9168 ; + wikibase:lexicalCategory wd:Q24905 ; + wikibase:lemma ?infinitive . #MARK: Past Participle @@ -21,14 +21,14 @@ WHERE { ?lexeme ontolex:lexicalForm ?presentParticipleForm . ?presentParticipleForm ontolex:representation ?presentParticiple ; wikibase:grammaticalFeature wd:Q192613, wd:Q814722 . - FILTER(lang(?presentParticiple) = "fa"). + FILTER(lang(?presentParticiple) = "fa") . } OPTIONAL { ?lexeme ontolex:lexicalForm ?pastParticipleForm . ?pastParticipleForm ontolex:representation ?pastParticiple ; wikibase:grammaticalFeature wd:Q814722, wd:Q1994301 . - FILTER(lang(?pastParticiple) = "fa"). + FILTER(lang(?pastParticiple) = "fa") . } #MARK: Word Stem @@ -37,13 +37,13 @@ WHERE { ?lexeme ontolex:lexicalForm ?presentWordStemForm . ?presentWordStemForm ontolex:representation ?presentWordStem ; wikibase:grammaticalFeature wd:Q192613, wd:Q210523 . - FILTER(lang(?presentWordStem) = "fa"). + FILTER(lang(?presentWordStem) = "fa") . } OPTIONAL { ?lexeme ontolex:lexicalForm ?pastWordStemForm . ?pastWordStemForm ontolex:representation ?pastWordStem ; wikibase:grammaticalFeature wd:Q1994301, wd:Q210523 . - FILTER(lang(?pastWordStem) = "fa"). + FILTER(lang(?pastWordStem) = "fa") . } } diff --git a/src/scribe_data/wikidata/language_data_extraction/persian/verbs/query_verbs_2.sparql b/src/scribe_data/wikidata/language_data_extraction/persian/verbs/query_verbs_2.sparql index f729d67c1..f0339e122 100644 --- a/src/scribe_data/wikidata/language_data_extraction/persian/verbs/query_verbs_2.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/persian/verbs/query_verbs_2.sparql @@ -13,9 +13,9 @@ SELECT ?indicativeThirdPersonAoristPlural WHERE { - ?lexeme dct:language wd:Q9168; - wikibase:lexicalCategory wd:Q24905; - wikibase:lemma ?infinitive. + ?lexeme dct:language wd:Q9168 ; + wikibase:lexicalCategory wd:Q24905 ; + wikibase:lemma ?infinitive . #MARK: Indicative Aorist @@ -23,41 +23,41 @@ WHERE { ?lexeme ontolex:lexicalForm ?indicativeFirstPersonAoristSingularForm . ?indicativeFirstPersonAoristSingularForm ontolex:representation ?indicativeFirstPersonAoristSingular ; wikibase:grammaticalFeature wd:Q21714344, wd:Q110786, wd:Q682111, wd:Q216497 . - FILTER(lang(?indicativeFirstPersonAoristSingular) = "fa"). + FILTER(lang(?indicativeFirstPersonAoristSingular) = "fa") . } OPTIONAL { ?lexeme ontolex:lexicalForm ?indicativeSecondPersonAoristSingularForm . ?indicativeSecondPersonAoristSingularForm ontolex:representation ?indicativeSecondPersonAoristSingular ; wikibase:grammaticalFeature wd:Q51929049, wd:Q110786, wd:Q682111, wd:Q216497 . - FILTER(lang(?indicativeSecondPersonAoristSingular) = "fa"). + FILTER(lang(?indicativeSecondPersonAoristSingular) = "fa") . } OPTIONAL { ?lexeme ontolex:lexicalForm ?indicativeThirdPersonAoristSingularForm . ?indicativeThirdPersonAoristSingularForm ontolex:representation ?indicativeThirdPersonAoristSingular ; wikibase:grammaticalFeature wd:Q51929074, wd:Q110786, wd:Q682111, wd:Q216497 . - FILTER(lang(?indicativeThirdPersonAoristSingular) = "fa"). + FILTER(lang(?indicativeThirdPersonAoristSingular) = "fa") . } OPTIONAL { ?lexeme ontolex:lexicalForm ?indicativeFirstPersonAoristPluralForm . ?indicativeFirstPersonAoristPluralForm ontolex:representation ?indicativeFirstPersonAoristPlural ; wikibase:grammaticalFeature wd:Q21714344, wd:Q146786, wd:Q682111, wd:Q216497 . - FILTER(lang(?indicativeFirstPersonAoristPlural) = "fa"). + FILTER(lang(?indicativeFirstPersonAoristPlural) = "fa") . } OPTIONAL { ?lexeme ontolex:lexicalForm ?indicativeSecondPersonAoristPluralForm . ?indicativeSecondPersonAoristPluralForm ontolex:representation ?indicativeSecondPersonAoristPlural ; wikibase:grammaticalFeature wd:Q51929049, wd:Q146786, wd:Q682111, wd:Q216497 . - FILTER(lang(?indicativeSecondPersonAoristPlural) = "fa"). + FILTER(lang(?indicativeSecondPersonAoristPlural) = "fa") . } OPTIONAL { ?lexeme ontolex:lexicalForm ?indicativeThirdPersonAoristPluralForm . ?indicativeThirdPersonAoristPluralForm ontolex:representation ?indicativeThirdPersonAoristPlural ; wikibase:grammaticalFeature wd:Q51929074, wd:Q146786, wd:Q682111, wd:Q216497 . - FILTER(lang(?indicativeThirdPersonAoristPlural) = "fa"). + FILTER(lang(?indicativeThirdPersonAoristPlural) = "fa") . } } From 9d953c05b71eb363862d82c71706f161795d8a2d Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Fri, 25 Oct 2024 22:37:33 +0200 Subject: [PATCH 2/2] Minor formatting and edits to formatting check code --- src/scribe_data/check/check_query_forms.py | 45 +++++++++++----------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/src/scribe_data/check/check_query_forms.py b/src/scribe_data/check/check_query_forms.py index 5897c4205..b4ab72d07 100644 --- a/src/scribe_data/check/check_query_forms.py +++ b/src/scribe_data/check/check_query_forms.py @@ -162,29 +162,29 @@ def extract_form_qids(form_text: str): return [q.split("wd:")[1].split(" .")[0] for q in match[0].split(", ")] -# MARK: Correct Panctuation +# MARK: Punctuation def check_query_formatting(form_text: str): """ - Checks the formatting of the given SPARQL query text for common issues. + Checks the formatting of the given SPARQL query text for common formatting issues. Parameters ---------- - query_text : str - The SPARQL query text to check. + query_text : str + The SPARQL query text to check. Returns ------- - str - A message indicating formatting issues, if any. + bool + Whether there are formatting errors with the query. """ - # Check for spaces before commas - if re.search(r"\s+[,]", form_text): + # Check for spaces before commas that should not exist. + if re.search(r"\s,", form_text): return False - # Check for spaces before periods and semicolons - if re.search(r"\S(?=[.;])", form_text): + # Check for non space characters before periods and semicolons that should not exist. + if re.search(r"\S[.;]", form_text): return False return True @@ -478,7 +478,7 @@ def check_query_forms() -> None: "ontolex:lexicalForm" in form_text and "ontolex:representation" in form_text ): - correct_form_spacing = check_query_formatting(form_text=form_text) + correct_formatting = check_query_formatting(form_text=form_text) form_rep_label = extract_form_rep_label(form_text=form_text) check = check_form_label(form_text=form_text) qids = extract_form_qids(form_text=form_text) @@ -486,30 +486,29 @@ def check_query_forms() -> None: query_form_check_dict[form_rep_label] = { "form_rep_match": check, - "correct_form_spacing": correct_form_spacing, + "correct_formatting": correct_formatting, "qids": qids, "correct_form_rep_label": correct_form_rep_label, } if query_form_check_dict: incorrect_query_labels = [] - for k in query_form_check_dict: - if k != query_form_check_dict[k]["correct_form_spacing"] is False: + for k, v in query_form_check_dict.items(): + if k != v["correct_formatting"] is False: incorrect_query_labels.append( ( k, "Invalid query formatting found - please put spaces before all periods and semicolons and also remove spaces before commas.", ) ) - else: - if k != query_form_check_dict[k]["correct_form_rep_label"]: - incorrect_query_labels.append( - (k, query_form_check_dict[k]["correct_form_rep_label"]) - ) - elif query_form_check_dict[k]["form_rep_match"] is False: - incorrect_query_labels.append( - (k, "Form and representation labels don't match") - ) + elif k != query_form_check_dict[k]["correct_form_rep_label"]: + incorrect_query_labels.append( + (k, query_form_check_dict[k]["correct_form_rep_label"]) + ) + elif query_form_check_dict[k]["form_rep_match"] is False: + incorrect_query_labels.append( + (k, "Form and representation labels don't match") + ) if incorrect_query_labels: current_rep_label_to_correct_label_str = [