From ad54e296edf2a6caf5e6448678d884b9d883b690 Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Thu, 17 Oct 2024 00:16:40 +0100 Subject: [PATCH 01/63] complete workflow to check sparql queries --- .../workflows/check_query_identifiers.yaml | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml index 3a601fe60..b1e71b6bd 100644 --- a/.github/workflows/check_query_identifiers.yaml +++ b/.github/workflows/check_query_identifiers.yaml @@ -25,21 +25,20 @@ jobs: - name: Checkout uses: actions/checkout@v3 - # - name: Set up Python ${{ matrix.python-version }} - # uses: actions/setup-python@v4 - # with: - # python-version: ${{ matrix.python-version }} - - # - name: Install dependencies - # run: | - # python -m pip install --upgrade uv - # uv venv - # uv pip install -r requirements.txt - - # - name: Activate virtualenv - # run: | - # . .venv/bin/activate - # echo PATH=$PATH >> $GITHUB_ENV - - # - name: Run Python script - # run: python src/scribe_data/check/check_query_identifiers.py + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade uv + uv venv + uv pip install -r requirements.txt + + - name: Run check_query_identifiers.py + run: python src/scribe_data/check/check_query_identifiers.py + + - name: Post-run status + if: failure() + run: echo "Project SPARQL queries check failed. Please fix the reported errors." From 5faa2f48b362e0f701599d38a9f6e0605115e080 Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Thu, 17 Oct 2024 00:21:18 +0100 Subject: [PATCH 02/63] add function call to check queries --- src/scribe_data/check/check_query_identifiers.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/scribe_data/check/check_query_identifiers.py b/src/scribe_data/check/check_query_identifiers.py index 5f8276e4d..885792c41 100644 --- a/src/scribe_data/check/check_query_identifiers.py +++ b/src/scribe_data/check/check_query_identifiers.py @@ -133,6 +133,5 @@ def is_valid_data_type(query_file: Path, data_type_qid: str) -> bool: return data_type_qid == expected_data_type_qid -# Run the check_queries function -# MARK: TODO: Remove Call -# check_queries() +if __name__ == "__main__": + check_queries() From c9c50d9544b850254c8109b3d61fe0de6068a3d9 Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Thu, 17 Oct 2024 00:28:10 +0100 Subject: [PATCH 03/63] update check_query_identifiers workflow file: activate virtual environment --- .github/workflows/check_query_identifiers.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml index b1e71b6bd..8c2a4a7c2 100644 --- a/.github/workflows/check_query_identifiers.yaml +++ b/.github/workflows/check_query_identifiers.yaml @@ -36,6 +36,11 @@ jobs: uv venv uv pip install -r requirements.txt + - name: Activate virtualenv + run: | + . .venv/bin/activate + echo PATH=$PATH >> $GITHUB_ENV + - name: Run check_query_identifiers.py run: python src/scribe_data/check/check_query_identifiers.py From 1e04e4b65634902c34148bec875d2de94505fc62 Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Thu, 17 Oct 2024 00:29:47 +0100 Subject: [PATCH 04/63] add working directory --- .github/workflows/check_query_identifiers.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml index 8c2a4a7c2..df4fe97e1 100644 --- a/.github/workflows/check_query_identifiers.yaml +++ b/.github/workflows/check_query_identifiers.yaml @@ -42,6 +42,7 @@ jobs: echo PATH=$PATH >> $GITHUB_ENV - name: Run check_query_identifiers.py + working-directory: ./src/scribe_data/check run: python src/scribe_data/check/check_query_identifiers.py - name: Post-run status From 97f3243b306a0b71f52178f22f850ef8c34c82c0 Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Thu, 17 Oct 2024 00:31:09 +0100 Subject: [PATCH 05/63] update workflow: fix file path --- .github/workflows/check_query_identifiers.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml index df4fe97e1..8a3f45e9c 100644 --- a/.github/workflows/check_query_identifiers.yaml +++ b/.github/workflows/check_query_identifiers.yaml @@ -43,7 +43,7 @@ jobs: - name: Run check_query_identifiers.py working-directory: ./src/scribe_data/check - run: python src/scribe_data/check/check_query_identifiers.py + run: python check_query_identifiers.py - name: Post-run status if: failure() From 2ee16bb044c2986b6524222badedb5f9aef8866a Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Thu, 17 Oct 2024 00:34:48 +0100 Subject: [PATCH 06/63] reduce dependencies --- .../workflows/check_query_identifiers.yaml | 20 ++----------------- 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml index 8a3f45e9c..b9d3e3bb4 100644 --- a/.github/workflows/check_query_identifiers.yaml +++ b/.github/workflows/check_query_identifiers.yaml @@ -22,24 +22,8 @@ jobs: name: Run Check Query Identifiers steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade uv - uv venv - uv pip install -r requirements.txt - - - name: Activate virtualenv - run: | - . .venv/bin/activate - echo PATH=$PATH >> $GITHUB_ENV + - name: Checkout repository + uses: actions/checkout@v4 - name: Run check_query_identifiers.py working-directory: ./src/scribe_data/check From 92e4ad97f75b4eab2f4a25944105640093d0762d Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Thu, 17 Oct 2024 00:40:02 +0100 Subject: [PATCH 07/63] add pythonpath dependencies --- .github/workflows/check_query_identifiers.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml index b9d3e3bb4..00234ac4c 100644 --- a/.github/workflows/check_query_identifiers.yaml +++ b/.github/workflows/check_query_identifiers.yaml @@ -25,6 +25,19 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Add project root to PYTHONPATH + run: echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + - name: Run check_query_identifiers.py working-directory: ./src/scribe_data/check run: python check_query_identifiers.py From 042958e6f65ad3216e9110ca9dc80f467c732db1 Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Thu, 17 Oct 2024 05:39:45 +0100 Subject: [PATCH 08/63] add workflow fix --- .github/workflows/check_query_identifiers.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml index 00234ac4c..d486394a9 100644 --- a/.github/workflows/check_query_identifiers.yaml +++ b/.github/workflows/check_query_identifiers.yaml @@ -31,7 +31,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Add project root to PYTHONPATH - run: echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV + run: echo "PYTHONPATH=$(pwd)/src" >> $GITHUB_ENV - name: Install dependencies run: | From ac4a2ba3af0ebbcc55b26eb7106c709bb3392896 Mon Sep 17 00:00:00 2001 From: Ebeleokolo Date: Wed, 16 Oct 2024 23:35:55 -0400 Subject: [PATCH 09/63] Add Finnish verbs query --- .../Finnish/verbs/query_verbs.sparql | 133 +++++++++++++++++- 1 file changed, 132 insertions(+), 1 deletion(-) diff --git a/src/scribe_data/language_data_extraction/Finnish/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Finnish/verbs/query_verbs.sparql index 949500ea2..b1a44c354 100644 --- a/src/scribe_data/language_data_extraction/Finnish/verbs/query_verbs.sparql +++ b/src/scribe_data/language_data_extraction/Finnish/verbs/query_verbs.sparql @@ -1,13 +1,144 @@ +PREFIX wd: +PREFIX wikibase: +PREFIX dct: +PREFIX ontolex: # tool: scribe-data -# All Finnish (Q1412) verbs and the given forms. +# All Finnish (Q1412) verbs and their forms. # Enter this query at https://query.wikidata.org/. SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?verb + ?infinitiveI + ?presIndSg1 + ?imperativeSg2 + ?passivePresent WHERE { ?lexeme dct:language wd:Q1412 ; wikibase:lexicalCategory wd:Q24905 ; wikibase:lemma ?verb . + + # Infinitives + OPTIONAL { + ?lexeme ontolex:lexicalForm ?infinitiveIForm . + ?infinitiveIForm ontolex:representation ?infinitiveI ; + wikibase:grammaticalFeature wd:Q179230 . + } + OPTIONAL { + ?lexeme ontolex:lexicalForm ?infinitiveIIForm . + ?infinitiveIIForm ontolex:representation ?infinitiveII ; + wikibase:grammaticalFeature wd:Q179230 ; + wikibase:grammaticalFeature wd:Q66596723 . + } + OPTIONAL { + ?lexeme ontolex:lexicalForm ?infinitiveIIIForm . + ?infinitiveIIIForm ontolex:representation ?infinitiveIII ; + wikibase:grammaticalFeature wd:Q179230 ; + wikibase:grammaticalFeature wd:Q66596786 . + } + OPTIONAL { + ?lexeme ontolex:lexicalForm ?infinitiveIVForm . + ?infinitiveIVForm ontolex:representation ?infinitiveIV ; + wikibase:grammaticalFeature wd:Q179230 ; + wikibase:grammaticalFeature wd:Q66596828 . + } + OPTIONAL { + ?lexeme ontolex:lexicalForm ?infinitiveVForm . + ?infinitiveVForm ontolex:representation ?infinitiveV ; + wikibase:grammaticalFeature wd:Q179230 ; + wikibase:grammaticalFeature wd:Q66596870 . + } + + # Present Indicative + OPTIONAL { + ?lexeme ontolex:lexicalForm ?presIndSg1Form . + ?presIndSg1Form ontolex:representation ?presIndSg1 ; + wikibase:grammaticalFeature wd:Q192613 ; + wikibase:grammaticalFeature wd:Q21714344 ; + wikibase:grammaticalFeature wd:Q110786 . + } + + # Past Indicative + OPTIONAL { + ?lexeme ontolex:lexicalForm ?pastIndSg1Form . + ?pastIndSg1Form ontolex:representation ?pastIndSg1 ; + wikibase:grammaticalFeature wd:Q1240211 ; + wikibase:grammaticalFeature wd:Q21714344 ; + wikibase:grammaticalFeature wd:Q110786 . + } + + # Conditional + OPTIONAL { + ?lexeme ontolex:lexicalForm ?conditionalSg1Form . + ?conditionalSg1Form ontolex:representation ?conditionalSg1 ; + wikibase:grammaticalFeature wd:Q52824793 ; + wikibase:grammaticalFeature wd:Q21714344 ; + wikibase:grammaticalFeature wd:Q110786 . + } + + # Potential + OPTIONAL { + ?lexeme ontolex:lexicalForm ?potentialSg1Form . + ?potentialSg1Form ontolex:representation ?potentialSg1 ; + wikibase:grammaticalFeature wd:Q696092 ; + wikibase:grammaticalFeature wd:Q21714344 ; + wikibase:grammaticalFeature wd:Q110786 . + } + + # Imperative + OPTIONAL { + ?lexeme ontolex:lexicalForm ?imperativeSg2Form . + ?imperativeSg2Form ontolex:representation ?imperativeSg2 ; + wikibase:grammaticalFeature wd:Q22716 ; + wikibase:grammaticalFeature wd:Q51929049 ; + wikibase:grammaticalFeature wd:Q110786 . + } + OPTIONAL { + ?lexeme ontolex:lexicalForm ?imperativePl2Form . + ?imperativePl2Form ontolex:representation ?imperativePl2 ; + wikibase:grammaticalFeature wd:Q22716 ; + wikibase:grammaticalFeature wd:Q51929049 ; + wikibase:grammaticalFeature wd:Q146786 . + } + + # Participles + OPTIONAL { + ?lexeme ontolex:lexicalForm ?activePresParticipleForm . + ?activePresParticipleForm ontolex:representation ?activePresParticiple ; + wikibase:grammaticalFeature wd:Q814722 ; + wikibase:grammaticalFeature wd:Q1317831 . + } + OPTIONAL { + ?lexeme ontolex:lexicalForm ?activePastParticipleForm . + ?activePastParticipleForm ontolex:representation ?activePastParticiple ; + wikibase:grammaticalFeature wd:Q12612262 ; + wikibase:grammaticalFeature wd:Q1317831 . + } + OPTIONAL { + ?lexeme ontolex:lexicalForm ?passivePresParticipleForm . + ?passivePresParticipleForm ontolex:representation ?passivePresParticiple ; + wikibase:grammaticalFeature wd:Q814722 ; + wikibase:grammaticalFeature wd:Q1194697 . + } + OPTIONAL { + ?lexeme ontolex:lexicalForm ?passivePastParticipleForm . + ?passivePastParticipleForm ontolex:representation ?passivePastParticiple ; + wikibase:grammaticalFeature wd:Q12612262 ; + wikibase:grammaticalFeature wd:Q1194697 . + } + + # Passive forms + OPTIONAL { + ?lexeme ontolex:lexicalForm ?passivePresentForm . + ?passivePresentForm ontolex:representation ?passivePresent ; + wikibase:grammaticalFeature wd:Q192613 ; + wikibase:grammaticalFeature wd:Q1194697 . + } + OPTIONAL { + ?lexeme ontolex:lexicalForm ?passivePastForm . + ?passivePastForm ontolex:representation ?passivePast ; + wikibase:grammaticalFeature wd:Q1240211 ; + wikibase:grammaticalFeature wd:Q1194697 . + } } From ee5b03435e5e1c8364b80b4e5f87b311d18f68a9 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Thu, 17 Oct 2024 09:34:19 +0200 Subject: [PATCH 10/63] Updates to Finnish verbs query --- .../Finnish/verbs/query_verbs.sparql | 72 +++++++------------ 1 file changed, 26 insertions(+), 46 deletions(-) diff --git a/src/scribe_data/language_data_extraction/Finnish/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Finnish/verbs/query_verbs.sparql index b1a44c354..3af067d84 100644 --- a/src/scribe_data/language_data_extraction/Finnish/verbs/query_verbs.sparql +++ b/src/scribe_data/language_data_extraction/Finnish/verbs/query_verbs.sparql @@ -1,18 +1,11 @@ -PREFIX wd: -PREFIX wikibase: -PREFIX dct: -PREFIX ontolex: # tool: scribe-data -# All Finnish (Q1412) verbs and their forms. +# All Finnish (Q1412) verbs and the given forms. # Enter this query at https://query.wikidata.org/. SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?verb ?infinitiveI - ?presIndSg1 - ?imperativeSg2 - ?passivePresent WHERE { ?lexeme dct:language wd:Q1412 ; @@ -25,120 +18,107 @@ WHERE { ?infinitiveIForm ontolex:representation ?infinitiveI ; wikibase:grammaticalFeature wd:Q179230 . } + OPTIONAL { ?lexeme ontolex:lexicalForm ?infinitiveIIForm . ?infinitiveIIForm ontolex:representation ?infinitiveII ; - wikibase:grammaticalFeature wd:Q179230 ; - wikibase:grammaticalFeature wd:Q66596723 . + wikibase:grammaticalFeature wd:Q179230, wd:Q66596723 . } + OPTIONAL { ?lexeme ontolex:lexicalForm ?infinitiveIIIForm . ?infinitiveIIIForm ontolex:representation ?infinitiveIII ; - wikibase:grammaticalFeature wd:Q179230 ; - wikibase:grammaticalFeature wd:Q66596786 . + wikibase:grammaticalFeature wd:Q179230, wd:Q66596786 . } + OPTIONAL { ?lexeme ontolex:lexicalForm ?infinitiveIVForm . ?infinitiveIVForm ontolex:representation ?infinitiveIV ; - wikibase:grammaticalFeature wd:Q179230 ; - wikibase:grammaticalFeature wd:Q66596828 . + wikibase:grammaticalFeature wd:Q179230, wd:Q66596828 . } + OPTIONAL { ?lexeme ontolex:lexicalForm ?infinitiveVForm . ?infinitiveVForm ontolex:representation ?infinitiveV ; - wikibase:grammaticalFeature wd:Q179230 ; - wikibase:grammaticalFeature wd:Q66596870 . + wikibase:grammaticalFeature wd:Q179230, wd:Q66596870 . } # Present Indicative OPTIONAL { ?lexeme ontolex:lexicalForm ?presIndSg1Form . ?presIndSg1Form ontolex:representation ?presIndSg1 ; - wikibase:grammaticalFeature wd:Q192613 ; - wikibase:grammaticalFeature wd:Q21714344 ; - wikibase:grammaticalFeature wd:Q110786 . + wikibase:grammaticalFeature wd:Q192613, wd:Q21714344, wd:Q110786 . } # Past Indicative OPTIONAL { ?lexeme ontolex:lexicalForm ?pastIndSg1Form . ?pastIndSg1Form ontolex:representation ?pastIndSg1 ; - wikibase:grammaticalFeature wd:Q1240211 ; - wikibase:grammaticalFeature wd:Q21714344 ; - wikibase:grammaticalFeature wd:Q110786 . + wikibase:grammaticalFeature wd:Q1240211, wd:Q21714344, wd:Q110786 . } # Conditional OPTIONAL { ?lexeme ontolex:lexicalForm ?conditionalSg1Form . ?conditionalSg1Form ontolex:representation ?conditionalSg1 ; - wikibase:grammaticalFeature wd:Q52824793 ; - wikibase:grammaticalFeature wd:Q21714344 ; - wikibase:grammaticalFeature wd:Q110786 . + wikibase:grammaticalFeature wd:Q52824793, wd:Q21714344, wd:Q110786 . } # Potential OPTIONAL { ?lexeme ontolex:lexicalForm ?potentialSg1Form . ?potentialSg1Form ontolex:representation ?potentialSg1 ; - wikibase:grammaticalFeature wd:Q696092 ; - wikibase:grammaticalFeature wd:Q21714344 ; - wikibase:grammaticalFeature wd:Q110786 . + wikibase:grammaticalFeature wd:Q696092, wd:Q21714344, wd:Q110786 . } # Imperative OPTIONAL { ?lexeme ontolex:lexicalForm ?imperativeSg2Form . ?imperativeSg2Form ontolex:representation ?imperativeSg2 ; - wikibase:grammaticalFeature wd:Q22716 ; - wikibase:grammaticalFeature wd:Q51929049 ; - wikibase:grammaticalFeature wd:Q110786 . + wikibase:grammaticalFeature wd:Q22716, wd:Q51929049, wd:Q110786 . } + OPTIONAL { ?lexeme ontolex:lexicalForm ?imperativePl2Form . ?imperativePl2Form ontolex:representation ?imperativePl2 ; - wikibase:grammaticalFeature wd:Q22716 ; - wikibase:grammaticalFeature wd:Q51929049 ; - wikibase:grammaticalFeature wd:Q146786 . + wikibase:grammaticalFeature wd:Q22716, wd:Q51929049, wd:Q146786 . } # Participles OPTIONAL { ?lexeme ontolex:lexicalForm ?activePresParticipleForm . ?activePresParticipleForm ontolex:representation ?activePresParticiple ; - wikibase:grammaticalFeature wd:Q814722 ; - wikibase:grammaticalFeature wd:Q1317831 . + wikibase:grammaticalFeature wd:Q814722, wd:Q1317831 . } + OPTIONAL { ?lexeme ontolex:lexicalForm ?activePastParticipleForm . ?activePastParticipleForm ontolex:representation ?activePastParticiple ; - wikibase:grammaticalFeature wd:Q12612262 ; - wikibase:grammaticalFeature wd:Q1317831 . + wikibase:grammaticalFeature wd:Q12612262, wd:Q1317831 . } + OPTIONAL { ?lexeme ontolex:lexicalForm ?passivePresParticipleForm . ?passivePresParticipleForm ontolex:representation ?passivePresParticiple ; - wikibase:grammaticalFeature wd:Q814722 ; - wikibase:grammaticalFeature wd:Q1194697 . + wikibase:grammaticalFeature wd:Q814722, wd:Q1194697 . } + OPTIONAL { ?lexeme ontolex:lexicalForm ?passivePastParticipleForm . ?passivePastParticipleForm ontolex:representation ?passivePastParticiple ; - wikibase:grammaticalFeature wd:Q12612262 ; - wikibase:grammaticalFeature wd:Q1194697 . + wikibase:grammaticalFeature wd:Q12612262, wd:Q1194697 . } # Passive forms OPTIONAL { ?lexeme ontolex:lexicalForm ?passivePresentForm . ?passivePresentForm ontolex:representation ?passivePresent ; - wikibase:grammaticalFeature wd:Q192613 ; - wikibase:grammaticalFeature wd:Q1194697 . + wikibase:grammaticalFeature wd:Q192613, wd:Q1194697 . } + OPTIONAL { ?lexeme ontolex:lexicalForm ?passivePastForm . ?passivePastForm ontolex:representation ?passivePast ; - wikibase:grammaticalFeature wd:Q1240211 ; - wikibase:grammaticalFeature wd:Q1194697 . + wikibase:grammaticalFeature wd:Q1240211, wd:Q1194697 . } } From 3b9a61a5f0fb01311cf4faaec9d6298c929db504 Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Thu, 17 Oct 2024 14:13:36 +0100 Subject: [PATCH 11/63] throw error if invalid QIDs are found --- src/scribe_data/check/check_query_identifiers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/scribe_data/check/check_query_identifiers.py b/src/scribe_data/check/check_query_identifiers.py index 885792c41..2d3a40b16 100644 --- a/src/scribe_data/check/check_query_identifiers.py +++ b/src/scribe_data/check/check_query_identifiers.py @@ -1,4 +1,5 @@ import re +import sys from pathlib import Path from scribe_data.cli.cli_utils import ( @@ -66,14 +67,14 @@ def check_queries(): for file in incorrect_languages: print(f"- {file}") - print("\n----------------------------------------------------------------\n") - if incorrect_data_types: print("Incorrect Data Type QIDs found in the following files:") for file in incorrect_data_types: print(f"- {file}") - print("\n----------------------------------------------------------------\n") + # Exit with an error code if any incorrect QIDs are found + if incorrect_languages or incorrect_data_types: + sys.exit(1) def is_valid_language(query_file: Path, lang_qid: str) -> bool: From 10e7a50ecb6a361b595fa4ce19c58179f2eac02d Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Thu, 17 Oct 2024 14:35:57 +0100 Subject: [PATCH 12/63] post comment if workflow fails --- .github/workflows/check_query_identifiers.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml index d486394a9..3757feb68 100644 --- a/.github/workflows/check_query_identifiers.yaml +++ b/.github/workflows/check_query_identifiers.yaml @@ -42,6 +42,22 @@ jobs: working-directory: ./src/scribe_data/check run: python check_query_identifiers.py + # If the previous step fails, post a comment + - name: Notify PR Author of invalid queries + if: failure() + uses: actions/github-script@v6 + with: + script: | + const prAuthor = context.payload.pull_request.user.login; + const issueNumber = context.payload.pull_request.number; + github.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issueNumber, + body: `Hey @${prAuthor}, please review your queries. Please fix the reported errors.` + }) + + - name: Post-run status if: failure() run: echo "Project SPARQL queries check failed. Please fix the reported errors." From 1d6668b1fd238a9745a67b66dfea160e54de563c Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Thu, 17 Oct 2024 14:42:50 +0100 Subject: [PATCH 13/63] fix async block in workflow --- .github/workflows/check_query_identifiers.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml index 3757feb68..a6e093297 100644 --- a/.github/workflows/check_query_identifiers.yaml +++ b/.github/workflows/check_query_identifiers.yaml @@ -50,7 +50,7 @@ jobs: script: | const prAuthor = context.payload.pull_request.user.login; const issueNumber = context.payload.pull_request.number; - github.issues.createComment({ + await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issueNumber, From 2cdcc01be10fbc9a11e6b8d78ed8686c143a9334 Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Thu, 17 Oct 2024 14:47:22 +0100 Subject: [PATCH 14/63] give gh actions write access --- .github/workflows/check_query_identifiers.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml index a6e093297..f14c529ae 100644 --- a/.github/workflows/check_query_identifiers.yaml +++ b/.github/workflows/check_query_identifiers.yaml @@ -7,6 +7,10 @@ on: - main types: [opened, reopened, synchronize] +permissions: + pull-requests: write + issues: write + jobs: format_check: strategy: From eb0e3f2b86892387b282e6ad23583dcd3404ead3 Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Thu, 17 Oct 2024 15:01:39 +0100 Subject: [PATCH 15/63] remove pr comment steps --- .../workflows/check_query_identifiers.yaml | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml index f14c529ae..d486394a9 100644 --- a/.github/workflows/check_query_identifiers.yaml +++ b/.github/workflows/check_query_identifiers.yaml @@ -7,10 +7,6 @@ on: - main types: [opened, reopened, synchronize] -permissions: - pull-requests: write - issues: write - jobs: format_check: strategy: @@ -46,22 +42,6 @@ jobs: working-directory: ./src/scribe_data/check run: python check_query_identifiers.py - # If the previous step fails, post a comment - - name: Notify PR Author of invalid queries - if: failure() - uses: actions/github-script@v6 - with: - script: | - const prAuthor = context.payload.pull_request.user.login; - const issueNumber = context.payload.pull_request.number; - await github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: issueNumber, - body: `Hey @${prAuthor}, please review your queries. Please fix the reported errors.` - }) - - - name: Post-run status if: failure() run: echo "Project SPARQL queries check failed. Please fix the reported errors." From 0a2d5746588728c5bf95a40a833c98f20fc798e2 Mon Sep 17 00:00:00 2001 From: gicharuelvis Date: Fri, 18 Oct 2024 01:08:30 +0300 Subject: [PATCH 16/63] Added Swedish Adjectives --- .../Swedish/adjectives/query_adjectives.sparql | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 src/scribe_data/language_data_extraction/Swedish/adjectives/query_adjectives.sparql diff --git a/src/scribe_data/language_data_extraction/Swedish/adjectives/query_adjectives.sparql b/src/scribe_data/language_data_extraction/Swedish/adjectives/query_adjectives.sparql new file mode 100644 index 000000000..0949450ba --- /dev/null +++ b/src/scribe_data/language_data_extraction/Swedish/adjectives/query_adjectives.sparql @@ -0,0 +1,18 @@ +# tool: scribe-data +# All Swedish (Q9027) adjectives and the given forms. +# Enter this query at https://query.wikidata.org/. + +SELECT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?adjective + +WHERE { + ?lexeme dct:language wd:Q9027 ; + wikibase:lexicalCategory wd:Q34698 ; + wikibase:lemma ?lemma . + + SERVICE wikibase:label { + bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". + ?lemma rdfs:label ?adjective . + } +} From 8f3425a6bfbb8a84488c971bf2596352f460291a Mon Sep 17 00:00:00 2001 From: Angel osim <69635048+Otom-obhazi@users.noreply.github.com> Date: Thu, 17 Oct 2024 14:44:07 +0100 Subject: [PATCH 17/63] Create query_verbs.sparql I noticed that there was no folder for Igbo. --- .../Igbo/verbs/query_verbs.sparql | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 src/scribe_data/language_data_extraction/Igbo/verbs/query_verbs.sparql diff --git a/src/scribe_data/language_data_extraction/Igbo/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Igbo/verbs/query_verbs.sparql new file mode 100644 index 000000000..6b59644f3 --- /dev/null +++ b/src/scribe_data/language_data_extraction/Igbo/verbs/query_verbs.sparql @@ -0,0 +1,13 @@ +# tool: scribe-data +# All Igbo (Q33578) verbs and the given forms. +# Enter this query at https://query.wikidata.org/. + +SELECT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?verb + +WHERE { + ?lexeme dct:language wd:Q33578 ; + wikibase:lexicalCategory wd:Q24905 ; + wikibase:lemma ?verb . + } From 5ffafb07234578c8883da2b118b320b79d84a035 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Thu, 17 Oct 2024 21:07:11 +0200 Subject: [PATCH 18/63] Add Igbo to the languages check --- src/scribe_data/check/check_project_structure.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scribe_data/check/check_project_structure.py b/src/scribe_data/check/check_project_structure.py index 4c58478a8..3313d0350 100644 --- a/src/scribe_data/check/check_project_structure.py +++ b/src/scribe_data/check/check_project_structure.py @@ -40,6 +40,7 @@ "Malay", "Punjabi", "Tajik", + "Igbo", } DATA_TYPES = { From cac8dd618bdfe9124ad760daca87fd3e9b174b1a Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Fri, 18 Oct 2024 00:33:25 +0200 Subject: [PATCH 19/63] Remove label service from adjectives query --- .../Swedish/adjectives/query_adjectives.sparql | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/scribe_data/language_data_extraction/Swedish/adjectives/query_adjectives.sparql b/src/scribe_data/language_data_extraction/Swedish/adjectives/query_adjectives.sparql index 0949450ba..0bef8ebab 100644 --- a/src/scribe_data/language_data_extraction/Swedish/adjectives/query_adjectives.sparql +++ b/src/scribe_data/language_data_extraction/Swedish/adjectives/query_adjectives.sparql @@ -9,10 +9,5 @@ SELECT WHERE { ?lexeme dct:language wd:Q9027 ; wikibase:lexicalCategory wd:Q34698 ; - wikibase:lemma ?lemma . - - SERVICE wikibase:label { - bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". - ?lemma rdfs:label ?adjective . - } + wikibase:lemma ?adjective . } From 34d84d258d96d8bebb3f4a99ccd346860c101f2f Mon Sep 17 00:00:00 2001 From: Angel osim <69635048+Otom-obhazi@users.noreply.github.com> Date: Thu, 17 Oct 2024 12:33:41 +0100 Subject: [PATCH 20/63] Update query_adverbs.sparql added comparative --- .../Spanish/adverbs/query_adverbs.sparql | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/scribe_data/language_data_extraction/Spanish/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Spanish/adverbs/query_adverbs.sparql index 2abb5033f..8188fc5e8 100644 --- a/src/scribe_data/language_data_extraction/Spanish/adverbs/query_adverbs.sparql +++ b/src/scribe_data/language_data_extraction/Spanish/adverbs/query_adverbs.sparql @@ -7,6 +7,7 @@ SELECT ?adverb ?diminutive ?superlative + ?comparative WHERE { ?lexeme dct:language wd:Q1321 ; @@ -28,4 +29,12 @@ WHERE { ?superlativeForm ontolex:representation ?superlative ; wikibase:grammaticalFeature wd:Q1817208 . } + + # MARK: Comparative + + OPTIONAL { + ?lexeme ontolex:lexicalForm ?comparativeForm . + ?comparativeForm ontolex:representation ?comparative ; + wikibase:grammaticalFeature wd:Q14169499 . + } } From b5be3e670a584d6ed6bd8ed56a90093fbc34948f Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Fri, 18 Oct 2024 00:45:43 +0200 Subject: [PATCH 21/63] Remove forms that were accidentally added --- .../Spanish/adverbs/query_adverbs.sparql | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/src/scribe_data/language_data_extraction/Spanish/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Spanish/adverbs/query_adverbs.sparql index 8188fc5e8..084da843f 100644 --- a/src/scribe_data/language_data_extraction/Spanish/adverbs/query_adverbs.sparql +++ b/src/scribe_data/language_data_extraction/Spanish/adverbs/query_adverbs.sparql @@ -5,36 +5,9 @@ SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adverb - ?diminutive - ?superlative - ?comparative WHERE { ?lexeme dct:language wd:Q1321 ; wikibase:lexicalCategory wd:Q380057 ; wikibase:lemma ?adverb . - - # MARK: Diminutive - - OPTIONAL { - ?lexeme ontolex:lexicalForm ?diminutiveForm . - ?diminutiveForm ontolex:representation ?diminutive ; - wikibase:grammaticalFeature wd:Q108709 . - } - - # MARK: Superlative - - OPTIONAL { - ?lexeme ontolex:lexicalForm ?superlativeForm . - ?superlativeForm ontolex:representation ?superlative ; - wikibase:grammaticalFeature wd:Q1817208 . - } - - # MARK: Comparative - - OPTIONAL { - ?lexeme ontolex:lexicalForm ?comparativeForm . - ?comparativeForm ontolex:representation ?comparative ; - wikibase:grammaticalFeature wd:Q14169499 . - } } From ca119c940ea115b582f7a0c9847438f3d38dcff1 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Fri, 18 Oct 2024 00:49:42 +0200 Subject: [PATCH 22/63] Minor changes to unicode setup docs --- src/scribe_data/unicode/UNICODE_INSTALLTION.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/scribe_data/unicode/UNICODE_INSTALLTION.md b/src/scribe_data/unicode/UNICODE_INSTALLTION.md index dfb4e1e4f..67d4ffb83 100644 --- a/src/scribe_data/unicode/UNICODE_INSTALLTION.md +++ b/src/scribe_data/unicode/UNICODE_INSTALLTION.md @@ -4,7 +4,9 @@ The Scribe-Data Unicode process is powered by [cldr-json](https://github.com/uni Please see the [installation guide for PyICU](https://gitlab.pyicu.org/main/pyicu#installing-pyicu) as the extension must be linked to ICU on your machine to work properly. -Note that some of the commands may be incorrect. On macOS you may need to do the following: +## macOS Support + +Note that some of the commands in the installation guide may be incorrect. On macOS you may need to do the following: ```bash # Instead of: @@ -16,7 +18,7 @@ echo "/opt/homebrew/opt/icu4c/bin:/opt/homebrew/opt/icu4c/sbin:$PATH" echo "PKG_CONFIG_PATH=$PKG_CONFIG_PATH:/opt/homebrew/opt/icu4c/lib/pkgconfig" ``` -# Installing PyICU for Emoji Support on Windows +## Windows Support This guide provides step-by-step instructions on how to install the PyICU library, which is essential for proper emoji support on Windows. @@ -25,7 +27,7 @@ This guide provides step-by-step instructions on how to install the PyICU librar 1. Visit the [PyICU Release Page](https://github.com/cgohlke/pyicu-build/releases). 2. Locate and download the wheel (`.whl`) file that matches your Python version. Make sure to select the correct architecture (e.g., `win_amd64` for 64-bit Python). -## Set Up a Virtual Environment +### Set Up a Virtual Environment If you haven't already, You can do this with the following command: @@ -37,7 +39,7 @@ python -m venv venv venv\Scripts\activate ``` -## Install PyICU +### Install PyICU ```bash # Replace 'PyICU-2.13-cp312-cp312-win_amd64.whl' with the actual filename you downloaded From 3ee79abf9c2a9157e9b3578e5409175f091f6add Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Fri, 18 Oct 2024 00:50:18 +0200 Subject: [PATCH 23/63] Minor header change to unicode docs headers --- src/scribe_data/unicode/UNICODE_INSTALLTION.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scribe_data/unicode/UNICODE_INSTALLTION.md b/src/scribe_data/unicode/UNICODE_INSTALLTION.md index 67d4ffb83..2dbe323be 100644 --- a/src/scribe_data/unicode/UNICODE_INSTALLTION.md +++ b/src/scribe_data/unicode/UNICODE_INSTALLTION.md @@ -22,7 +22,7 @@ echo "PKG_CONFIG_PATH=$PKG_CONFIG_PATH:/opt/homebrew/opt/icu4c/lib/pkgconfig" This guide provides step-by-step instructions on how to install the PyICU library, which is essential for proper emoji support on Windows. -## Download the PyICU Wheel File +### Download the PyICU Wheel File 1. Visit the [PyICU Release Page](https://github.com/cgohlke/pyicu-build/releases). 2. Locate and download the wheel (`.whl`) file that matches your Python version. Make sure to select the correct architecture (e.g., `win_amd64` for 64-bit Python). From 6620ec5625f7c4eb1d304d6b580bccdcb1fb02b1 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sat, 12 Oct 2024 16:44:10 +0300 Subject: [PATCH 24/63] Simplified language metadata JSON by removing unnecessary nesting and keys. - Removed 'description', 'entry', and 'languages' keys. - Flattened structure to include only 'language', 'iso', and 'qid' at the top level. --- .../resources/language_metadata.json | 98 ++++++------------- 1 file changed, 31 insertions(+), 67 deletions(-) diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json index e6d7de8a6..b5400c697 100755 --- a/src/scribe_data/resources/language_metadata.json +++ b/src/scribe_data/resources/language_metadata.json @@ -1,70 +1,34 @@ { - "used by": "Scribe-Data/src/scribe_data/utils.py", - "description": { - "entry": { - "language": "the supported language. All lowercase", - "iso": "the ISO 639 code for 'language'. See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes. All lowercase", - "qid": "the unique identifier of 'language' on Wikidata. 'Q' followed by one or more digits. See https://www.wikidata.org/wiki/Q43649390", - "remove-words": "words that should not be included as autosuggestions for the given language.", - "ignore-words": "words that should be removed from the autosuggestion generation process." - } + "english": { + "iso": "en", + "qid": "Q1860" }, - "languages": [ - { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [] - }, - { - "language": "french", - "iso": "fr", - "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"] - }, - { - "language": "german", - "iso": "de", - "qid": "Q188", - "remove-words": ["of", "the", "The", "and", "NeinJa", "et", "redirect"], - "ignore-words": ["Gemeinde", "Familienname"] - }, - { - "language": "italian", - "iso": "it", - "qid": "Q652", - "remove-words": ["of", "the", "The", "and", "text", "from"], - "ignore-words": ["The", "ATP"] - }, - { - "language": "portuguese", - "iso": "pt", - "qid": "Q5146", - "remove-words": ["of", "the", "The", "and", "jbutadptflora"], - "ignore-words": [] - }, - { - "language": "russian", - "iso": "ru", - "qid": "Q7737", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [] - }, - { - "language": "spanish", - "iso": "es", - "qid": "Q1321", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [] - }, - { - "language": "swedish", - "iso": "sv", - "qid": "Q9027", - "remove-words": ["of", "the", "The", "and", "Checklist", "Catalogue"], - "ignore-words": ["databasdump"] - } - ] + "french": { + "iso": "fr", + "qid": "Q150" + }, + "german": { + "iso": "de", + "qid": "Q188" + }, + "italian": { + "iso": "it", + "qid": "Q652" + }, + "portuguese": { + "iso": "pt", + "qid": "Q5146" + }, + "russian": { + "iso": "ru", + "qid": "Q7737" + }, + "spanish": { + "iso": "es", + "qid": "Q1321" + }, + "swedish": { + "iso": "sv", + "qid": "Q9027" + } } From 8666c0273898e10b20d026fbe9e04d582777eff7 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sat, 12 Oct 2024 16:50:34 +0300 Subject: [PATCH 25/63] Refactored _load_json function to handle simplified JSON structure. - Removed 'root' parameter since the JSON is now flat. - Updated function to return the entire contents of the JSON directly. --- src/scribe_data/utils.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 9d94485ab..05ac770d3 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -36,7 +36,7 @@ DEFAULT_SQLITE_EXPORT_DIR = "scribe_data_sqlite_export" -def _load_json(package_path: str, file_name: str, root: str) -> Any: +def _load_json(package_path: str, file_name: str) -> Any: """ Loads a JSON resource from a package into a python entity. @@ -48,25 +48,19 @@ def _load_json(package_path: str, file_name: str, root: str) -> Any: file_name : str The name of the file (resource) that contains the JSON data. - root : str - The root node of the JSON document. - Returns ------- - A python entity starting at 'root'. + A python entity representing the JSON content. """ - with resources.files(package_path).joinpath(file_name).open( encoding="utf-8" ) as in_stream: contents = json.load(in_stream) - return contents[root] + return contents # No need for 'root' _languages = _load_json( - package_path="scribe_data.resources", - file_name="language_metadata.json", - root="languages", + package_path="scribe_data.resources", file_name="language_metadata.json" ) From 3dce46dcdcddf14abf1d9a0f75ddc63d0d4b3578 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sat, 12 Oct 2024 18:25:15 +0300 Subject: [PATCH 26/63] =?UTF-8?q?Refactor=20language=20metadata=20structur?= =?UTF-8?q?e:=20Include=20all=20languages=20with=20Norwegian=20having=20su?= =?UTF-8?q?b-languags=20-=20Removed=20unnecessary=20top-level=20keys=20-?= =?UTF-8?q?=20Organized=20Norwegian=20with=20its=20sub-languages=20(Nynors?= =?UTF-8?q?k=20and=20Bokm=C3=A5l)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../resources/language_metadata.json | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json index b5400c697..dd85cdc91 100755 --- a/src/scribe_data/resources/language_metadata.json +++ b/src/scribe_data/resources/language_metadata.json @@ -1,8 +1,40 @@ { + "arabic": { + "iso": "ar", + "qid": "Q13955" + }, + "basque": { + "iso": "eu", + "qid": "Q8752" + }, + "bengali": { + "iso": "bn", + "qid": "Q9610" + }, + "czech": { + "iso": "cs", + "qid": "Q9056" + }, + "danish": { + "iso": "da", + "qid": "Q9035" + }, "english": { "iso": "en", "qid": "Q1860" }, + "esperanto": { + "iso": "eo", + "qid": "Q143" + }, + "estonian": { + "iso": "et", + "qid": "Q9072" + }, + "finnish": { + "iso": "fi", + "qid": "Q1412" + }, "french": { "iso": "fr", "qid": "Q150" @@ -11,24 +43,116 @@ "iso": "de", "qid": "Q188" }, + "greek": { + "iso": "el", + "qid": "Q36510" + }, + "hausa": { + "iso": "ha", + "qid": "Q56475" + }, + "hebrew": { + "iso": "he", + "qid": "Q9288" + }, + "hindustani": { + "iso": "hi", + "qid": "Q11051" + }, + "indonesian": { + "iso": "id", + "qid": "Q9240" + }, "italian": { "iso": "it", "qid": "Q652" }, + "japanese": { + "iso": "ja", + "qid": "Q5287" + }, + "kurmanji": { + "iso": "kmr", + "qid": "Q36163" + }, + "latin": { + "iso": "la", + "qid": "Q397" + }, + "malay": { + "iso": "ms", + "qid": "Q9237" + }, + "malayalam": { + "iso": "ml", + "qid": "Q36236" + }, + "mandarin": { + "iso": "zh", + "qid": "Q727694" + }, + "norwegian": { + "sub_languages": { + "nynorsk": { + "iso": "nn", + "qid": "Q25164" + }, + "bokmål": { + "iso": "nb", + "qid": "Q9043" + } + } + }, + "pidgin": { + "iso": "pi", + "qid": "Q33655" + }, + "polish": { + "iso": "pl", + "qid": "Q809" + }, "portuguese": { "iso": "pt", "qid": "Q5146" }, + "punjabi": { + "iso": "pa", + "qid": "Q58635" + }, "russian": { "iso": "ru", "qid": "Q7737" }, + "slovak": { + "iso": "sk", + "qid": "Q9058" + }, "spanish": { "iso": "es", "qid": "Q1321" }, + "swahili": { + "iso": "sw", + "qid": "Q7838" + }, "swedish": { "iso": "sv", "qid": "Q9027" + }, + "tajik": { + "iso": "tg", + "qid": "Q9260" + }, + "tamil": { + "iso": "ta", + "qid": "Q5885" + }, + "ukrainian": { + "iso": "ua", + "qid": "Q8798" + }, + "yoruba": { + "iso": "yo", + "qid": "Q34311" } } From 5b51483b1a8148925767ba6f3aa1df2e2f35d27a Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sat, 12 Oct 2024 20:43:17 +0300 Subject: [PATCH 27/63] Refactor _find function to handle languages with sub-languages - Enhanced the function to check for both regular languages and their sub-languages. - Added error handling for cases where a language has only sub-languages, providing informative messages. - Updated the function's docstring to reflect changes in behavior and usage. --- src/scribe_data/utils.py | 48 ++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 05ac770d3..8f4726012 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -66,28 +66,20 @@ def _load_json(package_path: str, file_name: str) -> Any: def _find(source_key: str, source_value: str, target_key: str, error_msg: str) -> Any: """ - Each 'language', (english, german,..., etc) is a dictionary of key/value pairs: + Finds a target value based on a source key/value pair from the language metadata. - entry = { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": [...], - "ignore-words": [...] - } - - Given a key/value pair, the 'source' and the 'target' key get the 'target' value. + This version handles both regular languages and those with sub-languages (e.g., Norwegian). Parameters ---------- source_value : str - The source value to find equivalents for (e.g. 'english'). + The source value to find equivalents for (e.g., 'english', 'nynorsk'). source_key : str - The source key to reference (e.g. 'language'). + The source key to reference (e.g., 'language'). target_key : str - The key to target (e.g. 'iso'). + The key to target (e.g., 'qid'). error_msg : str The message displayed when a value cannot be found. @@ -98,18 +90,30 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) - Raises ------ - ValueError : when a source_value is not supported. + ValueError : when a source_value is not supported or the language only has sub-languages. """ norm_source_value = source_value.lower() - if target_value := [ - entry[target_key] - for entry in _languages - if entry[source_key] == norm_source_value - ]: - assert len(target_value) == 1, f"More than one entry for '{norm_source_value}'" - return target_value[0] - + # Check if we're searching by language name + if source_key == "language": + # First, check the main language entries (e.g., mandarin, french, etc.) + for language, entry in _languages.items(): + # If the language name matches the top-level key, return the target value + if language.lower() == norm_source_value: + if "sub_languages" in entry: + sub_languages = ", ".join(entry["sub_languages"].keys()) + raise ValueError( + f"'{language}' has sub-languages, but is not queryable directly. Available sub-languages: {sub_languages}" + ) + return entry.get(target_key) + + # If there are sub-languages, check them too + if "sub_languages" in entry: + for sub_language, sub_entry in entry["sub_languages"].items(): + if sub_language.lower() == norm_source_value: + return sub_entry.get(target_key) + + # If no match was found, raise an error raise ValueError(error_msg) From a68b08c1946fe278e4329859f6ca17ac785a48e5 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sat, 12 Oct 2024 20:46:10 +0300 Subject: [PATCH 28/63] Update get_scribe_languages to handle sub-languages in JSON structure - Adjusted the function to return both main languages and their sub-languages. - Ensured that languages like Norwegian are represented by their sub-languages only. - Enhanced compatibility with the new JSON format. --- src/scribe_data/utils.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 8f4726012..494a2d1bf 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -120,8 +120,22 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) - def get_scribe_languages() -> list[str]: """ Returns the list of currently implemented Scribe languages. + This version handles both regular languages and those with sub-languages (e.g., Norwegian). """ - return sorted(entry["language"].capitalize() for entry in _languages) + languages = [] + + for language, entry in _languages.items(): + # Add the main language (if it's directly queryable) + if "sub_languages" not in entry: + languages.append(language.capitalize()) + + # If there are sub-languages, add them instead + if "sub_languages" in entry: + languages.extend( + sub_language.capitalize() for sub_language in entry["sub_languages"] + ) + + return sorted(languages) def get_language_qid(language: str) -> str: From d44769804f704473bc5fb70b6ebc245a08148b05 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sun, 13 Oct 2024 18:00:29 +0300 Subject: [PATCH 29/63] Remove get_language_words_to_remove and get_language_words_to_ignore due to new language_metadata.json structure --- src/scribe_data/utils.py | 44 ---------------------------------------- 1 file changed, 44 deletions(-) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 494a2d1bf..03e356870 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -206,50 +206,6 @@ def get_language_from_iso(iso: str) -> str: return language_name -def get_language_words_to_remove(language: str) -> list[str]: - """ - Returns the words that should be removed during the data cleaning process for the given language. - - Parameters - ---------- - language : str - The language the words should be returned for. - - Returns - ------- - list[str] - The words that that be removed during the data cleaning process for the given language. - """ - return _find( - "language", - language, - "remove-words", - f"{language.capitalize()} is currently not a supported language.", - ) - - -def get_language_words_to_ignore(language: str) -> list[str]: - """ - Returns the words that should not be included as autosuggestions for the given language. - - Parameters - ---------- - language : str - The language the words should be returned for. - - Returns - ------- - list[str] - The words that should not be included as autosuggestions for the given language. - """ - return _find( - "language", - language, - "ignore-words", - f"{language.capitalize()} is currently not a supported language.", - ) - - def load_queried_data( file_path: str, language: str, data_type: str ) -> tuple[Any, bool, str]: From 86cd59d1df2dbf737e7ab9c4fd7c5e2c18a48f56 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sun, 13 Oct 2024 23:59:31 +0300 Subject: [PATCH 30/63] Refactor language_map and language_to_qid generation to handle new JSON structure - Updated the logic for building language_map and language_to_qid to handle languages with sub-languages. - Both main languages and sub-languages are now processed in a single pass, ensuring that: - language_map includes all metadata for main and sub-languages. - language_to_qid correctly maps both main and sub-languages to their QIDs. --- src/scribe_data/cli/cli_utils.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py index 4f59a65ef..be2fa0f79 100644 --- a/src/scribe_data/cli/cli_utils.py +++ b/src/scribe_data/cli/cli_utils.py @@ -53,14 +53,23 @@ print(f"Error reading data type metadata: {e}") -language_map = { - lang["language"].lower(): lang for lang in language_metadata["languages"] -} - -# Create language_to_qid dictionary. -language_to_qid = { - lang["language"].lower(): lang["qid"] for lang in language_metadata["languages"] -} +language_map = {} +language_to_qid = {} + +# Process each language and its potential sub-languages in one pass +for lang_key, lang_data in language_metadata.items(): + lang_key_lower = lang_key.lower() + + # Handle sub-languages if they exist + if "sub_languages" in lang_data: + for sub_lang_key, sub_lang_data in lang_data["sub_languages"].items(): + sub_lang_key_lower = sub_lang_key.lower() + language_map[sub_lang_key_lower] = sub_lang_data + language_to_qid[sub_lang_key_lower] = sub_lang_data["qid"] + else: + # Handle the main language directly + language_map[lang_key_lower] = lang_data + language_to_qid[lang_key_lower] = lang_data["qid"] # MARK: Correct Inputs From d53ce37abc143c7b764a66b7e71c45ab66bfbb12 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 00:40:16 +0300 Subject: [PATCH 31/63] Fix: Update language extraction to match new JSON structure by removing the 'languages' key reference --- src/scribe_data/cli/interactive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py index 4e95f34b0..cefaa6bbe 100644 --- a/src/scribe_data/cli/interactive.py +++ b/src/scribe_data/cli/interactive.py @@ -52,7 +52,7 @@ class ScribeDataConfig: def __init__(self): self.languages = [ - lang["language"].capitalize() for lang in language_metadata["languages"] + [lang_key.capitalize() for lang_key in language_metadata.keys()] ] self.data_types = list(data_type_metadata.keys()) self.selected_languages: List[str] = [] From e8d82d0070644d8a887681ed8ecb5004778ba032 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 00:48:56 +0300 Subject: [PATCH 32/63] Refactor language extraction to use direct keys from language_metadata. Removed dependency on the 'languages' key in JSON structure. --- src/scribe_data/wikidata/query_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index 4da51b4f6..6ab730792 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -103,7 +103,7 @@ def query_data( SCRIBE_DATA_SRC_PATH / "language_data_extraction" ) languages = [lang.capitalize() for lang in languages] - current_languages = list(language_metadata["languages"]) + current_languages = list(language_metadata.keys()) current_data_type = ["nouns", "verbs", "prepositions"] # Assign current_languages and current_data_type if no arguments have been passed. From 5cd6087ac0acdbffb1844ab84a04de78511b41f9 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 14:13:54 +0300 Subject: [PATCH 33/63] Added format_sublanguage_name function to format sub-language names as 'mainlang/sublang' - Implemented the function to check if a language is a sub-language and format its name as 'mainlang/sublang' for easier searching in language_data_extraction. - Returns the original language name if it's not a sub-language. - Added detailed docstring for clarity and usage examples. --- src/scribe_data/utils.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 03e356870..33fc3763e 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -487,3 +487,39 @@ def order_annotations(annotation: str) -> str: annotation_split = sorted(list(set(filter(None, annotation.split("/"))))) return "/".join(annotation_split) + + +def format_sublanguage_name(lang, language_metadata): + """ + Formats the name of a sub-language by appending its main language + in the format 'mainlang/sublang'. If the language is not a sub-language, + the original language name is returned as-is. + + Args: + lang (str): The name of the language or sub-language to format. + language_metadata (dict): The metadata containing information about + main languages and their sub-languages. + + Returns: + str: The formatted language name if it's a sub-language + (e.g., 'norwegian/nynorsk'), otherwise the original name. + + Example: + format_sublanguage_name("nynorsk", language_metadata) + 'norwegian/nynorsk' + + format_sublanguage_name("english", language_metadata) + 'english' + """ + # Iterate through the main languages in the metadata + for main_lang, lang_data in language_metadata.items(): + # Check if the main language has sub-languages + if "sub_languages" in lang_data: + # Check if the provided language is a sub-language + for sub_lang in lang_data["sub_languages"]: + if lang.lower() == sub_lang.lower(): + # Return the formatted name mainlang/sublang + return f"{main_lang}/{sub_lang}" + + # If it's not a sub-language, return the original name + return lang From 74d7f4781f2b4086a0d4b6ff0242e82497173070 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 14:22:11 +0300 Subject: [PATCH 34/63] Refactor: Apply format_sublanguage_name to handle sub-language - Wrapped 'lang' variable with format_sublanguage_name to ensure sub-languages are formatted as 'mainlang/sublang' during data extraction. - This ensures proper directory creation and querying for a sub-languages, aligning with the new language metadata structure. --- src/scribe_data/wikidata/query_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index 6ab730792..c833dd7a2 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -33,6 +33,7 @@ from scribe_data.cli.cli_utils import ( language_metadata, ) +from scribe_data.utils import format_sublanguage_name from scribe_data.wikidata.wikidata_utils import sparql @@ -147,7 +148,7 @@ def query_data( disable=interactive, colour="MAGENTA", ): - lang = q.parent.parent.name + lang = format_sublanguage_name(q.parent.parent.name, language_metadata) target_type = q.parent.name updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir From 51e847d0d98cb7df43db041225b6faf79aad8265 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 14:31:45 +0300 Subject: [PATCH 35/63] Removed dependency on the 'languages' key based on the old json structure in cli/total.py file --- src/scribe_data/cli/total.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index fe1382707..1a05eb724 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -71,8 +71,8 @@ def get_datatype_list(language): data_types : list[str] or None A list of the corresponding data types. """ - languages = list(language_metadata["languages"]) - language_list = [lang["language"] for lang in languages] + languages = list(language_metadata.keys()) + language_list = [lang for lang in languages] if language.lower() in language_list: language_data = language_map.get(language.lower()) From 4c8fe1e01a4185f97074c78ae1533f0f257b6298 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 15:12:19 +0300 Subject: [PATCH 36/63] Add function to list all languages from language metadata loaded json - Created list_all_languages function to extract both main languages and sub-languages - The function checks for sub-languages and compiles a complete list for easier access. - Updated example usage to demonstrate the new functionality. --- src/scribe_data/utils.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 33fc3763e..1df502ad6 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -523,3 +523,20 @@ def format_sublanguage_name(lang, language_metadata): # If it's not a sub-language, return the original name return lang + + +def list_all_languages(language_metadata): + """List all languages from the provided metadata dictionary, including sub-languages.""" + current_languages = [] + + # Iterate through the language metadata + for lang_key, lang_data in language_metadata.items(): + # Check if there are sub-languages + if "sub_languages" in lang_data: + # Add the sub-languages to current_languages + current_languages.extend(lang_data["sub_languages"].keys()) + else: + # If no sub-languages, add the main language + current_languages.append(lang_key) + + return current_languages From 1fdb70372260ba0d8e018e13114589f98a0dbc76 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 15:14:37 +0300 Subject: [PATCH 37/63] Refactor to use list_all_languages function for language extraction - Replaced old extraction method with a centralized function. --- src/scribe_data/load/data_to_sqlite.py | 4 ++-- src/scribe_data/wikidata/query_data.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/scribe_data/load/data_to_sqlite.py b/src/scribe_data/load/data_to_sqlite.py index 79d19e39b..aec1f9560 100644 --- a/src/scribe_data/load/data_to_sqlite.py +++ b/src/scribe_data/load/data_to_sqlite.py @@ -35,6 +35,7 @@ DEFAULT_SQLITE_EXPORT_DIR, get_language_iso, ) +from scribe_data.utils import list_all_languages def data_to_sqlite( @@ -52,8 +53,7 @@ def data_to_sqlite( current_language_data = json.load(f_languages) data_types = json.load(f_data_types).keys() - current_languages = [d["language"] for d in current_language_data["languages"]] - + current_languages = list_all_languages(current_language_data) if not languages: languages = current_languages diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index c833dd7a2..a9dba0b9f 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -33,7 +33,7 @@ from scribe_data.cli.cli_utils import ( language_metadata, ) -from scribe_data.utils import format_sublanguage_name +from scribe_data.utils import format_sublanguage_name, list_all_languages from scribe_data.wikidata.wikidata_utils import sparql @@ -104,7 +104,7 @@ def query_data( SCRIBE_DATA_SRC_PATH / "language_data_extraction" ) languages = [lang.capitalize() for lang in languages] - current_languages = list(language_metadata.keys()) + current_languages = list_all_languages(language_metadata) current_data_type = ["nouns", "verbs", "prepositions"] # Assign current_languages and current_data_type if no arguments have been passed. From 4e50cbb67dbe323f85aec66ed8fcf1d7409cfea2 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 16:39:14 +0300 Subject: [PATCH 38/63] Enhance language handling by importing utility functions - Imported list_all_languages and ormat_sublanguage_name from scribe_data.utils. - Updated get_datatype_list and print_total_lexemes to improve language name retrieval and formatting. --- src/scribe_data/cli/total.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 1a05eb724..5530ef5db 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -30,6 +30,7 @@ language_to_qid, ) from scribe_data.wikidata.wikidata_utils import sparql +from scribe_data.utils import list_all_languages, format_sublanguage_name def get_qid_by_input(input_str): @@ -71,12 +72,14 @@ def get_datatype_list(language): data_types : list[str] or None A list of the corresponding data types. """ - languages = list(language_metadata.keys()) + languages = list_all_languages(language_metadata) language_list = [lang for lang in languages] if language.lower() in language_list: language_data = language_map.get(language.lower()) - language_capitalized = language.capitalize() + language_capitalized = format_sublanguage_name( + language, language_metadata + ).capitalize() language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_capitalized if not language_data: @@ -131,9 +134,11 @@ def print_total_lexemes(language: str = None): print("=" * 64) if language is None: # all languages - languages = list(language_metadata["languages"]) - languages.sort(key=lambda x: x["language"]) - language_list = [lang["language"] for lang in languages] + languages = list_all_languages( + language_metadata + ) # this returns a list of language names + language_list = languages # sorts the list in place + language_list.sort() for lang in language_list: data_types = get_datatype_list(lang) From 761f8eed474382610dfae6d8cfc0406c73490737 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 19:35:34 +0300 Subject: [PATCH 39/63] Update get_language_iso function: - Refactored to use the user-defined _find function. - Removed the ry-except block as error handling is already implemented in _find. - Removed the InvalidLanguageValue module as it was imported but unused. --- src/scribe_data/utils.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 1df502ad6..9898f2449 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -27,7 +27,7 @@ from typing import Any, Optional from iso639 import Lang -from iso639.exceptions import DeprecatedLanguageValue, InvalidLanguageValue +from iso639.exceptions import DeprecatedLanguageValue PROJECT_ROOT = "Scribe-Data" DEFAULT_JSON_EXPORT_DIR = "scribe_data_json_export" @@ -174,12 +174,13 @@ def get_language_iso(language: str) -> str: str The ISO code for the language. """ - try: - iso_code = str(Lang(language.capitalize()).pt1) - except InvalidLanguageValue: - raise ValueError( - f"{language.capitalize()} is currently not a supported language for ISO conversion." - ) from None + + iso_code = _find( + "language", + language, + "iso", + f"{language.upper()} is currently not a supported language for ISO conversion.", + ) return iso_code From bc65e0da7f1f46d0caca89ed78eeec315b869c62 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 19:55:09 +0300 Subject: [PATCH 40/63] Handle sub-languages in language table generation - Utilized already built helper functions to support sub-languages when retrieving ISO and QID values. - Updated table printing to correctly format and display both main languages and sub-languages. --- src/scribe_data/cli/list.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index 5d16b4413..6f8f2358e 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -26,18 +26,19 @@ language_map, LANGUAGE_DATA_EXTRACTION_DIR, ) +from scribe_data.utils import list_all_languages, get_language_iso, get_language_qid def list_languages() -> None: """ Generates a table of languages, their ISO-2 codes and their Wikidata QIDs. """ - languages = list(language_metadata["languages"]) - languages.sort(key=lambda x: x["language"]) + languages = list_all_languages(language_metadata) + languages.sort() - language_col_width = max(len(lang["language"]) for lang in languages) + 2 - iso_col_width = max(len(lang["iso"]) for lang in languages) + 2 - qid_col_width = max(len(lang["qid"]) for lang in languages) + 2 + language_col_width = max(len(lang) for lang in languages) + 2 + iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2 + qid_col_width = max(len(get_language_qid(lang)) for lang in languages) + 2 table_line_length = language_col_width + iso_col_width + qid_col_width @@ -49,7 +50,7 @@ def list_languages() -> None: for lang in languages: print( - f"{lang['language'].capitalize():<{language_col_width}} {lang['iso']:<{iso_col_width}} {lang['qid']:<{qid_col_width}}" + f"{lang.capitalize():<{language_col_width}} {get_language_iso(lang):<{iso_col_width}} {get_language_qid(lang):<{qid_col_width}}" ) print("-" * table_line_length) From 47ff4f80845ec0179cda8fbfa642e31b886c0798 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 21:27:17 +0300 Subject: [PATCH 41/63] adding new languages and their dialects to the language_metadata.json file --- .../resources/language_metadata.json | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json index dd85cdc91..d7d8100cd 100755 --- a/src/scribe_data/resources/language_metadata.json +++ b/src/scribe_data/resources/language_metadata.json @@ -56,8 +56,16 @@ "qid": "Q9288" }, "hindustani": { - "iso": "hi", - "qid": "Q11051" + "sub_languages": { + "hindi": { + "iso": "hi", + "qid": "Q11051" + }, + "urdu": { + "iso": "ur", + "qid": "Q11051" + } + } }, "indonesian": { "iso": "id", @@ -104,8 +112,12 @@ } }, "pidgin": { - "iso": "pi", - "qid": "Q33655" + "sub_languages": { + "nigerian": { + "iso": "pi", + "qid": "Q33655" + } + } }, "polish": { "iso": "pl", @@ -116,8 +128,16 @@ "qid": "Q5146" }, "punjabi": { - "iso": "pa", - "qid": "Q58635" + "sub_languages": { + "gurmukhi": { + "iso": "pan", + "qid": "Q58635" + }, + "shahmukhi": { + "iso": "pnp", + "qid": "Q58635" + } + } }, "russian": { "iso": "ru", From f1f892885fede116e4bd8641e2b5b882a452071b Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 21:52:40 +0300 Subject: [PATCH 42/63] Modified the loop that searches languages in the list_data_types function to reflect the new JSON structure, ensuring only data types are printed and no sub-languages unlike before. --- src/scribe_data/cli/list.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index 6f8f2358e..6b9ec295c 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -26,7 +26,12 @@ language_map, LANGUAGE_DATA_EXTRACTION_DIR, ) -from scribe_data.utils import list_all_languages, get_language_iso, get_language_qid +from scribe_data.utils import ( + list_all_languages, + get_language_iso, + get_language_qid, + format_sublanguage_name, +) def list_languages() -> None: @@ -66,6 +71,7 @@ def list_data_types(language: str = None) -> None: language : str The language to potentially list data types for. """ + languages = list_all_languages(language_metadata) if language: language_data = language_map.get(language.lower()) language_capitalized = language.capitalize() @@ -84,8 +90,11 @@ def list_data_types(language: str = None) -> None: else: data_types = set() - for lang in language_metadata["languages"]: - language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang["language"].capitalize() + for lang in languages: + language_dir = ( + LANGUAGE_DATA_EXTRACTION_DIR + / format_sublanguage_name(lang, language_metadata).capitalize() + ) if language_dir.is_dir(): data_types.update(f.name for f in language_dir.iterdir() if f.is_dir()) From 5a4f7217784a62ade73cdfab9be3751f1402fb25 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 22:24:19 +0300 Subject: [PATCH 43/63] Capitalize the languages returned by the function 'format_sublanguage_name' to align with the directory structure in the language_data_extraction directory. --- src/scribe_data/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 9898f2449..b4da68647 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -520,10 +520,10 @@ def format_sublanguage_name(lang, language_metadata): for sub_lang in lang_data["sub_languages"]: if lang.lower() == sub_lang.lower(): # Return the formatted name mainlang/sublang - return f"{main_lang}/{sub_lang}" + return f"{main_lang.capitalize()}/{sub_lang.capitalize()}" # If it's not a sub-language, return the original name - return lang + return lang.capitalize() def list_all_languages(language_metadata): From eaf89e497786bdde8688d3f5bf8497def4a08cde Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 22:29:02 +0300 Subject: [PATCH 44/63] Implemented minor fixes by utilizing the format_sublanguage_name function to handle sub_language folders. --- src/scribe_data/cli/list.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index 6b9ec295c..447d59060 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -73,6 +73,7 @@ def list_data_types(language: str = None) -> None: """ languages = list_all_languages(language_metadata) if language: + language = format_sublanguage_name(language, language_metadata) language_data = language_map.get(language.lower()) language_capitalized = language.capitalize() language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_capitalized @@ -132,9 +133,11 @@ def list_languages_for_data_type(data_type: str) -> None: The data type to check for. """ data_type = correct_data_type(data_type=data_type) + all_languages = list_all_languages(language_metadata) available_languages = [] - for lang in language_metadata["languages"]: - language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang["language"].capitalize() + for lang in all_languages: + lang = format_sublanguage_name(lang, language_metadata) + language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang.capitalize() if language_dir.is_dir(): dt_path = language_dir / data_type if dt_path.exists(): From 661d7234a56dace69adc78b85a341bac71e5aadb Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Tue, 15 Oct 2024 19:26:18 +0300 Subject: [PATCH 45/63] Updated the instance variable self.languages in ScribeDataConfig to use list_all_languages, assigning a complete list of all languages. --- src/scribe_data/cli/interactive.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py index cefaa6bbe..6ba7a1f55 100644 --- a/src/scribe_data/cli/interactive.py +++ b/src/scribe_data/cli/interactive.py @@ -35,7 +35,7 @@ from scribe_data.cli.cli_utils import data_type_metadata, language_metadata from scribe_data.cli.get import get_data from scribe_data.cli.version import get_version_message -from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR +from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR, list_all_languages # MARK: Config Setup @@ -51,9 +51,7 @@ class ScribeDataConfig: def __init__(self): - self.languages = [ - [lang_key.capitalize() for lang_key in language_metadata.keys()] - ] + self.languages = list_all_languages(language_metadata) self.data_types = list(data_type_metadata.keys()) self.selected_languages: List[str] = [] self.selected_data_types: List[str] = [] From dffb9f70a597782be22574cd450cf7f1365416f9 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Wed, 16 Oct 2024 17:22:25 +0300 Subject: [PATCH 46/63] adding mandarin as a sub language under chinese and updating some qids --- .../resources/language_metadata.json | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json index d7d8100cd..00a8d405c 100755 --- a/src/scribe_data/resources/language_metadata.json +++ b/src/scribe_data/resources/language_metadata.json @@ -95,9 +95,13 @@ "iso": "ml", "qid": "Q36236" }, - "mandarin": { - "iso": "zh", - "qid": "Q727694" + "chinese": { + "sub_languages": { + "mandarin": { + "iso": "zh", + "qid": "Q727694" + } + } }, "norwegian": { "sub_languages": { @@ -107,7 +111,7 @@ }, "bokmål": { "iso": "nb", - "qid": "Q9043" + "qid": "Q25167" } } }, @@ -129,12 +133,12 @@ }, "punjabi": { "sub_languages": { - "gurmukhi": { - "iso": "pan", + "shahmukhi": { + "iso": "pnb", "qid": "Q58635" }, - "shahmukhi": { - "iso": "pnp", + "gurmukhi": { + "iso": "pa", "qid": "Q58635" } } From 4a204c0fbd97e2b65671790d112b12f2caac46df Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Wed, 16 Oct 2024 17:46:53 +0300 Subject: [PATCH 47/63] Update test_list_languages to match updated output format --- tests/cli/test_list.py | 54 +++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py index 03172e077..eb6a29462 100644 --- a/tests/cli/test_list.py +++ b/tests/cli/test_list.py @@ -39,17 +39,49 @@ def test_list_languages(self, mock_print): list_languages() expected_calls = [ call(), - call("Language ISO QID "), - call("-----------------------"), - call("English en Q1860 "), - call("French fr Q150 "), - call("German de Q188 "), - call("Italian it Q652 "), - call("Portuguese pt Q5146 "), - call("Russian ru Q7737 "), - call("Spanish es Q1321 "), - call("Swedish sv Q9027 "), - call("-----------------------"), + call("Language ISO QID "), + call("--------------------------"), + call("Arabic ar Q13955 "), + call("Basque eu Q8752 "), + call("Bengali bn Q9610 "), + call("Bokmål nb Q25167 "), + call("Czech cs Q9056 "), + call("Danish da Q9035 "), + call("English en Q1860 "), + call("Esperanto eo Q143 "), + call("Estonian et Q9072 "), + call("Finnish fi Q1412 "), + call("French fr Q150 "), + call("German de Q188 "), + call("Greek el Q36510 "), + call("Gurmukhi pa Q58635 "), + call("Hausa ha Q56475 "), + call("Hebrew he Q9288 "), + call("Hindi hi Q11051 "), + call("Indonesian id Q9240 "), + call("Italian it Q652 "), + call("Japanese ja Q5287 "), + call("Kurmanji kmr Q36163 "), + call("Latin la Q397 "), + call("Malay ms Q9237 "), + call("Malayalam ml Q36236 "), + call("Mandarin zh Q727694 "), + call("Nigerian pi Q33655 "), + call("Nynorsk nn Q25164 "), + call("Polish pl Q809 "), + call("Portuguese pt Q5146 "), + call("Russian ru Q7737 "), + call("Shahmukhi pnb Q58635 "), + call("Slovak sk Q9058 "), + call("Spanish es Q1321 "), + call("Swahili sw Q7838 "), + call("Swedish sv Q9027 "), + call("Tajik tg Q9260 "), + call("Tamil ta Q5885 "), + call("Ukrainian ua Q8798 "), + call("Urdu ur Q11051 "), + call("Yoruba yo Q34311 "), + call("--------------------------"), call(), ] mock_print.assert_has_calls(expected_calls) From 0249c9643df36b5e5fd7276b4bd4c5603c284b95 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Wed, 16 Oct 2024 20:28:44 +0300 Subject: [PATCH 48/63] removing .capitalize method since it's already implemented inside laguages listing functions --- src/scribe_data/cli/list.py | 6 ++--- tests/cli/test_list.py | 52 ++++++++++++++++++++++++++++++------- 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index 447d59060..ee3311ede 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -137,11 +137,11 @@ def list_languages_for_data_type(data_type: str) -> None: available_languages = [] for lang in all_languages: lang = format_sublanguage_name(lang, language_metadata) - language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang.capitalize() + language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang if language_dir.is_dir(): dt_path = language_dir / data_type if dt_path.exists(): - available_languages.append(lang["language"]) + available_languages.append(lang) available_languages.sort() table_header = f"Available languages: {data_type}" @@ -154,7 +154,7 @@ def list_languages_for_data_type(data_type: str) -> None: print("-" * table_line_length) for lang in available_languages: - print(f"{lang.capitalize()}") + print(f"{lang}") print("-" * table_line_length) print() diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py index eb6a29462..8f6d1b86e 100644 --- a/tests/cli/test_list.py +++ b/tests/cli/test_list.py @@ -98,6 +98,8 @@ def test_list_data_types_all_languages(self, mock_print): call("adverbs"), call("emoji-keywords"), call("nouns"), + call("personal-pronouns"), + call("postpositions"), call("prepositions"), call("proper-nouns"), call("verbs"), @@ -179,16 +181,48 @@ def test_list_languages_for_data_type_valid(self, mock_print): list_languages_for_data_type("nouns") expected_calls = [ call(), - call("Available languages: nouns"), + call("Language ISO QID "), call("--------------------------"), - call("English"), - call("French"), - call("German"), - call("Italian"), - call("Portuguese"), - call("Russian"), - call("Spanish"), - call("Swedish"), + call("Arabic ar Q13955 "), + call("Basque eu Q8752 "), + call("Bengali bn Q9610 "), + call("Bokmål nb Q25167 "), + call("Czech cs Q9056 "), + call("Danish da Q9035 "), + call("English en Q1860 "), + call("Esperanto eo Q143 "), + call("Estonian et Q9072 "), + call("Finnish fi Q1412 "), + call("French fr Q150 "), + call("German de Q188 "), + call("Greek el Q36510 "), + call("Gurmukhi pa Q58635 "), + call("Hausa ha Q56475 "), + call("Hebrew he Q9288 "), + call("Hindi hi Q11051 "), + call("Indonesian id Q9240 "), + call("Italian it Q652 "), + call("Japanese ja Q5287 "), + call("Kurmanji kmr Q36163 "), + call("Latin la Q397 "), + call("Malay ms Q9237 "), + call("Malayalam ml Q36236 "), + call("Mandarin zh Q727694 "), + call("Nigerian pi Q33655 "), + call("Nynorsk nn Q25164 "), + call("Polish pl Q809 "), + call("Portuguese pt Q5146 "), + call("Russian ru Q7737 "), + call("Shahmukhi pnb Q58635 "), + call("Slovak sk Q9058 "), + call("Spanish es Q1321 "), + call("Swahili sw Q7838 "), + call("Swedish sv Q9027 "), + call("Tajik tg Q9260 "), + call("Tamil ta Q5885 "), + call("Ukrainian ua Q8798 "), + call("Urdu ur Q11051 "), + call("Yoruba yo Q34311 "), call("--------------------------"), call(), ] From a5847493692312540796b9294db7574699ff6371 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Wed, 16 Oct 2024 21:35:09 +0300 Subject: [PATCH 49/63] Updating test cases in test_list.py file to match newly added languages --- tests/cli/test_list.py | 82 +++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py index 8f6d1b86e..6fb4bf791 100644 --- a/tests/cli/test_list.py +++ b/tests/cli/test_list.py @@ -181,48 +181,48 @@ def test_list_languages_for_data_type_valid(self, mock_print): list_languages_for_data_type("nouns") expected_calls = [ call(), - call("Language ISO QID "), + call("Available languages: nouns"), call("--------------------------"), - call("Arabic ar Q13955 "), - call("Basque eu Q8752 "), - call("Bengali bn Q9610 "), - call("Bokmål nb Q25167 "), - call("Czech cs Q9056 "), - call("Danish da Q9035 "), - call("English en Q1860 "), - call("Esperanto eo Q143 "), - call("Estonian et Q9072 "), - call("Finnish fi Q1412 "), - call("French fr Q150 "), - call("German de Q188 "), - call("Greek el Q36510 "), - call("Gurmukhi pa Q58635 "), - call("Hausa ha Q56475 "), - call("Hebrew he Q9288 "), - call("Hindi hi Q11051 "), - call("Indonesian id Q9240 "), - call("Italian it Q652 "), - call("Japanese ja Q5287 "), - call("Kurmanji kmr Q36163 "), - call("Latin la Q397 "), - call("Malay ms Q9237 "), - call("Malayalam ml Q36236 "), - call("Mandarin zh Q727694 "), - call("Nigerian pi Q33655 "), - call("Nynorsk nn Q25164 "), - call("Polish pl Q809 "), - call("Portuguese pt Q5146 "), - call("Russian ru Q7737 "), - call("Shahmukhi pnb Q58635 "), - call("Slovak sk Q9058 "), - call("Spanish es Q1321 "), - call("Swahili sw Q7838 "), - call("Swedish sv Q9027 "), - call("Tajik tg Q9260 "), - call("Tamil ta Q5885 "), - call("Ukrainian ua Q8798 "), - call("Urdu ur Q11051 "), - call("Yoruba yo Q34311 "), + call("Arabic"), + call("Basque"), + call("Bengali"), + call("Chinese/Mandarin"), + call("Czech"), + call("Danish"), + call("English"), + call("Esperanto"), + call("Estonian"), + call("Finnish"), + call("French"), + call("German"), + call("Greek"), + call("Hausa"), + call("Hebrew"), + call("Hindustani/Hindi"), + call("Hindustani/Urdu"), + call("Indonesian"), + call("Italian"), + call("Japanese"), + call("Kurmanji"), + call("Latin"), + call("Malay"), + call("Malayalam"), + call("Norwegian/Bokmål"), + call("Norwegian/Nynorsk"), + call("Pidgin/Nigerian"), + call("Polish"), + call("Portuguese"), + call("Punjabi/Gurmukhi"), + call("Punjabi/Shahmukhi"), + call("Russian"), + call("Slovak"), + call("Spanish"), + call("Swahili"), + call("Swedish"), + call("Tajik"), + call("Tamil"), + call("Ukrainian"), + call("Yoruba"), call("--------------------------"), call(), ] From 4ef0c229a8583f9a61a9a0d4b8e59b298d5893a8 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Thu, 17 Oct 2024 00:31:59 +0300 Subject: [PATCH 50/63] Update test cases to include sub-languages - Updated all test cases to account for sub-languages. - Removed tests for est_get_language_words_to_remove and est_get_language_words_to_ignore, as these functions were deleted from utils.py and the languages metadata files --- tests/load/test_update_utils.py | 123 ++++++++++---------------------- 1 file changed, 36 insertions(+), 87 deletions(-) diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py index 638ee09dd..489abc4b8 100644 --- a/tests/load/test_update_utils.py +++ b/tests/load/test_update_utils.py @@ -38,14 +38,46 @@ def test_get_scribe_languages(): test_case.assertCountEqual( utils.get_scribe_languages(), [ + "Arabic", + "Basque", + "Bengali", + "Bokmål", + "Czech", + "Danish", "English", + "Esperanto", + "Estonian", + "Finnish", "French", "German", + "Greek", + "Gurmukhi", + "Hausa", + "Hebrew", + "Hindi", + "Indonesian", "Italian", + "Japanese", + "Kurmanji", + "Latin", + "Malay", + "Malayalam", + "Mandarin", + "Nigerian", + "Nynorsk", + "Polish", "Portuguese", "Russian", + "Shahmukhi", + "Slovak", "Spanish", + "Swahili", "Swedish", + "Tajik", + "Tamil", + "Ukrainian", + "Urdu", + "Yoruba", ], ) @@ -61,6 +93,7 @@ def test_get_scribe_languages(): ("russian", "Q7737"), ("spanish", "Q1321"), ("swedish", "Q9027"), + ("bokmål", "Q25167"), ], ) def test_get_language_qid_positive(language, qid_code): @@ -88,6 +121,7 @@ def test_get_language_qid_negative(): ("russian", "ru"), ("spanish", "es"), ("SwedisH", "sv"), + ("bokmål", "nb"), ], ) def test_get_language_iso_positive(language, iso_code): @@ -100,7 +134,7 @@ def test_get_language_iso_negative(): assert ( str(excp.value) - == "Gibberish is currently not a supported language for ISO conversion." + == "GIBBERISH is currently not a supported language for ISO conversion." ) @@ -115,6 +149,7 @@ def test_get_language_iso_negative(): ("ru", "Russian"), ("es", "Spanish"), ("sv", "Swedish"), + ("nb", "Bokmål"), ], ) def test_get_language_from_iso_positive(iso_code, language): @@ -128,92 +163,6 @@ def test_get_language_from_iso_negative(): assert str(excp.value) == "IXI is currently not a supported ISO language." -@pytest.mark.parametrize( - "language, remove_words", - [ - ( - "english", - [ - "of", - "the", - "The", - "and", - ], - ), - ( - "french", - [ - "of", - "the", - "The", - "and", - ], - ), - ("german", ["of", "the", "The", "and", "NeinJa", "et", "redirect"]), - ("italian", ["of", "the", "The", "and", "text", "from"]), - ("portuguese", ["of", "the", "The", "and", "jbutadptflora"]), - ( - "russian", - [ - "of", - "the", - "The", - "and", - ], - ), - ("spanish", ["of", "the", "The", "and"]), - ("swedish", ["of", "the", "The", "and", "Checklist", "Catalogue"]), - ], -) -def test_get_language_words_to_remove(language, remove_words): - test_case = unittest.TestCase() - - # ignore order, only content matters - test_case.assertCountEqual( - utils.get_language_words_to_remove(language), remove_words - ) - - -def test_get_language_words_to_remove_negative(): - with pytest.raises(ValueError) as excp: - _ = utils.get_language_words_to_remove("python") - - assert str(excp.value) == "Python is currently not a supported language." - - -@pytest.mark.parametrize( - "language, ignore_words", - [ - ( - "french", - [ - "XXe", - ], - ), - ("german", ["Gemeinde", "Familienname"]), - ("italian", ["The", "ATP"]), - ("portuguese", []), - ("russian", []), - ("spanish", []), - ("swedish", ["databasdump"]), - ], -) -def test_get_language_words_to_ignore(language, ignore_words): - test_case = unittest.TestCase() - - # ignore order, only content matters - test_case.assertCountEqual( - utils.get_language_words_to_ignore(language), ignore_words - ) - - -def test_get_language_words_to_ignore_negative(): - with pytest.raises(ValueError) as excp: - _ = utils.get_language_words_to_ignore("JAVA") - - assert str(excp.value) == "Java is currently not a supported language." - - def test_get_ios_data_path(): assert ( utils.get_ios_data_path("suomi") From 775fb24fd7805be5a859e5fb139b8cb974c4917d Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Thu, 17 Oct 2024 01:37:28 +0300 Subject: [PATCH 51/63] Updated the get_language_from_iso function to depend on the JSON file. Made the language_metadata parameter optional in two functions. Added a ValueError exception when a language is not found. --- src/scribe_data/utils.py | 47 +++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index b4da68647..df22a9a9a 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -26,8 +26,6 @@ from pathlib import Path from typing import Any, Optional -from iso639 import Lang -from iso639.exceptions import DeprecatedLanguageValue PROJECT_ROOT = "Scribe-Data" DEFAULT_JSON_EXPORT_DIR = "scribe_data_json_export" @@ -198,13 +196,20 @@ def get_language_from_iso(iso: str) -> str: str The name for the language which has an ISO value of iso. """ - try: - language_name = str(Lang(iso.lower()).name) - except DeprecatedLanguageValue as e: - raise ValueError( - f"{iso.upper()} is currently not a supported ISO language." - ) from e - return language_name + # Iterate over the languages and their properties + for language, properties in _languages.items(): + # Check if the current language's ISO matches the provided ISO + if properties.get("iso") == iso: + return language.capitalize() + + # If there are sub-languages, check those as well + if "sub_languages" in properties: + for sub_lang, sub_properties in properties["sub_languages"].items(): + if sub_properties.get("iso") == iso: + return sub_lang.capitalize() + + # If no match is found, raise a ValueError + raise ValueError(f"{iso.upper()} is currently not a supported ISO language.") def load_queried_data( @@ -490,10 +495,10 @@ def order_annotations(annotation: str) -> str: return "/".join(annotation_split) -def format_sublanguage_name(lang, language_metadata): +def format_sublanguage_name(lang, language_metadata=_languages): """ Formats the name of a sub-language by appending its main language - in the format 'mainlang/sublang'. If the language is not a sub-language, + in the format 'Mainlang/Sublang'. If the language is not a sub-language, the original language name is returned as-is. Args: @@ -503,30 +508,36 @@ def format_sublanguage_name(lang, language_metadata): Returns: str: The formatted language name if it's a sub-language - (e.g., 'norwegian/nynorsk'), otherwise the original name. + (e.g., 'Norwegian/Nynorsk'), otherwise the original name. + + Raises: + ValueError: If the provided language or sub-language is not found. Example: format_sublanguage_name("nynorsk", language_metadata) - 'norwegian/nynorsk' + 'Norwegian/Nynorsk' format_sublanguage_name("english", language_metadata) - 'english' + 'English' """ # Iterate through the main languages in the metadata for main_lang, lang_data in language_metadata.items(): + # If it's not a sub-language, return the original name + if main_lang == lang.lower(): + return lang.capitalize() # Check if the main language has sub-languages if "sub_languages" in lang_data: # Check if the provided language is a sub-language for sub_lang in lang_data["sub_languages"]: if lang.lower() == sub_lang.lower(): - # Return the formatted name mainlang/sublang + # Return the formatted name Mainlang/Sublang return f"{main_lang.capitalize()}/{sub_lang.capitalize()}" - # If it's not a sub-language, return the original name - return lang.capitalize() + # Raise ValueError if no match is found + raise ValueError(f"{lang.upper()} is not a valid language or sub-language.") -def list_all_languages(language_metadata): +def list_all_languages(language_metadata=_languages): """List all languages from the provided metadata dictionary, including sub-languages.""" current_languages = [] From 0b75b4e46728c4a3f43849b5d1b44e8e36609f2f Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Thu, 17 Oct 2024 01:39:25 +0300 Subject: [PATCH 52/63] Add unit tests for language formatting and listing: - Positive and negative tests for format_sublanguage_name - Test to validate the output of list_all_languages --- tests/load/test_update_utils.py | 66 +++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py index 489abc4b8..df37317a3 100644 --- a/tests/load/test_update_utils.py +++ b/tests/load/test_update_utils.py @@ -163,6 +163,72 @@ def test_get_language_from_iso_negative(): assert str(excp.value) == "IXI is currently not a supported ISO language." +@pytest.mark.parametrize( + "lang, expected_output", + [ + ("nynorsk", "Norwegian/Nynorsk"), + ("bokmål", "Norwegian/Bokmål"), + ("english", "English"), + ], +) +def test_format_sublanguage_name_positive(lang, expected_output): + assert utils.format_sublanguage_name(lang) == expected_output + + +def test_format_sublanguage_name_negative(): + with pytest.raises(ValueError) as excp: + _ = utils.format_sublanguage_name("soccer") + + assert str(excp.value) == "SOCCER is not a valid language or sub-language." + + +def test_list_all_languages(): + expected_languages = [ + "arabic", + "basque", + "bengali", + "czech", + "danish", + "english", + "esperanto", + "estonian", + "finnish", + "french", + "german", + "greek", + "hausa", + "hebrew", + "hindi", + "urdu", + "indonesian", + "italian", + "japanese", + "kurmanji", + "latin", + "malay", + "malayalam", + "mandarin", + "nynorsk", + "bokmål", + "nigerian", + "polish", + "portuguese", + "shahmukhi", + "gurmukhi", + "russian", + "slovak", + "spanish", + "swahili", + "swedish", + "tajik", + "tamil", + "ukrainian", + "yoruba", + ] + + assert utils.list_all_languages() == expected_languages + + def test_get_ios_data_path(): assert ( utils.get_ios_data_path("suomi") From ad61c66033c37184d91696309f4a94ae7b77bcfc Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Fri, 18 Oct 2024 03:05:02 +0200 Subject: [PATCH 53/63] Edits to language metadata and supporting functions + pr checklist --- .github/PULL_REQUEST_TEMPLATE.md | 1 + CONTRIBUTING.md | 11 ++ src/scribe_data/cli/cli_utils.py | 81 +++++----- src/scribe_data/cli/list.py | 9 +- src/scribe_data/cli/total.py | 13 +- .../resources/language_metadata.json | 32 ++-- src/scribe_data/utils.py | 150 +++++++++--------- tests/cli/test_utils.py | 10 +- tests/load/test_update_utils.py | 62 +------- 9 files changed, 158 insertions(+), 211 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index bab97a1a8..17c07e1c1 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -7,6 +7,7 @@ Thank you for your pull request! 🚀 - [] This pull request is on a [separate branch](https://docs.github.com/en/get-started/quickstart/github-flow) and not the main branch +- [] I have tested my code with the `pytest` command as directed in the [testing section of the contributing guide](https://github.com/scribe-org/Scribe-Data/blob/main/CONTRIBUTING.md#testing) --- diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 376a954a7..2e44c618e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,6 +15,7 @@ If you have questions or would like to communicate with the team, please [join u - [First steps as a contributor](#first-steps) - [Learning the tech stack](#learning-the-tech) - [Development environment](#dev-env) +- [Testing](#testing) - [Issues and projects](#issues-projects) - [Bug reports](#bug-reports) - [Feature requests](#feature-requests) @@ -171,6 +172,16 @@ pip install -e . > [!NOTE] > Feel free to contact the team in the [Data room on Matrix](https://matrix.to/#/#ScribeData:matrix.org) if you're having problems getting your environment setup! + + +## Testing [`⇧`](#contents) + +In addition to the [pre-commit](https://pre-commit.com/) hooks that are set up during the [development environment section](#dev-env), Scribe-Data also includes a testing suite that should be ran before all pull requests and subsequent commits. Please run the following in the project root: + +```bash +pytest +``` + ## Issues and projects [`⇧`](#contents) diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py index be2fa0f79..e39e1621d 100644 --- a/src/scribe_data/cli/cli_utils.py +++ b/src/scribe_data/cli/cli_utils.py @@ -27,6 +27,8 @@ from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR +# MARK: CLI Variables + LANGUAGE_DATA_EXTRACTION_DIR = Path(__file__).parent.parent / "language_data_extraction" LANGUAGE_METADATA_FILE = ( @@ -56,20 +58,21 @@ language_map = {} language_to_qid = {} -# Process each language and its potential sub-languages in one pass -for lang_key, lang_data in language_metadata.items(): - lang_key_lower = lang_key.lower() +# Process each language and its potential sub-languages in one pass. +for lang, lang_data in language_metadata.items(): + lang_lower = lang.lower() - # Handle sub-languages if they exist + # Handle sub-languages if they exist. if "sub_languages" in lang_data: - for sub_lang_key, sub_lang_data in lang_data["sub_languages"].items(): - sub_lang_key_lower = sub_lang_key.lower() - language_map[sub_lang_key_lower] = sub_lang_data - language_to_qid[sub_lang_key_lower] = sub_lang_data["qid"] + for sub_lang, sub_lang_data in lang_data["sub_languages"].items(): + sub_lang_lower = sub_lang.lower() + language_map[sub_lang_lower] = sub_lang_data + language_to_qid[sub_lang_lower] = sub_lang_data["qid"] + else: - # Handle the main language directly - language_map[lang_key_lower] = lang_data - language_to_qid[lang_key_lower] = lang_data["qid"] + # Handle the main language directly. + language_map[lang_lower] = lang_data + language_to_qid[lang_lower] = lang_data["qid"] # MARK: Correct Inputs @@ -112,41 +115,37 @@ def print_formatted_data(data: Union[dict, list], data_type: str) -> None: if isinstance(data, dict): max_key_length = max((len(key) for key in data.keys()), default=0) - if data_type == "autosuggestions": - for key, value in data.items(): + for key, value in data.items(): + if data_type == "autosuggestions": print(f"{key:<{max_key_length}} : {', '.join(value)}") - elif data_type == "emoji_keywords": - for key, value in data.items(): + elif data_type == "emoji_keywords": emojis = [item["emoji"] for item in value] print(f"{key:<{max_key_length}} : {' '.join(emojis)}") - elif data_type in {"prepositions"}: - for key, value in data.items(): + elif data_type in {"prepositions"}: print(f"{key:<{max_key_length}} : {value}") - else: - for key, value in data.items(): - if isinstance(value, dict): - print(f"{key:<{max_key_length}} : ") - max_sub_key_length = max( - (len(sub_key) for sub_key in value.keys()), default=0 - ) - for sub_key, sub_value in value.items(): - print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") - - elif isinstance(value, list): - print(f"{key:<{max_key_length}} : ") - for item in value: - if isinstance(item, dict): - for sub_key, sub_value in item.items(): - print(f" {sub_key:<{max_key_length}} : {sub_value}") - - else: - print(f" {item}") - - else: - print(f"{key:<{max_key_length}} : {value}") + elif isinstance(value, dict): + print(f"{key:<{max_key_length}} : ") + max_sub_key_length = max( + (len(sub_key) for sub_key in value.keys()), default=0 + ) + for sub_key, sub_value in value.items(): + print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") + + elif isinstance(value, list): + print(f"{key:<{max_key_length}} : ") + for item in value: + if isinstance(item, dict): + for sub_key, sub_value in item.items(): + print(f" {sub_key:<{max_key_length}} : {sub_value}") + + else: + print(f" {item}") + + else: + print(f"{key:<{max_key_length}} : {value}") elif isinstance(data, list): for item in data: @@ -211,12 +210,12 @@ def validate_single_item(item, valid_options, item_type): ): closest_match = difflib.get_close_matches(item, valid_options, n=1) closest_match_str = ( - f" The closest matching {item_type} is {closest_match[0]}." + f" The closest matching {item_type} is '{closest_match[0]}'." if closest_match else "" ) - return f"Invalid {item_type} {item}.{closest_match_str}" + return f"Invalid {item_type} '{item}'.{closest_match_str}" return None diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index ee3311ede..762d3bfca 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -21,16 +21,16 @@ """ from scribe_data.cli.cli_utils import ( + LANGUAGE_DATA_EXTRACTION_DIR, correct_data_type, - language_metadata, language_map, - LANGUAGE_DATA_EXTRACTION_DIR, + language_metadata, ) from scribe_data.utils import ( - list_all_languages, + format_sublanguage_name, get_language_iso, get_language_qid, - format_sublanguage_name, + list_all_languages, ) @@ -39,7 +39,6 @@ def list_languages() -> None: Generates a table of languages, their ISO-2 codes and their Wikidata QIDs. """ languages = list_all_languages(language_metadata) - languages.sort() language_col_width = max(len(lang) for lang in languages) + 2 iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2 diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 5530ef5db..885d9b3e9 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -29,8 +29,8 @@ language_metadata, language_to_qid, ) +from scribe_data.utils import format_sublanguage_name, list_all_languages from scribe_data.wikidata.wikidata_utils import sparql -from scribe_data.utils import list_all_languages, format_sublanguage_name def get_qid_by_input(input_str): @@ -73,9 +73,8 @@ def get_datatype_list(language): A list of the corresponding data types. """ languages = list_all_languages(language_metadata) - language_list = [lang for lang in languages] - if language.lower() in language_list: + if language.lower() in languages: language_data = language_map.get(language.lower()) language_capitalized = format_sublanguage_name( language, language_metadata @@ -134,13 +133,9 @@ def print_total_lexemes(language: str = None): print("=" * 64) if language is None: # all languages - languages = list_all_languages( - language_metadata - ) # this returns a list of language names - language_list = languages # sorts the list in place - language_list.sort() + languages = list_all_languages(language_metadata) - for lang in language_list: + for lang in languages: data_types = get_datatype_list(lang) first_row = True diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json index 00a8d405c..7ab2145bf 100755 --- a/src/scribe_data/resources/language_metadata.json +++ b/src/scribe_data/resources/language_metadata.json @@ -11,6 +11,14 @@ "iso": "bn", "qid": "Q9610" }, + "chinese": { + "sub_languages": { + "mandarin": { + "iso": "zh", + "qid": "Q727694" + } + } + }, "czech": { "iso": "cs", "qid": "Q9056" @@ -95,23 +103,15 @@ "iso": "ml", "qid": "Q36236" }, - "chinese": { - "sub_languages": { - "mandarin": { - "iso": "zh", - "qid": "Q727694" - } - } - }, "norwegian": { "sub_languages": { - "nynorsk": { - "iso": "nn", - "qid": "Q25164" - }, "bokmål": { "iso": "nb", "qid": "Q25167" + }, + "nynorsk": { + "iso": "nn", + "qid": "Q25164" } } }, @@ -133,13 +133,13 @@ }, "punjabi": { "sub_languages": { - "shahmukhi": { - "iso": "pnb", - "qid": "Q58635" - }, "gurmukhi": { "iso": "pa", "qid": "Q58635" + }, + "shahmukhi": { + "iso": "pnb", + "qid": "Q58635" } } }, diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index df22a9a9a..3c2007640 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -26,7 +26,6 @@ from pathlib import Path from typing import Any, Optional - PROJECT_ROOT = "Scribe-Data" DEFAULT_JSON_EXPORT_DIR = "scribe_data_json_export" DEFAULT_CSV_EXPORT_DIR = "scribe_data_csv_export" @@ -53,8 +52,7 @@ def _load_json(package_path: str, file_name: str) -> Any: with resources.files(package_path).joinpath(file_name).open( encoding="utf-8" ) as in_stream: - contents = json.load(in_stream) - return contents # No need for 'root' + return json.load(in_stream) _languages = _load_json( @@ -90,13 +88,13 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) - ------ ValueError : when a source_value is not supported or the language only has sub-languages. """ - norm_source_value = source_value.lower() - - # Check if we're searching by language name + # Check if we're searching by language name. if source_key == "language": - # First, check the main language entries (e.g., mandarin, french, etc.) + norm_source_value = source_value.lower() + + # First, check the main language entries (e.g., mandarin, french, etc.). for language, entry in _languages.items(): - # If the language name matches the top-level key, return the target value + # If the language name matches the top-level key, return the target value. if language.lower() == norm_source_value: if "sub_languages" in entry: sub_languages = ", ".join(entry["sub_languages"].keys()) @@ -105,37 +103,16 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) - ) return entry.get(target_key) - # If there are sub-languages, check them too + # If there are sub-languages, check them too. if "sub_languages" in entry: for sub_language, sub_entry in entry["sub_languages"].items(): if sub_language.lower() == norm_source_value: return sub_entry.get(target_key) - # If no match was found, raise an error + # If no match was found, raise an error. raise ValueError(error_msg) -def get_scribe_languages() -> list[str]: - """ - Returns the list of currently implemented Scribe languages. - This version handles both regular languages and those with sub-languages (e.g., Norwegian). - """ - languages = [] - - for language, entry in _languages.items(): - # Add the main language (if it's directly queryable) - if "sub_languages" not in entry: - languages.append(language.capitalize()) - - # If there are sub-languages, add them instead - if "sub_languages" in entry: - languages.extend( - sub_language.capitalize() for sub_language in entry["sub_languages"] - ) - - return sorted(languages) - - def get_language_qid(language: str) -> str: """ Returns the QID of the given language. @@ -173,13 +150,12 @@ def get_language_iso(language: str) -> str: The ISO code for the language. """ - iso_code = _find( + return _find( "language", language, "iso", f"{language.upper()} is currently not a supported language for ISO conversion.", ) - return iso_code def get_language_from_iso(iso: str) -> str: @@ -433,20 +409,25 @@ def map_genders(wikidata_gender: str) -> str: ---------- wikidata_gender : str The gender of the noun that was queried from WikiData. + + Returns + ------- + The gender value corrected in case the Wikidata ID was queried. """ gender_map = { - "masculine": "M", - "Q499327": "M", - "feminine": "F", - "Q1775415": "F", - "common gender": "C", - "Q1305037": "C", - "neuter": "N", - "Q1775461": "N", + "masculine": "masculine", + "Q499327": "masculine", + "feminine": "feminine", + "Q1775415": "feminine", + "common": "common", + "common gender": "common", + "Q1305037": "common", + "neuter": "neuter", + "Q1775461": "neuter", } return gender_map.get( - wikidata_gender, "" + wikidata_gender.lower(), "" ) # nouns could have a gender that is not a valid attribute @@ -458,20 +439,24 @@ def map_cases(wikidata_case: str) -> str: ---------- wikidata_case : str The case of the noun that was queried from WikiData. + + Returns + ------- + The case value corrected in case the Wikidata ID was queried. """ case_map = { - "accusative": "Acc", - "Q146078": "Acc", - "dative": "Dat", - "Q145599": "Dat", - "genitive": "Gen", - "Q146233": "Gen", - "instrumental": "Ins", - "Q192997": "Ins", - "prepositional": "Pre", - "Q2114906": "Pre", - "locative": "Loc", - "Q202142": "Loc", + "accusative": "accusative", + "Q146078": "accusative", + "dative": "dative", + "Q145599": "dative", + "genitive": "genitive", + "Q146233": "genitive", + "instrumental": "instrumental", + "Q192997": "instrumental", + "prepositional": "prepositional", + "Q2114906": "prepositional", + "locative": "locative", + "Q202142": "locative", } case = wikidata_case.split(" case")[0] return case_map.get(case, "") @@ -498,57 +483,66 @@ def order_annotations(annotation: str) -> str: def format_sublanguage_name(lang, language_metadata=_languages): """ Formats the name of a sub-language by appending its main language - in the format 'Mainlang/Sublang'. If the language is not a sub-language, + in the format 'MAIN_LANG/SUB_LANG'. If the language is not a sub-language, the original language name is returned as-is. - Args: - lang (str): The name of the language or sub-language to format. - language_metadata (dict): The metadata containing information about - main languages and their sub-languages. + Parameters + ---------- + lang : str + The name of the language or sub-language to format. - Returns: - str: The formatted language name if it's a sub-language - (e.g., 'Norwegian/Nynorsk'), otherwise the original name. + language_metadata : dict + The metadata containing information about main languages and their sub-languages. - Raises: + Returns + ------- + str + The formatted language name if it's a sub-language (e.g., 'Norwegian/Nynorsk'). + Otherwise the original name. + + Raises + ------ ValueError: If the provided language or sub-language is not found. - Example: - format_sublanguage_name("nynorsk", language_metadata) + Example + ------- + > format_sublanguage_name("nynorsk", language_metadata) 'Norwegian/Nynorsk' - format_sublanguage_name("english", language_metadata) + > format_sublanguage_name("english", language_metadata) 'English' """ - # Iterate through the main languages in the metadata for main_lang, lang_data in language_metadata.items(): - # If it's not a sub-language, return the original name + # If it's not a sub-language, return the original name. if main_lang == lang.lower(): return lang.capitalize() - # Check if the main language has sub-languages + + # Check if the main language has sub-languages. if "sub_languages" in lang_data: - # Check if the provided language is a sub-language + # Check if the provided language is a sub-language. for sub_lang in lang_data["sub_languages"]: if lang.lower() == sub_lang.lower(): - # Return the formatted name Mainlang/Sublang + # Return the formatted name MAIN_LANG/SUB_LANG. return f"{main_lang.capitalize()}/{sub_lang.capitalize()}" - # Raise ValueError if no match is found + # Raise ValueError if no match is found. raise ValueError(f"{lang.upper()} is not a valid language or sub-language.") def list_all_languages(language_metadata=_languages): - """List all languages from the provided metadata dictionary, including sub-languages.""" + """ + Returns a sorted list of all languages from the provided metadata dictionary, including sub-languages. + """ current_languages = [] - # Iterate through the language metadata + # Iterate through the language metadata. for lang_key, lang_data in language_metadata.items(): - # Check if there are sub-languages + # Check if there are sub-languages. if "sub_languages" in lang_data: - # Add the sub-languages to current_languages + # Add the sub-languages to current_languages. current_languages.extend(lang_data["sub_languages"].keys()) else: - # If no sub-languages, add the main language + # If no sub-languages, add the main language. current_languages.append(lang_key) - return current_languages + return sorted(current_languages) diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py index a827666a2..333c3b7d7 100644 --- a/tests/cli/test_utils.py +++ b/tests/cli/test_utils.py @@ -187,7 +187,7 @@ def test_validate_language_and_data_type_invalid_language(self, mock_get_qid): language=language_qid, data_type=data_type_qid ) - self.assertEqual(str(context.exception), "Invalid language InvalidLanguage.") + self.assertEqual(str(context.exception), "Invalid language 'InvalidLanguage'.") @patch("scribe_data.cli.total.get_qid_by_input") def test_validate_language_and_data_type_invalid_data_type(self, mock_get_qid): @@ -201,7 +201,7 @@ def test_validate_language_and_data_type_invalid_data_type(self, mock_get_qid): language=language_qid, data_type=data_type_qid ) - self.assertEqual(str(context.exception), "Invalid data-type InvalidDataType.") + self.assertEqual(str(context.exception), "Invalid data-type 'InvalidDataType'.") @patch("scribe_data.cli.total.get_qid_by_input") def test_validate_language_and_data_type_both_invalid(self, mock_get_qid): @@ -217,7 +217,7 @@ def test_validate_language_and_data_type_both_invalid(self, mock_get_qid): self.assertEqual( str(context.exception), - "Invalid language InvalidLanguage.\nInvalid data-type InvalidDataType.", + "Invalid language 'InvalidLanguage'.\nInvalid data-type 'InvalidDataType'.", ) def test_validate_language_and_data_type_with_list(self): @@ -248,5 +248,5 @@ def test_validate_language_and_data_type_mixed_validity_in_lists(self): data_types = ["nouns", "InvalidDataType"] with self.assertRaises(ValueError) as context: validate_language_and_data_type(languages, data_types) - self.assertIn("Invalid language InvalidLanguage", str(context.exception)) - self.assertIn("Invalid data-type InvalidDataType", str(context.exception)) + self.assertIn("Invalid language 'InvalidLanguage'", str(context.exception)) + self.assertIn("Invalid data-type 'InvalidDataType'", str(context.exception)) diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py index df37317a3..43eaa2038 100644 --- a/tests/load/test_update_utils.py +++ b/tests/load/test_update_utils.py @@ -21,7 +21,6 @@ """ import sys -import unittest from pathlib import Path import pytest @@ -31,57 +30,6 @@ from scribe_data import utils -def test_get_scribe_languages(): - test_case = unittest.TestCase() - - # test for content, not order - test_case.assertCountEqual( - utils.get_scribe_languages(), - [ - "Arabic", - "Basque", - "Bengali", - "Bokmål", - "Czech", - "Danish", - "English", - "Esperanto", - "Estonian", - "Finnish", - "French", - "German", - "Greek", - "Gurmukhi", - "Hausa", - "Hebrew", - "Hindi", - "Indonesian", - "Italian", - "Japanese", - "Kurmanji", - "Latin", - "Malay", - "Malayalam", - "Mandarin", - "Nigerian", - "Nynorsk", - "Polish", - "Portuguese", - "Russian", - "Shahmukhi", - "Slovak", - "Spanish", - "Swahili", - "Swedish", - "Tajik", - "Tamil", - "Ukrainian", - "Urdu", - "Yoruba", - ], - ) - - @pytest.mark.parametrize( "language, qid_code", [ @@ -187,6 +135,7 @@ def test_list_all_languages(): "arabic", "basque", "bengali", + "bokmål", "czech", "danish", "english", @@ -196,10 +145,10 @@ def test_list_all_languages(): "french", "german", "greek", + "gurmukhi", "hausa", "hebrew", "hindi", - "urdu", "indonesian", "italian", "japanese", @@ -208,14 +157,12 @@ def test_list_all_languages(): "malay", "malayalam", "mandarin", - "nynorsk", - "bokmål", "nigerian", + "nynorsk", "polish", "portuguese", - "shahmukhi", - "gurmukhi", "russian", + "shahmukhi", "slovak", "spanish", "swahili", @@ -223,6 +170,7 @@ def test_list_all_languages(): "tajik", "tamil", "ukrainian", + "urdu", "yoruba", ] From 3fe55283abddd4f901b186df7be973f567da5489 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sun, 13 Oct 2024 23:59:31 +0300 Subject: [PATCH 54/63] Refactor language_map and language_to_qid generation to handle new JSON structure - Updated the logic for building language_map and language_to_qid to handle languages with sub-languages. - Both main languages and sub-languages are now processed in a single pass, ensuring that: - language_map includes all metadata for main and sub-languages. - language_to_qid correctly maps both main and sub-languages to their QIDs. --- src/scribe_data/cli/cli_utils.py | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py index e39e1621d..f3994e3c1 100644 --- a/src/scribe_data/cli/cli_utils.py +++ b/src/scribe_data/cli/cli_utils.py @@ -51,31 +51,23 @@ with DATA_TYPE_METADATA_FILE.open("r", encoding="utf-8") as file: data_type_metadata = json.load(file) -except (IOError, json.JSONDecodeError) as e: - print(f"Error reading data type metadata: {e}") - - language_map = {} language_to_qid = {} -# Process each language and its potential sub-languages in one pass. -for lang, lang_data in language_metadata.items(): - lang_lower = lang.lower() +# Process each language and its potential sub-languages in one pass +for lang_key, lang_data in language_metadata.items(): + lang_key_lower = lang_key.lower() - # Handle sub-languages if they exist. + # Handle sub-languages if they exist if "sub_languages" in lang_data: - for sub_lang, sub_lang_data in lang_data["sub_languages"].items(): - sub_lang_lower = sub_lang.lower() - language_map[sub_lang_lower] = sub_lang_data - language_to_qid[sub_lang_lower] = sub_lang_data["qid"] - + for sub_lang_key, sub_lang_data in lang_data["sub_languages"].items(): + sub_lang_key_lower = sub_lang_key.lower() + language_map[sub_lang_key_lower] = sub_lang_data + language_to_qid[sub_lang_key_lower] = sub_lang_data["qid"] else: - # Handle the main language directly. - language_map[lang_lower] = lang_data - language_to_qid[lang_lower] = lang_data["qid"] - - -# MARK: Correct Inputs + # Handle the main language directly + language_map[lang_key_lower] = lang_data + language_to_qid[lang_key_lower] = lang_data["qid"] def correct_data_type(data_type: str) -> str: From efb1f647b31930173c7b57f9866f99168f282bce Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Wed, 16 Oct 2024 20:28:44 +0300 Subject: [PATCH 55/63] removing .capitalize method since it's already implemented inside laguages listing functions --- tests/cli/test_list.py | 84 +++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py index 6fb4bf791..e32c1973b 100644 --- a/tests/cli/test_list.py +++ b/tests/cli/test_list.py @@ -100,6 +100,8 @@ def test_list_data_types_all_languages(self, mock_print): call("nouns"), call("personal-pronouns"), call("postpositions"), + call("personal-pronouns"), + call("postpositions"), call("prepositions"), call("proper-nouns"), call("verbs"), @@ -181,48 +183,48 @@ def test_list_languages_for_data_type_valid(self, mock_print): list_languages_for_data_type("nouns") expected_calls = [ call(), - call("Available languages: nouns"), + call("Language ISO QID "), call("--------------------------"), - call("Arabic"), - call("Basque"), - call("Bengali"), - call("Chinese/Mandarin"), - call("Czech"), - call("Danish"), - call("English"), - call("Esperanto"), - call("Estonian"), - call("Finnish"), - call("French"), - call("German"), - call("Greek"), - call("Hausa"), - call("Hebrew"), - call("Hindustani/Hindi"), - call("Hindustani/Urdu"), - call("Indonesian"), - call("Italian"), - call("Japanese"), - call("Kurmanji"), - call("Latin"), - call("Malay"), - call("Malayalam"), - call("Norwegian/Bokmål"), - call("Norwegian/Nynorsk"), - call("Pidgin/Nigerian"), - call("Polish"), - call("Portuguese"), - call("Punjabi/Gurmukhi"), - call("Punjabi/Shahmukhi"), - call("Russian"), - call("Slovak"), - call("Spanish"), - call("Swahili"), - call("Swedish"), - call("Tajik"), - call("Tamil"), - call("Ukrainian"), - call("Yoruba"), + call("Arabic ar Q13955 "), + call("Basque eu Q8752 "), + call("Bengali bn Q9610 "), + call("Bokmål nb Q25167 "), + call("Czech cs Q9056 "), + call("Danish da Q9035 "), + call("English en Q1860 "), + call("Esperanto eo Q143 "), + call("Estonian et Q9072 "), + call("Finnish fi Q1412 "), + call("French fr Q150 "), + call("German de Q188 "), + call("Greek el Q36510 "), + call("Gurmukhi pa Q58635 "), + call("Hausa ha Q56475 "), + call("Hebrew he Q9288 "), + call("Hindi hi Q11051 "), + call("Indonesian id Q9240 "), + call("Italian it Q652 "), + call("Japanese ja Q5287 "), + call("Kurmanji kmr Q36163 "), + call("Latin la Q397 "), + call("Malay ms Q9237 "), + call("Malayalam ml Q36236 "), + call("Mandarin zh Q727694 "), + call("Nigerian pi Q33655 "), + call("Nynorsk nn Q25164 "), + call("Polish pl Q809 "), + call("Portuguese pt Q5146 "), + call("Russian ru Q7737 "), + call("Shahmukhi pnb Q58635 "), + call("Slovak sk Q9058 "), + call("Spanish es Q1321 "), + call("Swahili sw Q7838 "), + call("Swedish sv Q9027 "), + call("Tajik tg Q9260 "), + call("Tamil ta Q5885 "), + call("Ukrainian ua Q8798 "), + call("Urdu ur Q11051 "), + call("Yoruba yo Q34311 "), call("--------------------------"), call(), ] From 048c84f6c3e9e1eb349b1fd44cb912b53be7be29 Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Fri, 18 Oct 2024 09:54:22 +0100 Subject: [PATCH 56/63] adjust is_valid_language function to suit new JSON structure --- src/scribe_data/check/check_query_identifiers.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/scribe_data/check/check_query_identifiers.py b/src/scribe_data/check/check_query_identifiers.py index 2d3a40b16..90b06263f 100644 --- a/src/scribe_data/check/check_query_identifiers.py +++ b/src/scribe_data/check/check_query_identifiers.py @@ -94,12 +94,18 @@ def is_valid_language(query_file: Path, lang_qid: str) -> bool: True if the language QID is valid, otherwise False. """ lang_directory_name = query_file.parent.parent.name.lower() - languages = language_metadata.get( - "languages" + language_entry = language_metadata.get( + lang_directory_name ) # might not work since language_metadata file is not fully updated - language_entry = next( - (lang for lang in languages if lang["language"] == lang_directory_name), None - ) + + if not language_entry: + # Look for sub-languages + for lang, details in language_metadata.items(): + if "sub_languages" in details: + sub_language_entry = details["sub_languages"].get(lang_directory_name) + if sub_language_entry: + language_entry = sub_language_entry + break if not language_entry: return False From 1f8c9da3fe7aa90cc42d6d9531055c78759fa1af Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sun, 13 Oct 2024 23:59:31 +0300 Subject: [PATCH 57/63] Refactor language_map and language_to_qid generation to handle new JSON structure - Updated the logic for building language_map and language_to_qid to handle languages with sub-languages. - Both main languages and sub-languages are now processed in a single pass, ensuring that: - language_map includes all metadata for main and sub-languages. - language_to_qid correctly maps both main and sub-languages to their QIDs. --- src/scribe_data/cli/cli_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py index f3994e3c1..396a890d4 100644 --- a/src/scribe_data/cli/cli_utils.py +++ b/src/scribe_data/cli/cli_utils.py @@ -51,6 +51,9 @@ with DATA_TYPE_METADATA_FILE.open("r", encoding="utf-8") as file: data_type_metadata = json.load(file) +except (IOError, json.JSONDecodeError) as e: + print(f"Error reading datatype metadata: {e}") + language_map = {} language_to_qid = {} From f1e227f1050dfc42753cd41ed7149c370192a630 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sun, 13 Oct 2024 23:59:31 +0300 Subject: [PATCH 58/63] Refactor language_map and language_to_qid generation to handle new JSON structure - Updated the logic for building language_map and language_to_qid to handle languages with sub-languages. - Both main languages and sub-languages are now processed in a single pass, ensuring that: - language_map includes all metadata for main and sub-languages. - language_to_qid correctly maps both main and sub-languages to their QIDs. --- src/scribe_data/cli/cli_utils.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py index 396a890d4..a74f39b64 100644 --- a/src/scribe_data/cli/cli_utils.py +++ b/src/scribe_data/cli/cli_utils.py @@ -52,21 +52,28 @@ data_type_metadata = json.load(file) except (IOError, json.JSONDecodeError) as e: - print(f"Error reading datatype metadata: {e}") - + print(f"Error reading data type metadata: {e}") language_map = {} language_to_qid = {} +# Process each language and its potential sub-languages in one pass +for lang_key, lang_data in language_metadata.items(): + lang_key_lower = lang_key.lower() # Process each language and its potential sub-languages in one pass for lang_key, lang_data in language_metadata.items(): lang_key_lower = lang_key.lower() + # Handle sub-languages if they exist # Handle sub-languages if they exist if "sub_languages" in lang_data: for sub_lang_key, sub_lang_data in lang_data["sub_languages"].items(): sub_lang_key_lower = sub_lang_key.lower() language_map[sub_lang_key_lower] = sub_lang_data language_to_qid[sub_lang_key_lower] = sub_lang_data["qid"] + for sub_lang_key, sub_lang_data in lang_data["sub_languages"].items(): + sub_lang_key_lower = sub_lang_key.lower() + language_map[sub_lang_key_lower] = sub_lang_data + language_to_qid[sub_lang_key_lower] = sub_lang_data["qid"] else: # Handle the main language directly language_map[lang_key_lower] = lang_data From d814ecb3a20bf20005f69a268ae41b96dbb53528 Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Fri, 18 Oct 2024 12:37:16 +0100 Subject: [PATCH 59/63] fix failing tests and update docs --- .../check/check_query_identifiers.py | 43 ++++++--- src/scribe_data/cli/list.py | 35 ++++---- src/scribe_data/utils.py | 33 +++++++ tests/cli/test_list.py | 87 ++++++++++--------- 4 files changed, 123 insertions(+), 75 deletions(-) diff --git a/src/scribe_data/check/check_query_identifiers.py b/src/scribe_data/check/check_query_identifiers.py index 90b06263f..4a984be65 100644 --- a/src/scribe_data/check/check_query_identifiers.py +++ b/src/scribe_data/check/check_query_identifiers.py @@ -15,16 +15,26 @@ def extract_qid_from_sparql(file_path: Path, pattern: str) -> str: Parameters ---------- - file_path : Path - The path to the SPARQL query file from which to extract the QID. + file_path : Path + The path to the SPARQL query file from which to extract the QID. - pattern : str - The regex pattern used to match the QID (either for language or data type). + pattern : str + The regex pattern used to match the QID (either for language or data type). Returns ------- - str - The extracted QID if found, otherwise None. + str + The extracted QID if found, otherwise None. + + Raises + ------ + FileNotFoundError + If the specified file does not exist. + + Example + ------- + > extract_qid_from_sparql(Path("path/to/query.sparql"), r"\?lexeme dct:language wd:Q\d+") + 'Q123456' """ try: with open(file_path, "r", encoding="utf-8") as file: @@ -38,7 +48,7 @@ def extract_qid_from_sparql(file_path: Path, pattern: str) -> str: return None -def check_queries(): +def check_queries() -> None: """ Validates SPARQL queries in the specified directory to check for correct language and data type QIDs. @@ -92,11 +102,14 @@ def is_valid_language(query_file: Path, lang_qid: str) -> bool: ------- bool True if the language QID is valid, otherwise False. + + Example + ------- + > is_valid_language(Path("path/to/query.sparql"), "Q123456") + True """ lang_directory_name = query_file.parent.parent.name.lower() - language_entry = language_metadata.get( - lang_directory_name - ) # might not work since language_metadata file is not fully updated + language_entry = language_metadata.get(lang_directory_name) if not language_entry: # Look for sub-languages @@ -112,10 +125,7 @@ def is_valid_language(query_file: Path, lang_qid: str) -> bool: expected_language_qid = language_entry["qid"] - if lang_qid != expected_language_qid: - return False - - return True + return lang_qid == expected_language_qid def is_valid_data_type(query_file: Path, data_type_qid: str) -> bool: @@ -133,6 +143,11 @@ def is_valid_data_type(query_file: Path, data_type_qid: str) -> bool: ------- bool True if the data type QID is valid, otherwise False. + + Example + ------- + > is_valid_data_type(Path("path/to/query.sparql"), "Q654321") + True """ directory_name = query_file.parent.name # e.g., "nouns" or "verbs" expected_data_type_qid = data_type_metadata.get(directory_name) diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index 762d3bfca..eca602b06 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -31,6 +31,7 @@ get_language_iso, get_language_qid, list_all_languages, + list_languages_with_metadata_for_data_type, ) @@ -132,28 +133,26 @@ def list_languages_for_data_type(data_type: str) -> None: The data type to check for. """ data_type = correct_data_type(data_type=data_type) - all_languages = list_all_languages(language_metadata) - available_languages = [] - for lang in all_languages: - lang = format_sublanguage_name(lang, language_metadata) - language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang - if language_dir.is_dir(): - dt_path = language_dir / data_type - if dt_path.exists(): - available_languages.append(lang) - - available_languages.sort() - table_header = f"Available languages: {data_type}" - table_line_length = max( - len(table_header), max(len(lang) for lang in available_languages) - ) + all_languages = list_languages_with_metadata_for_data_type(language_metadata) + # Set column widths for consistent formatting + language_col_width = max(len(lang["name"]) for lang in all_languages) + 2 + iso_col_width = max(len(lang["iso"]) for lang in all_languages) + 2 + qid_col_width = max(len(lang["qid"]) for lang in all_languages) + 2 + + table_line_length = language_col_width + iso_col_width + qid_col_width + # Print table header print() - print(table_header) + print( + f"{'Language':<{language_col_width}} {'ISO':<{iso_col_width}} {'QID':<{qid_col_width}}" + ) print("-" * table_line_length) - for lang in available_languages: - print(f"{lang}") + # Iterate through the list of languages and format each row + for lang in all_languages: + print( + f"{lang['name'].capitalize():<{language_col_width}} {lang['iso']:<{iso_col_width}} {lang['qid']:<{qid_col_width}}" + ) print("-" * table_line_length) print() diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 3c2007640..c7f64e0c6 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -546,3 +546,36 @@ def list_all_languages(language_metadata=_languages): current_languages.append(lang_key) return sorted(current_languages) + + +def list_languages_with_metadata_for_data_type(language_metadata=_languages): + """ + Returns a sorted list of languages and their metadata (name, iso, qid) for a specific data type. + The list includes sub-languages where applicable. + """ + current_languages = [] + + # Iterate through the language metadata. + for lang_key, lang_data in language_metadata.items(): + # Check if there are sub-languages. + if "sub_languages" in lang_data: + # Add the sub-languages to current_languages with metadata. + for sub_key, sub_data in lang_data["sub_languages"].items(): + current_languages.append( + { + "name": f"{lang_data.get('name', lang_key)}/{sub_data.get('name', sub_key)}", + "iso": sub_data.get("iso", ""), + "qid": sub_data.get("qid", ""), + } + ) + else: + # If no sub-languages, add the main language with metadata. + current_languages.append( + { + "name": lang_data.get("name", lang_key), + "iso": lang_data.get("iso", ""), + "qid": lang_data.get("qid", ""), + } + ) + + return sorted(current_languages, key=lambda x: x["name"]) diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py index 8f6d1b86e..a15ec5c90 100644 --- a/tests/cli/test_list.py +++ b/tests/cli/test_list.py @@ -181,51 +181,52 @@ def test_list_languages_for_data_type_valid(self, mock_print): list_languages_for_data_type("nouns") expected_calls = [ call(), - call("Language ISO QID "), - call("--------------------------"), - call("Arabic ar Q13955 "), - call("Basque eu Q8752 "), - call("Bengali bn Q9610 "), - call("Bokmål nb Q25167 "), - call("Czech cs Q9056 "), - call("Danish da Q9035 "), - call("English en Q1860 "), - call("Esperanto eo Q143 "), - call("Estonian et Q9072 "), - call("Finnish fi Q1412 "), - call("French fr Q150 "), - call("German de Q188 "), - call("Greek el Q36510 "), - call("Gurmukhi pa Q58635 "), - call("Hausa ha Q56475 "), - call("Hebrew he Q9288 "), - call("Hindi hi Q11051 "), - call("Indonesian id Q9240 "), - call("Italian it Q652 "), - call("Japanese ja Q5287 "), - call("Kurmanji kmr Q36163 "), - call("Latin la Q397 "), - call("Malay ms Q9237 "), - call("Malayalam ml Q36236 "), - call("Mandarin zh Q727694 "), - call("Nigerian pi Q33655 "), - call("Nynorsk nn Q25164 "), - call("Polish pl Q809 "), - call("Portuguese pt Q5146 "), - call("Russian ru Q7737 "), - call("Shahmukhi pnb Q58635 "), - call("Slovak sk Q9058 "), - call("Spanish es Q1321 "), - call("Swahili sw Q7838 "), - call("Swedish sv Q9027 "), - call("Tajik tg Q9260 "), - call("Tamil ta Q5885 "), - call("Ukrainian ua Q8798 "), - call("Urdu ur Q11051 "), - call("Yoruba yo Q34311 "), - call("--------------------------"), + call("Language ISO QID "), + call("---------------------------------"), + call("Arabic ar Q13955 "), + call("Basque eu Q8752 "), + call("Bengali bn Q9610 "), + call("Chinese/mandarin zh Q727694 "), + call("Czech cs Q9056 "), + call("Danish da Q9035 "), + call("English en Q1860 "), + call("Esperanto eo Q143 "), + call("Estonian et Q9072 "), + call("Finnish fi Q1412 "), + call("French fr Q150 "), + call("German de Q188 "), + call("Greek el Q36510 "), + call("Hausa ha Q56475 "), + call("Hebrew he Q9288 "), + call("Hindustani/hindi hi Q11051 "), + call("Hindustani/urdu ur Q11051 "), + call("Indonesian id Q9240 "), + call("Italian it Q652 "), + call("Japanese ja Q5287 "), + call("Kurmanji kmr Q36163 "), + call("Latin la Q397 "), + call("Malay ms Q9237 "), + call("Malayalam ml Q36236 "), + call("Norwegian/bokmål nb Q25167 "), + call("Norwegian/nynorsk nn Q25164 "), + call("Pidgin/nigerian pi Q33655 "), + call("Polish pl Q809 "), + call("Portuguese pt Q5146 "), + call("Punjabi/gurmukhi pa Q58635 "), + call("Punjabi/shahmukhi pnb Q58635 "), + call("Russian ru Q7737 "), + call("Slovak sk Q9058 "), + call("Spanish es Q1321 "), + call("Swahili sw Q7838 "), + call("Swedish sv Q9027 "), + call("Tajik tg Q9260 "), + call("Tamil ta Q5885 "), + call("Ukrainian ua Q8798 "), + call("Yoruba yo Q34311 "), + call("---------------------------------"), call(), ] + mock_print.assert_has_calls(expected_calls) @patch("scribe_data.cli.list.list_languages") From c8214ffb4c25e73d5dac36801bb64a3f5e45b5d6 Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Sat, 19 Oct 2024 11:51:45 +0100 Subject: [PATCH 60/63] fix failing workflow: add languages to workflow and update failing test cases. --- src/scribe_data/check/check_query_identifiers.py | 8 ++------ src/scribe_data/resources/language_metadata.json | 8 ++++++++ tests/cli/test_list.py | 4 ++++ 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/scribe_data/check/check_query_identifiers.py b/src/scribe_data/check/check_query_identifiers.py index 4a984be65..14c151267 100644 --- a/src/scribe_data/check/check_query_identifiers.py +++ b/src/scribe_data/check/check_query_identifiers.py @@ -31,10 +31,6 @@ def extract_qid_from_sparql(file_path: Path, pattern: str) -> str: FileNotFoundError If the specified file does not exist. - Example - ------- - > extract_qid_from_sparql(Path("path/to/query.sparql"), r"\?lexeme dct:language wd:Q\d+") - 'Q123456' """ try: with open(file_path, "r", encoding="utf-8") as file: @@ -155,5 +151,5 @@ def is_valid_data_type(query_file: Path, data_type_qid: str) -> bool: return data_type_qid == expected_data_type_qid -if __name__ == "__main__": - check_queries() +# if __name__ == "__main__": +check_queries() diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json index 7ab2145bf..7c6840457 100755 --- a/src/scribe_data/resources/language_metadata.json +++ b/src/scribe_data/resources/language_metadata.json @@ -75,6 +75,10 @@ } } }, + "igbo":{ + "iso":"ig", + "qid": "Q33578" + }, "indonesian": { "iso": "id", "qid": "Q9240" @@ -87,6 +91,10 @@ "iso": "ja", "qid": "Q5287" }, + "korean":{ + "iso":"ko", + "qid":"Q9176" + }, "kurmanji": { "iso": "kmr", "qid": "Q36163" diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py index a15ec5c90..fc607dec3 100644 --- a/tests/cli/test_list.py +++ b/tests/cli/test_list.py @@ -58,9 +58,11 @@ def test_list_languages(self, mock_print): call("Hausa ha Q56475 "), call("Hebrew he Q9288 "), call("Hindi hi Q11051 "), + call("Igbo ig Q33578 "), call("Indonesian id Q9240 "), call("Italian it Q652 "), call("Japanese ja Q5287 "), + call("Korean ko Q9176 "), call("Kurmanji kmr Q36163 "), call("Latin la Q397 "), call("Malay ms Q9237 "), @@ -200,9 +202,11 @@ def test_list_languages_for_data_type_valid(self, mock_print): call("Hebrew he Q9288 "), call("Hindustani/hindi hi Q11051 "), call("Hindustani/urdu ur Q11051 "), + call("Igbo ig Q33578 "), call("Indonesian id Q9240 "), call("Italian it Q652 "), call("Japanese ja Q5287 "), + call("Korean ko Q9176 "), call("Kurmanji kmr Q36163 "), call("Latin la Q397 "), call("Malay ms Q9237 "), From 6517ffe31ede0898c9f095b05080ddf05cf8e099 Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Sat, 19 Oct 2024 11:59:16 +0100 Subject: [PATCH 61/63] fix failing tests --- tests/load/test_update_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py index 43eaa2038..71c0daa78 100644 --- a/tests/load/test_update_utils.py +++ b/tests/load/test_update_utils.py @@ -149,9 +149,11 @@ def test_list_all_languages(): "hausa", "hebrew", "hindi", + "igbo", "indonesian", "italian", "japanese", + "korean", "kurmanji", "latin", "malay", From 8586625541e799864797619b4e97b238f21a9ecc Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sat, 19 Oct 2024 16:49:13 +0200 Subject: [PATCH 62/63] Add Latvian to language metadata file --- src/scribe_data/resources/language_metadata.json | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json index b6320f835..088cd7552 100755 --- a/src/scribe_data/resources/language_metadata.json +++ b/src/scribe_data/resources/language_metadata.json @@ -95,9 +95,9 @@ "iso": "ja", "qid": "Q5287" }, - "korean":{ - "iso":"ko", - "qid":"Q9176" + "korean": { + "iso": "ko", + "qid": "Q9176" }, "kurmanji": { "iso": "kmr", @@ -107,6 +107,10 @@ "iso": "la", "qid": "Q397" }, + "latvian": { + "iso": "lv", + "qid": "Q9078" + }, "malay": { "iso": "ms", "qid": "Q9237" From a975a6bd59640f24e79930b6a92f979651b0ddd6 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sat, 19 Oct 2024 16:54:17 +0200 Subject: [PATCH 63/63] Add spacing and Latvian to testing --- .../check/check_query_identifiers.py | 23 +++++++++---------- src/scribe_data/cli/cli_utils.py | 2 ++ src/scribe_data/cli/list.py | 7 +++--- tests/load/test_update_utils.py | 1 + 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/scribe_data/check/check_query_identifiers.py b/src/scribe_data/check/check_query_identifiers.py index a0364e261..754827165 100644 --- a/src/scribe_data/check/check_query_identifiers.py +++ b/src/scribe_data/check/check_query_identifiers.py @@ -41,22 +41,21 @@ def extract_qid_from_sparql(file_path: Path, pattern: str) -> str: Parameters ---------- - file_path : Path - The path to the SPARQL query file from which to extract the QID. + file_path : Path + The path to the SPARQL query file from which to extract the QID. - pattern : str - The regex pattern used to match the QID (either for language or data type). + pattern : str + The regex pattern used to match the QID (either for language or data type). Returns ------- - str - The extracted QID if found, otherwise None. + str + The extracted QID if found, otherwise None. Raises ------ - FileNotFoundError - If the specified file does not exist. - + FileNotFoundError + If the specified file does not exist. """ try: with open(file_path, "r", encoding="utf-8") as file: @@ -104,7 +103,7 @@ def check_queries() -> None: for file in incorrect_data_types: print(f"- {file}") - # Exit with an error code if any incorrect QIDs are found + # Exit with an error code if any incorrect QIDs are found. if incorrect_languages or incorrect_data_types: sys.exit(1) @@ -177,5 +176,5 @@ def is_valid_data_type(query_file: Path, data_type_qid: str) -> bool: return data_type_qid == expected_data_type_qid -# if __name__ == "__main__": -check_queries() +if __name__ == "__main__": + check_queries() diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py index ddc9731a5..4bfbb58c6 100644 --- a/src/scribe_data/cli/cli_utils.py +++ b/src/scribe_data/cli/cli_utils.py @@ -84,6 +84,8 @@ # MARK: Correct Inputs + + def correct_data_type(data_type: str) -> str: """ Corrects common versions of data type arguments so users can choose between them. diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index eca602b06..8dd912b7a 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -134,21 +134,22 @@ def list_languages_for_data_type(data_type: str) -> None: """ data_type = correct_data_type(data_type=data_type) all_languages = list_languages_with_metadata_for_data_type(language_metadata) - # Set column widths for consistent formatting + + # Set column widths for consistent formatting. language_col_width = max(len(lang["name"]) for lang in all_languages) + 2 iso_col_width = max(len(lang["iso"]) for lang in all_languages) + 2 qid_col_width = max(len(lang["qid"]) for lang in all_languages) + 2 table_line_length = language_col_width + iso_col_width + qid_col_width - # Print table header + # Print table header. print() print( f"{'Language':<{language_col_width}} {'ISO':<{iso_col_width}} {'QID':<{qid_col_width}}" ) print("-" * table_line_length) - # Iterate through the list of languages and format each row + # Iterate through the list of languages and format each row. for lang in all_languages: print( f"{lang['name'].capitalize():<{language_col_width}} {lang['iso']:<{iso_col_width}} {lang['qid']:<{qid_col_width}}" diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py index 3f4599475..6f232846d 100644 --- a/tests/load/test_update_utils.py +++ b/tests/load/test_update_utils.py @@ -157,6 +157,7 @@ def test_list_all_languages(): "korean", "kurmanji", "latin", + "latvian", "malay", "malayalam", "mandarin",