diff --git a/.gitignore b/.gitignore index 87f4d25..be8ee48 100644 --- a/.gitignore +++ b/.gitignore @@ -17,12 +17,14 @@ docs/scripts # This is a symbolic link to data/ to allow local preview of the site. docs/data -scripts/data-external/* -!scripts/data-external/.gitkeep -!scripts/data-external/iso15924__sample.csv -!scripts/data-external/iso15924__sample.txt -!scripts/data-external/iso-639-3__sample.csv -!scripts/data-external/iso-639-3__sample.tab +# scripts/data-external/* +# !scripts/data-external/.gitkeep +# !scripts/data-external/iso15924__sample.csv +# !scripts/data-external/iso15924__sample.txt +# !scripts/data-external/iso-639-3__sample.csv +# !scripts/data-external/iso-639-3__sample.tab +# !scripts/data-external/cldr/aliases__sample.json +# !scripts/data-external/cldr/likelySubtags__sample.json # # temp # data/original/terminology/tico-19-terminology-facebook.csv diff --git a/docs/eng-Latn/index.adoc b/docs/eng-Latn/index.adoc index 77d016d..6aeb5ac 100644 --- a/docs/eng-Latn/index.adoc +++ b/docs/eng-Latn/index.adoc @@ -1,4 +1,4 @@ -= TICO-19 in HXLTM (draft) += TICO-19 in HXLTM (public working draft) // EticaAI, Collaborators_of ; Rocha, Emerson :toc: 1 :toclevels: 4 @@ -36,6 +36,11 @@ TODO: Public domain datasets of the https://tico-19.github.io[Translation Initiative for COVID-19] on the format HXLTM (Multilingual Terminology in Humanitarian Language Exchange). +== Versions +- Online: https://tico-19-hxltm.etica.ai/ +- PDF: https://tico-19-hxltm.etica.ai/tico-19-hxltm_eng-Latn.pdf[] +- Ebook EPUB: https://tico-19-hxltm.etica.ai/tico-19-hxltm_eng-Latn.epub[] + == Tables [%header,format=csv] @@ -46,6 +51,7 @@ include::../scripts/data-info/tico19_tm.csv[] == Quick explanations == Original data + minor changes +//// === `data/original/terminology/google/` and `data/original/tico-19-terminology-google.csv` @@ -54,7 +60,7 @@ include::../scripts/data-info/tico19_tm.csv[] === `data/original/terminology/facebook/` and `data/original/tico-19-terminology-google.csv` (...) - +//// == Appendix diff --git a/docs/robots.txt b/docs/robots.txt new file mode 100644 index 0000000..f8115f1 --- /dev/null +++ b/docs/robots.txt @@ -0,0 +1,5 @@ +User-agent: * + +# These files are redundant. No need to index +Disallow: /scripts/data-external/ +Disallow: /scripts/data-info/ \ No newline at end of file diff --git a/scripts/_run-all-data-scripts.sh b/scripts/_run-all-data-scripts.sh index 4a7eb29..3805be9 100755 --- a/scripts/_run-all-data-scripts.sh +++ b/scripts/_run-all-data-scripts.sh @@ -56,4 +56,8 @@ set -e # - tables render bad on ebooks, convert then to image and put ifelse # on .adoc # - https://stackoverflow.com/questions/26357137/csv-to-image-in-python -# - https://stackoverflow.com/questions/902761/saving-a-numpy-array-as-an-image \ No newline at end of file +# - https://stackoverflow.com/questions/902761/saving-a-numpy-array-as-an-image +# - https://github.com/datasets/language-codes/blob/master/ietf-lanGen.php +# - https://github.com/unicode-org/cldr-json/blob/main/cldr-json/cldr-core/defaultContent.json +# - Territory +# - https://github.com/unicode-org/cldr-json/blob/main/cldr-json/cldr-core/supplemental/territoryInfo.json \ No newline at end of file diff --git a/scripts/data-external-prepare.sh b/scripts/data-external-prepare.sh index cdb4702..7f44982 100755 --- a/scripts/data-external-prepare.sh +++ b/scripts/data-external-prepare.sh @@ -30,8 +30,8 @@ PWD_NOW=$(pwd) if [ ! -f scripts/data-external/iso-639-3.tab ]; then curl https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab --output scripts/data-external/iso-639-3.tab mlr --itsv --ocsv cat scripts/data-external/iso-639-3.tab > scripts/data-external/iso-639-3.csv - head scripts/data-external/iso-639-3.csv > scripts/data-external/iso-639-3__sample.csv - head scripts/data-external/iso-639-3.tab > scripts/data-external/iso-639-3__sample.tab + head scripts/data-external/iso-639-3.csv > scripts/data-external/iso-639-3.sample.csv + head scripts/data-external/iso-639-3.tab > scripts/data-external/iso-639-3.sample.tab fi if [ ! -f scripts/data-external/iso15924.txt ]; then curl https://www.unicode.org/iso15924/iso15924.txt --output scripts/data-external/iso15924.txt @@ -43,10 +43,22 @@ if [ ! -f scripts/data-external/iso15924.txt ]; then scripts/data-external/iso15924_no-comments.tsv \ > scripts/data-external/iso15924.csv - head scripts/data-external/iso15924.csv > scripts/data-external/iso15924__sample.csv - head scripts/data-external/iso15924.txt > scripts/data-external/iso15924__sample.txt + head scripts/data-external/iso15924.csv > scripts/data-external/iso15924.sample.csv + head scripts/data-external/iso15924.txt > scripts/data-external/iso15924.sample.txt fi +if [ ! -f scripts/data-external/cldr/likelySubtags.json ]; then + curl https://raw.githubusercontent.com/unicode-org/cldr-json/main/cldr-json/cldr-core/supplemental/likelySubtags.json --output scripts/data-external/cldr/likelySubtags.json + head -n 15 scripts/data-external/cldr/likelySubtags.json > scripts/data-external/cldr/likelySubtags.sample.json +fi + +if [ ! -f scripts/data-external/cldr/aliases.json ]; then + curl https://raw.githubusercontent.com/unicode-org/cldr-json/main/cldr-json/cldr-core/supplemental/aliases.json --output scripts/data-external/cldr/aliases.json + head -n 15 scripts/data-external/cldr/likelySubtags.json > scripts/data-external/cldr/aliases.sample.json +fi + +# https://github.com/unicode-org/cldr-json/blob/main/cldr-json/cldr-core/scriptMetadata.json + # mlr --irs '|' --implicit-csv-header cat scripts/data-external/iso15924_no-comments-pipe.txt # mlr --itsv --ocsv cat scripts/data-external/iso15924_no-comments.tsv diff --git a/scripts/data-external/.gitkeep b/scripts/data-external/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/data-external/README.adoc b/scripts/data-external/README.adoc new file mode 100644 index 0000000..eaa032b --- /dev/null +++ b/scripts/data-external/README.adoc @@ -0,0 +1,7 @@ +== scripts/data-external/README.adoc + +Data here is generated using the link:../data-external-prepare.sh[]. +The versions without `.sample` are the _de facto_ used for several other internal scripts. +It's recommended (if part of this code is reused by others) to re-download the files. + +The real data, without `.sample` prefix, is not committed on the repository (both to copyright and avoid redundancy)- \ No newline at end of file diff --git a/scripts/data-external/cldr/aliases.sample.json b/scripts/data-external/cldr/aliases.sample.json new file mode 100644 index 0000000..cf060ec --- /dev/null +++ b/scripts/data-external/cldr/aliases.sample.json @@ -0,0 +1,15 @@ +{ + "supplemental": { + "version": { + "_unicodeVersion": "14.0.0", + "_cldrVersion": "40" + }, + "likelySubtags": { + "aa": "aa-Latn-ET", + "aai": "aai-Latn-ZZ", + "aak": "aak-Latn-ZZ", + "aau": "aau-Latn-ZZ", + "ab": "ab-Cyrl-GE", + "abi": "abi-Latn-ZZ", + "abq": "abq-Cyrl-ZZ", + "abr": "abr-Latn-GH", diff --git a/scripts/data-external/cldr/likelySubtags.sample.json b/scripts/data-external/cldr/likelySubtags.sample.json new file mode 100644 index 0000000..cf060ec --- /dev/null +++ b/scripts/data-external/cldr/likelySubtags.sample.json @@ -0,0 +1,15 @@ +{ + "supplemental": { + "version": { + "_unicodeVersion": "14.0.0", + "_cldrVersion": "40" + }, + "likelySubtags": { + "aa": "aa-Latn-ET", + "aai": "aai-Latn-ZZ", + "aak": "aak-Latn-ZZ", + "aau": "aau-Latn-ZZ", + "ab": "ab-Cyrl-GE", + "abi": "abi-Latn-ZZ", + "abq": "abq-Cyrl-ZZ", + "abr": "abr-Latn-GH", diff --git a/scripts/data-external/iso-639-3__sample.csv b/scripts/data-external/iso-639-3.sample.csv similarity index 100% rename from scripts/data-external/iso-639-3__sample.csv rename to scripts/data-external/iso-639-3.sample.csv diff --git a/scripts/data-external/iso-639-3__sample.tab b/scripts/data-external/iso-639-3.sample.tab similarity index 100% rename from scripts/data-external/iso-639-3__sample.tab rename to scripts/data-external/iso-639-3.sample.tab diff --git a/scripts/data-external/iso15924__sample.csv b/scripts/data-external/iso15924.sample.csv similarity index 100% rename from scripts/data-external/iso15924__sample.csv rename to scripts/data-external/iso15924.sample.csv diff --git a/scripts/data-external/iso15924__sample.txt b/scripts/data-external/iso15924.sample.txt similarity index 100% rename from scripts/data-external/iso15924__sample.txt rename to scripts/data-external/iso15924.sample.txt diff --git a/scripts/fn/cldr_cli.py b/scripts/fn/cldr_cli.py new file mode 100755 index 0000000..9ed0919 --- /dev/null +++ b/scripts/fn/cldr_cli.py @@ -0,0 +1,118 @@ +#!/usr/bin/python3 +# ============================================================================== +# +# FILE: cldr_cli.py +# +# USAGE: ./scripts/fn/scripts/fn/cldr_cli.py +# +# DESCRIPTION: CLDR aliases via command line. It will return the JSON part +# of each file if exists. +# +# OPTIONS: --- +# +# REQUIREMENTS: - python3 +# BUGS: --- +# NOTES: --- +# AUTHORS: Emerson Rocha +# COLLABORATORS: <@TODO: put additional non-anonymous names here> +# COMPANY: EticaAI +# LICENSE: Public Domain dedication OR Zero-Clause BSD +# SPDX-License-Identifier: Unlicense OR 0BSD +# VERSION: v1.0 +# CREATED: 2021-11-20 10:37 UTC +# ============================================================================== + +import sys +import os +import json + + +# /workspace/git/EticaAI/tico-19-hxltm/scripts/data-external/cldr +# CLDR_BASE="scripts/data-external/cldr" CLDR_CLI_DEBUG=1 ./scripts/fn/cldr_cli.py languageAlias por +# CLDR_BASE="scripts/data-external/cldr" CLDR_CLI_DEBUG=1 ./scripts/fn/cldr_cli.py territoryAlias 076 +# CLDR_BASE="scripts/data-external/cldr" CLDR_CLI_DEBUG=1 ./scripts/fn/cldr_cli.py likelySubtags por + +if len(sys.argv) < 2 or sys.argv[1] == '-h' or sys.argv[1] == '--help': + print('usage: ' + sys.argv[0] + ' [command] [parameters]') + print('example: ') + print(' ' + sys.argv[0] + ' languageAlias zzz') + print(' ' + sys.argv[0] + ' languageAlias por') + print('') + print(' ' + sys.argv[0] + ' territoryAlias zzz') + print(' ' + sys.argv[0] + ' territoryAlias 076') + print('') + print(' ' + sys.argv[0] + ' likelySubtags zz') + print(' ' + sys.argv[0] + ' likelySubtags pt') + print('') + print(' CLDR_CLI_DEBUG=1 ' + sys.argv[0] + ' [command] [parameters]') + print('') + print('NOTE: ') + print(' CLDR_BASE, environment variable with path to the ' + + 'CLDR JSON files,\n' + + ' must be already defined to run this script. For example: \n') + print(' CLDR_BASE="~/Downloads/cldr/" ' + + sys.argv[0] + ' languageAlias por') + + sys.exit() + +if 'CLDR_BASE' not in os.environ: + sys.exit('ERROR! CLDR_BASE undefined. See ' + sys.argv[0] + ' --help') + +is_debug = bool(os.environ.get('CLDR_CLI_DEBUG', '0')) +cldr_alias_path = os.environ['CLDR_BASE'] + '/aliases.json' +cldr_likelySubtags_path = os.environ['CLDR_BASE'] + '/likelySubtags.json' +repo_cldr_json_base = 'https://raw.githubusercontent.com/unicode-org/' + \ + 'cldr-json/main/cldr-json/' + +if not os.path.exists(cldr_alias_path): + sys.exit('ERROR! No [' + cldr_alias_path + + ']. Try ' + repo_cldr_json_base + + '/main/cldr-json/cldr-core/supplemental/aliases.json') +if not os.path.exists(cldr_likelySubtags_path): + sys.exit('ERROR! No [' + cldr_likelySubtags_path + + ']. Try ' + repo_cldr_json_base + + '/main/cldr-json/cldr-core/supplemental/likelySubtags.json') + +if sys.argv[1] == 'languageAlias': + with open(cldr_alias_path, 'r') as _file: + data = json.loads(_file.read()) + + if sys.argv[2] in data['supplemental']['metadata']['alias']['languageAlias']: + print(str(data['supplemental']['metadata'] + ['alias']['languageAlias'][sys.argv[2]])) + else: + if is_debug: + print('{"msg": "Not found [' + sys.argv[2] + '] on [' + + cldr_alias_path + ']"}') + sys.exit() + +if sys.argv[1] == 'territoryAlias': + with open(cldr_alias_path, 'r') as _file: + data = json.loads(_file.read()) + + if sys.argv[2] in data['supplemental']['metadata']['alias']['territoryAlias']: + print(str(data['supplemental']['metadata'] + ['alias']['territoryAlias'][sys.argv[2]])) + else: + if is_debug: + print('{"msg": "Not found [' + sys.argv[2] + '] on [' + + cldr_alias_path + ']"}') + sys.exit() + +if sys.argv[1] == 'likelySubtags': + with open(cldr_likelySubtags_path, 'r') as _file: + data = json.loads(_file.read()) + + # print(data['supplemental']['likelySubtags'].keys()) + + if sys.argv[2] in data['supplemental']['likelySubtags']: + print(str(data['supplemental']['likelySubtags'][sys.argv[2]])) + else: + if is_debug: + print('{"msg": "Not found [' + sys.argv[2] + '] on [' + + cldr_likelySubtags_path + ']"}') + sys.exit() + + +sys.exit('unknow command [' + sys.argv[1] + + '] . See ' + sys.argv[0] + ' --help')