Skip to content

Commit

Permalink
#2, #4: started cldr_cli.py
Browse files Browse the repository at this point in the history
  • Loading branch information
fititnt committed Nov 20, 2021
1 parent 6562df5 commit 360e780
Show file tree
Hide file tree
Showing 14 changed files with 197 additions and 13 deletions.
14 changes: 8 additions & 6 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@ docs/scripts
# This is a symbolic link to data/ to allow local preview of the site.
docs/data

scripts/data-external/*
!scripts/data-external/.gitkeep
!scripts/data-external/iso15924__sample.csv
!scripts/data-external/iso15924__sample.txt
!scripts/data-external/iso-639-3__sample.csv
!scripts/data-external/iso-639-3__sample.tab
# scripts/data-external/*
# !scripts/data-external/.gitkeep
# !scripts/data-external/iso15924__sample.csv
# !scripts/data-external/iso15924__sample.txt
# !scripts/data-external/iso-639-3__sample.csv
# !scripts/data-external/iso-639-3__sample.tab
# !scripts/data-external/cldr/aliases__sample.json
# !scripts/data-external/cldr/likelySubtags__sample.json

# # temp
# data/original/terminology/tico-19-terminology-facebook.csv
Expand Down
10 changes: 8 additions & 2 deletions docs/eng-Latn/index.adoc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
= TICO-19 in HXLTM (draft)
= TICO-19 in HXLTM (public working draft)
// EticaAI, Collaborators_of <[email protected]>; Rocha, Emerson <[email protected]>
:toc: 1
:toclevels: 4
Expand Down Expand Up @@ -36,6 +36,11 @@ TODO:
Public domain datasets of the https://tico-19.github.io[Translation Initiative for COVID-19] on the format HXLTM (Multilingual Terminology in Humanitarian Language Exchange).
== Versions
- Online: https://tico-19-hxltm.etica.ai/
- PDF: https://tico-19-hxltm.etica.ai/tico-19-hxltm_eng-Latn.pdf[]
- Ebook EPUB: https://tico-19-hxltm.etica.ai/tico-19-hxltm_eng-Latn.epub[]
== Tables
[%header,format=csv]
Expand All @@ -46,6 +51,7 @@ include::../scripts/data-info/tico19_tm.csv[]
== Quick explanations
== Original data + minor changes
////

=== `data/original/terminology/google/` and `data/original/tico-19-terminology-google.csv`

Expand All @@ -54,7 +60,7 @@ include::../scripts/data-info/tico19_tm.csv[]
=== `data/original/terminology/facebook/` and `data/original/tico-19-terminology-google.csv`

(...)
////
== Appendix
Expand Down
5 changes: 5 additions & 0 deletions docs/robots.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
User-agent: *

# These files are redundant. No need to index
Disallow: /scripts/data-external/
Disallow: /scripts/data-info/
6 changes: 5 additions & 1 deletion scripts/_run-all-data-scripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,8 @@ set -e
# - tables render bad on ebooks, convert then to image and put ifelse
# on .adoc
# - https://stackoverflow.com/questions/26357137/csv-to-image-in-python
# - https://stackoverflow.com/questions/902761/saving-a-numpy-array-as-an-image
# - https://stackoverflow.com/questions/902761/saving-a-numpy-array-as-an-image
# - https://github.com/datasets/language-codes/blob/master/ietf-lanGen.php
# - https://github.com/unicode-org/cldr-json/blob/main/cldr-json/cldr-core/defaultContent.json
# - Territory
# - https://github.com/unicode-org/cldr-json/blob/main/cldr-json/cldr-core/supplemental/territoryInfo.json
20 changes: 16 additions & 4 deletions scripts/data-external-prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ PWD_NOW=$(pwd)
if [ ! -f scripts/data-external/iso-639-3.tab ]; then
curl https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab --output scripts/data-external/iso-639-3.tab
mlr --itsv --ocsv cat scripts/data-external/iso-639-3.tab > scripts/data-external/iso-639-3.csv
head scripts/data-external/iso-639-3.csv > scripts/data-external/iso-639-3__sample.csv
head scripts/data-external/iso-639-3.tab > scripts/data-external/iso-639-3__sample.tab
head scripts/data-external/iso-639-3.csv > scripts/data-external/iso-639-3.sample.csv
head scripts/data-external/iso-639-3.tab > scripts/data-external/iso-639-3.sample.tab
fi
if [ ! -f scripts/data-external/iso15924.txt ]; then
curl https://www.unicode.org/iso15924/iso15924.txt --output scripts/data-external/iso15924.txt
Expand All @@ -43,10 +43,22 @@ if [ ! -f scripts/data-external/iso15924.txt ]; then
scripts/data-external/iso15924_no-comments.tsv \
> scripts/data-external/iso15924.csv

head scripts/data-external/iso15924.csv > scripts/data-external/iso15924__sample.csv
head scripts/data-external/iso15924.txt > scripts/data-external/iso15924__sample.txt
head scripts/data-external/iso15924.csv > scripts/data-external/iso15924.sample.csv
head scripts/data-external/iso15924.txt > scripts/data-external/iso15924.sample.txt
fi

if [ ! -f scripts/data-external/cldr/likelySubtags.json ]; then
curl https://raw.githubusercontent.com/unicode-org/cldr-json/main/cldr-json/cldr-core/supplemental/likelySubtags.json --output scripts/data-external/cldr/likelySubtags.json
head -n 15 scripts/data-external/cldr/likelySubtags.json > scripts/data-external/cldr/likelySubtags.sample.json
fi

if [ ! -f scripts/data-external/cldr/aliases.json ]; then
curl https://raw.githubusercontent.com/unicode-org/cldr-json/main/cldr-json/cldr-core/supplemental/aliases.json --output scripts/data-external/cldr/aliases.json
head -n 15 scripts/data-external/cldr/likelySubtags.json > scripts/data-external/cldr/aliases.sample.json
fi

# https://github.com/unicode-org/cldr-json/blob/main/cldr-json/cldr-core/scriptMetadata.json

# mlr --irs '|' --implicit-csv-header cat scripts/data-external/iso15924_no-comments-pipe.txt

# mlr --itsv --ocsv cat scripts/data-external/iso15924_no-comments.tsv
Expand Down
Empty file removed scripts/data-external/.gitkeep
Empty file.
7 changes: 7 additions & 0 deletions scripts/data-external/README.adoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
== scripts/data-external/README.adoc

Data here is generated using the link:../data-external-prepare.sh[].
The versions without `.sample` are the _de facto_ used for several other internal scripts.
It's recommended (if part of this code is reused by others) to re-download the files.

The real data, without `.sample` prefix, is not committed on the repository (both to copyright and avoid redundancy)-
15 changes: 15 additions & 0 deletions scripts/data-external/cldr/aliases.sample.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"supplemental": {
"version": {
"_unicodeVersion": "14.0.0",
"_cldrVersion": "40"
},
"likelySubtags": {
"aa": "aa-Latn-ET",
"aai": "aai-Latn-ZZ",
"aak": "aak-Latn-ZZ",
"aau": "aau-Latn-ZZ",
"ab": "ab-Cyrl-GE",
"abi": "abi-Latn-ZZ",
"abq": "abq-Cyrl-ZZ",
"abr": "abr-Latn-GH",
15 changes: 15 additions & 0 deletions scripts/data-external/cldr/likelySubtags.sample.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"supplemental": {
"version": {
"_unicodeVersion": "14.0.0",
"_cldrVersion": "40"
},
"likelySubtags": {
"aa": "aa-Latn-ET",
"aai": "aai-Latn-ZZ",
"aak": "aak-Latn-ZZ",
"aau": "aau-Latn-ZZ",
"ab": "ab-Cyrl-GE",
"abi": "abi-Latn-ZZ",
"abq": "abq-Cyrl-ZZ",
"abr": "abr-Latn-GH",
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
118 changes: 118 additions & 0 deletions scripts/fn/cldr_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#!/usr/bin/python3
# ==============================================================================
#
# FILE: cldr_cli.py
#
# USAGE: ./scripts/fn/scripts/fn/cldr_cli.py
#
# DESCRIPTION: CLDR aliases via command line. It will return the JSON part
# of each file if exists.
#
# OPTIONS: ---
#
# REQUIREMENTS: - python3
# BUGS: ---
# NOTES: ---
# AUTHORS: Emerson Rocha <rocha[at]ieee.org>
# COLLABORATORS: <@TODO: put additional non-anonymous names here>
# COMPANY: EticaAI
# LICENSE: Public Domain dedication OR Zero-Clause BSD
# SPDX-License-Identifier: Unlicense OR 0BSD
# VERSION: v1.0
# CREATED: 2021-11-20 10:37 UTC
# ==============================================================================

import sys
import os
import json


# /workspace/git/EticaAI/tico-19-hxltm/scripts/data-external/cldr
# CLDR_BASE="scripts/data-external/cldr" CLDR_CLI_DEBUG=1 ./scripts/fn/cldr_cli.py languageAlias por
# CLDR_BASE="scripts/data-external/cldr" CLDR_CLI_DEBUG=1 ./scripts/fn/cldr_cli.py territoryAlias 076
# CLDR_BASE="scripts/data-external/cldr" CLDR_CLI_DEBUG=1 ./scripts/fn/cldr_cli.py likelySubtags por

if len(sys.argv) < 2 or sys.argv[1] == '-h' or sys.argv[1] == '--help':
print('usage: ' + sys.argv[0] + ' [command] [parameters]')
print('example: ')
print(' ' + sys.argv[0] + ' languageAlias zzz')
print(' ' + sys.argv[0] + ' languageAlias por')
print('')
print(' ' + sys.argv[0] + ' territoryAlias zzz')
print(' ' + sys.argv[0] + ' territoryAlias 076')
print('')
print(' ' + sys.argv[0] + ' likelySubtags zz')
print(' ' + sys.argv[0] + ' likelySubtags pt')
print('')
print(' CLDR_CLI_DEBUG=1 ' + sys.argv[0] + ' [command] [parameters]')
print('')
print('NOTE: ')
print(' CLDR_BASE, environment variable with path to the ' +
'CLDR JSON files,\n' +
' must be already defined to run this script. For example: \n')
print(' CLDR_BASE="~/Downloads/cldr/" ' +
sys.argv[0] + ' languageAlias por')

sys.exit()

if 'CLDR_BASE' not in os.environ:
sys.exit('ERROR! CLDR_BASE undefined. See ' + sys.argv[0] + ' --help')

is_debug = bool(os.environ.get('CLDR_CLI_DEBUG', '0'))
cldr_alias_path = os.environ['CLDR_BASE'] + '/aliases.json'
cldr_likelySubtags_path = os.environ['CLDR_BASE'] + '/likelySubtags.json'
repo_cldr_json_base = 'https://raw.githubusercontent.com/unicode-org/' + \
'cldr-json/main/cldr-json/'

if not os.path.exists(cldr_alias_path):
sys.exit('ERROR! No [' + cldr_alias_path +
']. Try ' + repo_cldr_json_base +
'/main/cldr-json/cldr-core/supplemental/aliases.json')
if not os.path.exists(cldr_likelySubtags_path):
sys.exit('ERROR! No [' + cldr_likelySubtags_path +
']. Try ' + repo_cldr_json_base +
'/main/cldr-json/cldr-core/supplemental/likelySubtags.json')

if sys.argv[1] == 'languageAlias':
with open(cldr_alias_path, 'r') as _file:
data = json.loads(_file.read())

if sys.argv[2] in data['supplemental']['metadata']['alias']['languageAlias']:
print(str(data['supplemental']['metadata']
['alias']['languageAlias'][sys.argv[2]]))
else:
if is_debug:
print('{"msg": "Not found [' + sys.argv[2] + '] on [' +
cldr_alias_path + ']"}')
sys.exit()

if sys.argv[1] == 'territoryAlias':
with open(cldr_alias_path, 'r') as _file:
data = json.loads(_file.read())

if sys.argv[2] in data['supplemental']['metadata']['alias']['territoryAlias']:
print(str(data['supplemental']['metadata']
['alias']['territoryAlias'][sys.argv[2]]))
else:
if is_debug:
print('{"msg": "Not found [' + sys.argv[2] + '] on [' +
cldr_alias_path + ']"}')
sys.exit()

if sys.argv[1] == 'likelySubtags':
with open(cldr_likelySubtags_path, 'r') as _file:
data = json.loads(_file.read())

# print(data['supplemental']['likelySubtags'].keys())

if sys.argv[2] in data['supplemental']['likelySubtags']:
print(str(data['supplemental']['likelySubtags'][sys.argv[2]]))
else:
if is_debug:
print('{"msg": "Not found [' + sys.argv[2] + '] on [' +
cldr_likelySubtags_path + ']"}')
sys.exit()


sys.exit('unknow command [' + sys.argv[1] +
'] . See ' + sys.argv[0] + ' --help')

0 comments on commit 360e780

Please sign in to comment.