-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
14 changed files
with
197 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
= TICO-19 in HXLTM (draft) | ||
= TICO-19 in HXLTM (public working draft) | ||
// EticaAI, Collaborators_of <[email protected]>; Rocha, Emerson <[email protected]> | ||
:toc: 1 | ||
:toclevels: 4 | ||
|
@@ -36,6 +36,11 @@ TODO: | |
Public domain datasets of the https://tico-19.github.io[Translation Initiative for COVID-19] on the format HXLTM (Multilingual Terminology in Humanitarian Language Exchange). | ||
== Versions | ||
- Online: https://tico-19-hxltm.etica.ai/ | ||
- PDF: https://tico-19-hxltm.etica.ai/tico-19-hxltm_eng-Latn.pdf[] | ||
- Ebook EPUB: https://tico-19-hxltm.etica.ai/tico-19-hxltm_eng-Latn.epub[] | ||
== Tables | ||
[%header,format=csv] | ||
|
@@ -46,6 +51,7 @@ include::../scripts/data-info/tico19_tm.csv[] | |
== Quick explanations | ||
== Original data + minor changes | ||
//// | ||
|
||
=== `data/original/terminology/google/` and `data/original/tico-19-terminology-google.csv` | ||
|
||
|
@@ -54,7 +60,7 @@ include::../scripts/data-info/tico19_tm.csv[] | |
=== `data/original/terminology/facebook/` and `data/original/tico-19-terminology-google.csv` | ||
|
||
(...) | ||
//// | ||
== Appendix | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
User-agent: * | ||
|
||
# These files are redundant. No need to index | ||
Disallow: /scripts/data-external/ | ||
Disallow: /scripts/data-info/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
== scripts/data-external/README.adoc | ||
|
||
Data here is generated using the link:../data-external-prepare.sh[]. | ||
The versions without `.sample` are the _de facto_ used for several other internal scripts. | ||
It's recommended (if part of this code is reused by others) to re-download the files. | ||
|
||
The real data, without `.sample` prefix, is not committed on the repository (both to copyright and avoid redundancy)- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
{ | ||
"supplemental": { | ||
"version": { | ||
"_unicodeVersion": "14.0.0", | ||
"_cldrVersion": "40" | ||
}, | ||
"likelySubtags": { | ||
"aa": "aa-Latn-ET", | ||
"aai": "aai-Latn-ZZ", | ||
"aak": "aak-Latn-ZZ", | ||
"aau": "aau-Latn-ZZ", | ||
"ab": "ab-Cyrl-GE", | ||
"abi": "abi-Latn-ZZ", | ||
"abq": "abq-Cyrl-ZZ", | ||
"abr": "abr-Latn-GH", |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
{ | ||
"supplemental": { | ||
"version": { | ||
"_unicodeVersion": "14.0.0", | ||
"_cldrVersion": "40" | ||
}, | ||
"likelySubtags": { | ||
"aa": "aa-Latn-ET", | ||
"aai": "aai-Latn-ZZ", | ||
"aak": "aak-Latn-ZZ", | ||
"aau": "aau-Latn-ZZ", | ||
"ab": "ab-Cyrl-GE", | ||
"abi": "abi-Latn-ZZ", | ||
"abq": "abq-Cyrl-ZZ", | ||
"abr": "abr-Latn-GH", |
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
#!/usr/bin/python3 | ||
# ============================================================================== | ||
# | ||
# FILE: cldr_cli.py | ||
# | ||
# USAGE: ./scripts/fn/scripts/fn/cldr_cli.py | ||
# | ||
# DESCRIPTION: CLDR aliases via command line. It will return the JSON part | ||
# of each file if exists. | ||
# | ||
# OPTIONS: --- | ||
# | ||
# REQUIREMENTS: - python3 | ||
# BUGS: --- | ||
# NOTES: --- | ||
# AUTHORS: Emerson Rocha <rocha[at]ieee.org> | ||
# COLLABORATORS: <@TODO: put additional non-anonymous names here> | ||
# COMPANY: EticaAI | ||
# LICENSE: Public Domain dedication OR Zero-Clause BSD | ||
# SPDX-License-Identifier: Unlicense OR 0BSD | ||
# VERSION: v1.0 | ||
# CREATED: 2021-11-20 10:37 UTC | ||
# ============================================================================== | ||
|
||
import sys | ||
import os | ||
import json | ||
|
||
|
||
# /workspace/git/EticaAI/tico-19-hxltm/scripts/data-external/cldr | ||
# CLDR_BASE="scripts/data-external/cldr" CLDR_CLI_DEBUG=1 ./scripts/fn/cldr_cli.py languageAlias por | ||
# CLDR_BASE="scripts/data-external/cldr" CLDR_CLI_DEBUG=1 ./scripts/fn/cldr_cli.py territoryAlias 076 | ||
# CLDR_BASE="scripts/data-external/cldr" CLDR_CLI_DEBUG=1 ./scripts/fn/cldr_cli.py likelySubtags por | ||
|
||
if len(sys.argv) < 2 or sys.argv[1] == '-h' or sys.argv[1] == '--help': | ||
print('usage: ' + sys.argv[0] + ' [command] [parameters]') | ||
print('example: ') | ||
print(' ' + sys.argv[0] + ' languageAlias zzz') | ||
print(' ' + sys.argv[0] + ' languageAlias por') | ||
print('') | ||
print(' ' + sys.argv[0] + ' territoryAlias zzz') | ||
print(' ' + sys.argv[0] + ' territoryAlias 076') | ||
print('') | ||
print(' ' + sys.argv[0] + ' likelySubtags zz') | ||
print(' ' + sys.argv[0] + ' likelySubtags pt') | ||
print('') | ||
print(' CLDR_CLI_DEBUG=1 ' + sys.argv[0] + ' [command] [parameters]') | ||
print('') | ||
print('NOTE: ') | ||
print(' CLDR_BASE, environment variable with path to the ' + | ||
'CLDR JSON files,\n' + | ||
' must be already defined to run this script. For example: \n') | ||
print(' CLDR_BASE="~/Downloads/cldr/" ' + | ||
sys.argv[0] + ' languageAlias por') | ||
|
||
sys.exit() | ||
|
||
if 'CLDR_BASE' not in os.environ: | ||
sys.exit('ERROR! CLDR_BASE undefined. See ' + sys.argv[0] + ' --help') | ||
|
||
is_debug = bool(os.environ.get('CLDR_CLI_DEBUG', '0')) | ||
cldr_alias_path = os.environ['CLDR_BASE'] + '/aliases.json' | ||
cldr_likelySubtags_path = os.environ['CLDR_BASE'] + '/likelySubtags.json' | ||
repo_cldr_json_base = 'https://raw.githubusercontent.com/unicode-org/' + \ | ||
'cldr-json/main/cldr-json/' | ||
|
||
if not os.path.exists(cldr_alias_path): | ||
sys.exit('ERROR! No [' + cldr_alias_path + | ||
']. Try ' + repo_cldr_json_base + | ||
'/main/cldr-json/cldr-core/supplemental/aliases.json') | ||
if not os.path.exists(cldr_likelySubtags_path): | ||
sys.exit('ERROR! No [' + cldr_likelySubtags_path + | ||
']. Try ' + repo_cldr_json_base + | ||
'/main/cldr-json/cldr-core/supplemental/likelySubtags.json') | ||
|
||
if sys.argv[1] == 'languageAlias': | ||
with open(cldr_alias_path, 'r') as _file: | ||
data = json.loads(_file.read()) | ||
|
||
if sys.argv[2] in data['supplemental']['metadata']['alias']['languageAlias']: | ||
print(str(data['supplemental']['metadata'] | ||
['alias']['languageAlias'][sys.argv[2]])) | ||
else: | ||
if is_debug: | ||
print('{"msg": "Not found [' + sys.argv[2] + '] on [' + | ||
cldr_alias_path + ']"}') | ||
sys.exit() | ||
|
||
if sys.argv[1] == 'territoryAlias': | ||
with open(cldr_alias_path, 'r') as _file: | ||
data = json.loads(_file.read()) | ||
|
||
if sys.argv[2] in data['supplemental']['metadata']['alias']['territoryAlias']: | ||
print(str(data['supplemental']['metadata'] | ||
['alias']['territoryAlias'][sys.argv[2]])) | ||
else: | ||
if is_debug: | ||
print('{"msg": "Not found [' + sys.argv[2] + '] on [' + | ||
cldr_alias_path + ']"}') | ||
sys.exit() | ||
|
||
if sys.argv[1] == 'likelySubtags': | ||
with open(cldr_likelySubtags_path, 'r') as _file: | ||
data = json.loads(_file.read()) | ||
|
||
# print(data['supplemental']['likelySubtags'].keys()) | ||
|
||
if sys.argv[2] in data['supplemental']['likelySubtags']: | ||
print(str(data['supplemental']['likelySubtags'][sys.argv[2]])) | ||
else: | ||
if is_debug: | ||
print('{"msg": "Not found [' + sys.argv[2] + '] on [' + | ||
cldr_likelySubtags_path + ']"}') | ||
sys.exit() | ||
|
||
|
||
sys.exit('unknow command [' + sys.argv[1] + | ||
'] . See ' + sys.argv[0] + ' --help') |