From 90883235499d899ce1585ad9d74c9d4e269b538d Mon Sep 17 00:00:00 2001 From: froggleston Date: Tue, 17 Sep 2024 14:38:32 +0100 Subject: [PATCH] Add support for externally sorting terms --- .gitignore | 1 - _includes/glossary.html | 6 +- utils/sort-glossary.py | 308 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 312 insertions(+), 3 deletions(-) create mode 100644 utils/sort-glossary.py diff --git a/.gitignore b/.gitignore index b1b6c9f1..6ccf1ea5 100644 --- a/.gitignore +++ b/.gitignore @@ -20,4 +20,3 @@ assets/css/to_delete.css .DS_Store _config.yml _data/ -count.py diff --git a/_includes/glossary.html b/_includes/glossary.html index 51c412f6..66e0fb8e 100644 --- a/_includes/glossary.html +++ b/_includes/glossary.html @@ -1,5 +1,7 @@ -{% assign gloss = site.data.glossary %} {% assign language = page.permalink | replace: '/', '' %} + +{% assign gloss = site.data.[language].glossary %} + {% assign direction = 'ltr' %} {% if page.direction %} {% assign direction = page.direction %} @@ -13,7 +15,7 @@ - 'actual' is a list of slugs sorted by terms. {%- endcomment -%} {%- capture defined -%}{%- for item in gloss -%}{%- if item[language] -%}{{item[language].term | downcase}}IN_ITEM{{item.slug}}BETWEEN_ITEMS{%- endif -%}{%- endfor -%}{%- endcapture -%} -{%- assign sorted = defined | split: 'BETWEEN_ITEMS' | sort -%} +{%- assign sorted = defined | split: 'BETWEEN_ITEMS' -%} {%- capture ordered -%}{%- for item in sorted -%}{{item | split: 'IN_ITEM' | last}}BETWEEN_ITEMS{%- endfor -%}{%- endcapture -%} {%- assign actual = ordered | split: 'BETWEEN_ITEMS' -%} diff --git a/utils/sort-glossary.py b/utils/sort-glossary.py new file mode 100644 index 00000000..299490ce --- /dev/null +++ b/utils/sort-glossary.py @@ -0,0 +1,308 @@ +import pprint +import yaml + +from collections import OrderedDict +from pathlib import Path + +import icu + +languages = [ + ('aa', 'Afar'), + ('ab', 'Abkhazian'), + ('af', 'Afrikaans'), + ('ak', 'Akan'), + ('sq', 'Albanian'), + ('am', 'Amharic'), + ('ar', 'Arabic'), + ('an', 'Aragonese'), + ('hy', 'Armenian'), + ('as', 'Assamese'), + ('av', 'Avaric'), + ('ae', 'Avestan'), + ('ay', 'Aymara'), + ('az', 'Azerbaijani'), + ('ba', 'Bashkir'), + ('bm', 'Bambara'), + ('eu', 'Basque'), + ('be', 'Belarusian'), + ('bn', 'Bengali'), + ('bh', 'Bihari languages'), + ('bi', 'Bislama'), + ('bo', 'Tibetan'), + ('bs', 'Bosnian'), + ('br', 'Breton'), + ('bg', 'Bulgarian'), + ('my', 'Burmese'), + ('ca', 'Catalan; Valencian'), + ('cs', 'Czech'), + ('ch', 'Chamorro'), + ('ce', 'Chechen'), + ('zh', 'Chinese'), + ('cu', 'Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic'), + ('cv', 'Chuvash'), + ('kw', 'Cornish'), + ('co', 'Corsican'), + ('cr', 'Cree'), + ('cy', 'Welsh'), + ('cs', 'Czech'), + ('da', 'Danish'), + ('de', 'German'), + ('dv', 'Divehi; Dhivehi; Maldivian'), + ('nl', 'Dutch; Flemish'), + ('dz', 'Dzongkha'), + ('el', 'Greek, Modern (1453-)'), + ('en', 'English'), + ('eo', 'Esperanto'), + ('et', 'Estonian'), + ('eu', 'Basque'), + ('ee', 'Ewe'), + ('fo', 'Faroese'), + ('fa', 'Persian'), + ('fj', 'Fijian'), + ('fi', 'Finnish'), + ('fr', 'French'), + ('fy', 'Western Frisian'), + ('ff', 'Fulah'), + ('Ga', 'Georgian'), + ('de', 'German'), + ('gd', 'Gaelic; Scottish Gaelic'), + ('ga', 'Irish'), + ('gl', 'Galician'), + ('gv', 'Manx'), + ('el', 'Greek, Modern (1453-)'), + ('gn', 'Guarani'), + ('gu', 'Gujarati'), + ('ht', 'Haitian; Haitian Creole'), + ('ha', 'Hausa'), + ('he', 'Hebrew'), + ('hz', 'Herero'), + ('hi', 'Hindi'), + ('ho', 'Hiri Motu'), + ('hr', 'Croatian'), + ('hu', 'Hungarian'), + ('hy', 'Armenian'), + ('ig', 'Igbo'), + ('is', 'Icelandic'), + ('io', 'Ido'), + ('ii', 'Sichuan Yi; Nuosu'), + ('iu', 'Inuktitut'), + ('ie', 'Interlingue; Occidental'), + ('ia', 'Interlingua (International Auxiliary Language Association)'), + ('id', 'Indonesian'), + ('ik', 'Inupiaq'), + ('is', 'Icelandic'), + ('it', 'Italian'), + ('jv', 'Javanese'), + ('ja', 'Japanese'), + ('kl', 'Kalaallisut; Greenlandic'), + ('kn', 'Kannada'), + ('ks', 'Kashmiri'), + ('ka', 'Georgian'), + ('kr', 'Kanuri'), + ('kk', 'Kazakh'), + ('km', 'Central Khmer'), + ('ki', 'Kikuyu; Gikuyu'), + ('rw', 'Kinyarwanda'), + ('ky', 'Kirghiz; Kyrgyz'), + ('kv', 'Komi'), + ('kg', 'Kongo'), + ('ko', 'Korean'), + ('kj', 'Kuanyama; Kwanyama'), + ('ku', 'Kurdish'), + ('lo', 'Lao'), + ('la', 'Latin'), + ('lv', 'Latvian'), + ('li', 'Limburgan; Limburger; Limburgish'), + ('ln', 'Lingala'), + ('lt', 'Lithuanian'), + ('lb', 'Luxembourgish; Letzeburgesch'), + ('lu', 'Luba-Katanga'), + ('lg', 'Ganda'), + ('mk', 'Macedonian'), + ('mh', 'Marshallese'), + ('ml', 'Malayalam'), + ('mi', 'Maori'), + ('mr', 'Marathi'), + ('ms', 'Malay'), + ('Mi', 'Micmac'), + ('mk', 'Macedonian'), + ('mg', 'Malagasy'), + ('mt', 'Maltese'), + ('mn', 'Mongolian'), + ('mi', 'Maori'), + ('ms', 'Malay'), + ('my', 'Burmese'), + ('na', 'Nauru'), + ('nv', 'Navajo; Navaho'), + ('nr', 'Ndebele, South; South Ndebele'), + ('nd', 'Ndebele, North; North Ndebele'), + ('ng', 'Ndonga'), + ('ne', 'Nepali'), + ('nl', 'Dutch; Flemish'), + ('nn', 'Norwegian Nynorsk; Nynorsk, Norwegian'), + ('nb', 'Bokmål, Norwegian; Norwegian Bokmål'), + ('no', 'Norwegian'), + ('oc', 'Occitan (post 1500)'), + ('oj', 'Ojibwa'), + ('or', 'Oriya'), + ('om', 'Oromo'), + ('os', 'Ossetian; Ossetic'), + ('pa', 'Panjabi; Punjabi'), + ('fa', 'Persian'), + ('pi', 'Pali'), + ('pl', 'Polish'), + ('pt', 'Portuguese'), + ('ps', 'Pushto; Pashto'), + ('qu', 'Quechua'), + ('rm', 'Romansh'), + ('ro', 'Romanian; Moldavian; Moldovan'), + ('ro', 'Romanian; Moldavian; Moldovan'), + ('rn', 'Rundi'), + ('ru', 'Russian'), + ('sg', 'Sango'), + ('sa', 'Sanskrit'), + ('si', 'Sinhala; Sinhalese'), + ('sk', 'Slovak'), + ('sk', 'Slovak'), + ('sl', 'Slovenian'), + ('se', 'Northern Sami'), + ('sm', 'Samoan'), + ('sn', 'Shona'), + ('sd', 'Sindhi'), + ('so', 'Somali'), + ('st', 'Sotho, Southern'), + ('es', 'Spanish; Castilian'), + ('sq', 'Albanian'), + ('sc', 'Sardinian'), + ('sr', 'Serbian'), + ('ss', 'Swati'), + ('su', 'Sundanese'), + ('sw', 'Swahili'), + ('sv', 'Swedish'), + ('ty', 'Tahitian'), + ('ta', 'Tamil'), + ('tt', 'Tatar'), + ('te', 'Telugu'), + ('tg', 'Tajik'), + ('tl', 'Tagalog'), + ('th', 'Thai'), + ('bo', 'Tibetan'), + ('ti', 'Tigrinya'), + ('to', 'Tonga (Tonga Islands)'), + ('tn', 'Tswana'), + ('ts', 'Tsonga'), + ('tk', 'Turkmen'), + ('tr', 'Turkish'), + ('tw', 'Twi'), + ('ug', 'Uighur; Uyghur'), + ('uk', 'Ukrainian'), + ('ur', 'Urdu'), + ('uz', 'Uzbek'), + ('ve', 'Venda'), + ('vi', 'Vietnamese'), + ('vo', 'Volapük'), + ('cy', 'Welsh'), + ('wa', 'Walloon'), + ('wo', 'Wolof'), + ('xh', 'Xhosa'), + ('yi', 'Yiddish'), + ('yo', 'Yoruba'), + ('za', 'Zhuang; Chuang'), + ('zh', 'Chinese'), + ('zu', 'Zulu') +] + +def _sort_terms(count_dict): + # sort and reassign terms + for lang in count_dict: + # std_lang = standardize_tag(lang) + # print(f"{lang} -> {std_lang} -> {Language.get(std_lang).to_alpha3()}") + + # create a locale from the language code and sort the terms with a collator + icu_locale = icu.Locale(lang) + collator = icu.Collator.createInstance(icu_locale) + + # only create directories for languages with terms + if count_dict[lang]["count"] > 0: + lang_path = data_path.joinpath(lang) + lang_path.mkdir(parents=True, exist_ok=True) + + # sort + sorted_terms = sorted(count_dict[lang]["terms"], key=collator.getSortKey) + count_dict[lang]["sorted_terms"] = sorted_terms + return count_dict + +def _setup_dict(glossary): + count_dict = {} + lang_codes = [] + + for cc in languages: + count_dict[cc[0]] = {} + count_dict[cc[0]]["count"] = 0 + count_dict[cc[0]]["name"] = cc[1] + count_dict[cc[0]]["terms"] = [] + count_dict[cc[0]]["sorted_terms"] = [] + count_dict[cc[0]]["term_entry_map"] = {} + lang_codes.append(cc[0]) + + # total number of glossary terms + # print(len(glos)) + + for slug in glossary: + for lang in slug.keys(): + if lang in lang_codes: + count_dict[lang]["count"] += 1 + count_dict[lang]["terms"].append(slug[lang]["term"]) + count_dict[lang]["term_entry_map"][slug[lang]["term"]] = dict( + { + "slug": slug["slug"], + "def": slug[lang]["def"] + } + ) + return _sort_terms(count_dict) + +def _build_lang_glossary(count_dict): + glossary_by_lang = {} + for lang in count_dict: + sorted_glossary = [] + for sorted_term in count_dict[lang]["sorted_terms"]: + if sorted_term in count_dict[lang]["term_entry_map"]: + term_map = count_dict[lang]["term_entry_map"][sorted_term] + slug = term_map["slug"] + _def = term_map["def"] + + sorted_glossary.append(OrderedDict({ + "slug": slug, + lang: { + "term": sorted_term, + "def": _def + } + })) + if sorted_glossary: + glossary_by_lang[lang] = sorted_glossary + return glossary_by_lang + +def setup_yaml(): + """ https://stackoverflow.com/a/8661021 """ + def represent_dict_order(self, data): + return self.represent_mapping('tag:yaml.org,2002:map', data.items()) + yaml.add_representer(OrderedDict, represent_dict_order) + +# load main glossary file +glos = yaml.safe_load(Path('glossary.yml').read_text()) +data_path = Path("_data/") + +# sort terms +count_dict = _setup_dict(glos) + +# rebuild glossary per language +sorted_glossary_by_lang = _build_lang_glossary(count_dict) + +# setup yaml for outputting +setup_yaml() +for lang in sorted_glossary_by_lang: + pprint.pprint(sorted_glossary_by_lang[lang]) + yaml.dump(sorted_glossary_by_lang[lang], Path(f'_data/{lang}/glossary.yml').open('w')) + +# output counts +# pprint.pprint(count_dict)