diff --git a/scripts/maintenance/unidata.py b/scripts/maintenance/unidata.py new file mode 100644 index 0000000000..cf20dca746 --- /dev/null +++ b/scripts/maintenance/unidata.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +"""Script to update :mod:`pywikibot.tools._unidata`. + +This script is for updating ``_first_upper_exception_dict``. Run this +module multiple times using different python versions. + +.. note:: I seems that running under the latest version of Python gives + a superse of the older version and should be enough. But this is not + tested completely. +""" +# +# (C) Pywikibot team, 2018-2023 +# +# Distributed under the terms of the MIT license. +# +from sys import maxunicode +from re import findall +from json import dump, load +from queue import Queue +from threading import Thread + +from scripts.maintenance.wikimedia_sites import families_list +from pywikibot.family import Family +from pywikibot import Site +from pywikibot.comms.http import session + + +NUMBER_OF_THREADS = 26 +FILEPATH = '/data/firstup_excepts.json' + + +def chars_uppers_wikilinks(): + """Retrieve upper chars from MediaWiki using page titles.""" + n = 0 + chars = [] + uppers = [] + wikilinks = '' + for i in range(0, maxunicode + 1): + c = chr(i) + uc = c.upper() + if uc != c: + n += 1 + chars.append(c) + uppers.append(uc) + # MediaWiki is first-letter case + wikilinks += '[[MediaWiki:' + c + ']]\n' + return chars, uppers, wikilinks + + +def process_site(fam_name, site_code): + """Process title for a single site.""" + j = session.post( + f'https://{site_code}.{fam_name}.org/w/api.php?' + f'action=parse&contentmodel=wikitext&prop=text' + f'&format=json&utf8', + data={'text': wikilinks}, + timeout=10, + ).json() + pased_text = j['parse']['text']['*'] + titles = findall(r'title="[^:]*:(.)', pased_text) + site_excepts = {} + for i, original_char in enumerate(chars): + title_char = titles[i] + if uppers[i] != title_char: + site_excepts[original_char] = title_char + return site_excepts + + +def threads_target(q): + """Thread processing a single site.""" + global families_excepts + while True: + try: + fam, code = q.get() + except TypeError: # non-iterable NoneType object + break + site_excepts = process_site(fam, code) + families_excepts[fam].setdefault(code, {}).update(site_excepts) + q.task_done() + + +def spawn_threads(q): + """Prepare several threads.""" + # TODO: use ThreadList instead + threads = [] + for i in range(NUMBER_OF_THREADS): + t = Thread(target=threads_target, args=(q,)) + t.start() + threads.append(t) + return threads + + +def stop_threads(q, threads): + """Stop threads.""" + for i in range(NUMBER_OF_THREADS): + q.put(None) + for t in threads: + t.join() + + +def main(): + """Main loop processing sites.""" + global families_excepts + q = Queue() + threads = spawn_threads(q) + for fam_name in families_list: + family = Family.load(fam_name) + families_excepts.setdefault(fam_name, {}) + for site_code in family.languages_by_size: + site = Site(site_code, family) + if site.namespaces[8].case != 'first-letter': + raise ValueError('MW namespace case is not first-letter') + fam_code = (fam_name, site_code) + if fam_code in { + ('wikisource', 'www'), + ('wikisource', 'mul'), + ('wikiversity', 'test'), + }: + continue # the API of these codes does not respond as expected + q.put(fam_code) + # block until all tasks are done + q.join() + stop_threads(q, threads) + + +def save_json(obj, path): + """Save data to file.""" + with open(path, 'w', encoding='utf8') as f: + dump(obj, f) + + +def load_json(path): + """Load data from file.""" + try: + with open(path, 'r', encoding='utf8') as f: + return load(f) + except OSError: + print('File not found:', path) # noqa: T001, T201 + return {} + + +if __name__ == '__main__': + chars, uppers, wikilinks = chars_uppers_wikilinks() + # save_json({'chars': chars, 'uppers': uppers, 'wikilinks': wikilinks}, + # 'user-temp-save.json') + # j = load_json('user-temp-save.json') + # chars, uppers, wikilinks = j['chars'], j['uppers'], j['wikilinks'] + # families_excepts = load_json(FILEPATH) + # main() + # save_json(families_excepts, FILEPATH) + print(process_site('wiktionary', 'fr')) # noqa: T001, T201