From e67d3aaaf8d994c2f3e91b0cd12cd0e7c16c0cfa Mon Sep 17 00:00:00 2001 From: lindsay stevens Date: Mon, 9 Dec 2024 20:30:15 +1100 Subject: [PATCH 1/3] add: test for >2 char iana tags, tag list update script, updated tags - test_language_warnings.py - add case for a valid >2 char language code - add case for invalid 3 char code - example of behaviour where if someone refers to ISO 639 and gets a valid tag but that's not compliant with the IANA RFC which uses the shortest available tag. - subtags_updater.py - add script to simplify/formalise how the tag lists are prepared. I don't know how/where the original list was obtained but this produces a purely additive changeset consistent with a published changelist https://www.iana.org/assignments/lang-subtags-templates/lang-subtags-templates.xhtml so it should be about right. - add IANA language codes added to registry since list was originally committed 6+ years ago (2018-04-01) --- .../iana_subtags_3_or_more_characters.txt | 125 ++++++++++++++++++ .../pyxform/iana_subtags/subtags_updater.py | 31 +++++ tests/test_language_warnings.py | 13 +- 3 files changed, 163 insertions(+), 6 deletions(-) create mode 100644 pyxform/validators/pyxform/iana_subtags/subtags_updater.py diff --git a/pyxform/validators/pyxform/iana_subtags/iana_subtags_3_or_more_characters.txt b/pyxform/validators/pyxform/iana_subtags/iana_subtags_3_or_more_characters.txt index 96a44749..6e6f967f 100644 --- a/pyxform/validators/pyxform/iana_subtags/iana_subtags_3_or_more_characters.txt +++ b/pyxform/validators/pyxform/iana_subtags/iana_subtags_3_or_more_characters.txt @@ -189,6 +189,7 @@ ajg aji ajn ajp +ajs ajt aju ajw @@ -342,6 +343,7 @@ aqa aqc aqd aqg +aqk aql aqm aqn @@ -892,7 +894,9 @@ boy boz bpa bpb +bpc bpd +bpe bpg bph bpi @@ -1246,6 +1250,7 @@ cek cel cen cet +cey cfa cfd cfg @@ -1304,6 +1309,7 @@ cka ckb ckh ckl +ckm ckn cko ckq @@ -1326,6 +1332,7 @@ clk cll clm clo +cls clt clu clw @@ -1352,6 +1359,8 @@ cni cnk cnl cno +cnp +cnq cnr cns cnt @@ -1436,6 +1445,7 @@ csl csm csn cso +csp csq csr css @@ -1443,6 +1453,7 @@ cst csu csv csw +csx csy csz cta @@ -1459,6 +1470,7 @@ ctp cts ctt ctu +cty ctz cua cub @@ -1489,6 +1501,7 @@ cwd cwe cwg cwt +cxh cya cyb cyo @@ -1644,6 +1657,7 @@ djr dju djw dka +dkg dkk dkl dkr @@ -1658,6 +1672,7 @@ dmb dmc dmd dme +dmf dmg dmk dml @@ -1679,6 +1694,7 @@ dni dnj dnk dnn +dno dnr dnt dnu @@ -1729,10 +1745,12 @@ dsb dse dsh dsi +dsk dsl dsn dso dsq +dsz dta dtb dtd @@ -1775,12 +1793,14 @@ duy duz dva dwa +dwk dwl dwr dws dwu dww dwy +dwz dya dyb dyd @@ -1789,6 +1809,7 @@ dyi dym dyn dyo +dyr dyu dyy dza @@ -1798,6 +1819,7 @@ dzg dzl dzn eaa +ebc ebg ebk ebo @@ -1812,9 +1834,11 @@ efe efi ega egl +egm ego egx egy +ehs ehu eip eit @@ -1851,11 +1875,13 @@ emm emn emo emp +emq ems emu emw emx emy +emz ena enb enc @@ -1909,6 +1935,7 @@ ett etu etx etz +eud euq eve evh @@ -1946,6 +1973,7 @@ ffm fgr fia fie +fif fil fip fir @@ -2091,6 +2119,7 @@ gea geb gec ged +gef geg geh gei @@ -2139,6 +2168,7 @@ gid gie gig gih +gii gil gim gin @@ -2166,6 +2196,7 @@ gkn gko gkp gku +glb glc gld glh @@ -2188,6 +2219,7 @@ gml gmm gmn gmq +gmr gmu gmv gmw @@ -2235,6 +2267,7 @@ gor gos got gou +gov gow gox goy @@ -2347,6 +2380,7 @@ gyn gyo gyr gyy +gyz gza gzi gzn @@ -2414,6 +2448,7 @@ hix hji hka hke +hkh hkk hkn hks @@ -2451,6 +2486,7 @@ hmz hna hnd hne +hng hnh hni hnj @@ -2599,6 +2635,7 @@ ijn ijo ijs ike +ikh iki ikk ikl @@ -2632,6 +2669,7 @@ imn imo imr ims +imt imy inb inc @@ -2679,6 +2717,7 @@ iso isr ist isu +isv itb itc itd @@ -2710,6 +2749,7 @@ iyo iyx izh izi +izm izr izz jaa @@ -2737,6 +2777,7 @@ jbe jbi jbj jbk +jbm jbn jbo jbr @@ -2788,6 +2829,7 @@ jkm jko jkp jkr +jks jku jle jls @@ -3608,8 +3650,10 @@ lgk lgl lgm lgn +lgo lgq lgr +lgs lgt lgu lgz @@ -3759,6 +3803,7 @@ lpe lpn lpo lpx +lqr lra lrc lre @@ -3774,6 +3819,8 @@ lrt lrv lrz lsa +lsb +lsc lsd lse lsg @@ -3781,11 +3828,14 @@ lsh lsi lsl lsm +lsn lso lsp lsr lss lst +lsv +lsw lsy ltc ltg @@ -3818,7 +3868,9 @@ luw luy luz lva +lvi lvk +lvl lvs lvu lwa @@ -3832,6 +3884,7 @@ lws lwt lwu lww +lxm lya lyg lyn @@ -4835,6 +4888,7 @@ nqm nqn nqo nqq +nqt nqy nra nrb @@ -4854,6 +4908,7 @@ nru nrx nrz nsa +nsb nsc nsd nse @@ -4931,6 +4986,7 @@ nwi nwm nwo nwr +nww nwx nwy nxa @@ -4977,6 +5033,7 @@ nzd nzi nzk nzm +nzr nzs nzu nzy @@ -4995,6 +5052,7 @@ obt obu oca och +ocm oco ocu oda @@ -5013,6 +5071,7 @@ ogu oht ohu oia +oie oin ojb ojc @@ -5023,6 +5082,7 @@ ojv ojw oka okb +okc okd oke okg @@ -5039,6 +5099,7 @@ oks oku okv okx +okz ola old ole @@ -5066,6 +5127,7 @@ omu omv omw omx +omy ona onb one @@ -5112,6 +5174,7 @@ orz osa osc osi +osn oso osp ost @@ -5256,6 +5319,7 @@ phd phg phh phi +phj phk phl phm @@ -5353,6 +5417,7 @@ pmz pna pnb pnc +pnd pne png pnh @@ -5506,6 +5571,8 @@ pys pyu pyx pyy +pze +pzh pzn qaa..qtz qua @@ -5620,6 +5687,7 @@ rgu rhg rhp ria +rib rie rif ril @@ -5664,6 +5732,7 @@ rmx rmy rmz rna +rnb rnd rng rnl @@ -5688,12 +5757,16 @@ row rpn rpt rri +rrm rro rrt rsb rsi +rsk rsl rsm +rsn +rsw rtc rth rtm @@ -5716,6 +5789,7 @@ ruy ruz rwa rwk +rwl rwm rwo rwr @@ -5808,6 +5882,7 @@ sdm sdn sdo sdp +sdq sdr sds sdt @@ -6082,6 +6157,7 @@ sqr sqs sqt squ +sqx sra srb src @@ -6163,6 +6239,7 @@ suj suk sul sum +suo suq sur sus @@ -6243,6 +6320,7 @@ szp szs szv szw +szy taa tab tac @@ -6438,10 +6516,12 @@ tiz tja tjg tji +tjj tjl tjm tjn tjo +tjp tjs tju tjw @@ -6548,6 +6628,7 @@ tog toh toi toj +tok tol tom too @@ -6696,6 +6777,7 @@ tuz tva tvd tve +tvi tvk tvl tvm @@ -6705,6 +6787,7 @@ tvs tvt tvu tvw +tvx tvy twa twb @@ -6758,6 +6841,7 @@ tyt tyu tyv tyx +tyy tyz tza tzh @@ -6789,6 +6873,7 @@ ufi uga ugb uge +ugh ugn ugo ugy @@ -6800,12 +6885,14 @@ uji uka ukg ukh +uki ukk ukl ukp ukq uks uku +ukv ukw uky ula @@ -6820,6 +6907,7 @@ ulm uln ulu ulw +uly uma umb umc @@ -6837,6 +6925,7 @@ una und une ung +uni unk unm unn @@ -6846,6 +6935,7 @@ unu unx unz uok +uon upi upv ura @@ -6876,9 +6966,11 @@ ush usi usk usp +uss usu uta ute +uth utp utr utu @@ -6930,16 +7022,19 @@ vin vis vit viv +vjk vka vki vkj vkk vkl vkm +vkn vko vkp vkt vku +vkz vlp vls vma @@ -6976,6 +7071,7 @@ vrs vrt vsi vsl +vsn vsv vto vum @@ -7031,6 +7127,7 @@ wdd wdg wdj wdk +wdt wdu wdy wea @@ -7085,6 +7182,7 @@ wka wkb wkd wkl +wkr wku wkw wky @@ -7092,6 +7190,7 @@ wla wlc wle wlg +wlh wli wlk wll @@ -7109,6 +7208,7 @@ wmb wmc wmd wme +wmg wmh wmi wmm @@ -7177,6 +7277,7 @@ wsr wss wsu wsv +wtb wtf wth wti @@ -7207,6 +7308,7 @@ wya wyb wyi wym +wyn wyr wyy xaa @@ -7267,6 +7369,7 @@ xdc xdk xdm xdo +xdq xdy xeb xed @@ -7295,6 +7398,7 @@ xha xhc xhd xhe +xhm xhr xht xhu @@ -7380,9 +7484,12 @@ xnd xng xnh xni +xnj xnk +xnm xnn xno +xnq xnr xns xnt @@ -7401,12 +7508,17 @@ xop xor xow xpa +xpb xpc +xpd xpe +xpf xpg +xph xpi xpj xpk +xpl xpm xpn xpo @@ -7416,7 +7528,11 @@ xpr xps xpt xpu +xpv +xpw +xpx xpy +xpz xqa xqt xra @@ -7563,6 +7679,7 @@ ych ycl ycn ycp +ycr yda ydd yde @@ -7617,6 +7734,7 @@ yiy yiz yka ykg +ykh yki ykk ykl @@ -7708,6 +7826,7 @@ ysc ysd ysg ysl +ysm ysn yso ysp @@ -7788,17 +7907,21 @@ zaw zax zay zaz +zba zbc zbe zbl zbt +zbu zbw zca +zcd zch zdj zea zeg zeh +zem zen zga zgb @@ -7835,12 +7958,14 @@ zkt zku zkv zkz +zla zle zlj zlm zln zlq zls +zlu zlw zma zmb diff --git a/pyxform/validators/pyxform/iana_subtags/subtags_updater.py b/pyxform/validators/pyxform/iana_subtags/subtags_updater.py new file mode 100644 index 00000000..838c22c2 --- /dev/null +++ b/pyxform/validators/pyxform/iana_subtags/subtags_updater.py @@ -0,0 +1,31 @@ +import re + +""" +The IANA tag registry is updated occasionally. Use this script to update pyxform's copy. + +Save (don't commit) a local .txt copy of the full tag registry and run this script. The +registry includes definitions for things other than languages, so the regex looks for only +primary language subtags. The tag registry referenced by the XLSForm docs is: +https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry + +For further reference see the RFC/BCP: https://datatracker.ietf.org/doc/html/rfc5646 +""" + + +def update(): + with open("language-subtag-registry.txt", encoding="utf-8") as f1: + matches = re.findall(r"Type: language\nSubtag:\s(.*?)\n", f1.read()) + + with open( + "iana_subtags_2_characters.txt", mode="w", encoding="utf-8", newline="\n" + ) as f2: + f2.write("\n".join(i for i in matches if len(i) == 2)) + + with open( + "iana_subtags_3_or_more_characters.txt", mode="w", encoding="utf-8", newline="\n" + ) as f3: + f3.write("\n".join(i for i in matches if len(i) > 2)) + + +if __name__ == "__main__": + update() diff --git a/tests/test_language_warnings.py b/tests/test_language_warnings.py index ee255bf8..347122ee 100644 --- a/tests/test_language_warnings.py +++ b/tests/test_language_warnings.py @@ -13,9 +13,9 @@ class LanguageWarningTest(PyxformTestCase): def test_label_with_valid_subtag_should_not_warn(self): self.assertPyxformXform( md=""" - | survey | | | | - | | type | name | label::English (en) | - | | note | my_note | My note | + | survey | + | | type | name | label::English (en) | label::Acoli (ach) | + | | note | my_note | My note | coc na | """, warnings_count=0, ) @@ -35,16 +35,17 @@ def test_label_with_no_subtag_should_warn(self): ) def test_label_with_unknown_subtag_should_warn(self): + # Bosnian has a short code "bs" so "bos" is not correct per RFC5646. self.assertPyxformXform( md=""" | survey | | | | - | | type | name | label::English (schm) | - | | note | my_note | My note | + | | type | name | label::English (schm) | label::Bosnian (bos) | + | | note | my_note | My note | Moja napomena | """, warnings_count=1, warnings__contains=[ "The following language declarations do not contain valid machine-readable " - "codes: English (schm). Learn more: http://xlsform.org#multiple-language-support" + "codes: English (schm), Bosnian (bos). Learn more: http://xlsform.org#multiple-language-support" ], ) From 0fad57d2804c9e00fe96b4e84db088443a2c9789 Mon Sep 17 00:00:00 2001 From: lindsay stevens Date: Mon, 9 Dec 2024 20:30:43 +1100 Subject: [PATCH 2/3] fix: move comment to relevant lines --- pyxform/parsing/expression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyxform/parsing/expression.py b/pyxform/parsing/expression.py index 335864e6..2c80b74f 100644 --- a/pyxform/parsing/expression.py +++ b/pyxform/parsing/expression.py @@ -74,7 +74,6 @@ def tokenizer(scan, value) -> ExpLexerToken | str: return re.Scanner(lexicon) -# Scanner takes a few 100ms to compile so use this shared instance. class ExpLexerToken: __slots__ = ("name", "value", "start", "end") @@ -85,6 +84,7 @@ def __init__(self, name: str, value: str, start: int, end: int) -> None: self.end: int = end +# Scanner takes a few 100ms to compile so use these shared instances. _EXPRESSION_LEXER = get_expression_lexer() _TOKEN_NAME_LEXER = get_expression_lexer(name_only=True) From 2c878a3035059eb4ce90e76ed279ac58c9d6d6f6 Mon Sep 17 00:00:00 2001 From: lindsay stevens Date: Mon, 9 Dec 2024 22:39:25 +1100 Subject: [PATCH 3/3] add: test results for performance test cases - generally quite a bit faster now but there seems to be something still potentially wrong when translations and/or lots of choices/itemsets are involved, e.g. 10K text = ~5 seconds, vs 10K select + 20K choices = ~80 seconds. - switched to using `convert()` directly since this now accepts markdown input, and it avoids potentially confounding effects of extra things done in `assertPyxformXform` (such as re-parsing XML). - this had a minor impact (approx 10%) on test execution time but considerable impact (approx 30%) on memory usage. --- tests/test_dynamic_default.py | 18 ++++++++++-------- tests/test_translations.py | 17 ++++++++++------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/tests/test_dynamic_default.py b/tests/test_dynamic_default.py index 595dd31e..05d48c1c 100644 --- a/tests/test_dynamic_default.py +++ b/tests/test_dynamic_default.py @@ -10,6 +10,7 @@ import psutil from pyxform import utils +from pyxform.xls2xform import convert from tests.pyxform_test_case import PyxformTestCase from tests.xpath_helpers.choices import xpc @@ -774,13 +775,14 @@ def test_dynamic_default_performance__time(self): """ Should find the dynamic default check costs little extra relative time large forms. - Results with Python 3.9.10 on VM with 4CPU 8GB RAM, x questions each, average of - 10 runs (seconds), with and without the check, per question: - | num | with | without | - | 500 | 0.4599 | 0.4535 | - | 1000 | 0.9234 | 0.9195 | - | 2000 | 2.1118 | 1.9917 | - | 5000 | 4.9563 | 4.8714 | + Results with Python 3.10.14 on VM with 2vCPU (i7-7700HQ) 1GB RAM, x questions + each, average of 10 runs (seconds), with and without the check, per question: + | num | with | without | peak RSS MB | + | 500 | 0.2415 | 0.2512 | 58 | + | 1000 | 0.4754 | 0.5199 | 63 | + | 2000 | 0.9866 | 1.2936 | 67 | + | 5000 | 3.1041 | 2.7132 | 96 | + | 10000 | 5.4795 | 5.3229 | 133 | """ survey_header = """ | survey | | | | | @@ -798,7 +800,7 @@ def run(name, case): results = [] while runs < 10: start = perf_counter() - self.assertPyxformXform(md=case) + convert(xlsform=case) results.append(perf_counter() - start) runs += 1 print(name, round(sum(results) / len(results), 4)) diff --git a/tests/test_translations.py b/tests/test_translations.py index b644dde6..2215c4d3 100644 --- a/tests/test_translations.py +++ b/tests/test_translations.py @@ -13,6 +13,7 @@ OR_OTHER_WARNING, format_missing_translations_msg, ) +from pyxform.xls2xform import convert from tests.pyxform_test_case import PyxformTestCase from tests.xpath_helpers.choices import xpc @@ -396,13 +397,15 @@ def test_missing_translations_check_performance(self): """ Should find the translations check costs a fraction of a second for large forms. - Results with Python 3.10.14 on VM with 2vCPU (i7-7700HQ) 4GB RAM, x questions + Results with Python 3.10.14 on VM with 2vCPU (i7-7700HQ) 1GB RAM, x questions with 2 choices each, average of 10 runs (seconds), with and without the check, per question: - | num | with | without | - | 500 | 3.0420 | 3.0427 | - | 1000 | 9.7641 | 9.6972 | - | 2000 | 30.645 | 28.869 | + | num | with | without | peak RSS MB | + | 500 | 1.0235 | 0.9831 | 74 | + | 1000 | 2.3025 | 2.6332 | 101 | + | 2000 | 5.6960 | 6.2805 | 157 | + | 5000 | 23.439 | 25.327 | 265 | + | 10000 | 80.396 | 75.165 | 480 | """ survey_header = """ | survey | | | | | @@ -429,10 +432,10 @@ def run(name, case): results = [] while runs < 10: start = perf_counter() - self.assertPyxformXform(md=case) + convert(xlsform=case) results.append(perf_counter() - start) runs += 1 - print(name, sum(results) / len(results)) + print(name, round(sum(results) / len(results), 4)) run(name=f"questions={count}, with check (seconds):", case=md)