From 21fb501699ad6d41c3f57b431a3826238df255b6 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 11 May 2022 17:23:28 +0200 Subject: [PATCH 01/30] add info about countries without a postcode --- settings/country_settings.yaml | 56 +++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml index 643acbee3a..8abbe4a27c 100644 --- a/settings/country_settings.yaml +++ b/settings/country_settings.yaml @@ -10,6 +10,7 @@ ae: partition: 83 languages: ar names: !include country-names/ae.yaml + postcode: no # Afghanistan (افغانستان) @@ -24,6 +25,7 @@ ag: partition: 205 languages: en names: !include country-names/ag.yaml + postcode: no # Anguilla (Anguilla) @@ -59,6 +61,7 @@ ao: partition: 85 languages: pt names: !include country-names/ao.yaml + postcode: no # (Antarctica) @@ -101,6 +104,7 @@ aw: partition: 183 languages: nl, pap names: !include country-names/aw.yaml + postcode: no # (Aland Islands) @@ -150,6 +154,7 @@ bf: partition: 225 languages: fr names: !include country-names/bf.yaml + postcode: no # Bulgaria (Бългaрия) @@ -171,6 +176,7 @@ bi: partition: 61 languages: fr names: !include country-names/bi.yaml + postcode: no # Benin (Bénin) @@ -178,6 +184,7 @@ bj: partition: 224 languages: fr names: !include country-names/bj.yaml + postcode: no # (Saint Barthélemy) @@ -206,6 +213,7 @@ bo: partition: 120 languages: es, qu, gn, ay names: !include country-names/bo.yaml + postcode: no # Caribbean Netherlands (Caribisch Nederland) @@ -227,6 +235,7 @@ bs: partition: 207 languages: en names: !include country-names/bs.yaml + postcode: no # Bhutan (འབྲུག་ཡུལ་) @@ -248,6 +257,7 @@ bw: partition: 122 languages: en, tn names: !include country-names/bw.yaml + postcode: no # Belarus (Беларусь) @@ -262,6 +272,7 @@ bz: partition: 208 languages: en names: !include country-names/bz.yaml + postcode: no # Canada (Canada) @@ -283,6 +294,7 @@ cd: partition: 229 languages: fr names: !include country-names/cd.yaml + postcode: no # Central African Republic (Ködörösêse tî Bêafrîka - République Centrafricaine) @@ -290,6 +302,7 @@ cf: partition: 227 languages: fr, sg names: !include country-names/cf.yaml + postcode: no # Congo-Brazzaville (Congo) @@ -297,6 +310,7 @@ cg: partition: 230 languages: fr names: !include country-names/cg.yaml + postcode: no # Switzerland (Schweiz/Suisse/Svizzera/Svizra) @@ -311,6 +325,7 @@ ci: partition: 228 languages: fr names: !include country-names/ci.yaml + postcode: no # Cook Islands (Kūki 'Āirani) @@ -318,6 +333,7 @@ ck: partition: 41 languages: en, rar names: !include country-names/ck.yaml + postcode: no # Chile (Chile) @@ -332,6 +348,7 @@ cm: partition: 141 languages: fr, en names: !include country-names/cm.yaml + postcode: no # China (中国) @@ -409,6 +426,7 @@ dj: partition: 43 languages: fr, ar, so, aa names: !include country-names/dj.yaml + postcode: no # Denmark (Danmark) @@ -423,6 +441,7 @@ dm: partition: 209 languages: en names: !include country-names/dm.yaml + postcode: no # Dominican Republic (República Dominicana) @@ -472,6 +491,7 @@ er: partition: 142 languages: ti, ar, en names: !include country-names/er.yaml + postcode: no # Spain (España) @@ -500,6 +520,7 @@ fj: partition: 210 languages: en names: !include country-names/fj.yaml + postcode: no # Falkland Islands (Falkland Islands) @@ -535,6 +556,7 @@ ga: partition: 239 languages: fr names: !include country-names/ga.yaml + postcode: no # United Kingdom (United Kingdom) @@ -549,6 +571,7 @@ gd: partition: 143 languages: en names: !include country-names/gd.yaml + postcode: no # Georgia (საქართველო) @@ -598,6 +621,7 @@ gm: partition: 212 languages: en names: !include country-names/gm.yaml + postcode: no # Guinea (Guinée) @@ -619,6 +643,7 @@ gq: partition: 12 languages: es, fr, pt names: !include country-names/gq.yaml + postcode: no # Greece (Ελλάς) @@ -661,6 +686,7 @@ gy: partition: 213 languages: en names: !include country-names/gy.yaml + postcode: no # (Hong Kong) @@ -829,6 +855,7 @@ ki: partition: 215 languages: en names: !include country-names/ki.yaml + postcode: no # Comoros (Comores Komori جزر القمر) @@ -836,6 +863,7 @@ km: partition: 47 languages: ar, fr, sw names: !include country-names/km.yaml + postcode: no # Saint Kitts and Nevis (Saint Kitts and Nevis) @@ -850,6 +878,7 @@ kp: partition: 48 languages: ko names: !include country-names/kp.yaml + postcode: no # South Korea (대한민국) @@ -955,6 +984,7 @@ ly: partition: 163 languages: ar names: !include country-names/ly.yaml + postcode: no # Morocco (Maroc ⵍⵎⵖⵔⵉⴱ المغرب) @@ -1018,6 +1048,7 @@ ml: partition: 241 languages: fr names: !include country-names/ml.yaml + postcode: no # Myanmar (မြန်မာ) @@ -1039,6 +1070,7 @@ mo: partition: 191 languages: zh-hant, pt names: !include country-names/mo.yaml + postcode: no # Northern Mariana Islands (Northern Mariana Islands) @@ -1060,6 +1092,7 @@ mr: partition: 149 languages: ar, fr names: !include country-names/mr.yaml + postcode: no # Montserrat (Montserrat) @@ -1095,6 +1128,7 @@ mw: partition: 97 languages: en, ny names: !include country-names/mw.yaml + postcode: no # Mexico (México) @@ -1186,6 +1220,7 @@ nr: partition: 70 languages: na, en names: !include country-names/nr.yaml + postcode: no # Niue (Niuē) @@ -1193,6 +1228,7 @@ nu: partition: 178 languages: niu, en names: !include country-names/nu.yaml + postcode: no # New Zealand (New Zealand / Aotearoa) @@ -1312,6 +1348,7 @@ qa: partition: 169 languages: ar names: !include country-names/qa.yaml + postcode: no # (Réunion) @@ -1347,6 +1384,7 @@ rw: partition: 102 languages: rw, fr, en names: !include country-names/rw.yaml + postcode: no # Saudi Arabia (السعودية) @@ -1361,6 +1399,7 @@ sb: partition: 201 languages: en names: !include country-names/sb.yaml + postcode: no # Seychelles (Sesel) @@ -1368,6 +1407,7 @@ sc: partition: 79 languages: fr, en, crs names: !include country-names/sc.yaml + postcode: no # Sudan (السودان) @@ -1424,6 +1464,7 @@ sl: partition: 219 languages: en names: !include country-names/sl.yaml + postcode: no # San Marino (San Marino) @@ -1452,6 +1493,7 @@ sr: partition: 24 languages: nl names: !include country-names/sr.yaml + postcode: no # South Sudan (South Sudan) @@ -1459,6 +1501,7 @@ ss: partition: 247 languages: en names: !include country-names/ss.yaml + postcode: no # São Tomé and Príncipe (São Tomé e Príncipe) @@ -1466,6 +1509,7 @@ st: partition: 53 languages: pt names: !include country-names/st.yaml + postcode: no # El Salvador (El Salvador) @@ -1487,6 +1531,7 @@ sy: partition: 104 languages: ar names: !include country-names/sy.yaml + postcode: no # Eswatini (eSwatini) @@ -1508,6 +1553,7 @@ td: partition: 68 languages: fr, ar names: !include country-names/td.yaml + postcode: no # French Southern Lands (Terres australes et antarctiques françaises) @@ -1522,6 +1568,7 @@ tg: partition: 243 languages: fr names: !include country-names/tg.yaml + postcode: no # Thailand (ประเทศไทย) @@ -1543,6 +1590,7 @@ tk: partition: 179 languages: tkl, en, sm names: !include country-names/tk.yaml + postcode: no # East Timor (Timór Lorosa'e) @@ -1550,6 +1598,7 @@ tl: partition: 161 languages: pt, tet names: !include country-names/tl.yaml + postcode: no # Turkmenistan (Türkmenistan) @@ -1571,6 +1620,7 @@ to: partition: 220 languages: en names: !include country-names/to.yaml + postcode: no # Turkey (Türkiye) @@ -1592,6 +1642,7 @@ tv: partition: 156 languages: en names: !include country-names/tv.yaml + postcode: no # Taiwan (臺灣) @@ -1620,6 +1671,7 @@ ug: partition: 155 languages: en, sw names: !include country-names/ug.yaml + postcode: no # (United States Minor Outlying Islands) @@ -1697,6 +1749,7 @@ vu: partition: 116 languages: bi, en, fr names: !include country-names/vu.yaml + postcode: no # Wallis and Futuna Islands (Wallis-et-Futuna) @@ -1725,6 +1778,7 @@ ye: partition: 55 languages: ar names: !include country-names/ye.yaml + postcode: no # Mayotte (Mayotte) @@ -1753,4 +1807,4 @@ zw: partition: 223 languages: en, sn, nd names: !include country-names/zw.yaml - + postcode: no From 8080625747dc7e87bc510d2af0d3edf5d551a6d0 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 12 May 2022 11:43:47 +0200 Subject: [PATCH 02/30] remove postcodes from countries that don't have them The postcodes will only be removed as a 'computed postcode' they are still searchable for the given object. --- .pylintrc | 2 +- nominatim/tokenizer/sanitizers/config.py | 14 ++++++++++++++ .../sanitizers/tag_analyzer_by_language.py | 3 +-- nominatim/tools/country_info.py | 14 ++++++++++++-- settings/icu_tokenizer.yaml | 2 ++ test/bdd/db/import/postcodes.feature | 16 +++++++++++++++- 6 files changed, 45 insertions(+), 6 deletions(-) diff --git a/.pylintrc b/.pylintrc index fef5387211..52d9fcf9e6 100644 --- a/.pylintrc +++ b/.pylintrc @@ -13,4 +13,4 @@ ignored-classes=NominatimArgs,closing # 'too-many-ancestors' is triggered already by deriving from UserDict disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use -good-names=i,x,y,fd,db +good-names=i,x,y,fd,db,cc diff --git a/nominatim/tokenizer/sanitizers/config.py b/nominatim/tokenizer/sanitizers/config.py index ecfcacbe55..ce5ce1eb8b 100644 --- a/nominatim/tokenizer/sanitizers/config.py +++ b/nominatim/tokenizer/sanitizers/config.py @@ -44,6 +44,20 @@ def get_string_list(self, param, default=tuple()): return values + def get_bool(self, param, default=None): + """ Extract a configuration parameter as a boolean. + The parameter must be one of the yaml boolean values or an + user error will be raised. If `default` is given, then the parameter + may also be missing or empty. + """ + value = self.data.get(param, default) + + if not isinstance(value, bool): + raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no'.") + + return value + + def get_delimiter(self, default=',;'): """ Return the 'delimiter' parameter in the configuration as a compiled regular expression that can be used to split the names on the diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py index 7898b1c685..9a99d12772 100644 --- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py +++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py @@ -48,8 +48,7 @@ def _compute_default_languages(self, use_defaults): self.deflangs = {} if use_defaults in ('mono', 'all'): - for ccode, prop in country_info.iterate(): - clangs = prop['languages'] + for ccode, clangs in country_info.iterate('languages'): if len(clangs) == 1 or use_defaults == 'all': if self.whitelist: self.deflangs[ccode] = [l for l in clangs if l in self.whitelist] diff --git a/nominatim/tools/country_info.py b/nominatim/tools/country_info.py index 0ad001719e..d754b4ddb0 100644 --- a/nominatim/tools/country_info.py +++ b/nominatim/tools/country_info.py @@ -84,10 +84,20 @@ def setup_country_config(config): _COUNTRY_INFO.load(config) -def iterate(): +def iterate(prop=None): """ Iterate over country code and properties. + + When `prop` is None, all countries are returned with their complete + set of properties. + + If `prop` is given, then only countries are returned where the + given property is set. The second item of the tuple contains only + the content of the given property. """ - return _COUNTRY_INFO.items() + if prop is None: + return _COUNTRY_INFO.items() + + return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p) def setup_country_tables(dsn, sql_dir, ignore_partitions=False): diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index cd9c0d6dd5..544bd81db0 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -32,6 +32,8 @@ sanitizers: - streetnumber convert-to-name: - (\A|.*,)[^\d,]{3,}(,.*|\Z) + - step: clean-postcodes + convert-to-address: yes - step: split-name-list - step: strip-brace-terms - step: tag-analyzer-by-language diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature index 15beab5782..50afa7cc2d 100644 --- a/test/bdd/db/import/postcodes.feature +++ b/test/bdd/db/import/postcodes.feature @@ -246,4 +246,18 @@ Feature: Import of postcodes | 12 445 4 | ca | 25 | 11 | | A1:BC10 | ca | 25 | 11 | - + Scenario: Postcodes outside all countries are not added to the postcode and word table + Given the places + | osm | class | type | addr+postcode | addr+housenumber | addr+place | geometry | + | N34 | place | house | 01982 | 111 | Null Island | 0 0.00001 | + And the places + | osm | class | type | name | geometry | + | N1 | place | hamlet | Null Island | 0 0 | + When importing + Then location_postcode contains exactly + | country | postcode | geometry | + And there are no word tokens for postcodes 01982 + When sending search query "111, 01982 Null Island" + Then results contain + | osm | display_name | + | N34 | 111, Null Island, 01982 | From 6e0014e1383f2cefa235a00a82c50f4169af278f Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 19 May 2022 12:03:26 +0200 Subject: [PATCH 03/30] add postcode patterns for numeric postcodes Adds patterns for countries that have simple numeric-only postcodes. --- settings/country_settings.yaml | 208 +++++++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml index 8abbe4a27c..972e267042 100644 --- a/settings/country_settings.yaml +++ b/settings/country_settings.yaml @@ -18,6 +18,8 @@ af: partition: 30 languages: fa, ps names: !include country-names/af.yaml + postcode: + pattern: "dddd" # Antigua and Barbuda (Antigua and Barbuda) @@ -40,6 +42,8 @@ al: partition: 9 languages: sq names: !include country-names/al.yaml + postcode: + pattern: "dddd" # Armenia (Հայաստան) @@ -47,6 +51,8 @@ am: partition: 33 languages: hy names: !include country-names/am.yaml + postcode: + pattern: "dddd" # Netherlands Antilles (De Nederlandse Antillen) @@ -90,6 +96,8 @@ at: partition: 245 languages: de names: !include country-names/at.yaml + postcode: + pattern: "dddd" # Australia (Australia) @@ -97,6 +105,8 @@ au: partition: 139 languages: en names: !include country-names/au.yaml + postcode: + pattern: "dddd" # (Aruba) @@ -126,6 +136,8 @@ ba: partition: 6 languages: bs, hr, sr names: !include country-names/ba.yaml + postcode: + pattern: "ddddd" # Barbados (Barbados) @@ -140,6 +152,8 @@ bd: partition: 158 languages: bn names: !include country-names/bd.yaml + postcode: + pattern: "dddd" # Belgium (België / Belgique / Belgien) @@ -147,6 +161,8 @@ be: partition: 15 languages: nl, fr, de names: !include country-names/be.yaml + postcode: + pattern: "dddd" # Burkina Faso (Burkina Faso) @@ -162,6 +178,8 @@ bg: partition: 140 languages: bg names: !include country-names/bg.yaml + postcode: + pattern: "dddd" # Bahrain (البحرين) @@ -243,6 +261,8 @@ bt: partition: 87 languages: dz names: !include country-names/bt.yaml + postcode: + pattern: "ddddd" # (Bouvet Island) @@ -265,6 +285,8 @@ by: partition: 40 languages: be, ru names: !include country-names/by.yaml + postcode: + pattern: "dddddd" # Belize (Belize) @@ -318,6 +340,8 @@ ch: partition: 5 languages: de, fr, it, rm names: !include country-names/ch.yaml + postcode: + pattern: "dddd" # Côte d'Ivoire (Côte d’Ivoire) @@ -341,6 +365,8 @@ cl: partition: 88 languages: es names: !include country-names/cl.yaml + postcode: + pattern: "ddddddd" # Cameroon (Cameroun) @@ -356,6 +382,8 @@ cn: partition: 117 languages: zh names: !include country-names/cn.yaml + postcode: + pattern: "dddddd" # Colombia (Colombia) @@ -363,6 +391,8 @@ co: partition: 133 languages: es names: !include country-names/co.yaml + postcode: + pattern: "dddddd" # Costa Rica (Costa Rica) @@ -370,6 +400,8 @@ cr: partition: 64 languages: es names: !include country-names/cr.yaml + postcode: + pattern: "ddddd" # Cuba (Cuba) @@ -377,6 +409,8 @@ cu: partition: 42 languages: es names: !include country-names/cu.yaml + postcode: + pattern: "ddddd" # Cape Verde (Cabo Verde) @@ -384,6 +418,8 @@ cv: partition: 89 languages: pt names: !include country-names/cv.yaml + postcode: + pattern: "dddd" # Curaçao (Curaçao) @@ -419,6 +455,8 @@ de: partition: 3 languages: de names: !include country-names/de.yaml + postcode: + pattern: "ddddd" # Djibouti (Djibouti جيبوتي) @@ -434,6 +472,8 @@ dk: partition: 160 languages: da names: !include country-names/dk.yaml + postcode: + pattern: "dddd" # Dominica (Dominica) @@ -449,6 +489,8 @@ do: partition: 37 languages: es names: !include country-names/do.yaml + postcode: + pattern: "ddddd" # Algeria (Algérie / ⵍⵣⵣⴰⵢⴻⵔ / الجزائر) @@ -456,6 +498,8 @@ dz: partition: 19 languages: ar, ber, fr names: !include country-names/dz.yaml + postcode: + pattern: "ddddd" # Ecuador (Ecuador) @@ -470,6 +514,8 @@ ee: partition: 125 languages: et names: !include country-names/ee.yaml + postcode: + pattern: "ddddd" # Egypt (مصر) @@ -477,6 +523,8 @@ eg: partition: 16 languages: ar names: !include country-names/eg.yaml + postcode: + pattern: "ddddd" # Sahrawi Arab Democratic Republic (الجمهورية العربية الصحراوية الديمقراطية) @@ -499,6 +547,8 @@ es: partition: 31 languages: es, ast, ca, eu, gl names: !include country-names/es.yaml + postcode: + pattern: "ddddd" # Ethiopia (ኢትዮጵያ) @@ -506,6 +556,8 @@ et: partition: 90 languages: am, om names: !include country-names/et.yaml + postcode: + pattern: "dddd" # Finland (Suomi) @@ -513,6 +565,8 @@ fi: partition: 20 languages: fi, sv, se names: !include country-names/fi.yaml + postcode: + pattern: "ddddd" # Fiji (Viti) @@ -535,6 +589,8 @@ fm: partition: 217 languages: en names: !include country-names/fm.yaml + postcode: + pattern: "ddddd" # Faroe Islands (Føroyar) @@ -549,6 +605,8 @@ fr: partition: 4 languages: fr names: !include country-names/fr.yaml + postcode: + pattern: "ddddd" # Gabon (Gabon) @@ -579,6 +637,8 @@ ge: partition: 21 languages: ka names: !include country-names/ge.yaml + postcode: + pattern: "dddd" # French Guiana (Guyane Française) @@ -614,6 +674,8 @@ gl: partition: 111 languages: kl, da names: !include country-names/gl.yaml + postcode: + pattern: "dddd" # The Gambia (Gambia) @@ -629,6 +691,8 @@ gn: partition: 240 languages: fr names: !include country-names/gn.yaml + postcode: + pattern: "ddd" # Guadeloupe (Guadeloupe) @@ -665,6 +729,8 @@ gt: partition: 57 languages: es names: !include country-names/gt.yaml + postcode: + pattern: "ddddd" # Guam (Guam) @@ -679,6 +745,8 @@ gw: partition: 8 languages: pt names: !include country-names/gw.yaml + postcode: + pattern: "dddd" # Guyana (Guyana) @@ -708,6 +776,8 @@ hn: partition: 56 languages: es names: !include country-names/hn.yaml + postcode: + pattern: "ddddd" # Croatia (Hrvatska) @@ -715,6 +785,8 @@ hr: partition: 92 languages: hr names: !include country-names/hr.yaml + postcode: + pattern: "ddddd" # Haiti (Ayiti) @@ -729,6 +801,8 @@ hu: partition: 45 languages: hu names: !include country-names/hu.yaml + postcode: + pattern: "dddd" # Indonesia (Indonesia) @@ -736,6 +810,8 @@ id: partition: 110 languages: id names: !include country-names/id.yaml + postcode: + pattern: "ddddd" # Ireland (Éire / Ireland) @@ -750,6 +826,8 @@ il: partition: 65 languages: he names: !include country-names/il.yaml + postcode: + pattern: "ddddddd" # Isle of Man (Isle of Man) @@ -778,6 +856,8 @@ iq: partition: 144 languages: ar, ku names: !include country-names/iq.yaml + postcode: + pattern: "ddddd" # Iran (ایران) @@ -792,6 +872,8 @@ is: partition: 134 languages: is names: !include country-names/is.yaml + postcode: + pattern: "ddd" # Italy (Italia) @@ -799,6 +881,8 @@ it: partition: 28 languages: it, de, fr names: !include country-names/it.yaml + postcode: + pattern: "ddddd" # Jersey (Jersey) @@ -820,6 +904,8 @@ jo: partition: 17 languages: ar names: !include country-names/jo.yaml + postcode: + pattern: "ddddd" # Japan (日本) @@ -834,6 +920,8 @@ ke: partition: 126 languages: sw, en names: !include country-names/ke.yaml + postcode: + pattern: "ddddd" # Kyrgyzstan (Кыргызстан) @@ -841,6 +929,8 @@ kg: partition: 93 languages: ky, ru names: !include country-names/kg.yaml + postcode: + pattern: "dddddd" # Cambodia (ព្រះរាជាណាចក្រ​កម្ពុជា) @@ -848,6 +938,8 @@ kh: partition: 159 languages: km names: !include country-names/kh.yaml + postcode: + pattern: "dddddd" # Kiribati (Kiribati) @@ -871,6 +963,8 @@ kn: partition: 84 languages: en names: !include country-names/kn.yaml + postcode: + pattern: "dddd" # North Korea (조선민주주의인민공화국) @@ -886,6 +980,8 @@ kr: partition: 49 languages: ko, en names: !include country-names/kr.yaml + postcode: + pattern: "ddddd" # Kuwait (الكويت) @@ -893,6 +989,8 @@ kw: partition: 127 languages: ar names: !include country-names/kw.yaml + postcode: + pattern: "ddddd" # Cayman Islands (Cayman Islands) @@ -914,6 +1012,8 @@ la: partition: 145 languages: lo names: !include country-names/la.yaml + postcode: + pattern: "ddddd" # Lebanon (لبنان) @@ -935,6 +1035,8 @@ li: partition: 246 languages: de names: !include country-names/li.yaml + postcode: + pattern: "dddd" # Sri Lanka (ශ්‍රී ලංකාව இலங்கை) @@ -942,6 +1044,8 @@ lk: partition: 95 languages: si, ta names: !include country-names/lk.yaml + postcode: + pattern: "ddddd" # Liberia (Liberia) @@ -949,6 +1053,8 @@ lr: partition: 216 languages: en names: !include country-names/lr.yaml + postcode: + pattern: "dddd" # Lesotho (Lesotho) @@ -956,6 +1062,8 @@ ls: partition: 136 languages: en, st names: !include country-names/ls.yaml + postcode: + pattern: "ddd" # Lithuania (Lietuva) @@ -970,6 +1078,8 @@ lu: partition: 74 languages: lb, fr, de names: !include country-names/lu.yaml + postcode: + pattern: "dddd" # Latvia (Latvija) @@ -992,6 +1102,8 @@ ma: partition: 23 languages: fr, zgh, ar names: !include country-names/ma.yaml + postcode: + pattern: "ddddd" # Monaco (Monaco) @@ -1013,6 +1125,8 @@ me: partition: 180 languages: srp, sr, hr, bs, sq names: !include country-names/me.yaml + postcode: + pattern: "ddddd" # Saint Martin (Saint Martin) @@ -1027,6 +1141,8 @@ mg: partition: 164 languages: mg, fr names: !include country-names/mg.yaml + postcode: + pattern: "ddd" # Marshall Islands (Ṃajeḷ) @@ -1034,6 +1150,8 @@ mh: partition: 105 languages: en, mh names: !include country-names/mh.yaml + postcode: + pattern: "ddddd" # North Macedonia (Северна Македонија) @@ -1041,6 +1159,8 @@ mk: partition: 69 languages: mk names: !include country-names/mk.yaml + postcode: + pattern: "dddd" # Mali (Mali) @@ -1056,6 +1176,8 @@ mm: partition: 148 languages: my names: !include country-names/mm.yaml + postcode: + pattern: "ddddd" # Mongolia (Монгол улс ᠮᠤᠩᠭᠤᠯ ᠤᠯᠤᠰ) @@ -1063,6 +1185,8 @@ mn: partition: 167 languages: mn names: !include country-names/mn.yaml + postcode: + pattern: "ddddd" # Macao (Macao) @@ -1114,6 +1238,8 @@ mu: partition: 150 languages: mfe, fr, en names: !include country-names/mu.yaml + postcode: + pattern: "ddddd" # Maldives (ދިވެހިރާއްޖެ) @@ -1121,6 +1247,8 @@ mv: partition: 96 languages: dv names: !include country-names/mv.yaml + postcode: + pattern: "ddddd" # Malawi (Malawi) @@ -1136,6 +1264,8 @@ mx: partition: 166 languages: es names: !include country-names/mx.yaml + postcode: + pattern: "ddddd" # Malaysia (Malaysia) @@ -1143,6 +1273,8 @@ my: partition: 7 languages: ms names: !include country-names/my.yaml + postcode: + pattern: "ddddd" # Mozambique (Moçambique) @@ -1157,6 +1289,8 @@ na: partition: 99 languages: en, sf, de names: !include country-names/na.yaml + postcode: + pattern: "ddddd" # New Caledonia (Nouvelle-Calédonie) @@ -1171,6 +1305,8 @@ ne: partition: 226 languages: fr names: !include country-names/ne.yaml + postcode: + pattern: "dddd" # Norfolk Island (Norfolk Island) @@ -1185,6 +1321,8 @@ ng: partition: 218 languages: en names: !include country-names/ng.yaml + postcode: + pattern: "dddddd" # Nicaragua (Nicaragua) @@ -1192,6 +1330,8 @@ ni: partition: 151 languages: es names: !include country-names/ni.yaml + postcode: + pattern: "ddddd" # Netherlands (Nederland) @@ -1206,6 +1346,8 @@ nl: partition: 60 languages: nb, nn, no, se names: !include country-names/no.yaml + postcode: + pattern: "dddd" # Nepal (Nepal) @@ -1213,6 +1355,8 @@ np: partition: 50 languages: ne names: !include country-names/np.yaml + postcode: + pattern: "ddddd" # Nauru (Naoero) @@ -1236,6 +1380,8 @@ nz: partition: 27 languages: mi, en names: !include country-names/nz.yaml + postcode: + pattern: "dddd" # Oman (عمان) @@ -1243,6 +1389,8 @@ om: partition: 137 languages: ar names: !include country-names/om.yaml + postcode: + pattern: "ddd" # Panama (Panamá) @@ -1250,6 +1398,8 @@ pa: partition: 152 languages: es names: !include country-names/pa.yaml + postcode: + pattern: "dddd" # Peru (Perú) @@ -1257,6 +1407,8 @@ pe: partition: 51 languages: es names: !include country-names/pe.yaml + postcode: + pattern: "ddddd" # French Polynesia (Polynésie française) @@ -1271,6 +1423,8 @@ pg: partition: 71 languages: en, tpi, ho names: !include country-names/pg.yaml + postcode: + pattern: "ddd" # Philippines (Philippines) @@ -1278,6 +1432,8 @@ ph: partition: 26 languages: en, tl names: !include country-names/ph.yaml + postcode: + pattern: "dddd" # Pakistan (پاکستان) @@ -1285,6 +1441,8 @@ pk: partition: 14 languages: en, ur, pnb, sd, ps, bal names: !include country-names/pk.yaml + postcode: + pattern: "ddddd" # Poland (Polska) @@ -1320,6 +1478,8 @@ ps: partition: 194 languages: ar, he names: !include country-names/ps.yaml + postcode: + pattern: "ddd" # Portugal (Portugal) @@ -1341,6 +1501,8 @@ py: partition: 101 languages: es, gn names: !include country-names/py.yaml + postcode: + pattern: "dddddd" # Qatar (قطر) @@ -1363,6 +1525,8 @@ ro: partition: 170 languages: ro names: !include country-names/ro.yaml + postcode: + pattern: "dddddd" # Serbia (Србија) @@ -1370,6 +1534,8 @@ rs: partition: 59 languages: sr names: !include country-names/rs.yaml + postcode: + pattern: "ddddd" # Russia (Россия) @@ -1377,6 +1543,8 @@ ru: partition: 135 languages: ru names: !include country-names/ru.yaml + postcode: + pattern: "dddddd" # Rwanda (Rwanda) @@ -1415,6 +1583,8 @@ sd: partition: 72 languages: ar, en names: !include country-names/sd.yaml + postcode: + pattern: "ddddd" # Sweden (Sverige) @@ -1429,6 +1599,8 @@ sg: partition: 115 languages: zh-hans, en, ms, ta names: !include country-names/sg.yaml + postcode: + pattern: "dddddd" # Saint Helena, Ascension and Tristan da Cunha (Saint Helena, Ascension and Tristan da Cunha) @@ -1443,6 +1615,8 @@ si: partition: 36 languages: sl names: !include country-names/si.yaml + postcode: + pattern: "dddd" # (Svalbard and Jan Mayen) @@ -1479,6 +1653,8 @@ sn: partition: 237 languages: fr names: !include country-names/sn.yaml + postcode: + pattern: "ddddd" # Somalia (Soomaaliya الصومال) @@ -1517,6 +1693,8 @@ sv: partition: 103 languages: es names: !include country-names/sv.yaml + postcode: + pattern: "dddd" # (Sint Maarten) @@ -1576,6 +1754,8 @@ th: partition: 32 languages: th names: !include country-names/th.yaml + postcode: + pattern: "ddddd" # Tajikistan (Тоҷикистон) @@ -1583,6 +1763,8 @@ tj: partition: 129 languages: tg, ru names: !include country-names/tj.yaml + postcode: + pattern: "dddddd" # Tokelau (Tokelau) @@ -1606,6 +1788,8 @@ tm: partition: 54 languages: tk names: !include country-names/tm.yaml + postcode: + pattern: "dddddd" # Tunisia (تونس) @@ -1613,6 +1797,8 @@ tn: partition: 18 languages: ar, fr names: !include country-names/tn.yaml + postcode: + pattern: "dddd" # Tonga (Tonga) @@ -1628,6 +1814,8 @@ tr: partition: 81 languages: tr names: !include country-names/tr.yaml + postcode: + pattern: "ddddd" # Trinidad and Tobago (Trinidad and Tobago) @@ -1635,6 +1823,8 @@ tt: partition: 221 languages: en names: !include country-names/tt.yaml + postcode: + pattern: "dddddd" # Tuvalu (Tuvalu) @@ -1657,6 +1847,8 @@ tz: partition: 130 languages: sw, en names: !include country-names/tz.yaml + postcode: + pattern: "ddddd" # Ukraine (Україна) @@ -1664,6 +1856,8 @@ ua: partition: 173 languages: uk names: !include country-names/ua.yaml + postcode: + pattern: "ddddd" # Uganda (Uganda) @@ -1693,6 +1887,8 @@ uy: partition: 174 languages: es names: !include country-names/uy.yaml + postcode: + pattern: "ddddd" # Uzbekistan (Oʻzbekiston) @@ -1700,6 +1896,8 @@ uz: partition: 157 languages: uz, kaa names: !include country-names/uz.yaml + postcode: + pattern: "dddddd" # Vatican City (Civitas Vaticana) @@ -1721,6 +1919,8 @@ ve: partition: 108 languages: es names: !include country-names/ve.yaml + postcode: + pattern: "dddd" # British Virgin Islands (British Virgin Islands) @@ -1742,6 +1942,8 @@ vn: partition: 75 languages: vi names: !include country-names/vn.yaml + postcode: + pattern: "ddddd" # Vanuatu (Vanuatu) @@ -1771,6 +1973,8 @@ xk: partition: 59 languages: sq, sr names: !include country-names/xk.yaml + postcode: + pattern: "ddddd" # Yemen (اليمن) @@ -1793,6 +1997,8 @@ za: partition: 76 languages: en, af, st, tn, xh, zu names: !include country-names/za.yaml + postcode: + pattern: "dddd" # Zambia (Zambia) @@ -1800,6 +2006,8 @@ zm: partition: 222 languages: en names: !include country-names/zm.yaml + postcode: + pattern: "dddd" # Zimbabwe (Zimbabwe) From 90d4d339dbed83cc90823401634f01a20e129548 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 19 May 2022 15:49:36 +0200 Subject: [PATCH 04/30] initial postcode cleaner for simple patterns Moves postcodes that are either in countries without a postcode system or don't correspond to the local pattern for postcodes into a field for a normal address part. Makes them searchable but not as a special address. This has two consequences: they are no longer a skippable part of the address and the postcodes cannot be searched on their own. --- .../tokenizer/sanitizers/clean_postcodes.py | 99 +++++++++++++++++++ .../sanitizers/test_clean_postcodes.py | 54 ++++++++++ 2 files changed, 153 insertions(+) create mode 100644 nominatim/tokenizer/sanitizers/clean_postcodes.py create mode 100644 test/python/tokenizer/sanitizers/test_clean_postcodes.py diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py new file mode 100644 index 0000000000..b07908cdee --- /dev/null +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Sanitizer that filters postcodes by their officially allowed pattern. + +Arguments: + convert-to-address: If set to 'yes' (the default), then postcodes that do + not conform with their country-specific pattern are + converted to an address component. That means that + the postcode does not take part when computing the + postcode centroids of a country but is still searchable. + When set to 'no', non-conforming postcodes are not + searchable either. +""" +import re + +from nominatim.errors import UsageError +from nominatim.tools import country_info + +class _PostcodeMatcher: + """ Matches and formats a postcode according to the format definition. + """ + def __init__(self, country_code, config): + if 'pattern' not in config: + raise UsageError("Field 'pattern' required for 'postcode' " + f"for country '{country_code}'") + + self.pattern = re.compile(config['pattern'].replace('d', '[0-9]') + .replace('l', '[A-Z]')) + + + def normalize(self, postcode): + """ Return the normalized version of the postcode. If the given postcode + does not correspond to the usage-pattern, return null. + """ + normalized = postcode.strip().upper() + + return normalized if self.pattern.fullmatch(normalized) else None + + +class _PostcodeSanitizer: + + def __init__(self, config): + self.convert_to_address = config.get_bool('convert-to-address', True) + # Objects without a country code can't have a postcode per definition. + self.country_without_postcode = {None} + self.country_matcher = {} + + for ccode, prop in country_info.iterate('postcode'): + if prop is False: + self.country_without_postcode.add(ccode) + elif isinstance(prop, dict): + self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop) + else: + raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'") + + + def __call__(self, obj): + if not obj.address: + return + + postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode') + + for pos, postcode in postcodes: + formatted = self.scan(postcode.name, obj.place.country_code) + + if formatted is None: + if self.convert_to_address: + postcode.kind = 'unofficial_postcode' + else: + obj.address.pop(pos) + else: + postcode.name = formatted + + + def scan(self, postcode, country): + """ Check the postcode for correct formatting and return the + normalized version. Returns None if the postcode does not + correspond to the oficial format of the given country. + """ + if country in self.country_without_postcode: + return None + + if country in self.country_matcher: + return self.country_matcher[country].normalize(postcode) + + return postcode.upper() + + + +def create(config): + """ Create a housenumber processing function. + """ + + return _PostcodeSanitizer(config) diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py new file mode 100644 index 0000000000..7cb3c70fae --- /dev/null +++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for the sanitizer that normalizes postcodes. +""" +import pytest + +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer +from nominatim.indexer.place_info import PlaceInfo +from nominatim.tools import country_info + +@pytest.fixture +def sanitize(def_config, request): + country_info.setup_country_config(def_config) + sanitizer_args = {'step': 'clean-postcodes'} + for mark in request.node.iter_markers(name="sanitizer_params"): + sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()}) + + def _run(country=None, **kwargs): + pi = {'address': kwargs} + if country is not None: + pi['country_code'] = country + + _, address = PlaceSanitizer([sanitizer_args]).process_names(PlaceInfo(pi)) + + return sorted([(p.kind, p.name) for p in address]) + + return _run + + +@pytest.mark.parametrize("country", (None, 'ae')) +def test_postcode_no_country(sanitize, country): + assert sanitize(country=country, postcode='23231') == [('unofficial_postcode', '23231')] + + +@pytest.mark.parametrize("country", (None, 'ae')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_no_country_drop(sanitize, country): + assert sanitize(country=country, postcode='23231') == [] + + +@pytest.mark.parametrize("postcode", ('12345', ' 34009 ')) +def test_postcode_pass_good_format(sanitize, postcode): + assert sanitize(country='de', postcode=postcode) == [('postcode', postcode.strip())] + + +@pytest.mark.parametrize("postcode", ('123456', '', ' ', '.....')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_drop_bad_format(sanitize, postcode): + assert sanitize(country='de', postcode=postcode) == [] From 28ab2f6048eff33e6119271c9fd31852db64240a Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 19 May 2022 16:26:51 +0200 Subject: [PATCH 05/30] add postcodes patterns without optional spaces --- settings/country_settings.yaml | 24 +++++++++++++++++++ .../sanitizers/test_clean_postcodes.py | 23 ++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml index 972e267042..684b0e44e9 100644 --- a/settings/country_settings.yaml +++ b/settings/country_settings.yaml @@ -82,6 +82,8 @@ ar: partition: 39 languages: es names: !include country-names/ar.yaml + postcode: + pattern: "l?dddd(?:lll)?" # (American Samoa) @@ -187,6 +189,8 @@ bh: partition: 62 languages: ar names: !include country-names/bh.yaml + postcode: + pattern: "d?ddd" # Burundi (Burundi) @@ -441,6 +445,8 @@ cy: partition: 114 languages: el, tr names: !include country-names/cy.yaml + postcode: + pattern: "(?:99|d)ddd" # Czechia (Česko) @@ -582,6 +588,8 @@ fk: partition: 91 languages: en names: !include country-names/fk.yaml + postcode: + pattern: "FIQQ 1ZZ" # Federated States of Micronesia (Micronesia) @@ -660,6 +668,8 @@ gh: partition: 211 languages: en names: !include country-names/gh.yaml + postcode: + pattern: "ll-d?ddd-dddd" # Gibraltar (Gibraltar) @@ -1005,6 +1015,8 @@ kz: partition: 94 languages: kk, ru names: !include country-names/kz.yaml + postcode: + pattern: "(?:lddldld|dddddd)" # Laos (ປະເທດລາວ) @@ -1111,6 +1123,8 @@ mc: partition: 242 languages: fr names: !include country-names/mc.yaml + postcode: + pattern: "980dd" # Moldova (Moldova) @@ -1494,6 +1508,8 @@ pw: partition: 195 languages: en, pau, ja, sov, tox names: !include country-names/pw.yaml + postcode: + pattern: "969(39|40)" # Paraguay (Paraguay) @@ -1646,6 +1662,8 @@ sm: partition: 153 languages: it names: !include country-names/sm.yaml + postcode: + pattern: "4789d" # Senegal (Sénégal) @@ -1717,6 +1735,8 @@ sz: partition: 82 languages: en, ss names: !include country-names/sz.yaml + postcode: + pattern: "lddd" # Turks and Caicos Islands (Turks and Caicos Islands) @@ -1873,6 +1893,8 @@ um: partition: 198 languages: en names: !include country-names/um.yaml + postcode: + pattern: "96898" # United States (United States) @@ -1905,6 +1927,8 @@ va: partition: 107 languages: it names: !include country-names/va.yaml + postcode: + pattern: "00120" # Saint Vincent and the Grenadines (Saint Vincent and the Grenadines) diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py index 7cb3c70fae..d6371e075b 100644 --- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py +++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py @@ -52,3 +52,26 @@ def test_postcode_pass_good_format(sanitize, postcode): @pytest.mark.sanitizer_params(convert_to_address=False) def test_postcode_drop_bad_format(sanitize, postcode): assert sanitize(country='de', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('1234', '9435', '99000')) +def test_postcode_cyprus_pass(sanitize, postcode): + assert sanitize(country='cy', postcode=postcode) == [('postcode', postcode)] + + +@pytest.mark.parametrize("postcode", ('91234', '99a45', '567')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_cyprus_fail(sanitize, postcode): + assert sanitize(country='cy', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('123456', 'A33F2G7')) +def test_postcode_kazakhstan_pass(sanitize, postcode): + assert sanitize(country='kz', postcode=postcode) == [('postcode', postcode)] + + +@pytest.mark.parametrize("postcode", ('V34T6Y923456', '99345')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_kazakhstan_fail(sanitize, postcode): + assert sanitize(country='kz', postcode=postcode) == [] + From baee6f3de09226c3dc41cb2314a0ac348e865561 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 23 May 2022 11:01:57 +0200 Subject: [PATCH 06/30] postcodes: strip leading country codes --- nominatim/tokenizer/sanitizers/clean_postcodes.py | 9 ++++++--- test/python/tokenizer/sanitizers/test_clean_postcodes.py | 8 +++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py index b07908cdee..ae1cd62d8d 100644 --- a/nominatim/tokenizer/sanitizers/clean_postcodes.py +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -29,8 +29,9 @@ def __init__(self, country_code, config): raise UsageError("Field 'pattern' required for 'postcode' " f"for country '{country_code}'") - self.pattern = re.compile(config['pattern'].replace('d', '[0-9]') - .replace('l', '[A-Z]')) + pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]') + + self.pattern = re.compile(f'(?:{country_code.upper()}[ -]?)?({pc_pattern})') def normalize(self, postcode): @@ -39,7 +40,9 @@ def normalize(self, postcode): """ normalized = postcode.strip().upper() - return normalized if self.pattern.fullmatch(normalized) else None + match = self.pattern.fullmatch(normalized) + + return match.group(1) if match else None class _PostcodeSanitizer: diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py index d6371e075b..e5c07596a7 100644 --- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py +++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py @@ -43,12 +43,14 @@ def test_postcode_no_country_drop(sanitize, country): assert sanitize(country=country, postcode='23231') == [] -@pytest.mark.parametrize("postcode", ('12345', ' 34009 ')) +@pytest.mark.parametrize("postcode", ('12345', ' 12345 ', 'de 12345', + 'DE12345', 'DE 12345', 'DE-12345')) def test_postcode_pass_good_format(sanitize, postcode): - assert sanitize(country='de', postcode=postcode) == [('postcode', postcode.strip())] + assert sanitize(country='de', postcode=postcode) == [('postcode', '12345')] -@pytest.mark.parametrize("postcode", ('123456', '', ' ', '.....')) +@pytest.mark.parametrize("postcode", ('123456', '', ' ', '.....', + 'DE 12345', 'DEF12345', 'CH 12345')) @pytest.mark.sanitizer_params(convert_to_address=False) def test_postcode_drop_bad_format(sanitize, postcode): assert sanitize(country='de', postcode=postcode) == [] From 49626ba7091ea305616c03b397984add8a09e7d4 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 23 May 2022 11:10:35 +0200 Subject: [PATCH 07/30] add postcode formats with optional country code If the country code is not part of the mandatory output, the country code filter will do the correct handling. --- settings/country_settings.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml index 684b0e44e9..adb7593ed5 100644 --- a/settings/country_settings.yaml +++ b/settings/country_settings.yaml @@ -131,6 +131,8 @@ az: partition: 119 languages: az names: !include country-names/az.yaml + postcode: + pattern: "dddd" # Bosnia and Herzegovina (Bosna i Hercegovina / Босна и Херцеговина) @@ -513,6 +515,8 @@ ec: partition: 78 languages: es names: !include country-names/ec.yaml + postcode: + pattern: "dddddd" # Estonia (Eesti) @@ -606,6 +610,8 @@ fo: partition: 10 languages: fo, da names: !include country-names/fo.yaml + postcode: + pattern: "ddd" # France (France) @@ -804,6 +810,8 @@ ht: partition: 29 languages: fr, ht names: !include country-names/ht.yaml + postcode: + pattern: "dddd" # Hungary (Magyarország) @@ -1083,6 +1091,8 @@ lt: partition: 67 languages: lt names: !include country-names/lt.yaml + postcode: + pattern: "ddddd" # Luxembourg (Lëtzebuerg) From 9172696324d1a3cd489428d16c2c8d88cf9adaca Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 23 May 2022 14:04:22 +0200 Subject: [PATCH 08/30] postcodes: add support for optional spaces --- nominatim/tokenizer/sanitizers/clean_postcodes.py | 14 ++++++++++---- settings/country_settings.yaml | 9 +++++++++ .../tokenizer/sanitizers/test_clean_postcodes.py | 11 +++++++++++ 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py index ae1cd62d8d..a968c9db07 100644 --- a/nominatim/tokenizer/sanitizers/clean_postcodes.py +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -31,18 +31,24 @@ def __init__(self, country_code, config): pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]') - self.pattern = re.compile(f'(?:{country_code.upper()}[ -]?)?({pc_pattern})') + self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*') + self.pattern = re.compile(pc_pattern) + + self.output = config.get('output', r'\g<0>') def normalize(self, postcode): """ Return the normalized version of the postcode. If the given postcode does not correspond to the usage-pattern, return null. """ - normalized = postcode.strip().upper() + # Upper-case, strip spaces and leading country code. + normalized = self.norm_pattern.fullmatch(postcode.upper()) - match = self.pattern.fullmatch(normalized) + if normalized: + match = self.pattern.fullmatch(normalized.group(1)) + return match.expand(self.output) if match else None - return match.group(1) if match else None + return None class _PostcodeSanitizer: diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml index adb7593ed5..f09de046fc 100644 --- a/settings/country_settings.yaml +++ b/settings/country_settings.yaml @@ -456,6 +456,9 @@ cz: partition: 124 languages: cs names: !include country-names/cz.yaml + postcode: + pattern: "(ddd) ?(dd)" + output: \1 \2 # Germany (Deutschland) @@ -1618,6 +1621,9 @@ se: partition: 112 languages: sv names: !include country-names/se.yaml + postcode: + pattern: "(ddd) ?(dd)" + output: \1 \2 # Singapore (Singapore) @@ -1657,6 +1663,9 @@ sk: partition: 172 languages: sk names: !include country-names/sk.yaml + postcode: + pattern: "(ddd) ?(dd)" + output: \1 \2 # Sierra Leone (Sierra Leone) diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py index e5c07596a7..228c2f3a1a 100644 --- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py +++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py @@ -77,3 +77,14 @@ def test_postcode_kazakhstan_pass(sanitize, postcode): def test_postcode_kazakhstan_fail(sanitize, postcode): assert sanitize(country='kz', postcode=postcode) == [] + +@pytest.mark.parametrize("postcode", ('675 34', '67534', 'SE-675 34', 'SE67534')) +def test_postcode_sweden_pass(sanitize, postcode): + assert sanitize(country='se', postcode=postcode) == [('postcode', '675 34')] + + +@pytest.mark.parametrize("postcode", ('67 345', '671123')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_sweden_fail(sanitize, postcode): + assert sanitize(country='se', postcode=postcode) == [] + From 9cf700e85d723736bf54334f0b1bd9e885cbb42a Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 23 May 2022 16:11:16 +0200 Subject: [PATCH 09/30] add postcodes for most of the remaining countries Now includes all postcodes that have optional parts. --- settings/country_settings.yaml | 111 +++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml index f09de046fc..67905ea2a3 100644 --- a/settings/country_settings.yaml +++ b/settings/country_settings.yaml @@ -3,6 +3,9 @@ ad: partition: 35 languages: ca names: !include country-names/ad.yaml + postcode: + pattern: "(ddd)" + output: AD\1 # United Arab Emirates (الإمارات العربية المتحدة) @@ -35,6 +38,9 @@ ai: partition: 175 languages: en names: !include country-names/ai.yaml + postcode: + pattern: "2640" + output: AI-2640 # Albania (Shqipëria) @@ -75,6 +81,7 @@ aq: partition: 181 languages: en, es, fr, ru names: !include country-names/aq.yaml + postcode: no # Argentina (Argentina) @@ -149,6 +156,9 @@ bb: partition: 206 languages: en names: !include country-names/bb.yaml + postcode: + pattern: "(ddddd)" + output: BB\1 # Bangladesh (Bangladesh) @@ -223,6 +233,9 @@ bm: partition: 176 languages: en names: !include country-names/bm.yaml + postcode: + pattern: "(ll)[ -]?(dd)" + output: \1 \2 # Brunei (Brunei) @@ -230,6 +243,9 @@ bn: partition: 86 languages: ms names: !include country-names/bn.yaml + postcode: + pattern: "(ll) ?(dddd)" + output: \1\2 # Bolivia (Bolivia) @@ -252,6 +268,9 @@ br: partition: 121 languages: pt names: !include country-names/br.yaml + postcode: + pattern: "(ddddd)-?(ddd)" + output: \1-\2 # The Bahamas (The Bahamas) @@ -308,6 +327,9 @@ ca: partition: 244 languages: en, fr names: !include country-names/ca.yaml + postcode: + pattern: "(ldl) ?(dld)" + output: \1 \2 # Cocos (Keeling) Islands (Cocos (Keeling) Islands) @@ -639,6 +661,9 @@ gb: partition: 1 languages: en names: !include country-names/gb.yaml + postcode: + pattern: "(l?ld[A-Z0-9]?) ?(dll)" + output: \1 \2 # Grenada (Grenada) @@ -670,6 +695,9 @@ gg: partition: 77 languages: en names: !include country-names/gg.yaml + postcode: + pattern: "(GYdd?) ?(dll)" + output: \1 \2 # Ghana (Ghana) @@ -686,6 +714,9 @@ gi: partition: 138 languages: en names: !include country-names/gi.yaml + postcode: + pattern: "(GX11) ?(1AA)" + output: GX11 1AA # Greenland (Kalaallit Nunaat) @@ -734,6 +765,9 @@ gr: partition: 22 languages: el names: !include country-names/gr.yaml + postcode: + pattern: "(ddd) ?(dd)" + output: \1 \2 # South Georgia and the South Sandwich Islands (South Georgia and the South Sandwich Islands) @@ -741,6 +775,9 @@ gs: partition: 44 languages: en names: !include country-names/gs.yaml + postcode: + pattern: "(SIQQ) ?(1ZZ)" + output: \1 \2 # Guatemala (Guatemala) @@ -840,6 +877,9 @@ ie: partition: 46 languages: en, ga names: !include country-names/ie.yaml + postcode: + pattern: "(ldd) ?([0123456789ACDEFHKNPRTVWXY]{4})" + output: \1 \2 # Israel (ישראל) @@ -856,6 +896,9 @@ im: partition: 190 languages: en names: !include country-names/im.yaml + postcode: + pattern: "(IMdd?) ?(dll)" + output: \1 \2 # India (India) @@ -863,6 +906,9 @@ in: partition: 128 languages: hi, en names: !include country-names/in.yaml + postcode: + pattern: "(ddd) ?(ddd)" + output: \1\2 # British Indian Ocean Territory (British Indian Ocean Territory) @@ -870,6 +916,9 @@ io: partition: 13 languages: en names: !include country-names/io.yaml + postcode: + pattern: "(BBND) ?(1ZZ)" + output: \1 \2 # Iraq (العراق) @@ -886,6 +935,9 @@ ir: partition: 80 languages: fa names: !include country-names/ir.yaml + postcode: + pattern: "(ddddd)[-_ ]?(ddddd)" + output: \1-\2 # Iceland (Ísland) @@ -911,6 +963,9 @@ je: partition: 123 languages: en names: !include country-names/je.yaml + postcode: + pattern: "(JEdd?) ?(dll)" + output: \1 \2 # Jamaica (Jamaica) @@ -918,6 +973,7 @@ jm: partition: 214 languages: en names: !include country-names/jm.yaml + postcode: no # Jordan (الأردن) @@ -934,6 +990,9 @@ jp: partition: 11 languages: ja names: !include country-names/jp.yaml + postcode: + pattern: "(ddd)-?(dddd)" + output: \1-\2 # Kenya (Kenya) @@ -1019,6 +1078,9 @@ ky: partition: 38 languages: en names: !include country-names/ky.yaml + postcode: + pattern: "(d)-(dddd)" + output: KY\1-\2 # Kazakhstan (Қазақстан) @@ -1044,6 +1106,8 @@ lb: partition: 66 languages: ar, fr names: !include country-names/lb.yaml + postcode: + pattern: "(dddd)(?: ?dddd)?" # Saint Lucia (Saint Lucia) @@ -1051,6 +1115,9 @@ lc: partition: 146 languages: en names: !include country-names/lc.yaml + postcode: + pattern: "(dd) ?(ddd)" + output: LC\1 \2 # Liechtenstein (Liechtenstein) @@ -1112,6 +1179,9 @@ lv: partition: 162 languages: lv names: !include country-names/lv.yaml + postcode: + pattern: "(dddd)" + output: LV-\1 # Libya (ليبيا) @@ -1145,6 +1215,9 @@ md: partition: 147 languages: ro, ru, uk names: !include country-names/md.yaml + postcode: + pattern: "(dddd)" + output: MD-\1 # Montenegro (Crna Gora / Црна Гора) @@ -1258,6 +1331,9 @@ mt: partition: 165 languages: mt, en names: !include country-names/mt.yaml + postcode: + pattern: "(lll) ?(dddd)" + output: \1 \2 # Mauritius (Mauritius) @@ -1309,6 +1385,9 @@ mz: partition: 98 languages: pt names: !include country-names/mz.yaml + postcode: + pattern: "(dddd)-?(dd)?" + output: \1-\2 # Namibia (Namibia) @@ -1366,6 +1445,9 @@ nl: partition: 63 languages: nl names: !include country-names/nl.yaml + postcode: + pattern: "(dddd) ?(ll)" + output: \1 \2 # Norway (Norge) @@ -1477,6 +1559,9 @@ pl: partition: 168 languages: pl names: !include country-names/pl.yaml + postcode: + pattern: "(dd)[ -]?(ddd)" + output: \1-\2 # Saint Pierre and Miquelon (Saint-Pierre-et-Miquelon) @@ -1491,6 +1576,9 @@ pn: partition: 113 languages: en, pih names: !include country-names/pn.yaml + postcode: + pattern: "(PCRN) ?(1ZZ)" + output: \1 \2 # Puerto Rico (Puerto Rico) @@ -1514,6 +1602,8 @@ pt: partition: 34 languages: pt names: !include country-names/pt.yaml + postcode: + pattern: "dddd(?:-ddd)?" # Palau (Belau) @@ -1589,6 +1679,8 @@ sa: partition: 52 languages: ar names: !include country-names/sa.yaml + postcode: + pattern: "ddddd(?:-dddd)?" # Solomon Islands (Solomon Islands) @@ -1640,6 +1732,9 @@ sh: partition: 196 languages: en names: !include country-names/sh.yaml + postcode: + pattern: "(ASCN|STHL|TDCU) ?(1ZZ)" + output: \1 \2 # Slovenia (Slovenija) @@ -1699,6 +1794,9 @@ so: partition: 154 languages: so, ar names: !include country-names/so.yaml + postcode: + pattern: "(ll) ?(ddddd)" + output: \1 \2 # Suriname (Suriname) @@ -1763,6 +1861,9 @@ tc: partition: 106 languages: en names: !include country-names/tc.yaml + postcode: + pattern: "(TKCA) ?(1ZZ)" + output: \1 \2 # Chad (Tchad تشاد) @@ -1879,6 +1980,8 @@ tw: partition: 25 languages: zh-hant names: !include country-names/tw.yaml + postcode: + pattern: "ddd(?:ddd?)?" # Tanzania (Tanzania) @@ -1921,6 +2024,8 @@ us: partition: 2 languages: en names: !include country-names/us.yaml + postcode: + pattern: "(ddddd)(?:-dddd)?" # Uruguay (Uruguay) @@ -1955,6 +2060,9 @@ vc: partition: 171 languages: en names: !include country-names/vc.yaml + postcode: + pattern: "(dddd)" + output: VC\1 # Venezuela (Venezuela) @@ -1971,6 +2079,9 @@ vg: partition: 109 languages: en names: !include country-names/vg.yaml + postcode: + pattern: "(dddd)" + output: VG\1 # (United States Virgin Islands) From 5ba75df507617162907c2b42a7825ee406218582 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 24 May 2022 17:11:40 +0200 Subject: [PATCH 10/30] postcode: generate a generic form --- .../tokenizer/sanitizers/clean_postcodes.py | 31 +++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py index a968c9db07..42beea37fe 100644 --- a/nominatim/tokenizer/sanitizers/clean_postcodes.py +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -37,20 +37,28 @@ def __init__(self, country_code, config): self.output = config.get('output', r'\g<0>') - def normalize(self, postcode): - """ Return the normalized version of the postcode. If the given postcode - does not correspond to the usage-pattern, return null. + def match(self, postcode): + """ Match the given postcode against the postcode pattern for this + matcher. Returns a `re.Match` object if the match was successful + and None otherwise. """ # Upper-case, strip spaces and leading country code. normalized = self.norm_pattern.fullmatch(postcode.upper()) if normalized: - match = self.pattern.fullmatch(normalized.group(1)) - return match.expand(self.output) if match else None + return self.pattern.fullmatch(normalized.group(1)) return None + def normalize(self, match): + """ Return the default format of the postcode for the given match. + `match` must be a `re.Match` object previously returned by + `match()` + """ + return match.expand(self.output) + + class _PostcodeSanitizer: def __init__(self, config): @@ -83,7 +91,8 @@ def __call__(self, obj): else: obj.address.pop(pos) else: - postcode.name = formatted + postcode.name = formatted[0] + postcode.set_attr('lookup', formatted[1]) def scan(self, postcode, country): @@ -94,10 +103,14 @@ def scan(self, postcode, country): if country in self.country_without_postcode: return None - if country in self.country_matcher: - return self.country_matcher[country].normalize(postcode) + matcher = self.country_matcher.get(country) + if matcher is not None: + match = matcher.match(postcode) + if match is None: + return None + return matcher.normalize(match), ' '.join(match.groups()) - return postcode.upper() + return postcode.upper(), '' From 18864afa8aee710a5aa7fe65565711119ca7a663 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 24 May 2022 18:25:37 +0200 Subject: [PATCH 11/30] postcodes: introduce a default pattern for countries without postcodes --- .../tokenizer/sanitizers/clean_postcodes.py | 22 +++++++++++++------ settings/icu_tokenizer.yaml | 1 + .../sanitizers/test_clean_postcodes.py | 12 ++++++++++ 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py index 42beea37fe..c6292a2942 100644 --- a/nominatim/tokenizer/sanitizers/clean_postcodes.py +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -75,6 +75,12 @@ def __init__(self, config): else: raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'") + default_pattern = config.get('default-pattern') + if default_pattern is not None and isinstance(default_pattern, str): + self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern}) + else: + self.default_matcher = None + def __call__(self, obj): if not obj.address: @@ -103,14 +109,16 @@ def scan(self, postcode, country): if country in self.country_without_postcode: return None - matcher = self.country_matcher.get(country) - if matcher is not None: - match = matcher.match(postcode) - if match is None: - return None - return matcher.normalize(match), ' '.join(match.groups()) + matcher = self.country_matcher.get(country, self.default_matcher) + if matcher is None: + return postcode.upper(), '' + + match = matcher.match(postcode) + if match is None: + return None + + return matcher.normalize(match), ' '.join(match.groups()) - return postcode.upper(), '' diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 544bd81db0..f682bbcdf8 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -34,6 +34,7 @@ sanitizers: - (\A|.*,)[^\d,]{3,}(,.*|\Z) - step: clean-postcodes convert-to-address: yes + default-pattern: [A-Z0-9- ]{3,12} - step: split-name-list - step: strip-brace-terms - step: tag-analyzer-by-language diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py index 228c2f3a1a..4437619625 100644 --- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py +++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py @@ -88,3 +88,15 @@ def test_postcode_sweden_pass(sanitize, postcode): def test_postcode_sweden_fail(sanitize, postcode): assert sanitize(country='se', postcode=postcode) == [] + +@pytest.mark.parametrize("postcode", ('AB1', '123-456-7890', '1 as 44')) +@pytest.mark.sanitizer_params(default_pattern='[A-Z0-9- ]{3,12}') +def test_postcode_default_pattern_pass(sanitize, postcode): + assert sanitize(country='an', postcode=postcode) == [('postcode', postcode.upper())] + + +@pytest.mark.parametrize("postcode", ('C', '12', 'ABC123DEF 456', '1234,5678', '11223;11224')) +@pytest.mark.sanitizer_params(convert_to_address=False, default_pattern='[A-Z0-9- ]{3,12}') +def test_postcode_default_pattern_fail(sanitize, postcode): + assert sanitize(country='an', postcode=postcode) == [] + From ca7b46511d41d67e229f758e638367c241815c11 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 24 May 2022 21:45:06 +0200 Subject: [PATCH 12/30] introduce and use analyzer for postcodes --- lib-sql/tokenizer/icu_tokenizer.sql | 23 ++++++++ nominatim/tokenizer/icu_tokenizer.py | 51 +++++++++++------- .../tokenizer/sanitizers/clean_postcodes.py | 2 +- .../tokenizer/token_analysis/postcodes.py | 54 +++++++++++++++++++ settings/icu_tokenizer.yaml | 4 +- 5 files changed, 114 insertions(+), 20 deletions(-) create mode 100644 nominatim/tokenizer/token_analysis/postcodes.py diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index a3dac8ddcb..f323334b88 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -223,3 +223,26 @@ BEGIN END; $$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION create_postcode_word(postcode TEXT, lookup_terms TEXT[]) + RETURNS BOOLEAN + AS $$ +DECLARE + existing INTEGER; +BEGIN + SELECT count(*) INTO existing + FROM word WHERE word = postcode and type = 'P'; + + IF existing > 0 THEN + RETURN TRUE; + END IF; + + -- postcodes don't need word ids + INSERT INTO word (word_token, type, word) + SELECT lookup_term, 'P', postcode FROM unnest(lookup_terms) as lookup_term; + + RETURN FALSE; +END; +$$ +LANGUAGE plpgsql; + diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 4678af66eb..e9812ba043 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -11,7 +11,6 @@ import itertools import json import logging -import re from textwrap import dedent from nominatim.db.connection import connect @@ -473,7 +472,7 @@ def process_place(self, place): def _process_place_address(self, token_info, address): for item in address: if item.kind == 'postcode': - self._add_postcode(item.name) + token_info.set_postcode(self._add_postcode(item)) elif item.kind == 'housenumber': token_info.add_housenumber(*self._compute_housenumber_token(item)) elif item.kind == 'street': @@ -605,26 +604,36 @@ def _compute_name_tokens(self, names): return full_tokens, partial_tokens - def _add_postcode(self, postcode): + def _add_postcode(self, item): """ Make sure the normalized postcode is present in the word table. """ - if re.search(r'[:,;]', postcode) is None: - postcode = self.normalize_postcode(postcode) + analyzer = self.token_analysis.get_analyzer('@postcode') - if postcode not in self._cache.postcodes: - term = self._search_normalized(postcode) - if not term: - return + if analyzer is None: + postcode_name = item.name.strip().upper() + variant_base = None + else: + postcode_name = analyzer.normalize(item.name) + variant_base = item.get_attr("variant") - with self.conn.cursor() as cur: - # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word_token, type, word) - (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc) - WHERE NOT EXISTS - (SELECT * FROM word - WHERE type = 'P' and word = pc)) - """, (term, postcode)) - self._cache.postcodes.add(postcode) + if variant_base is not None: + postcode = f'{postcode_name}@{variant_base}' + else: + postcode = postcode_name + + if postcode not in self._cache.postcodes: + term = self._search_normalized(postcode_name) + if not term: + return + + variants = {term} + if analyzer is not None and variant_base is not None: + variants.update(analyzer.get_variants_ascii(variant_base)) + + with self.conn.cursor() as cur: + cur.execute("SELECT create_postcode_word(%s, %s)", + (postcode, list(variants))) + self._cache.postcodes.add(postcode) class _TokenInfo: @@ -637,6 +646,7 @@ def __init__(self): self.street_tokens = set() self.place_tokens = set() self.address_tokens = {} + self.postcode = None @staticmethod @@ -701,6 +711,11 @@ def add_address_term(self, key, partials): if partials: self.address_tokens[key] = self._mk_array(partials) + def set_postcode(self, postcode): + """ Set the postcode to the given one. + """ + self.postcode = postcode + class _TokenCache: """ Cache for token information to avoid repeated database queries. diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py index c6292a2942..d1edc60d1e 100644 --- a/nominatim/tokenizer/sanitizers/clean_postcodes.py +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -98,7 +98,7 @@ def __call__(self, obj): obj.address.pop(pos) else: postcode.name = formatted[0] - postcode.set_attr('lookup', formatted[1]) + postcode.set_attr('variant', formatted[1]) def scan(self, postcode, country): diff --git a/nominatim/tokenizer/token_analysis/postcodes.py b/nominatim/tokenizer/token_analysis/postcodes.py new file mode 100644 index 0000000000..e105b132da --- /dev/null +++ b/nominatim/tokenizer/token_analysis/postcodes.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Specialized processor for postcodes. Supports a 'lookup' variant of the +token, which produces variants with optional spaces. +""" + +from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator + +### Configuration section + +def configure(rules, normalization_rules): # pylint: disable=W0613 + """ All behaviour is currently hard-coded. + """ + return None + +### Analysis section + +def create(normalizer, transliterator, config): # pylint: disable=W0613 + """ Create a new token analysis instance for this module. + """ + return PostcodeTokenAnalysis(normalizer, transliterator) + +class PostcodeTokenAnalysis: + """ Detects common housenumber patterns and normalizes them. + """ + def __init__(self, norm, trans): + self.norm = norm + self.trans = trans + + self.mutator = MutationVariantGenerator(' ', (' ', '')) + + + def normalize(self, name): + """ Return the standard form of the postcode. + """ + return name.strip().upper() + + + def get_variants_ascii(self, norm_name): + """ Compute the spelling variants for the given normalized postcode. + + The official form creates one variant. If a 'lookup version' is + given, then it will create variants with optional spaces. + """ + # Postcodes follow their own transliteration rules. + # Make sure at this point, that the terms are normalized in a way + # that they are searchable with the standard transliteration rules. + return [self.trans.transliterate(term) for term in + self.mutator.generate([self.norm.transliterate(norm_name)])] diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index f682bbcdf8..212fdcb9e2 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -34,7 +34,7 @@ sanitizers: - (\A|.*,)[^\d,]{3,}(,.*|\Z) - step: clean-postcodes convert-to-address: yes - default-pattern: [A-Z0-9- ]{3,12} + default-pattern: "[A-Z0-9- ]{3,12}" - step: split-name-list - step: strip-brace-terms - step: tag-analyzer-by-language @@ -46,6 +46,8 @@ token-analysis: - analyzer: generic - id: "@housenumber" analyzer: housenumbers + - id: "@postcode" + analyzer: postcodes - id: bg analyzer: generic mode: variant-only From b7704833e4b011541928372a46ea692c3a496b5c Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 3 Jun 2022 17:12:01 +0200 Subject: [PATCH 13/30] icu: switch postcodes to using the pre-formatted one --- lib-sql/functions/interpolation.sql | 19 +++++++++---------- lib-sql/functions/placex_triggers.sql | 5 ++--- lib-sql/tokenizer/icu_tokenizer.sql | 7 +++++++ nominatim/tokenizer/icu_tokenizer.py | 3 +++ 4 files changed, 21 insertions(+), 13 deletions(-) diff --git a/lib-sql/functions/interpolation.sql b/lib-sql/functions/interpolation.sql index c8cfbcc68c..3a99471101 100644 --- a/lib-sql/functions/interpolation.sql +++ b/lib-sql/functions/interpolation.sql @@ -156,7 +156,6 @@ DECLARE linegeo GEOMETRY; splitline GEOMETRY; sectiongeo GEOMETRY; - interpol_postcode TEXT; postcode TEXT; stepmod SMALLINT; BEGIN @@ -174,8 +173,6 @@ BEGIN ST_PointOnSurface(NEW.linegeo), NEW.linegeo); - interpol_postcode := token_normalized_postcode(NEW.address->'postcode'); - NEW.token_info := token_strip_info(NEW.token_info); IF NEW.address ? '_inherited' THEN NEW.address := hstore('interpolation', NEW.address->'interpolation'); @@ -207,6 +204,11 @@ BEGIN FOR nextnode IN SELECT DISTINCT ON (nodeidpos) osm_id, address, geometry, + -- Take the postcode from the node only if it has a housenumber itself. + -- Note that there is a corner-case where the node has a wrongly + -- formatted postcode and therefore 'postcode' contains a derived + -- variant. + CASE WHEN address ? 'postcode' THEN placex.postcode ELSE NULL::text END as postcode, substring(address->'housenumber','[0-9]+')::integer as hnr FROM placex, generate_series(1, array_upper(waynodes, 1)) nodeidpos WHERE osm_type = 'N' and osm_id = waynodes[nodeidpos]::BIGINT @@ -260,13 +262,10 @@ BEGIN endnumber := newend; -- determine postcode - postcode := coalesce(interpol_postcode, - token_normalized_postcode(prevnode.address->'postcode'), - token_normalized_postcode(nextnode.address->'postcode'), - postcode); - IF postcode is NULL THEN - SELECT token_normalized_postcode(placex.postcode) - FROM placex WHERE place_id = NEW.parent_place_id INTO postcode; + postcode := coalesce(prevnode.postcode, nextnode.postcode, postcode); + IF postcode is NULL and NEW.parent_place_id > 0 THEN + SELECT placex.postcode FROM placex + WHERE place_id = NEW.parent_place_id INTO postcode; END IF; IF postcode is NULL THEN postcode := get_nearest_postcode(NEW.country_code, nextnode.geometry); diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql index 6143a1edae..1f7e6dc61a 100644 --- a/lib-sql/functions/placex_triggers.sql +++ b/lib-sql/functions/placex_triggers.sql @@ -992,7 +992,7 @@ BEGIN {% if debug %}RAISE WARNING 'Got parent details from search name';{% endif %} -- determine postcode - NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'), + NEW.postcode := coalesce(token_get_postcode(NEW.token_info), location.postcode, get_nearest_postcode(NEW.country_code, NEW.centroid)); @@ -1150,8 +1150,7 @@ BEGIN {% if debug %}RAISE WARNING 'RETURN insert_addresslines: %, %, %', NEW.parent_place_id, NEW.postcode, nameaddress_vector;{% endif %} - NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'), - NEW.postcode); + NEW.postcode := coalesce(token_get_postcode(NEW.token_info), NEW.postcode); -- if we have a name add this to the name search table IF NEW.name IS NOT NULL THEN diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index f323334b88..f86a0a3794 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -104,6 +104,13 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; +CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB) + RETURNS TEXT +AS $$ + SELECT info->>'postcode'; +$$ LANGUAGE SQL IMMUTABLE STRICT; + + -- Return token info that should be saved permanently in the database. CREATE OR REPLACE FUNCTION token_strip_info(info JSONB) RETURNS JSONB diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index e9812ba043..61c47c1188 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -675,6 +675,9 @@ def to_dict(self): if self.address_tokens: out['addr'] = self.address_tokens + if self.postcode: + out['postcode'] = self.postcode + return out From 4885fdf0f97d0615027fa6b2ed410e75ae1a2e20 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 6 Jun 2022 09:49:00 +0200 Subject: [PATCH 14/30] add class for online centroid computation --- nominatim/utils/__init__.py | 0 nominatim/utils/centroid.py | 48 +++++++++++++++++++++++++ test/python/utils/test_centroid.py | 56 ++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+) create mode 100644 nominatim/utils/__init__.py create mode 100644 nominatim/utils/centroid.py create mode 100644 test/python/utils/test_centroid.py diff --git a/nominatim/utils/__init__.py b/nominatim/utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/nominatim/utils/centroid.py b/nominatim/utils/centroid.py new file mode 100644 index 0000000000..c2bd61927e --- /dev/null +++ b/nominatim/utils/centroid.py @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Functions for computation of centroids. +""" +from collections.abc import Collection + +class PointsCentroid: + """ Centroid computation from single points using an online algorithm. + More points may be added at any time. + + Coordinates are internally treated as a 7-digit fixed-point float + (i.e. in OSM style). + """ + + def __init__(self): + self.sum_x = 0 + self.sum_y = 0 + self.count = 0 + + def centroid(self): + """ Return the centroid of all points collected so far. + """ + if self.count == 0: + raise ValueError("No points available for centroid.") + + return (float(self.sum_x/self.count)/10000000, + float(self.sum_y/self.count)/10000000) + + + def __len__(self): + return self.count + + + def __iadd__(self, other): + if isinstance(other, Collection) and len(other) == 2: + if all(isinstance(p, (float, int)) for p in other): + x, y = other + self.sum_x += int(x * 10000000) + self.sum_y += int(y * 10000000) + self.count += 1 + return self + + raise ValueError("Can only add 2-element tuples to centroid.") diff --git a/test/python/utils/test_centroid.py b/test/python/utils/test_centroid.py new file mode 100644 index 0000000000..63d967e756 --- /dev/null +++ b/test/python/utils/test_centroid.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for centroid computation. +""" +import pytest + +from nominatim.utils.centroid import PointsCentroid + +def test_empty_set(): + c = PointsCentroid() + + with pytest.raises(ValueError, match='No points'): + c.centroid() + + +@pytest.mark.parametrize("centroid", [(0,0), (-1, 3), [0.0000032, 88.4938]]) +def test_one_point_centroid(centroid): + c = PointsCentroid() + + c += centroid + + assert len(c.centroid()) == 2 + assert c.centroid() == (pytest.approx(centroid[0]), pytest.approx(centroid[1])) + + +def test_multipoint_centroid(): + c = PointsCentroid() + + c += (20.0, -10.0) + assert c.centroid() == (pytest.approx(20.0), pytest.approx(-10.0)) + c += (20.2, -9.0) + assert c.centroid() == (pytest.approx(20.1), pytest.approx(-9.5)) + c += (20.2, -9.0) + assert c.centroid() == (pytest.approx(20.13333), pytest.approx(-9.333333)) + + +def test_manypoint_centroid(): + c = PointsCentroid() + + for _ in range(10000): + c += (4.564732, -0.000034) + + assert c.centroid() == (pytest.approx(4.564732), pytest.approx(-0.000034)) + + +@pytest.mark.parametrize("param", ["aa", None, 5, [1, 2, 3], (3, None), ("a", 3.9)]) +def test_add_non_tuple(param): + c = PointsCentroid() + + with pytest.raises(ValueError, match='2-element tuples'): + c += param From bf86b45178ff69dbe87942840543d67168577401 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 6 Jun 2022 10:46:48 +0200 Subject: [PATCH 15/30] move postcode centroid computation to Python --- nominatim/tools/postcodes.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py index 2b7027e721..42f54cea3e 100644 --- a/nominatim/tools/postcodes.py +++ b/nominatim/tools/postcodes.py @@ -8,6 +8,7 @@ Functions for importing, updating and otherwise maintaining the table of artificial postcode centroids. """ +from collections import defaultdict import csv import gzip import logging @@ -16,6 +17,7 @@ from psycopg2 import sql as pysql from nominatim.db.connection import connect +from nominatim.utils.centroid import PointsCentroid LOG = logging.getLogger() @@ -36,14 +38,14 @@ class _CountryPostcodesCollector: def __init__(self, country): self.country = country - self.collected = {} + self.collected = defaultdict(PointsCentroid) def add(self, postcode, x, y): """ Add the given postcode to the collection cache. If the postcode already existed, it is overwritten with the new centroid. """ - self.collected[postcode] = (x, y) + self.collected[postcode] += (x, y) def commit(self, conn, analyzer, project_dir): @@ -93,16 +95,16 @@ def _compute_changes(self, conn): WHERE country_code = %s""", (self.country, )) for postcode, x, y in cur: - newx, newy = self.collected.pop(postcode, (None, None)) - if newx is not None: - dist = (x - newx)**2 + (y - newy)**2 - if dist > 0.0000001: + pcobj = self.collected.pop(postcode, None) + if pcobj: + newx, newy = pcobj.centroid() + if (x - newx) > 0.0000001 or (y - newy) > 0.0000001: to_update.append((postcode, newx, newy)) else: to_delete.append(postcode) - to_add = [(k, v[0], v[1]) for k, v in self.collected.items()] - self.collected = [] + to_add = [(k, *v.centroid()) for k, v in self.collected.items()] + self.collected = None return to_add, to_delete, to_update @@ -125,8 +127,10 @@ def _update_from_external(self, analyzer, project_dir): postcode = analyzer.normalize_postcode(row['postcode']) if postcode not in self.collected: try: - self.collected[postcode] = (_to_float(row['lon'], 180), - _to_float(row['lat'], 90)) + # Do float conversation separately, it might throw + centroid = (_to_float(row['lon'], 180), + _to_float(row['lat'], 90)) + self.collected[postcode] += centroid except ValueError: LOG.warning("Bad coordinates %s, %s in %s country postcode file.", row['lat'], row['lon'], self.country) @@ -174,12 +178,10 @@ def update_postcodes(dsn, project_dir, tokenizer): COALESCE(plx.country_code, get_country_code(ST_Centroid(pl.geometry))) as cc, token_normalized_postcode(pl.address->'postcode') as pc, - ST_Centroid(ST_Collect(COALESCE(plx.centroid, - ST_Centroid(pl.geometry)))) as centroid + COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid FROM place AS pl LEFT OUTER JOIN placex AS plx ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type - WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null - GROUP BY cc, pc) xx + WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx WHERE pc IS NOT null AND cc IS NOT null ORDER BY country_code, pc""") From 80ea13437df4c6d57ea503adbdfc9928de8d859c Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 6 Jun 2022 23:37:04 +0200 Subject: [PATCH 16/30] move postcode matcher in a separate file --- nominatim/data/__init__.py | 0 nominatim/data/postcode_format.py | 97 +++++++++++++++++++ nominatim/tokenizer/icu_tokenizer.py | 2 +- .../tokenizer/sanitizers/clean_postcodes.py | 70 +------------ test/python/tokenizer/test_icu.py | 7 -- 5 files changed, 103 insertions(+), 73 deletions(-) create mode 100644 nominatim/data/__init__.py create mode 100644 nominatim/data/postcode_format.py diff --git a/nominatim/data/__init__.py b/nominatim/data/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/nominatim/data/postcode_format.py b/nominatim/data/postcode_format.py new file mode 100644 index 0000000000..0158111ada --- /dev/null +++ b/nominatim/data/postcode_format.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Functions for formatting postcodes according to their country-specific +format. +""" +import re + +from nominatim.errors import UsageError +from nominatim.tools import country_info + +class CountryPostcodeMatcher: + """ Matches and formats a postcode according to a format definition + of the given country. + """ + def __init__(self, country_code, config): + if 'pattern' not in config: + raise UsageError("Field 'pattern' required for 'postcode' " + f"for country '{country_code}'") + + pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]') + + self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*') + self.pattern = re.compile(pc_pattern) + + self.output = config.get('output', r'\g<0>') + + + def match(self, postcode): + """ Match the given postcode against the postcode pattern for this + matcher. Returns a `re.Match` object if the match was successful + and None otherwise. + """ + # Upper-case, strip spaces and leading country code. + normalized = self.norm_pattern.fullmatch(postcode.upper()) + + if normalized: + return self.pattern.fullmatch(normalized.group(1)) + + return None + + + def normalize(self, match): + """ Return the default format of the postcode for the given match. + `match` must be a `re.Match` object previously returned by + `match()` + """ + return match.expand(self.output) + + +class PostcodeFormatter: + """ Container for different postcode formats of the world and + access functions. + """ + def __init__(self): + # Objects without a country code can't have a postcode per definition. + self.country_without_postcode = {None} + self.country_matcher = {} + self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'}) + + for ccode, prop in country_info.iterate('postcode'): + if prop is False: + self.country_without_postcode.add(ccode) + elif isinstance(prop, dict): + self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop) + else: + raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'") + + + def set_default_pattern(self, pattern): + """ Set the postcode match pattern to use, when a country does not + have a specific pattern or is marked as country without postcode. + """ + self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern}) + + + def match(self, country_code, postcode): + """ Match the given postcode against the postcode pattern for this + matcher. Returns a `re.Match` object if the country has a pattern + and the match was successful or None if the match failed. + """ + if country_code in self.country_without_postcode: + return None + + return self.country_matcher.get(country_code, self.default_matcher).match(postcode) + + + def normalize(self, country_code, match): + """ Return the default format of the postcode for the given match. + `match` must be a `re.Match` object previously returned by + `match()` + """ + return self.country_matcher.get(country_code, self.default_matcher).normalize(match) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 61c47c1188..0dc551e1b4 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -607,7 +607,7 @@ def _compute_name_tokens(self, names): def _add_postcode(self, item): """ Make sure the normalized postcode is present in the word table. """ - analyzer = self.token_analysis.get_analyzer('@postcode') + analyzer = self.token_analysis.analysis.get('@postcode') if analyzer is None: postcode_name = item.name.strip().upper() diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py index d1edc60d1e..fbc46fa582 100644 --- a/nominatim/tokenizer/sanitizers/clean_postcodes.py +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -16,70 +16,17 @@ When set to 'no', non-conforming postcodes are not searchable either. """ -import re - -from nominatim.errors import UsageError -from nominatim.tools import country_info - -class _PostcodeMatcher: - """ Matches and formats a postcode according to the format definition. - """ - def __init__(self, country_code, config): - if 'pattern' not in config: - raise UsageError("Field 'pattern' required for 'postcode' " - f"for country '{country_code}'") - - pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]') - - self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*') - self.pattern = re.compile(pc_pattern) - - self.output = config.get('output', r'\g<0>') - - - def match(self, postcode): - """ Match the given postcode against the postcode pattern for this - matcher. Returns a `re.Match` object if the match was successful - and None otherwise. - """ - # Upper-case, strip spaces and leading country code. - normalized = self.norm_pattern.fullmatch(postcode.upper()) - - if normalized: - return self.pattern.fullmatch(normalized.group(1)) - - return None - - - def normalize(self, match): - """ Return the default format of the postcode for the given match. - `match` must be a `re.Match` object previously returned by - `match()` - """ - return match.expand(self.output) - +from nominatim.data.postcode_format import PostcodeFormatter class _PostcodeSanitizer: def __init__(self, config): self.convert_to_address = config.get_bool('convert-to-address', True) - # Objects without a country code can't have a postcode per definition. - self.country_without_postcode = {None} - self.country_matcher = {} - - for ccode, prop in country_info.iterate('postcode'): - if prop is False: - self.country_without_postcode.add(ccode) - elif isinstance(prop, dict): - self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop) - else: - raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'") + self.matcher = PostcodeFormatter() default_pattern = config.get('default-pattern') if default_pattern is not None and isinstance(default_pattern, str): - self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern}) - else: - self.default_matcher = None + self.matcher.set_default_pattern(default_pattern) def __call__(self, obj): @@ -106,18 +53,11 @@ def scan(self, postcode, country): normalized version. Returns None if the postcode does not correspond to the oficial format of the given country. """ - if country in self.country_without_postcode: - return None - - matcher = self.country_matcher.get(country, self.default_matcher) - if matcher is None: - return postcode.upper(), '' - - match = matcher.match(postcode) + match = self.matcher.match(country, postcode) if match is None: return None - return matcher.normalize(match), ' '.join(match.groups()) + return self.matcher.normalize(country, match), ' '.join(match.groups()) diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index d85a5b65e5..6138a03a42 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -437,13 +437,6 @@ def test_process_place_postcode(self, word_table, pcode): assert word_table.get_postcodes() == {pcode, } - @pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836']) - def test_process_place_bad_postcode(self, word_table, pcode): - self.process_address(postcode=pcode) - - assert not word_table.get_postcodes() - - @pytest.mark.parametrize('hnr', ['123a', '1', '101']) def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id): info = self.process_address(housenumber=hnr) From b5e5efc131a29a46a4c5d57f02f7c6b50126f86f Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 6 Jun 2022 23:44:51 +0200 Subject: [PATCH 17/30] only add well-formatted postcodes to location table --- nominatim/tools/postcodes.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py index 42f54cea3e..dad1edff7f 100644 --- a/nominatim/tools/postcodes.py +++ b/nominatim/tools/postcodes.py @@ -18,6 +18,7 @@ from nominatim.db.connection import connect from nominatim.utils.centroid import PointsCentroid +from nominatim.data.postcode_format import PostcodeFormatter LOG = logging.getLogger() @@ -162,6 +163,7 @@ def update_postcodes(dsn, project_dir, tokenizer): potentially enhances it with external data and then updates the postcodes in the table 'location_postcode'. """ + matcher = PostcodeFormatter() with tokenizer.name_analyzer() as analyzer: with connect(dsn) as conn: # First get the list of countries that currently have postcodes. @@ -193,7 +195,9 @@ def update_postcodes(dsn, project_dir, tokenizer): collector.commit(conn, analyzer, project_dir) collector = _CountryPostcodesCollector(country) todo_countries.discard(country) - collector.add(postcode, x, y) + match = matcher.match(country, postcode) + if match: + collector.add(matcher.normalize(country, match), x, y) if collector is not None: collector.commit(conn, analyzer, project_dir) From 2eca9fc8aff8fc7bc3ab4b7e4bf262686a5a6a5c Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 7 Jun 2022 12:08:22 +0200 Subject: [PATCH 18/30] cache postcode normalization --- nominatim/data/postcode_format.py | 12 ++++++++++++ nominatim/tools/postcodes.py | 23 ++++++++++++++++------- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/nominatim/data/postcode_format.py b/nominatim/data/postcode_format.py index 0158111ada..6ae43b7d50 100644 --- a/nominatim/data/postcode_format.py +++ b/nominatim/data/postcode_format.py @@ -78,6 +78,18 @@ def set_default_pattern(self, pattern): self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern}) + def get_matcher(self, country_code): + """ Return the CountryPostcodeMatcher for the given country. + Returns None if the country doesn't have a postcode and the + default matcher if there is no specific matcher configured for + the country. + """ + if country_code in self.country_without_postcode: + return None + + return self.country_matcher.get(country_code, self.default_matcher) + + def match(self, country_code, postcode): """ Match the given postcode against the postcode pattern for this matcher. Returns a `re.Match` object if the country has a pattern diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py index dad1edff7f..26b96099a9 100644 --- a/nominatim/tools/postcodes.py +++ b/nominatim/tools/postcodes.py @@ -37,16 +37,27 @@ class _CountryPostcodesCollector: """ Collector for postcodes of a single country. """ - def __init__(self, country): + def __init__(self, country, matcher): self.country = country + self.matcher = matcher self.collected = defaultdict(PointsCentroid) + self.normalization_cache = None def add(self, postcode, x, y): """ Add the given postcode to the collection cache. If the postcode already existed, it is overwritten with the new centroid. """ - self.collected[postcode] += (x, y) + if self.matcher is not None: + if self.normalization_cache and self.normalization_cache[0] == postcode: + normalized = self.normalization_cache[1] + else: + match = self.matcher.match(postcode) + normalized = self.matcher.normalize(match) if match else None + self.normalization_cache = (postcode, normalized) + + if normalized: + self.collected[normalized] += (x, y) def commit(self, conn, analyzer, project_dir): @@ -193,18 +204,16 @@ def update_postcodes(dsn, project_dir, tokenizer): if collector is None or country != collector.country: if collector is not None: collector.commit(conn, analyzer, project_dir) - collector = _CountryPostcodesCollector(country) + collector = _CountryPostcodesCollector(country, matcher.get_matcher(country)) todo_countries.discard(country) - match = matcher.match(country, postcode) - if match: - collector.add(matcher.normalize(country, match), x, y) + collector.add(postcode, x, y) if collector is not None: collector.commit(conn, analyzer, project_dir) # Now handle any countries that are only in the postcode table. for country in todo_countries: - _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir) + _CountryPostcodesCollector(country, matcher.get_matcher(country)).commit(conn, analyzer, project_dir) conn.commit() From 67dfa38e608a6e63dbcae40530c46a56971cca0a Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 8 Jun 2022 06:33:11 +0200 Subject: [PATCH 19/30] fix liniting problems --- nominatim/tools/postcodes.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py index 26b96099a9..27fbcc9b25 100644 --- a/nominatim/tools/postcodes.py +++ b/nominatim/tools/postcodes.py @@ -33,7 +33,7 @@ def _to_float(num, max_value): return num -class _CountryPostcodesCollector: +class _PostcodeCollector: """ Collector for postcodes of a single country. """ @@ -204,7 +204,7 @@ def update_postcodes(dsn, project_dir, tokenizer): if collector is None or country != collector.country: if collector is not None: collector.commit(conn, analyzer, project_dir) - collector = _CountryPostcodesCollector(country, matcher.get_matcher(country)) + collector = _PostcodeCollector(country, matcher.get_matcher(country)) todo_countries.discard(country) collector.add(postcode, x, y) @@ -213,7 +213,8 @@ def update_postcodes(dsn, project_dir, tokenizer): # Now handle any countries that are only in the postcode table. for country in todo_countries: - _CountryPostcodesCollector(country, matcher.get_matcher(country)).commit(conn, analyzer, project_dir) + fmt = matcher.get_matcher(country) + _PostcodeCollector(country, fmt).commit(conn, analyzer, project_dir) conn.commit() From 7b6ec4fc6cf07f924025b5d2f63053c54e3f6fa4 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 8 Jun 2022 07:24:53 +0200 Subject: [PATCH 20/30] add tests for discarding bad postcodes --- test/python/tools/test_postcodes.py | 46 ++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/test/python/tools/test_postcodes.py b/test/python/tools/test_postcodes.py index bdfe309471..0c4b93fcac 100644 --- a/test/python/tools/test_postcodes.py +++ b/test/python/tools/test_postcodes.py @@ -11,7 +11,7 @@ import pytest -from nominatim.tools import postcodes +from nominatim.tools import postcodes, country_info import dummy_tokenizer class MockPostcodeTable: @@ -64,11 +64,26 @@ def row_set(self): def tokenizer(): return dummy_tokenizer.DummyTokenizer(None, None) + @pytest.fixture -def postcode_table(temp_db_conn, placex_table): +def postcode_table(def_config, temp_db_conn, placex_table): + country_info.setup_country_config(def_config) return MockPostcodeTable(temp_db_conn) +@pytest.fixture +def insert_implicit_postcode(placex_table, place_row): + """ + Inserts data into the placex and place table + which can then be used to compute one postcode. + """ + def _insert_implicit_postcode(osm_id, country, geometry, address): + placex_table.add(osm_id=osm_id, country=country, geom=geometry) + place_row(osm_id=osm_id, geom='SRID=4326;'+geometry, address=address) + + return _insert_implicit_postcode + + def test_postcodes_empty(dsn, postcode_table, place_table, tmp_path, tokenizer): postcodes.update_postcodes(dsn, tmp_path, tokenizer) @@ -193,27 +208,30 @@ def test_can_compute(dsn, table_factory): table_factory('place') assert postcodes.can_compute(dsn) + def test_no_placex_entry(dsn, tmp_path, temp_db_cursor, place_row, postcode_table, tokenizer): #Rewrite the get_country_code function to verify its execution. temp_db_cursor.execute(""" CREATE OR REPLACE FUNCTION get_country_code(place geometry) RETURNS TEXT AS $$ BEGIN - RETURN 'fr'; + RETURN 'yy'; END; $$ LANGUAGE plpgsql; """) place_row(geom='SRID=4326;POINT(10 12)', address=dict(postcode='AB 4511')) postcodes.update_postcodes(dsn, tmp_path, tokenizer) - assert postcode_table.row_set == {('fr', 'AB 4511', 10, 12)} + assert postcode_table.row_set == {('yy', 'AB 4511', 10, 12)} -@pytest.fixture -def insert_implicit_postcode(placex_table, place_row): - """ - Inserts data into the placex and place table - which can then be used to compute one postcode. - """ - def _insert_implicit_postcode(osm_id, country, geometry, address): - placex_table.add(osm_id=osm_id, country=country, geom=geometry) - place_row(osm_id=osm_id, geom='SRID=4326;'+geometry, address=address) - return _insert_implicit_postcode +def test_discard_badly_formatted_postcodes(dsn, tmp_path, temp_db_cursor, place_row, postcode_table, tokenizer): + #Rewrite the get_country_code function to verify its execution. + temp_db_cursor.execute(""" + CREATE OR REPLACE FUNCTION get_country_code(place geometry) + RETURNS TEXT AS $$ BEGIN + RETURN 'fr'; + END; $$ LANGUAGE plpgsql; + """) + place_row(geom='SRID=4326;POINT(10 12)', address=dict(postcode='AB 4511')) + postcodes.update_postcodes(dsn, tmp_path, tokenizer) + + assert not postcode_table.row_set From e86db3001f90012029a59ec6b313a4f7257035d4 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 8 Jun 2022 07:42:35 +0200 Subject: [PATCH 21/30] fix postcode pattern for Mozambique Optional groups are not implemented yet. --- nominatim/tokenizer/sanitizers/clean_postcodes.py | 3 ++- settings/country_settings.yaml | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py index fbc46fa582..43d297695f 100644 --- a/nominatim/tokenizer/sanitizers/clean_postcodes.py +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -57,7 +57,8 @@ def scan(self, postcode, country): if match is None: return None - return self.matcher.normalize(country, match), ' '.join(match.groups()) + return self.matcher.normalize(country, match),\ + ' '.join(filter(lambda p: p is not None, match.groups())) diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml index 67905ea2a3..14d08de3a3 100644 --- a/settings/country_settings.yaml +++ b/settings/country_settings.yaml @@ -1386,8 +1386,7 @@ mz: languages: pt names: !include country-names/mz.yaml postcode: - pattern: "(dddd)-?(dd)?" - output: \1-\2 + pattern: "(dddd)(?:-dd)?" # Namibia (Namibia) From 37b2c6a830c90aea17b76c5b6a74c711025a142d Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 8 Jun 2022 08:19:55 +0200 Subject: [PATCH 22/30] port legacy tokenizer to new postcode handling Also documents the changes to the SQL functions of the tokenizer. --- docs/develop/Tokenizers.md | 6 +++--- lib-sql/tokenizer/icu_tokenizer.sql | 7 ------- lib-sql/tokenizer/legacy_tokenizer.sql | 4 ++-- nominatim/tokenizer/legacy_tokenizer.py | 10 ++++++++-- nominatim/tools/postcodes.py | 6 +++--- 5 files changed, 16 insertions(+), 17 deletions(-) diff --git a/docs/develop/Tokenizers.md b/docs/develop/Tokenizers.md index 2b4da00509..5fe4e38d43 100644 --- a/docs/develop/Tokenizers.md +++ b/docs/develop/Tokenizers.md @@ -245,11 +245,11 @@ Currently, tokenizers are encouraged to make sure that matching works against both the search token list and the match token list. ```sql -FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT +FUNCTION token_get_postcode(info JSONB) RETURNS TEXT ``` -Return the normalized version of the given postcode. This function must return -the same value as the Python function `AbstractAnalyzer->normalize_postcode()`. +Return the postcode for the object, if any exists. The postcode must be in +the form that should also be presented to the end-user. ```sql FUNCTION token_strip_info(info JSONB) RETURNS JSONB diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index f86a0a3794..599d0eb089 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -97,13 +97,6 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT) - RETURNS TEXT -AS $$ - SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END; -$$ LANGUAGE SQL IMMUTABLE STRICT; - - CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB) RETURNS TEXT AS $$ diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql index 64453d4e59..5826f74ac2 100644 --- a/lib-sql/tokenizer/legacy_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_tokenizer.sql @@ -97,10 +97,10 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT) +CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB) RETURNS TEXT AS $$ - SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END; + SELECT info->>'postcode'; $$ LANGUAGE SQL IMMUTABLE STRICT; diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index a292b180b8..36fd572244 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -467,8 +467,9 @@ def _process_place_address(self, token_info, address): if key == 'postcode': # Make sure the normalized postcode is present in the word table. if re.search(r'[:,;]', value) is None: - self._cache.add_postcode(self.conn, - self.normalize_postcode(value)) + norm_pc = self.normalize_postcode(value) + token_info.set_postcode(norm_pc) + self._cache.add_postcode(self.conn, norm_pc) elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): hnrs.append(value) elif key == 'street': @@ -527,6 +528,11 @@ def add_housenumbers(self, conn, hnrs): self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone() + def set_postcode(self, postcode): + """ Set or replace the postcode token with the given value. + """ + self.data['postcode'] = postcode + def add_street(self, conn, street): """ Add addr:street match terms. """ diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py index 27fbcc9b25..9c66719b5f 100644 --- a/nominatim/tools/postcodes.py +++ b/nominatim/tools/postcodes.py @@ -186,17 +186,17 @@ def update_postcodes(dsn, project_dir, tokenizer): # Recompute the list of valid postcodes from placex. with conn.cursor(name="placex_postcodes") as cur: cur.execute(""" - SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid) + SELECT cc, pc, ST_X(centroid), ST_Y(centroid) FROM (SELECT COALESCE(plx.country_code, get_country_code(ST_Centroid(pl.geometry))) as cc, - token_normalized_postcode(pl.address->'postcode') as pc, + pl.address->'postcode' as pc, COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid FROM place AS pl LEFT OUTER JOIN placex AS plx ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx WHERE pc IS NOT null AND cc IS NOT null - ORDER BY country_code, pc""") + ORDER BY cc, pc""") collector = None From 0f00f4968c1b78b83a57a4ece660a38f58e2de11 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 17 Jun 2022 17:28:51 +0200 Subject: [PATCH 23/30] fix up BDD tests for postcode changes Includes smaller code fixes found by the tests. --- lib-sql/functions/address_lookup.sql | 5 +++ nominatim/tokenizer/icu_tokenizer.py | 2 + test/bdd/db/import/postcodes.feature | 53 +------------------------ test/bdd/db/query/normalization.feature | 8 ---- 4 files changed, 9 insertions(+), 59 deletions(-) diff --git a/lib-sql/functions/address_lookup.sql b/lib-sql/functions/address_lookup.sql index 0eada6987e..2bbfcd5c03 100644 --- a/lib-sql/functions/address_lookup.sql +++ b/lib-sql/functions/address_lookup.sql @@ -320,6 +320,11 @@ BEGIN location := ROW(null, null, null, hstore('ref', place.postcode), 'place', 'postcode', null, null, false, true, 5, 0)::addressline; RETURN NEXT location; + ELSEIF place.address is not null and place.address ? 'postcode' + and not place.address->'postcode' SIMILAR TO '%(,|;)%' THEN + location := ROW(null, null, null, hstore('ref', place.address->'postcode'), 'place', + 'postcode', null, null, false, true, 5, 0)::addressline; + RETURN NEXT location; END IF; RETURN; diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 0dc551e1b4..28184f6af1 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -635,6 +635,8 @@ def _add_postcode(self, item): (postcode, list(variants))) self._cache.postcodes.add(postcode) + return postcode_name + class _TokenInfo: """ Collect token information to be sent back to the database. diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature index 50afa7cc2d..7636aea7ee 100644 --- a/test/bdd/db/import/postcodes.feature +++ b/test/bdd/db/import/postcodes.feature @@ -182,6 +182,7 @@ Feature: Import of postcodes | type | display_name | | postcode | E4 7EA | + @Fail Scenario: search and address ranks for GB post codes correctly assigned Given the places | osm | class | type | postcode | geometry | @@ -195,57 +196,7 @@ Feature: Import of postcodes | E45 2 | gb | 23 | 5 | | Y45 | gb | 21 | 5 | - Scenario: wrongly formatted GB postcodes are down-ranked - Given the places - | osm | class | type | postcode | geometry | - | N1 | place | postcode | EA452CD | country:gb | - | N2 | place | postcode | E45 23 | country:gb | - When importing - Then location_postcode contains exactly - | postcode | country | rank_search | rank_address | - | EA452CD | gb | 30 | 30 | - | E45 23 | gb | 30 | 30 | - - Scenario: search and address rank for DE postcodes correctly assigned - Given the places - | osm | class | type | postcode | geometry | - | N1 | place | postcode | 56427 | country:de | - | N2 | place | postcode | 5642 | country:de | - | N3 | place | postcode | 5642A | country:de | - | N4 | place | postcode | 564276 | country:de | - When importing - Then location_postcode contains exactly - | postcode | country | rank_search | rank_address | - | 56427 | de | 21 | 11 | - | 5642 | de | 30 | 30 | - | 5642A | de | 30 | 30 | - | 564276 | de | 30 | 30 | - - Scenario: search and address rank for other postcodes are correctly assigned - Given the places - | osm | class | type | postcode | geometry | - | N1 | place | postcode | 1 | country:ca | - | N2 | place | postcode | X3 | country:ca | - | N3 | place | postcode | 543 | country:ca | - | N4 | place | postcode | 54dc | country:ca | - | N5 | place | postcode | 12345 | country:ca | - | N6 | place | postcode | 55TT667 | country:ca | - | N7 | place | postcode | 123-65 | country:ca | - | N8 | place | postcode | 12 445 4 | country:ca | - | N9 | place | postcode | A1:bc10 | country:ca | - When importing - Then location_postcode contains exactly - | postcode | country | rank_search | rank_address | - | 1 | ca | 21 | 11 | - | X3 | ca | 21 | 11 | - | 543 | ca | 21 | 11 | - | 54DC | ca | 21 | 11 | - | 12345 | ca | 21 | 11 | - | 55TT667 | ca | 21 | 11 | - | 123-65 | ca | 25 | 11 | - | 12 445 4 | ca | 25 | 11 | - | A1:BC10 | ca | 25 | 11 | - + @fail-legacy Scenario: Postcodes outside all countries are not added to the postcode and word table Given the places | osm | class | type | addr+postcode | addr+housenumber | addr+place | geometry | diff --git a/test/bdd/db/query/normalization.feature b/test/bdd/db/query/normalization.feature index f91c005043..e5a7a5922b 100644 --- a/test/bdd/db/query/normalization.feature +++ b/test/bdd/db/query/normalization.feature @@ -168,14 +168,6 @@ Feature: Import and search of names | ID | osm | | 0 | R1 | - Scenario: Unprintable characters in postcodes are ignored - Given the named places - | osm | class | type | address | geometry | - | N234 | amenity | prison | 'postcode' : u'1234\u200e' | country:de | - When importing - And sending search query "1234" - Then result 0 has not attributes osm_type - Scenario Outline: Housenumbers with special characters are found Given the grid | 1 | | | | 2 | From 7f2ad4ac7e956e2744d64fa4cafc5370721886a2 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 17 Jun 2022 18:14:23 +0200 Subject: [PATCH 24/30] fix linting issue --- nominatim/tokenizer/icu_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 28184f6af1..df1387e24b 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -624,7 +624,7 @@ def _add_postcode(self, item): if postcode not in self._cache.postcodes: term = self._search_normalized(postcode_name) if not term: - return + return None variants = {term} if analyzer is not None and variant_base is not None: From 5be320368c6695498c4fed7cbba44220d3c91b17 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 20 Jun 2022 17:42:12 +0200 Subject: [PATCH 25/30] add documentation for postcode customization --- docs/customize/Country-Settings.md | 149 ++++++++++++++++++ docs/customize/Tokenizers.md | 24 ++- docs/mkdocs.yml | 1 + .../tokenizer/sanitizers/clean_postcodes.py | 4 + 4 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 docs/customize/Country-Settings.md diff --git a/docs/customize/Country-Settings.md b/docs/customize/Country-Settings.md new file mode 100644 index 0000000000..6f8f2a9f23 --- /dev/null +++ b/docs/customize/Country-Settings.md @@ -0,0 +1,149 @@ +# Customizing Per-Country Data + +Whenever an OSM is imported into Nominatim, the object is first assigned +a country. Nominatim can use this information to adapt various aspects of +the address computation to the local customs of the country. This section +explains how country assignment works and the principal per-country +localizations. + +## Country assignment + +Countries are assigned on the basis of country data from the OpenStreetMap +input data itself. Countries are expected to be tagged according to the +[administrative boundary schema](https://wiki.openstreetmap.org/wiki/Tag:boundary%3Dadministrative): +a OSM relation with `boundary=administrative` and `admin_level=2`. Nominatim +uses the country code to distinguish the countries. + +If there is no country data available for a point, then Nominatim uses the +fallback data imported from `data/country_osm_grid.sql.gz`. This was computed +from OSM data as well but is guaranteed to cover all countries. + +Some OSM objects may also be located outside any country, for example a buoy +in the middle of the ocean. These object do not get any country assigned and +get a default treatment when it comes to localized handling of data. + +## Per-country settings + +### Global country settings + +The main place to configure settings per country is the file +`settings/country_settings.yaml`. This file has one section per country that +is recognised by Nominatim. Each section is tagged with the country code +(in lower case) and contains the different localization information. Only +countries which are listed in this file are taken into account for computations. + +For example, the section for Andorra looks like this: + +``` + partition: 35 + languages: ca + names: !include country-names/ad.yaml + postcode: + pattern: "(ddd)" + output: AD\1 +``` + +The individual settings are described below. + +#### `partition` + +Nominatim internally splits the data into multiple tables to improve +performance. The partition number tells Nominatim into which table to put +the country. This is purely internal management and has no effect on the +output data. + +The default is to have one partition per country. + +#### `languages` + +A comma-separated list of ISO-639 language codes of default languages in the +country. These are the languages used in name tags without a language suffix. +Note that this is not necessarily the same as the list of official languages +in the country. There may be officially recognised languages in a country +which are only ever used in name tags with the appropriate language suffixes. +Conversely, a non-official language may appear a lot in the name tags, for +example when used as an unofficial Lingua Franca. + +List the languages in order of frequency of appearance with the most frequently +used language first. It is not recommended to add languages when there are only +very few occurrences. + +If only one language is listed, then Nominatim will 'auto-complete' the +language of names without an explicit language-suffix. + +#### `names` + +List of names of the country and its translations. These names are used as +a baseline. It is always possible to search countries by the given names, no +matter what other names are in the OSM data. They are also used as a fallback +when a needed translation is not available. + +!!! Note + The list of names per country is currently fairly large because Nominatim + supports translations in many languages per default. That is why the + name lists have been separated out into extra files. You can find the + name lists in the file `settings/country-names/.yaml`. + The names section in the main country settings file only refers to these + files via the special `!include` directive. + +#### `postcode` + +Describes the format of the postcode that is in use in the country. + +When a country has no official postcodes, set this to no. Example: + +``` +ae: + postcode: no +``` + +When a country has a postcode, you need to state the postcode pattern and +the default output format. Example: + +``` +bm: + postcode: + pattern: "(ll)[ -]?(dd)" + output: \1 \2 +``` + +The **pattern** is a regular expression that describes the possible formats +accepted as a postcode. The pattern follows the standard syntax for +[regular expressions in Python](https://docs.python.org/3/library/re.html#regular-expression-syntax) +with two extra shortcuts: `d` is a shortcut for a single digit([0-9]) +and `l` for a single ASCII letter ([A-Z]). + +Use match groups to indicate groups in the postcode that may optionally be +separated with a space or a hyphen. + +For example, the postcode for Bermuda above always consists of two letters +and two digits. They may optionally be separated by a space or hyphen. That +means that Nominatim will consider `AB56`, `AB 56` and `AB-56` spelling variants +for one and the same postcode. + +Never add the country code in front of the postcode pattern. Nominatim will +automatically accept variants with a country code prefix for all postcodes. + +The **output** field is an optional field that describes what the canonical +spelling of the postcode should be. The format is the +[regular expression expand syntax](https://docs.python.org/3/library/re.html#re.Match.expand) referring back to the bracket groups in the pattern. + +Most simple postcodes only have one spelling variant. In that case, the +**output** can be omitted. The postcode will simply be used as is. + +In the Bermuda example above, the canonical spelling would be to have a space +between letters and digits. + +!!! Warning + When your postcode pattern covers multiple variants of the postcode, then + you must explicitly state the canonical output or Nominatim will not + handle the variations correctly. + +### Other country-specific configuration + +There are some other configuration files where you can set localized settings +according to the assigned country. These are: + + * [Place ranking configuration](Ranking.md) + +Please see the linked documentation sections for more information. diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md index 19d867ddd8..c563b20105 100644 --- a/docs/customize/Tokenizers.md +++ b/docs/customize/Tokenizers.md @@ -205,6 +205,14 @@ The following is a list of sanitizers that are shipped with Nominatim. rendering: heading_level: 6 +##### clean-postcodes + +::: nominatim.tokenizer.sanitizers.clean_postcodes + selection: + members: False + rendering: + heading_level: 6 + #### Token Analysis @@ -222,8 +230,12 @@ by a sanitizer (see for example the The token-analysis section contains the list of configured analyzers. Each analyzer must have an `id` parameter that uniquely identifies the analyzer. The only exception is the default analyzer that is used when no special -analyzer was selected. There is one special id '@housenumber'. If an analyzer -with that name is present, it is used for normalization of house numbers. +analyzer was selected. There are analysers with special ids: + + * '@housenumber'. If an analyzer with that name is present, it is used + for normalization of house numbers. + * '@potcode'. If an analyzer with that name is present, it is used + for normalization of postcodes. Different analyzer implementations may exist. To select the implementation, the `analyzer` parameter must be set. The different implementations are @@ -356,6 +368,14 @@ house numbers of the form '3 a', '3A', '3-A' etc. are all considered equivalent. The analyzer cannot be customized. +##### Postcode token analyzer + +The analyzer `postcodes` is pupose-made to analyze postcodes. It supports +a 'lookup' varaint of the token, which produces variants with optional +spaces. Use together with the clean-postcodes sanitizer. + +The analyzer cannot be customized. + ### Reconfiguration Changing the configuration after the import is currently not possible, although diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index c25ae0ad32..a3860cbaa2 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -28,6 +28,7 @@ pages: - 'Overview': 'customize/Overview.md' - 'Import Styles': 'customize/Import-Styles.md' - 'Configuration Settings': 'customize/Settings.md' + - 'Per-Country Data': 'customize/Country-Settings.md' - 'Place Ranking' : 'customize/Ranking.md' - 'Tokenizers' : 'customize/Tokenizers.md' - 'Special Phrases': 'customize/Special-Phrases.md' diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py index 43d297695f..05e90ca122 100644 --- a/nominatim/tokenizer/sanitizers/clean_postcodes.py +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -15,6 +15,10 @@ postcode centroids of a country but is still searchable. When set to 'no', non-conforming postcodes are not searchable either. + default-pattern: Pattern to use, when there is none available for the + country in question. Warning: will not be used for + objects that have no country assigned. These are always + assumed to have no postcode. """ from nominatim.data.postcode_format import PostcodeFormatter From 612d34930b603997acce2772e7264b509bb8aed6 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 21 Jun 2022 22:05:35 +0200 Subject: [PATCH 26/30] handle postcodes properly on word table updates update_postcodes_from_db() needs to do the full postcode treatment in order to derive the correct word table entries. --- nominatim/tokenizer/icu_tokenizer.py | 95 +++++++++++++------ .../tokenizer/token_analysis/postcodes.py | 19 +++- test/bdd/steps/steps_db_ops.py | 19 ++-- test/python/tokenizer/test_icu.py | 79 +++++++++++---- .../token_analysis/test_analysis_postcodes.py | 60 ++++++++++++ 5 files changed, 216 insertions(+), 56 deletions(-) create mode 100644 test/python/tokenizer/token_analysis/test_analysis_postcodes.py diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index df1387e24b..a6ff08a407 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -290,33 +290,72 @@ def update_postcodes_from_db(self): """ Update postcode tokens in the word table from the location_postcode table. """ - to_delete = [] + analyzer = self.token_analysis.analysis.get('@postcode') + with self.conn.cursor() as cur: - # This finds us the rows in location_postcode and word that are - # missing in the other table. - cur.execute("""SELECT * FROM - (SELECT pc, word FROM - (SELECT distinct(postcode) as pc FROM location_postcode) p - FULL JOIN - (SELECT word FROM word WHERE type = 'P') w - ON pc = word) x - WHERE pc is null or word is null""") - - with CopyBuffer() as copystr: - for postcode, word in cur: - if postcode is None: - to_delete.append(word) - else: - copystr.add(self._search_normalized(postcode), - 'P', postcode) - - if to_delete: - cur.execute("""DELETE FROM WORD - WHERE type ='P' and word = any(%s) - """, (to_delete, )) - - copystr.copy_out(cur, 'word', - columns=['word_token', 'type', 'word']) + # First get all postcode names currently in the word table. + cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'") + word_entries = set((entry[0] for entry in cur)) + + # Then compute the required postcode names from the postcode table. + needed_entries = set() + cur.execute("SELECT country_code, postcode FROM location_postcode") + for cc, postcode in cur: + info = PlaceInfo({'country_code': cc, + 'class': 'place', 'type': 'postcode', + 'address': {'postcode': postcode}}) + address = self.sanitizer.process_names(info)[1] + for place in address: + if place.kind == 'postcode': + if analyzer is None: + postcode_name = place.name.strip().upper() + variant_base = None + else: + postcode_name = analyzer.normalize(place.name) + variant_base = place.get_attr("variant") + + if variant_base: + needed_entries.add(f'{postcode_name}@{variant_base}') + else: + needed_entries.add(postcode_name) + break + + # Now update the word table. + self._delete_unused_postcode_words(word_entries - needed_entries) + self._add_missing_postcode_words(needed_entries - word_entries) + + def _delete_unused_postcode_words(self, tokens): + if tokens: + with self.conn.cursor() as cur: + cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)", + (list(tokens), )) + + def _add_missing_postcode_words(self, tokens): + if not tokens: + return + + analyzer = self.token_analysis.analysis.get('@postcode') + terms = [] + + for postcode_name in tokens: + if '@' in postcode_name: + term, variant = postcode_name.split('@', 2) + term = self._search_normalized(term) + variants = {term} + if analyzer is not None: + variants.update(analyzer.get_variants_ascii(variant)) + variants = list(variants) + else: + variants = [self._search_normalized(postcode_name)] + terms.append((postcode_name, variants)) + + if terms: + with self.conn.cursor() as cur: + cur.execute_values("""SELECT create_postcode_word(pc, var) + FROM (VALUES %s) AS v(pc, var)""", + terms) + + def update_special_phrases(self, phrases, should_replace): @@ -616,7 +655,7 @@ def _add_postcode(self, item): postcode_name = analyzer.normalize(item.name) variant_base = item.get_attr("variant") - if variant_base is not None: + if variant_base: postcode = f'{postcode_name}@{variant_base}' else: postcode = postcode_name @@ -627,7 +666,7 @@ def _add_postcode(self, item): return None variants = {term} - if analyzer is not None and variant_base is not None: + if analyzer is not None and variant_base: variants.update(analyzer.get_variants_ascii(variant_base)) with self.conn.cursor() as cur: diff --git a/nominatim/tokenizer/token_analysis/postcodes.py b/nominatim/tokenizer/token_analysis/postcodes.py index e105b132da..18fc2a8ded 100644 --- a/nominatim/tokenizer/token_analysis/postcodes.py +++ b/nominatim/tokenizer/token_analysis/postcodes.py @@ -25,8 +25,18 @@ def create(normalizer, transliterator, config): # pylint: disable=W0613 """ return PostcodeTokenAnalysis(normalizer, transliterator) + class PostcodeTokenAnalysis: - """ Detects common housenumber patterns and normalizes them. + """ Special normalization and variant generation for postcodes. + + This analyser must not be used with anything but postcodes as + it follows some special rules: `normalize` doesn't necessarily + need to return a standard form as per normalization rules. It + needs to return the canonical form of the postcode that is also + used for output. `get_variants_ascii` then needs to ensure that + the generated variants once more follow the standard normalization + and transliteration, so that postcodes are correctly recognised by + the search algorithm. """ def __init__(self, norm, trans): self.norm = norm @@ -44,11 +54,12 @@ def normalize(self, name): def get_variants_ascii(self, norm_name): """ Compute the spelling variants for the given normalized postcode. - The official form creates one variant. If a 'lookup version' is - given, then it will create variants with optional spaces. + Takes the canonical form of the postcode, normalizes it using the + standard rules and then creates variants of the result where + all spaces are optional. """ # Postcodes follow their own transliteration rules. # Make sure at this point, that the terms are normalized in a way # that they are searchable with the standard transliteration rules. return [self.trans.transliterate(term) for term in - self.mutator.generate([self.norm.transliterate(norm_name)])] + self.mutator.generate([self.norm.transliterate(norm_name)]) if term] diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py index 44c82b017c..37d541533d 100644 --- a/test/bdd/steps/steps_db_ops.py +++ b/test/bdd/steps/steps_db_ops.py @@ -18,13 +18,18 @@ def check_database_integrity(context): """ Check some generic constraints on the tables. """ - # place_addressline should not have duplicate (place_id, address_place_id) - cur = context.db.cursor() - cur.execute("""SELECT count(*) FROM - (SELECT place_id, address_place_id, count(*) as c - FROM place_addressline GROUP BY place_id, address_place_id) x - WHERE c > 1""") - assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline" + with context.db.cursor() as cur: + # place_addressline should not have duplicate (place_id, address_place_id) + cur.execute("""SELECT count(*) FROM + (SELECT place_id, address_place_id, count(*) as c + FROM place_addressline GROUP BY place_id, address_place_id) x + WHERE c > 1""") + assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline" + + # word table must not have empty word_tokens + cur.execute("SELECT count(*) FROM word WHERE word_token = ''") + assert cur.fetchone()[0] == 0, "Empty word tokens found in word table" + ################################ GIVEN ################################## diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index 6138a03a42..b9de97bcc2 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -72,7 +72,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch, def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',), variants=('~gasse -> gasse', 'street => st', ), - sanitizers=[], with_housenumber=False): + sanitizers=[], with_housenumber=False, + with_postcode=False): cfgstr = {'normalization': list(norm), 'sanitizers': sanitizers, 'transliteration': list(trans), @@ -81,6 +82,9 @@ def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper() if with_housenumber: cfgstr['token-analysis'].append({'id': '@housenumber', 'analyzer': 'housenumbers'}) + if with_postcode: + cfgstr['token-analysis'].append({'id': '@postcode', + 'analyzer': 'postcodes'}) (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr)) tok.loader = nominatim.tokenizer.icu_rule_loader.ICURuleLoader(test_config) @@ -246,28 +250,69 @@ def test_normalize_postcode(analyzer): anl.normalize_postcode('38 Б') == '38 Б' -def test_update_postcodes_from_db_empty(analyzer, table_factory, word_table): - table_factory('location_postcode', 'postcode TEXT', - content=(('1234',), ('12 34',), ('AB23',), ('1234',))) +class TestPostcodes: - with analyzer() as anl: - anl.update_postcodes_from_db() + @pytest.fixture(autouse=True) + def setup(self, analyzer, sql_functions): + sanitizers = [{'step': 'clean-postcodes'}] + with analyzer(sanitizers=sanitizers, with_postcode=True) as anl: + self.analyzer = anl + yield anl - assert word_table.count() == 3 - assert word_table.get_postcodes() == {'1234', '12 34', 'AB23'} + def process_postcode(self, cc, postcode): + return self.analyzer.process_place(PlaceInfo({'country_code': cc, + 'address': {'postcode': postcode}})) -def test_update_postcodes_from_db_add_and_remove(analyzer, table_factory, word_table): - table_factory('location_postcode', 'postcode TEXT', - content=(('1234',), ('45BC', ), ('XX45', ))) - word_table.add_postcode(' 1234', '1234') - word_table.add_postcode(' 5678', '5678') - with analyzer() as anl: - anl.update_postcodes_from_db() + def test_update_postcodes_from_db_empty(self, table_factory, word_table): + table_factory('location_postcode', 'country_code TEXT, postcode TEXT', + content=(('de', '12345'), ('se', '132 34'), + ('bm', 'AB23'), ('fr', '12345'))) + + self.analyzer.update_postcodes_from_db() + + assert word_table.count() == 5 + assert word_table.get_postcodes() == {'12345', '132 34@132 34', 'AB 23@AB 23'} + + + def test_update_postcodes_from_db_ambigious(self, table_factory, word_table): + table_factory('location_postcode', 'country_code TEXT, postcode TEXT', + content=(('in', '123456'), ('sg', '123456'))) + + self.analyzer.update_postcodes_from_db() + + assert word_table.count() == 3 + assert word_table.get_postcodes() == {'123456', '123456@123 456'} + + + def test_update_postcodes_from_db_add_and_remove(self, table_factory, word_table): + table_factory('location_postcode', 'country_code TEXT, postcode TEXT', + content=(('ch', '1234'), ('bm', 'BC 45'), ('bm', 'XX45'))) + word_table.add_postcode(' 1234', '1234') + word_table.add_postcode(' 5678', '5678') + + self.analyzer.update_postcodes_from_db() + + assert word_table.count() == 5 + assert word_table.get_postcodes() == {'1234', 'BC 45@BC 45', 'XX 45@XX 45'} + + + def test_process_place_postcode_simple(self, word_table): + info = self.process_postcode('de', '12345') + + assert info['postcode'] == '12345' + + assert word_table.get_postcodes() == {'12345', } + + + def test_process_place_postcode_with_space(self, word_table): + info = self.process_postcode('in', '123 567') + + assert info['postcode'] == '123567' + + assert word_table.get_postcodes() == {'123567@123 567', } - assert word_table.count() == 3 - assert word_table.get_postcodes() == {'1234', '45BC', 'XX45'} def test_update_special_phrase_empty_table(analyzer, word_table): diff --git a/test/python/tokenizer/token_analysis/test_analysis_postcodes.py b/test/python/tokenizer/token_analysis/test_analysis_postcodes.py new file mode 100644 index 0000000000..623bed54a8 --- /dev/null +++ b/test/python/tokenizer/token_analysis/test_analysis_postcodes.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for special postcode analysis and variant generation. +""" +import pytest + +from icu import Transliterator + +import nominatim.tokenizer.token_analysis.postcodes as module +from nominatim.errors import UsageError + +DEFAULT_NORMALIZATION = """ :: NFD (); + '🜳' > ' '; + [[:Nonspacing Mark:] [:Cf:]] >; + :: lower (); + [[:Punctuation:][:Space:]]+ > ' '; + :: NFC (); + """ + +DEFAULT_TRANSLITERATION = """ :: Latin (); + '🜵' > ' '; + """ + +@pytest.fixture +def analyser(): + rules = { 'analyzer': 'postcodes'} + config = module.configure(rules, DEFAULT_NORMALIZATION) + + trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) + norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) + + return module.create(norm, trans, config) + + +def get_normalized_variants(proc, name): + norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) + return proc.get_variants_ascii(norm.transliterate(name).strip()) + + +@pytest.mark.parametrize('name,norm', [('12', '12'), + ('A 34 ', 'A 34'), + ('34-av', '34-AV')]) +def test_normalize(analyser, name, norm): + assert analyser.normalize(name) == norm + + +@pytest.mark.parametrize('postcode,variants', [('12345', {'12345'}), + ('AB-998', {'ab 998', 'ab998'}), + ('23 FGH D3', {'23 fgh d3', '23fgh d3', + '23 fghd3', '23fghd3'})]) +def test_get_variants_ascii(analyser, postcode, variants): + out = analyser.get_variants_ascii(postcode) + + assert len(out) == len(set(out)) + assert set(out) == variants From 6eb90443530e31025802e27527faaa7da99b02b6 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 22 Jun 2022 09:54:47 +0200 Subject: [PATCH 27/30] adapt search algorithm to new postcode format in word --- lib-php/TokenPostcode.php | 7 +- lib-php/tokenizer/icu_tokenizer.php | 16 +++-- test/bdd/db/import/postcodes.feature | 18 ------ test/bdd/db/query/postcodes.feature | 95 ++++++++++++++++++++++++++++ 4 files changed, 111 insertions(+), 25 deletions(-) create mode 100644 test/bdd/db/query/postcodes.feature diff --git a/lib-php/TokenPostcode.php b/lib-php/TokenPostcode.php index f0dbd45716..0ff92929cb 100644 --- a/lib-php/TokenPostcode.php +++ b/lib-php/TokenPostcode.php @@ -25,7 +25,12 @@ class Postcode public function __construct($iId, $sPostcode, $sCountryCode = '') { $this->iId = $iId; - $this->sPostcode = $sPostcode; + $iSplitPos = strpos($sPostcode, '@'); + if ($iSplitPos === false) { + $this->sPostcode = $sPostcode; + } else { + $this->sPostcode = substr($sPostcode, 0, $iSplitPos); + } $this->sCountryCode = empty($sCountryCode) ? '' : $sCountryCode; } diff --git a/lib-php/tokenizer/icu_tokenizer.php b/lib-php/tokenizer/icu_tokenizer.php index ccce99ca13..e45d076548 100644 --- a/lib-php/tokenizer/icu_tokenizer.php +++ b/lib-php/tokenizer/icu_tokenizer.php @@ -190,13 +190,17 @@ private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery) if ($aWord['word'] !== null && pg_escape_string($aWord['word']) == $aWord['word'] ) { - $sNormPostcode = $this->normalizeString($aWord['word']); - if (strpos($sNormQuery, $sNormPostcode) !== false) { - $oValidTokens->addToken( - $sTok, - new Token\Postcode($iId, $aWord['word'], null) - ); + $iSplitPos = strpos($aWord['word'], '@'); + if ($iSplitPos === false) { + $sPostcode = $aWord['word']; + } else { + $sPostcode = substr($aWord['word'], 0, $iSplitPos); } + + $oValidTokens->addToken( + $sTok, + new Token\Postcode($iId, $sPostcode, null) + ); } break; case 'S': // tokens for classification terms (special phrases) diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature index 7636aea7ee..4d146d18c1 100644 --- a/test/bdd/db/import/postcodes.feature +++ b/test/bdd/db/import/postcodes.feature @@ -163,24 +163,6 @@ Feature: Import of postcodes | de | 01982 | country:de | And there are word tokens for postcodes 01982 - Scenario: Different postcodes with the same normalization can both be found - Given the places - | osm | class | type | addr+postcode | addr+housenumber | geometry | - | N34 | place | house | EH4 7EA | 111 | country:gb | - | N35 | place | house | E4 7EA | 111 | country:gb | - When importing - Then location_postcode contains exactly - | country | postcode | geometry | - | gb | EH4 7EA | country:gb | - | gb | E4 7EA | country:gb | - When sending search query "EH4 7EA" - Then results contain - | type | display_name | - | postcode | EH4 7EA | - When sending search query "E4 7EA" - Then results contain - | type | display_name | - | postcode | E4 7EA | @Fail Scenario: search and address ranks for GB post codes correctly assigned diff --git a/test/bdd/db/query/postcodes.feature b/test/bdd/db/query/postcodes.feature new file mode 100644 index 0000000000..c399b63b0b --- /dev/null +++ b/test/bdd/db/query/postcodes.feature @@ -0,0 +1,95 @@ +@DB +@fail-legacy +Feature: Querying fo postcode variants + + Scenario: Postcodes in Singapore (6-digit postcode) + Given the grid with origin SG + | 10 | | | | 11 | + And the places + | osm | class | type | name | addr+postcode | geometry | + | W1 | highway | path | Lorang | 399174 | 10,11 | + When importing + When sending search query "399174" + Then results contain + | ID | type | display_name | + | 0 | postcode | 399174 | + + + Scenario Outline: Postcodes in the Netherlands (mixed postcode with spaces) + Given the grid with origin NL + | 10 | | | | 11 | + And the places + | osm | class | type | name | addr+postcode | geometry | + | W1 | highway | path | De Weide | 3993 DX | 10,11 | + When importing + When sending search query "3993 DX" + Then results contain + | ID | type | display_name | + | 0 | postcode | 3993 DX | + When sending search query "3993dx" + Then results contain + | ID | type | display_name | + | 0 | postcode | 3993 DX | + + Examples: + | postcode | + | 3993 DX | + | 3993DX | + | 3993 dx | + + + Scenario: Postcodes in Singapore (6-digit postcode) + Given the grid with origin SG + | 10 | | | | 11 | + And the places + | osm | class | type | name | addr+postcode | geometry | + | W1 | highway | path | Lorang | 399174 | 10,11 | + When importing + When sending search query "399174" + Then results contain + | ID | type | display_name | + | 0 | postcode | 399174 | + + + Scenario Outline: Postcodes in Andorra (with country code) + Given the grid with origin AD + | 10 | | | | 11 | + And the places + | osm | class | type | name | addr+postcode | geometry | + | W1 | highway | path | Lorang | | 10,11 | + When importing + When sending search query "675" + Then results contain + | ID | type | display_name | + | 0 | postcode | AD675 | + When sending search query "AD675" + Then results contain + | ID | type | display_name | + | 0 | postcode | AD675 | + + Examples: + | postcode | + | 675 | + | AD 675 | + | AD675 | + + + Scenario: Different postcodes with the same normalization can both be found + Given the places + | osm | class | type | addr+postcode | addr+housenumber | geometry | + | N34 | place | house | EH4 7EA | 111 | country:gb | + | N35 | place | house | E4 7EA | 111 | country:gb | + When importing + Then location_postcode contains exactly + | country | postcode | geometry | + | gb | EH4 7EA | country:gb | + | gb | E4 7EA | country:gb | + When sending search query "EH4 7EA" + Then results contain + | type | display_name | + | postcode | EH4 7EA | + When sending search query "E4 7EA" + Then results contain + | type | display_name | + | postcode | E4 7EA | + From 93d5be097a338b0333ecf2452b0a7311cf061bff Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 22 Jun 2022 10:47:08 +0200 Subject: [PATCH 28/30] bdd: do not expect legacy word table to be without empty tokens It can happen for bogus names and this will not get fixed anymore. --- test/bdd/steps/steps_db_ops.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py index 37d541533d..8fd918f88f 100644 --- a/test/bdd/steps/steps_db_ops.py +++ b/test/bdd/steps/steps_db_ops.py @@ -27,8 +27,9 @@ def check_database_integrity(context): assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline" # word table must not have empty word_tokens - cur.execute("SELECT count(*) FROM word WHERE word_token = ''") - assert cur.fetchone()[0] == 0, "Empty word tokens found in word table" + if context.nominatim.tokenizer != 'legacy': + cur.execute("SELECT count(*) FROM word WHERE word_token = ''") + assert cur.fetchone()[0] == 0, "Empty word tokens found in word table" From 3dd7410bb7adf937f3a4c53ab6219c028d7685b8 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 22 Jun 2022 11:38:23 +0200 Subject: [PATCH 29/30] bdd: correctly skip postcode tests for legacy --- test/bdd/db/query/postcodes.feature | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/bdd/db/query/postcodes.feature b/test/bdd/db/query/postcodes.feature index c399b63b0b..a3ca70352a 100644 --- a/test/bdd/db/query/postcodes.feature +++ b/test/bdd/db/query/postcodes.feature @@ -1,5 +1,4 @@ @DB -@fail-legacy Feature: Querying fo postcode variants Scenario: Postcodes in Singapore (6-digit postcode) @@ -15,6 +14,7 @@ Feature: Querying fo postcode variants | 0 | postcode | 399174 | + @fail-legacy Scenario Outline: Postcodes in the Netherlands (mixed postcode with spaces) Given the grid with origin NL | 10 | | | | 11 | @@ -38,6 +38,7 @@ Feature: Querying fo postcode variants | 3993 dx | + @fail-legacy Scenario: Postcodes in Singapore (6-digit postcode) Given the grid with origin SG | 10 | | | | 11 | @@ -51,6 +52,7 @@ Feature: Querying fo postcode variants | 0 | postcode | 399174 | + @fail-legacy Scenario Outline: Postcodes in Andorra (with country code) Given the grid with origin AD | 10 | | | | 11 | From 536f08f33a1388c5022c2adf799da55035fd7e0a Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 23 Jun 2022 16:17:47 +0200 Subject: [PATCH 30/30] ignore 5+ postcodes in the US for now Hierarchical postcodes need a different treatment. --- settings/country_settings.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml index 14d08de3a3..b0bacdfcc5 100644 --- a/settings/country_settings.yaml +++ b/settings/country_settings.yaml @@ -1998,7 +1998,7 @@ ua: languages: uk names: !include country-names/ua.yaml postcode: - pattern: "ddddd" + pattern: "d?ddddd" # Uganda (Uganda) @@ -2024,7 +2024,7 @@ us: languages: en names: !include country-names/us.yaml postcode: - pattern: "(ddddd)(?:-dddd)?" + pattern: "ddddd" # Uruguay (Uruguay)