From 21fb501699ad6d41c3f57b431a3826238df255b6 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 11 May 2022 17:23:28 +0200
Subject: [PATCH 01/30] add info about countries without a postcode

---
 settings/country_settings.yaml | 56 +++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml
index 643acbee3a..8abbe4a27c 100644
--- a/settings/country_settings.yaml
+++ b/settings/country_settings.yaml
@@ -10,6 +10,7 @@ ae:
     partition: 83
     languages: ar
     names: !include country-names/ae.yaml
+    postcode: no
 
 
 # Afghanistan (افغانستان)
@@ -24,6 +25,7 @@ ag:
     partition: 205
     languages: en
     names: !include country-names/ag.yaml
+    postcode: no
 
 
 # Anguilla (Anguilla)
@@ -59,6 +61,7 @@ ao:
     partition: 85
     languages: pt
     names: !include country-names/ao.yaml
+    postcode: no
 
 
 #  (Antarctica)
@@ -101,6 +104,7 @@ aw:
     partition: 183
     languages: nl, pap
     names: !include country-names/aw.yaml
+    postcode: no
 
 
 #  (Aland Islands)
@@ -150,6 +154,7 @@ bf:
     partition: 225
     languages: fr
     names: !include country-names/bf.yaml
+    postcode: no
 
 
 # Bulgaria (Бългaрия)
@@ -171,6 +176,7 @@ bi:
     partition: 61
     languages: fr
     names: !include country-names/bi.yaml
+    postcode: no
 
 
 # Benin (Bénin)
@@ -178,6 +184,7 @@ bj:
     partition: 224
     languages: fr
     names: !include country-names/bj.yaml
+    postcode: no
 
 
 #  (Saint Barthélemy)
@@ -206,6 +213,7 @@ bo:
     partition: 120
     languages: es, qu, gn, ay
     names: !include country-names/bo.yaml
+    postcode: no
 
 
 # Caribbean Netherlands (Caribisch Nederland)
@@ -227,6 +235,7 @@ bs:
     partition: 207
     languages: en
     names: !include country-names/bs.yaml
+    postcode: no
 
 
 # Bhutan (འབྲུག་ཡུལ་)
@@ -248,6 +257,7 @@ bw:
     partition: 122
     languages: en, tn
     names: !include country-names/bw.yaml
+    postcode: no
 
 
 # Belarus (Беларусь)
@@ -262,6 +272,7 @@ bz:
     partition: 208
     languages: en
     names: !include country-names/bz.yaml
+    postcode: no
 
 
 # Canada (Canada)
@@ -283,6 +294,7 @@ cd:
     partition: 229
     languages: fr
     names: !include country-names/cd.yaml
+    postcode: no
 
 
 # Central African Republic (Ködörösêse tî Bêafrîka - République Centrafricaine)
@@ -290,6 +302,7 @@ cf:
     partition: 227
     languages: fr, sg
     names: !include country-names/cf.yaml
+    postcode: no
 
 
 # Congo-Brazzaville (Congo)
@@ -297,6 +310,7 @@ cg:
     partition: 230
     languages: fr
     names: !include country-names/cg.yaml
+    postcode: no
 
 
 # Switzerland (Schweiz/Suisse/Svizzera/Svizra)
@@ -311,6 +325,7 @@ ci:
     partition: 228
     languages: fr
     names: !include country-names/ci.yaml
+    postcode: no
 
 
 # Cook Islands (Kūki 'Āirani)
@@ -318,6 +333,7 @@ ck:
     partition: 41
     languages: en, rar
     names: !include country-names/ck.yaml
+    postcode: no
 
 
 # Chile (Chile)
@@ -332,6 +348,7 @@ cm:
     partition: 141
     languages: fr, en
     names: !include country-names/cm.yaml
+    postcode: no
 
 
 # China (中国)
@@ -409,6 +426,7 @@ dj:
     partition: 43
     languages: fr, ar, so, aa
     names: !include country-names/dj.yaml
+    postcode: no
 
 
 # Denmark (Danmark)
@@ -423,6 +441,7 @@ dm:
     partition: 209
     languages: en
     names: !include country-names/dm.yaml
+    postcode: no
 
 
 # Dominican Republic (República Dominicana)
@@ -472,6 +491,7 @@ er:
     partition: 142
     languages: ti, ar, en
     names: !include country-names/er.yaml
+    postcode: no
 
 
 # Spain (España)
@@ -500,6 +520,7 @@ fj:
     partition: 210
     languages: en
     names: !include country-names/fj.yaml
+    postcode: no
 
 
 # Falkland Islands (Falkland Islands)
@@ -535,6 +556,7 @@ ga:
     partition: 239
     languages: fr
     names: !include country-names/ga.yaml
+    postcode: no
 
 
 # United Kingdom (United Kingdom)
@@ -549,6 +571,7 @@ gd:
     partition: 143
     languages: en
     names: !include country-names/gd.yaml
+    postcode: no
 
 
 # Georgia (საქართველო)
@@ -598,6 +621,7 @@ gm:
     partition: 212
     languages: en
     names: !include country-names/gm.yaml
+    postcode: no
 
 
 # Guinea (Guinée)
@@ -619,6 +643,7 @@ gq:
     partition: 12
     languages: es, fr, pt
     names: !include country-names/gq.yaml
+    postcode: no
 
 
 # Greece (Ελλάς)
@@ -661,6 +686,7 @@ gy:
     partition: 213
     languages: en
     names: !include country-names/gy.yaml
+    postcode: no
 
 
 #  (Hong Kong)
@@ -829,6 +855,7 @@ ki:
     partition: 215
     languages: en
     names: !include country-names/ki.yaml
+    postcode: no
 
 
 # Comoros (Comores Komori جزر القمر)
@@ -836,6 +863,7 @@ km:
     partition: 47
     languages: ar, fr, sw
     names: !include country-names/km.yaml
+    postcode: no
 
 
 # Saint Kitts and Nevis (Saint Kitts and Nevis)
@@ -850,6 +878,7 @@ kp:
     partition: 48
     languages: ko
     names: !include country-names/kp.yaml
+    postcode: no
 
 
 # South Korea (대한민국)
@@ -955,6 +984,7 @@ ly:
     partition: 163
     languages: ar
     names: !include country-names/ly.yaml
+    postcode: no
 
 
 # Morocco (Maroc ⵍⵎⵖⵔⵉⴱ المغرب)
@@ -1018,6 +1048,7 @@ ml:
     partition: 241
     languages: fr
     names: !include country-names/ml.yaml
+    postcode: no
 
 
 # Myanmar (မြန်မာ)
@@ -1039,6 +1070,7 @@ mo:
     partition: 191
     languages: zh-hant, pt
     names: !include country-names/mo.yaml
+    postcode: no
 
 
 # Northern Mariana Islands (Northern Mariana Islands)
@@ -1060,6 +1092,7 @@ mr:
     partition: 149
     languages: ar, fr
     names: !include country-names/mr.yaml
+    postcode: no
 
 
 # Montserrat (Montserrat)
@@ -1095,6 +1128,7 @@ mw:
     partition: 97
     languages: en, ny
     names: !include country-names/mw.yaml
+    postcode: no
 
 
 # Mexico (México)
@@ -1186,6 +1220,7 @@ nr:
     partition: 70
     languages: na, en
     names: !include country-names/nr.yaml
+    postcode: no
 
 
 # Niue (Niuē)
@@ -1193,6 +1228,7 @@ nu:
     partition: 178
     languages: niu, en
     names: !include country-names/nu.yaml
+    postcode: no
 
 
 # New Zealand (New Zealand / Aotearoa)
@@ -1312,6 +1348,7 @@ qa:
     partition: 169
     languages: ar
     names: !include country-names/qa.yaml
+    postcode: no
 
 
 #  (Réunion)
@@ -1347,6 +1384,7 @@ rw:
     partition: 102
     languages: rw, fr, en
     names: !include country-names/rw.yaml
+    postcode: no
 
 
 # Saudi Arabia (السعودية)
@@ -1361,6 +1399,7 @@ sb:
     partition: 201
     languages: en
     names: !include country-names/sb.yaml
+    postcode: no
 
 
 # Seychelles (Sesel)
@@ -1368,6 +1407,7 @@ sc:
     partition: 79
     languages: fr, en, crs
     names: !include country-names/sc.yaml
+    postcode: no
 
 
 # Sudan (السودان)
@@ -1424,6 +1464,7 @@ sl:
     partition: 219
     languages: en
     names: !include country-names/sl.yaml
+    postcode: no
 
 
 # San Marino (San Marino)
@@ -1452,6 +1493,7 @@ sr:
     partition: 24
     languages: nl
     names: !include country-names/sr.yaml
+    postcode: no
 
 
 # South Sudan (South Sudan)
@@ -1459,6 +1501,7 @@ ss:
     partition: 247
     languages: en
     names: !include country-names/ss.yaml
+    postcode: no
 
 
 # São Tomé and Príncipe (São Tomé e Príncipe)
@@ -1466,6 +1509,7 @@ st:
     partition: 53
     languages: pt
     names: !include country-names/st.yaml
+    postcode: no
 
 
 # El Salvador (El Salvador)
@@ -1487,6 +1531,7 @@ sy:
     partition: 104
     languages: ar
     names: !include country-names/sy.yaml
+    postcode: no
 
 
 # Eswatini (eSwatini)
@@ -1508,6 +1553,7 @@ td:
     partition: 68
     languages: fr, ar
     names: !include country-names/td.yaml
+    postcode: no
 
 
 # French Southern Lands (Terres australes et antarctiques françaises)
@@ -1522,6 +1568,7 @@ tg:
     partition: 243
     languages: fr
     names: !include country-names/tg.yaml
+    postcode: no
 
 
 # Thailand (ประเทศไทย)
@@ -1543,6 +1590,7 @@ tk:
     partition: 179
     languages: tkl, en, sm
     names: !include country-names/tk.yaml
+    postcode: no
 
 
 # East Timor (Timór Lorosa'e)
@@ -1550,6 +1598,7 @@ tl:
     partition: 161
     languages: pt, tet
     names: !include country-names/tl.yaml
+    postcode: no
 
 
 # Turkmenistan (Türkmenistan)
@@ -1571,6 +1620,7 @@ to:
     partition: 220
     languages: en
     names: !include country-names/to.yaml
+    postcode: no
 
 
 # Turkey (Türkiye)
@@ -1592,6 +1642,7 @@ tv:
     partition: 156
     languages: en
     names: !include country-names/tv.yaml
+    postcode: no
 
 
 # Taiwan (臺灣)
@@ -1620,6 +1671,7 @@ ug:
     partition: 155
     languages: en, sw
     names: !include country-names/ug.yaml
+    postcode: no
 
 
 #  (United States Minor Outlying Islands)
@@ -1697,6 +1749,7 @@ vu:
     partition: 116
     languages: bi, en, fr
     names: !include country-names/vu.yaml
+    postcode: no
 
 
 # Wallis and Futuna Islands (Wallis-et-Futuna)
@@ -1725,6 +1778,7 @@ ye:
     partition: 55
     languages: ar
     names: !include country-names/ye.yaml
+    postcode: no
 
 
 # Mayotte (Mayotte)
@@ -1753,4 +1807,4 @@ zw:
     partition: 223
     languages: en, sn, nd
     names: !include country-names/zw.yaml
-
+    postcode: no

From 8080625747dc7e87bc510d2af0d3edf5d551a6d0 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Thu, 12 May 2022 11:43:47 +0200
Subject: [PATCH 02/30] remove postcodes from countries that don't have them

The postcodes will only be removed as a 'computed postcode' they
are still searchable for the given object.
---
 .pylintrc                                        |  2 +-
 nominatim/tokenizer/sanitizers/config.py         | 14 ++++++++++++++
 .../sanitizers/tag_analyzer_by_language.py       |  3 +--
 nominatim/tools/country_info.py                  | 14 ++++++++++++--
 settings/icu_tokenizer.yaml                      |  2 ++
 test/bdd/db/import/postcodes.feature             | 16 +++++++++++++++-
 6 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index fef5387211..52d9fcf9e6 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -13,4 +13,4 @@ ignored-classes=NominatimArgs,closing
 # 'too-many-ancestors' is triggered already by deriving from UserDict
 disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use
 
-good-names=i,x,y,fd,db
+good-names=i,x,y,fd,db,cc
diff --git a/nominatim/tokenizer/sanitizers/config.py b/nominatim/tokenizer/sanitizers/config.py
index ecfcacbe55..ce5ce1eb8b 100644
--- a/nominatim/tokenizer/sanitizers/config.py
+++ b/nominatim/tokenizer/sanitizers/config.py
@@ -44,6 +44,20 @@ def get_string_list(self, param, default=tuple()):
         return values
 
 
+    def get_bool(self, param, default=None):
+        """ Extract a configuration parameter as a boolean.
+            The parameter must be one of the yaml boolean values or an
+            user error will be raised. If `default` is given, then the parameter
+            may also be missing or empty.
+        """
+        value = self.data.get(param, default)
+
+        if not isinstance(value, bool):
+            raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no'.")
+
+        return value
+
+
     def get_delimiter(self, default=',;'):
         """ Return the 'delimiter' parameter in the configuration as a
             compiled regular expression that can be used to split the names on the
diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
index 7898b1c685..9a99d12772 100644
--- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -48,8 +48,7 @@ def _compute_default_languages(self, use_defaults):
         self.deflangs = {}
 
         if use_defaults in ('mono', 'all'):
-            for ccode, prop in country_info.iterate():
-                clangs = prop['languages']
+            for ccode, clangs in country_info.iterate('languages'):
                 if len(clangs) == 1 or use_defaults == 'all':
                     if self.whitelist:
                         self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
diff --git a/nominatim/tools/country_info.py b/nominatim/tools/country_info.py
index 0ad001719e..d754b4ddb0 100644
--- a/nominatim/tools/country_info.py
+++ b/nominatim/tools/country_info.py
@@ -84,10 +84,20 @@ def setup_country_config(config):
     _COUNTRY_INFO.load(config)
 
 
-def iterate():
+def iterate(prop=None):
     """ Iterate over country code and properties.
+
+        When `prop` is None, all countries are returned with their complete
+        set of properties.
+
+        If `prop` is given, then only countries are returned where the
+        given property is set. The second item of the tuple contains only
+        the content of the given property.
     """
-    return _COUNTRY_INFO.items()
+    if prop is None:
+        return _COUNTRY_INFO.items()
+
+    return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p)
 
 
 def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml
index cd9c0d6dd5..544bd81db0 100644
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -32,6 +32,8 @@ sanitizers:
         - streetnumber
       convert-to-name:
         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
+    - step: clean-postcodes
+      convert-to-address: yes
     - step: split-name-list
     - step: strip-brace-terms
     - step: tag-analyzer-by-language
diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature
index 15beab5782..50afa7cc2d 100644
--- a/test/bdd/db/import/postcodes.feature
+++ b/test/bdd/db/import/postcodes.feature
@@ -246,4 +246,18 @@ Feature: Import of postcodes
          | 12 445 4 | ca      | 25          | 11 |
          | A1:BC10  | ca      | 25          | 11 |
 
-
+    Scenario: Postcodes outside all countries are not added to the postcode and word table
+        Given the places
+            | osm | class | type  | addr+postcode | addr+housenumber | addr+place  | geometry  |
+            | N34 | place | house | 01982         | 111              | Null Island | 0 0.00001 |
+        And the places
+            | osm | class | type   | name        | geometry |
+            | N1  | place | hamlet | Null Island | 0 0      |
+        When importing
+        Then location_postcode contains exactly
+            | country | postcode | geometry |
+        And there are no word tokens for postcodes 01982
+        When sending search query "111, 01982 Null Island"
+        Then results contain
+            | osm | display_name |
+            | N34 | 111, Null Island, 01982 |

From 6e0014e1383f2cefa235a00a82c50f4169af278f Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Thu, 19 May 2022 12:03:26 +0200
Subject: [PATCH 03/30] add postcode patterns for numeric postcodes

Adds patterns for countries that have simple numeric-only postcodes.
---
 settings/country_settings.yaml | 208 +++++++++++++++++++++++++++++++++
 1 file changed, 208 insertions(+)

diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml
index 8abbe4a27c..972e267042 100644
--- a/settings/country_settings.yaml
+++ b/settings/country_settings.yaml
@@ -18,6 +18,8 @@ af:
     partition: 30
     languages: fa, ps
     names: !include country-names/af.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Antigua and Barbuda (Antigua and Barbuda)
@@ -40,6 +42,8 @@ al:
     partition: 9
     languages: sq
     names: !include country-names/al.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Armenia (Հայաստան)
@@ -47,6 +51,8 @@ am:
     partition: 33
     languages: hy
     names: !include country-names/am.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Netherlands Antilles (De Nederlandse Antillen)
@@ -90,6 +96,8 @@ at:
     partition: 245
     languages: de
     names: !include country-names/at.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Australia (Australia)
@@ -97,6 +105,8 @@ au:
     partition: 139
     languages: en
     names: !include country-names/au.yaml
+    postcode:
+      pattern: "dddd"
 
 
 #  (Aruba)
@@ -126,6 +136,8 @@ ba:
     partition: 6
     languages: bs, hr, sr
     names: !include country-names/ba.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Barbados (Barbados)
@@ -140,6 +152,8 @@ bd:
     partition: 158
     languages: bn
     names: !include country-names/bd.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Belgium (België / Belgique / Belgien)
@@ -147,6 +161,8 @@ be:
     partition: 15
     languages: nl, fr, de
     names: !include country-names/be.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Burkina Faso (Burkina Faso)
@@ -162,6 +178,8 @@ bg:
     partition: 140
     languages: bg
     names: !include country-names/bg.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Bahrain (البحرين)
@@ -243,6 +261,8 @@ bt:
     partition: 87
     languages: dz
     names: !include country-names/bt.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 #  (Bouvet Island)
@@ -265,6 +285,8 @@ by:
     partition: 40
     languages: be, ru
     names: !include country-names/by.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Belize (Belize)
@@ -318,6 +340,8 @@ ch:
     partition: 5
     languages: de, fr, it, rm
     names: !include country-names/ch.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Côte d'Ivoire (Côte d’Ivoire)
@@ -341,6 +365,8 @@ cl:
     partition: 88
     languages: es
     names: !include country-names/cl.yaml
+    postcode:
+      pattern: "ddddddd"
 
 
 # Cameroon (Cameroun)
@@ -356,6 +382,8 @@ cn:
     partition: 117
     languages: zh
     names: !include country-names/cn.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Colombia (Colombia)
@@ -363,6 +391,8 @@ co:
     partition: 133
     languages: es
     names: !include country-names/co.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Costa Rica (Costa Rica)
@@ -370,6 +400,8 @@ cr:
     partition: 64
     languages: es
     names: !include country-names/cr.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Cuba (Cuba)
@@ -377,6 +409,8 @@ cu:
     partition: 42
     languages: es
     names: !include country-names/cu.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Cape Verde (Cabo Verde)
@@ -384,6 +418,8 @@ cv:
     partition: 89
     languages: pt
     names: !include country-names/cv.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Curaçao (Curaçao)
@@ -419,6 +455,8 @@ de:
     partition: 3
     languages: de
     names: !include country-names/de.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Djibouti (Djibouti جيبوتي)
@@ -434,6 +472,8 @@ dk:
     partition: 160
     languages: da
     names: !include country-names/dk.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Dominica (Dominica)
@@ -449,6 +489,8 @@ do:
     partition: 37
     languages: es
     names: !include country-names/do.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Algeria (Algérie / ⵍⵣⵣⴰⵢⴻⵔ / الجزائر)
@@ -456,6 +498,8 @@ dz:
     partition: 19
     languages: ar, ber, fr
     names: !include country-names/dz.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Ecuador (Ecuador)
@@ -470,6 +514,8 @@ ee:
     partition: 125
     languages: et
     names: !include country-names/ee.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Egypt (مصر)
@@ -477,6 +523,8 @@ eg:
     partition: 16
     languages: ar
     names: !include country-names/eg.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Sahrawi Arab Democratic Republic (الجمهورية العربية الصحراوية الديمقراطية)
@@ -499,6 +547,8 @@ es:
     partition: 31
     languages: es, ast, ca, eu, gl
     names: !include country-names/es.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Ethiopia (ኢትዮጵያ)
@@ -506,6 +556,8 @@ et:
     partition: 90
     languages: am, om
     names: !include country-names/et.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Finland (Suomi)
@@ -513,6 +565,8 @@ fi:
     partition: 20
     languages: fi, sv, se
     names: !include country-names/fi.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Fiji (Viti)
@@ -535,6 +589,8 @@ fm:
     partition: 217
     languages: en
     names: !include country-names/fm.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Faroe Islands (Føroyar)
@@ -549,6 +605,8 @@ fr:
     partition: 4
     languages: fr
     names: !include country-names/fr.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Gabon (Gabon)
@@ -579,6 +637,8 @@ ge:
     partition: 21
     languages: ka
     names: !include country-names/ge.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # French Guiana (Guyane Française)
@@ -614,6 +674,8 @@ gl:
     partition: 111
     languages: kl, da
     names: !include country-names/gl.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # The Gambia (Gambia)
@@ -629,6 +691,8 @@ gn:
     partition: 240
     languages: fr
     names: !include country-names/gn.yaml
+    postcode:
+      pattern: "ddd"
 
 
 # Guadeloupe (Guadeloupe)
@@ -665,6 +729,8 @@ gt:
     partition: 57
     languages: es
     names: !include country-names/gt.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Guam (Guam)
@@ -679,6 +745,8 @@ gw:
     partition: 8
     languages: pt
     names: !include country-names/gw.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Guyana (Guyana)
@@ -708,6 +776,8 @@ hn:
     partition: 56
     languages: es
     names: !include country-names/hn.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Croatia (Hrvatska)
@@ -715,6 +785,8 @@ hr:
     partition: 92
     languages: hr
     names: !include country-names/hr.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Haiti (Ayiti)
@@ -729,6 +801,8 @@ hu:
     partition: 45
     languages: hu
     names: !include country-names/hu.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Indonesia (Indonesia)
@@ -736,6 +810,8 @@ id:
     partition: 110
     languages: id
     names: !include country-names/id.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Ireland (Éire / Ireland)
@@ -750,6 +826,8 @@ il:
     partition: 65
     languages: he
     names: !include country-names/il.yaml
+    postcode:
+      pattern: "ddddddd"
 
 
 # Isle of Man (Isle of Man)
@@ -778,6 +856,8 @@ iq:
     partition: 144
     languages: ar, ku
     names: !include country-names/iq.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Iran (ایران)
@@ -792,6 +872,8 @@ is:
     partition: 134
     languages: is
     names: !include country-names/is.yaml
+    postcode:
+      pattern: "ddd"
 
 
 # Italy (Italia)
@@ -799,6 +881,8 @@ it:
     partition: 28
     languages: it, de, fr
     names: !include country-names/it.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Jersey (Jersey)
@@ -820,6 +904,8 @@ jo:
     partition: 17
     languages: ar
     names: !include country-names/jo.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Japan (日本)
@@ -834,6 +920,8 @@ ke:
     partition: 126
     languages: sw, en
     names: !include country-names/ke.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Kyrgyzstan (Кыргызстан)
@@ -841,6 +929,8 @@ kg:
     partition: 93
     languages: ky, ru
     names: !include country-names/kg.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Cambodia (ព្រះរាជាណាចក្រ​កម្ពុជា)
@@ -848,6 +938,8 @@ kh:
     partition: 159
     languages: km
     names: !include country-names/kh.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Kiribati (Kiribati)
@@ -871,6 +963,8 @@ kn:
     partition: 84
     languages: en
     names: !include country-names/kn.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # North Korea (조선민주주의인민공화국)
@@ -886,6 +980,8 @@ kr:
     partition: 49
     languages: ko, en
     names: !include country-names/kr.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Kuwait (الكويت)
@@ -893,6 +989,8 @@ kw:
     partition: 127
     languages: ar
     names: !include country-names/kw.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Cayman Islands (Cayman Islands)
@@ -914,6 +1012,8 @@ la:
     partition: 145
     languages: lo
     names: !include country-names/la.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Lebanon (لبنان)
@@ -935,6 +1035,8 @@ li:
     partition: 246
     languages: de
     names: !include country-names/li.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Sri Lanka (ශ්‍රී ලංකාව இலங்கை)
@@ -942,6 +1044,8 @@ lk:
     partition: 95
     languages: si, ta
     names: !include country-names/lk.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Liberia (Liberia)
@@ -949,6 +1053,8 @@ lr:
     partition: 216
     languages: en
     names: !include country-names/lr.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Lesotho (Lesotho)
@@ -956,6 +1062,8 @@ ls:
     partition: 136
     languages: en, st
     names: !include country-names/ls.yaml
+    postcode:
+      pattern: "ddd"
 
 
 # Lithuania (Lietuva)
@@ -970,6 +1078,8 @@ lu:
     partition: 74
     languages: lb, fr, de
     names: !include country-names/lu.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Latvia (Latvija)
@@ -992,6 +1102,8 @@ ma:
     partition: 23
     languages: fr, zgh, ar
     names: !include country-names/ma.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Monaco (Monaco)
@@ -1013,6 +1125,8 @@ me:
     partition: 180
     languages: srp, sr, hr, bs, sq
     names: !include country-names/me.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Saint Martin (Saint Martin)
@@ -1027,6 +1141,8 @@ mg:
     partition: 164
     languages: mg, fr
     names: !include country-names/mg.yaml
+    postcode:
+      pattern: "ddd"
 
 
 # Marshall Islands (Ṃajeḷ)
@@ -1034,6 +1150,8 @@ mh:
     partition: 105
     languages: en, mh
     names: !include country-names/mh.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # North Macedonia (Северна Македонија)
@@ -1041,6 +1159,8 @@ mk:
     partition: 69
     languages: mk
     names: !include country-names/mk.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Mali (Mali)
@@ -1056,6 +1176,8 @@ mm:
     partition: 148
     languages: my
     names: !include country-names/mm.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Mongolia (Монгол улс ᠮᠤᠩᠭᠤᠯ ᠤᠯᠤᠰ)
@@ -1063,6 +1185,8 @@ mn:
     partition: 167
     languages: mn
     names: !include country-names/mn.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Macao (Macao)
@@ -1114,6 +1238,8 @@ mu:
     partition: 150
     languages: mfe, fr, en
     names: !include country-names/mu.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Maldives (ދިވެހިރާއްޖެ)
@@ -1121,6 +1247,8 @@ mv:
     partition: 96
     languages: dv
     names: !include country-names/mv.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Malawi (Malawi)
@@ -1136,6 +1264,8 @@ mx:
     partition: 166
     languages: es
     names: !include country-names/mx.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Malaysia (Malaysia)
@@ -1143,6 +1273,8 @@ my:
     partition: 7
     languages: ms
     names: !include country-names/my.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Mozambique (Moçambique)
@@ -1157,6 +1289,8 @@ na:
     partition: 99
     languages: en, sf, de
     names: !include country-names/na.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # New Caledonia (Nouvelle-Calédonie)
@@ -1171,6 +1305,8 @@ ne:
     partition: 226
     languages: fr
     names: !include country-names/ne.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Norfolk Island (Norfolk Island)
@@ -1185,6 +1321,8 @@ ng:
     partition: 218
     languages: en
     names: !include country-names/ng.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Nicaragua (Nicaragua)
@@ -1192,6 +1330,8 @@ ni:
     partition: 151
     languages: es
     names: !include country-names/ni.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Netherlands (Nederland)
@@ -1206,6 +1346,8 @@ nl:
     partition: 60
     languages: nb, nn, no, se
     names: !include country-names/no.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Nepal (Nepal)
@@ -1213,6 +1355,8 @@ np:
     partition: 50
     languages: ne
     names: !include country-names/np.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Nauru (Naoero)
@@ -1236,6 +1380,8 @@ nz:
     partition: 27
     languages: mi, en
     names: !include country-names/nz.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Oman (عمان)
@@ -1243,6 +1389,8 @@ om:
     partition: 137
     languages: ar
     names: !include country-names/om.yaml
+    postcode:
+      pattern: "ddd"
 
 
 # Panama (Panamá)
@@ -1250,6 +1398,8 @@ pa:
     partition: 152
     languages: es
     names: !include country-names/pa.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Peru (Perú)
@@ -1257,6 +1407,8 @@ pe:
     partition: 51
     languages: es
     names: !include country-names/pe.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # French Polynesia (Polynésie française)
@@ -1271,6 +1423,8 @@ pg:
     partition: 71
     languages: en, tpi, ho
     names: !include country-names/pg.yaml
+    postcode:
+      pattern: "ddd"
 
 
 # Philippines (Philippines)
@@ -1278,6 +1432,8 @@ ph:
     partition: 26
     languages: en, tl
     names: !include country-names/ph.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Pakistan (پاکستان)
@@ -1285,6 +1441,8 @@ pk:
     partition: 14
     languages: en, ur, pnb, sd, ps, bal
     names: !include country-names/pk.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Poland (Polska)
@@ -1320,6 +1478,8 @@ ps:
     partition: 194
     languages: ar, he
     names: !include country-names/ps.yaml
+    postcode:
+      pattern: "ddd"
 
 
 # Portugal (Portugal)
@@ -1341,6 +1501,8 @@ py:
     partition: 101
     languages: es, gn
     names: !include country-names/py.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Qatar (قطر)
@@ -1363,6 +1525,8 @@ ro:
     partition: 170
     languages: ro
     names: !include country-names/ro.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Serbia (Србија)
@@ -1370,6 +1534,8 @@ rs:
     partition: 59
     languages: sr
     names: !include country-names/rs.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Russia (Россия)
@@ -1377,6 +1543,8 @@ ru:
     partition: 135
     languages: ru
     names: !include country-names/ru.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Rwanda (Rwanda)
@@ -1415,6 +1583,8 @@ sd:
     partition: 72
     languages: ar, en
     names: !include country-names/sd.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Sweden (Sverige)
@@ -1429,6 +1599,8 @@ sg:
     partition: 115
     languages: zh-hans, en, ms, ta
     names: !include country-names/sg.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Saint Helena, Ascension and Tristan da Cunha (Saint Helena, Ascension and Tristan da Cunha)
@@ -1443,6 +1615,8 @@ si:
     partition: 36
     languages: sl
     names: !include country-names/si.yaml
+    postcode:
+      pattern: "dddd"
 
 
 #  (Svalbard and Jan Mayen)
@@ -1479,6 +1653,8 @@ sn:
     partition: 237
     languages: fr
     names: !include country-names/sn.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Somalia (Soomaaliya الصومال)
@@ -1517,6 +1693,8 @@ sv:
     partition: 103
     languages: es
     names: !include country-names/sv.yaml
+    postcode:
+      pattern: "dddd"
 
 
 #  (Sint Maarten)
@@ -1576,6 +1754,8 @@ th:
     partition: 32
     languages: th
     names: !include country-names/th.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Tajikistan (Тоҷикистон)
@@ -1583,6 +1763,8 @@ tj:
     partition: 129
     languages: tg, ru
     names: !include country-names/tj.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Tokelau (Tokelau)
@@ -1606,6 +1788,8 @@ tm:
     partition: 54
     languages: tk
     names: !include country-names/tm.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Tunisia (تونس)
@@ -1613,6 +1797,8 @@ tn:
     partition: 18
     languages: ar, fr
     names: !include country-names/tn.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Tonga (Tonga)
@@ -1628,6 +1814,8 @@ tr:
     partition: 81
     languages: tr
     names: !include country-names/tr.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Trinidad and Tobago (Trinidad and Tobago)
@@ -1635,6 +1823,8 @@ tt:
     partition: 221
     languages: en
     names: !include country-names/tt.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Tuvalu (Tuvalu)
@@ -1657,6 +1847,8 @@ tz:
     partition: 130
     languages: sw, en
     names: !include country-names/tz.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Ukraine (Україна)
@@ -1664,6 +1856,8 @@ ua:
     partition: 173
     languages: uk
     names: !include country-names/ua.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Uganda (Uganda)
@@ -1693,6 +1887,8 @@ uy:
     partition: 174
     languages: es
     names: !include country-names/uy.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Uzbekistan (Oʻzbekiston)
@@ -1700,6 +1896,8 @@ uz:
     partition: 157
     languages: uz, kaa
     names: !include country-names/uz.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Vatican City (Civitas Vaticana)
@@ -1721,6 +1919,8 @@ ve:
     partition: 108
     languages: es
     names: !include country-names/ve.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # British Virgin Islands (British Virgin Islands)
@@ -1742,6 +1942,8 @@ vn:
     partition: 75
     languages: vi
     names: !include country-names/vn.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Vanuatu (Vanuatu)
@@ -1771,6 +1973,8 @@ xk:
     partition: 59
     languages: sq, sr
     names: !include country-names/xk.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Yemen (اليمن)
@@ -1793,6 +1997,8 @@ za:
     partition: 76
     languages: en, af, st, tn, xh, zu
     names: !include country-names/za.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Zambia (Zambia)
@@ -1800,6 +2006,8 @@ zm:
     partition: 222
     languages: en
     names: !include country-names/zm.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Zimbabwe (Zimbabwe)

From 90d4d339dbed83cc90823401634f01a20e129548 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Thu, 19 May 2022 15:49:36 +0200
Subject: [PATCH 04/30] initial postcode cleaner for simple patterns

Moves postcodes that are either in countries without a postcode
system or don't correspond to the local pattern for postcodes into
a field for a normal address part. Makes them searchable but not as
a special address. This has two consequences: they are no longer a
skippable part of the address and the postcodes cannot be searched
on their own.
---
 .../tokenizer/sanitizers/clean_postcodes.py   | 99 +++++++++++++++++++
 .../sanitizers/test_clean_postcodes.py        | 54 ++++++++++
 2 files changed, 153 insertions(+)
 create mode 100644 nominatim/tokenizer/sanitizers/clean_postcodes.py
 create mode 100644 test/python/tokenizer/sanitizers/test_clean_postcodes.py

diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py
new file mode 100644
index 0000000000..b07908cdee
--- /dev/null
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Sanitizer that filters postcodes by their officially allowed pattern.
+
+Arguments:
+    convert-to-address: If set to 'yes' (the default), then postcodes that do
+                        not conform with their country-specific pattern are
+                        converted to an address component. That means that
+                        the postcode does not take part when computing the
+                        postcode centroids of a country but is still searchable.
+                        When set to 'no', non-conforming postcodes are not
+                        searchable either.
+"""
+import re
+
+from nominatim.errors import UsageError
+from nominatim.tools import country_info
+
+class _PostcodeMatcher:
+    """ Matches and formats a postcode according to the format definition.
+    """
+    def __init__(self, country_code, config):
+        if 'pattern' not in config:
+            raise UsageError("Field 'pattern' required for 'postcode' "
+                             f"for country '{country_code}'")
+
+        self.pattern = re.compile(config['pattern'].replace('d', '[0-9]')
+                                                   .replace('l', '[A-Z]'))
+
+
+    def normalize(self, postcode):
+        """ Return the normalized version of the postcode. If the given postcode
+            does not correspond to the usage-pattern, return null.
+        """
+        normalized = postcode.strip().upper()
+
+        return normalized if self.pattern.fullmatch(normalized) else None
+
+
+class _PostcodeSanitizer:
+
+    def __init__(self, config):
+        self.convert_to_address = config.get_bool('convert-to-address', True)
+        # Objects without a country code can't have a postcode per definition.
+        self.country_without_postcode = {None}
+        self.country_matcher = {}
+
+        for ccode, prop in country_info.iterate('postcode'):
+            if prop is False:
+                self.country_without_postcode.add(ccode)
+            elif isinstance(prop, dict):
+                self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop)
+            else:
+                raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+
+
+    def __call__(self, obj):
+        if not obj.address:
+            return
+
+        postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode')
+
+        for pos, postcode in postcodes:
+            formatted = self.scan(postcode.name, obj.place.country_code)
+
+            if formatted is None:
+                if self.convert_to_address:
+                    postcode.kind = 'unofficial_postcode'
+                else:
+                    obj.address.pop(pos)
+            else:
+                postcode.name = formatted
+
+
+    def scan(self, postcode, country):
+        """ Check the postcode for correct formatting and return the
+            normalized version. Returns None if the postcode does not
+            correspond to the oficial format of the given country.
+        """
+        if country in self.country_without_postcode:
+            return None
+
+        if country in self.country_matcher:
+            return self.country_matcher[country].normalize(postcode)
+
+        return postcode.upper()
+
+
+
+def create(config):
+    """ Create a housenumber processing function.
+    """
+
+    return _PostcodeSanitizer(config)
diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
new file mode 100644
index 0000000000..7cb3c70fae
--- /dev/null
+++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for the sanitizer that normalizes postcodes.
+"""
+import pytest
+
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.indexer.place_info import PlaceInfo
+from nominatim.tools import country_info
+
+@pytest.fixture
+def sanitize(def_config, request):
+    country_info.setup_country_config(def_config)
+    sanitizer_args = {'step': 'clean-postcodes'}
+    for mark in request.node.iter_markers(name="sanitizer_params"):
+        sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()})
+
+    def _run(country=None, **kwargs):
+        pi = {'address': kwargs}
+        if country is not None:
+            pi['country_code'] = country
+
+        _, address = PlaceSanitizer([sanitizer_args]).process_names(PlaceInfo(pi))
+
+        return sorted([(p.kind, p.name) for p in address])
+
+    return _run
+
+
+@pytest.mark.parametrize("country", (None, 'ae'))
+def test_postcode_no_country(sanitize, country):
+    assert sanitize(country=country, postcode='23231') == [('unofficial_postcode', '23231')]
+
+
+@pytest.mark.parametrize("country", (None, 'ae'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_no_country_drop(sanitize, country):
+    assert sanitize(country=country, postcode='23231') == []
+
+
+@pytest.mark.parametrize("postcode", ('12345', '  34009  '))
+def test_postcode_pass_good_format(sanitize, postcode):
+    assert sanitize(country='de', postcode=postcode) == [('postcode', postcode.strip())]
+
+
+@pytest.mark.parametrize("postcode", ('123456', '', '   ', '.....'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_drop_bad_format(sanitize, postcode):
+    assert sanitize(country='de', postcode=postcode) == []

From 28ab2f6048eff33e6119271c9fd31852db64240a Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Thu, 19 May 2022 16:26:51 +0200
Subject: [PATCH 05/30] add postcodes patterns without optional spaces

---
 settings/country_settings.yaml                | 24 +++++++++++++++++++
 .../sanitizers/test_clean_postcodes.py        | 23 ++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml
index 972e267042..684b0e44e9 100644
--- a/settings/country_settings.yaml
+++ b/settings/country_settings.yaml
@@ -82,6 +82,8 @@ ar:
     partition: 39
     languages: es
     names: !include country-names/ar.yaml
+    postcode:
+      pattern: "l?dddd(?:lll)?"
 
 
 #  (American Samoa)
@@ -187,6 +189,8 @@ bh:
     partition: 62
     languages: ar
     names: !include country-names/bh.yaml
+    postcode:
+      pattern: "d?ddd"
 
 
 # Burundi (Burundi)
@@ -441,6 +445,8 @@ cy:
     partition: 114
     languages: el, tr
     names: !include country-names/cy.yaml
+    postcode:
+      pattern: "(?:99|d)ddd"
 
 
 # Czechia (Česko)
@@ -582,6 +588,8 @@ fk:
     partition: 91
     languages: en
     names: !include country-names/fk.yaml
+    postcode:
+      pattern: "FIQQ 1ZZ"
 
 
 # Federated States of Micronesia (Micronesia)
@@ -660,6 +668,8 @@ gh:
     partition: 211
     languages: en
     names: !include country-names/gh.yaml
+    postcode:
+      pattern: "ll-d?ddd-dddd"
 
 
 # Gibraltar (Gibraltar)
@@ -1005,6 +1015,8 @@ kz:
     partition: 94
     languages: kk, ru
     names: !include country-names/kz.yaml
+    postcode:
+      pattern: "(?:lddldld|dddddd)"
 
 
 # Laos (ປະເທດລາວ)
@@ -1111,6 +1123,8 @@ mc:
     partition: 242
     languages: fr
     names: !include country-names/mc.yaml
+    postcode:
+      pattern: "980dd"
 
 
 # Moldova (Moldova)
@@ -1494,6 +1508,8 @@ pw:
     partition: 195
     languages: en, pau, ja, sov, tox
     names: !include country-names/pw.yaml
+    postcode:
+      pattern: "969(39|40)"
 
 
 # Paraguay (Paraguay)
@@ -1646,6 +1662,8 @@ sm:
     partition: 153
     languages: it
     names: !include country-names/sm.yaml
+    postcode:
+      pattern: "4789d"
 
 
 # Senegal (Sénégal)
@@ -1717,6 +1735,8 @@ sz:
     partition: 82
     languages: en, ss
     names: !include country-names/sz.yaml
+    postcode:
+      pattern: "lddd"
 
 
 # Turks and Caicos Islands (Turks and Caicos Islands)
@@ -1873,6 +1893,8 @@ um:
     partition: 198
     languages: en
     names: !include country-names/um.yaml
+    postcode:
+      pattern: "96898"
 
 
 # United States (United States)
@@ -1905,6 +1927,8 @@ va:
     partition: 107
     languages: it
     names: !include country-names/va.yaml
+    postcode:
+      pattern: "00120"
 
 
 # Saint Vincent and the Grenadines (Saint Vincent and the Grenadines)
diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
index 7cb3c70fae..d6371e075b 100644
--- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py
+++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
@@ -52,3 +52,26 @@ def test_postcode_pass_good_format(sanitize, postcode):
 @pytest.mark.sanitizer_params(convert_to_address=False)
 def test_postcode_drop_bad_format(sanitize, postcode):
     assert sanitize(country='de', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('1234', '9435', '99000'))
+def test_postcode_cyprus_pass(sanitize, postcode):
+    assert sanitize(country='cy', postcode=postcode) == [('postcode', postcode)]
+
+
+@pytest.mark.parametrize("postcode", ('91234', '99a45', '567'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_cyprus_fail(sanitize, postcode):
+    assert sanitize(country='cy', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('123456', 'A33F2G7'))
+def test_postcode_kazakhstan_pass(sanitize, postcode):
+    assert sanitize(country='kz', postcode=postcode) == [('postcode', postcode)]
+
+
+@pytest.mark.parametrize("postcode", ('V34T6Y923456', '99345'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_kazakhstan_fail(sanitize, postcode):
+    assert sanitize(country='kz', postcode=postcode) == []
+

From baee6f3de09226c3dc41cb2314a0ac348e865561 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 23 May 2022 11:01:57 +0200
Subject: [PATCH 06/30] postcodes: strip leading country codes

---
 nominatim/tokenizer/sanitizers/clean_postcodes.py        | 9 ++++++---
 test/python/tokenizer/sanitizers/test_clean_postcodes.py | 8 +++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py
index b07908cdee..ae1cd62d8d 100644
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -29,8 +29,9 @@ def __init__(self, country_code, config):
             raise UsageError("Field 'pattern' required for 'postcode' "
                              f"for country '{country_code}'")
 
-        self.pattern = re.compile(config['pattern'].replace('d', '[0-9]')
-                                                   .replace('l', '[A-Z]'))
+        pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
+
+        self.pattern = re.compile(f'(?:{country_code.upper()}[ -]?)?({pc_pattern})')
 
 
     def normalize(self, postcode):
@@ -39,7 +40,9 @@ def normalize(self, postcode):
         """
         normalized = postcode.strip().upper()
 
-        return normalized if self.pattern.fullmatch(normalized) else None
+        match = self.pattern.fullmatch(normalized)
+
+        return match.group(1) if match else None
 
 
 class _PostcodeSanitizer:
diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
index d6371e075b..e5c07596a7 100644
--- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py
+++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
@@ -43,12 +43,14 @@ def test_postcode_no_country_drop(sanitize, country):
     assert sanitize(country=country, postcode='23231') == []
 
 
-@pytest.mark.parametrize("postcode", ('12345', '  34009  '))
+@pytest.mark.parametrize("postcode", ('12345', '  12345  ', 'de 12345',
+                                      'DE12345', 'DE 12345', 'DE-12345'))
 def test_postcode_pass_good_format(sanitize, postcode):
-    assert sanitize(country='de', postcode=postcode) == [('postcode', postcode.strip())]
+    assert sanitize(country='de', postcode=postcode) == [('postcode', '12345')]
 
 
-@pytest.mark.parametrize("postcode", ('123456', '', '   ', '.....'))
+@pytest.mark.parametrize("postcode", ('123456', '', '   ', '.....',
+                                      'DE  12345', 'DEF12345', 'CH 12345'))
 @pytest.mark.sanitizer_params(convert_to_address=False)
 def test_postcode_drop_bad_format(sanitize, postcode):
     assert sanitize(country='de', postcode=postcode) == []

From 49626ba7091ea305616c03b397984add8a09e7d4 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 23 May 2022 11:10:35 +0200
Subject: [PATCH 07/30] add postcode formats with optional country code

If the country code is not part of the mandatory output, the
country code filter will do the correct handling.
---
 settings/country_settings.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml
index 684b0e44e9..adb7593ed5 100644
--- a/settings/country_settings.yaml
+++ b/settings/country_settings.yaml
@@ -131,6 +131,8 @@ az:
     partition: 119
     languages: az
     names: !include country-names/az.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Bosnia and Herzegovina (Bosna i Hercegovina / Босна и Херцеговина)
@@ -513,6 +515,8 @@ ec:
     partition: 78
     languages: es
     names: !include country-names/ec.yaml
+    postcode:
+      pattern: "dddddd"
 
 
 # Estonia (Eesti)
@@ -606,6 +610,8 @@ fo:
     partition: 10
     languages: fo, da
     names: !include country-names/fo.yaml
+    postcode:
+      pattern: "ddd"
 
 
 # France (France)
@@ -804,6 +810,8 @@ ht:
     partition: 29
     languages: fr, ht
     names: !include country-names/ht.yaml
+    postcode:
+      pattern: "dddd"
 
 
 # Hungary (Magyarország)
@@ -1083,6 +1091,8 @@ lt:
     partition: 67
     languages: lt
     names: !include country-names/lt.yaml
+    postcode:
+      pattern: "ddddd"
 
 
 # Luxembourg (Lëtzebuerg)

From 9172696324d1a3cd489428d16c2c8d88cf9adaca Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 23 May 2022 14:04:22 +0200
Subject: [PATCH 08/30] postcodes: add support for optional spaces

---
 nominatim/tokenizer/sanitizers/clean_postcodes.py  | 14 ++++++++++----
 settings/country_settings.yaml                     |  9 +++++++++
 .../tokenizer/sanitizers/test_clean_postcodes.py   | 11 +++++++++++
 3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py
index ae1cd62d8d..a968c9db07 100644
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -31,18 +31,24 @@ def __init__(self, country_code, config):
 
         pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
 
-        self.pattern = re.compile(f'(?:{country_code.upper()}[ -]?)?({pc_pattern})')
+        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
+        self.pattern = re.compile(pc_pattern)
+
+        self.output = config.get('output', r'\g<0>')
 
 
     def normalize(self, postcode):
         """ Return the normalized version of the postcode. If the given postcode
             does not correspond to the usage-pattern, return null.
         """
-        normalized = postcode.strip().upper()
+        # Upper-case, strip spaces and leading country code.
+        normalized = self.norm_pattern.fullmatch(postcode.upper())
 
-        match = self.pattern.fullmatch(normalized)
+        if normalized:
+            match = self.pattern.fullmatch(normalized.group(1))
+            return match.expand(self.output) if match else None
 
-        return match.group(1) if match else None
+        return None
 
 
 class _PostcodeSanitizer:
diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml
index adb7593ed5..f09de046fc 100644
--- a/settings/country_settings.yaml
+++ b/settings/country_settings.yaml
@@ -456,6 +456,9 @@ cz:
     partition: 124
     languages: cs
     names: !include country-names/cz.yaml
+    postcode:
+      pattern: "(ddd) ?(dd)"
+      output: \1 \2
 
 
 # Germany (Deutschland)
@@ -1618,6 +1621,9 @@ se:
     partition: 112
     languages: sv
     names: !include country-names/se.yaml
+    postcode:
+      pattern: "(ddd) ?(dd)"
+      output: \1 \2
 
 
 # Singapore (Singapore)
@@ -1657,6 +1663,9 @@ sk:
     partition: 172
     languages: sk
     names: !include country-names/sk.yaml
+    postcode:
+      pattern: "(ddd) ?(dd)"
+      output: \1 \2
 
 
 # Sierra Leone (Sierra Leone)
diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
index e5c07596a7..228c2f3a1a 100644
--- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py
+++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
@@ -77,3 +77,14 @@ def test_postcode_kazakhstan_pass(sanitize, postcode):
 def test_postcode_kazakhstan_fail(sanitize, postcode):
     assert sanitize(country='kz', postcode=postcode) == []
 
+
+@pytest.mark.parametrize("postcode", ('675 34', '67534', 'SE-675 34', 'SE67534'))
+def test_postcode_sweden_pass(sanitize, postcode):
+    assert sanitize(country='se', postcode=postcode) == [('postcode', '675 34')]
+
+
+@pytest.mark.parametrize("postcode", ('67 345', '671123'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_sweden_fail(sanitize, postcode):
+    assert sanitize(country='se', postcode=postcode) == []
+

From 9cf700e85d723736bf54334f0b1bd9e885cbb42a Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 23 May 2022 16:11:16 +0200
Subject: [PATCH 09/30] add postcodes for most of the remaining countries

Now includes all postcodes that have optional parts.
---
 settings/country_settings.yaml | 111 +++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)

diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml
index f09de046fc..67905ea2a3 100644
--- a/settings/country_settings.yaml
+++ b/settings/country_settings.yaml
@@ -3,6 +3,9 @@ ad:
     partition: 35
     languages: ca
     names: !include country-names/ad.yaml
+    postcode:
+      pattern: "(ddd)"
+      output: AD\1
 
 
 # United Arab Emirates (الإمارات العربية المتحدة)
@@ -35,6 +38,9 @@ ai:
     partition: 175
     languages: en
     names: !include country-names/ai.yaml
+    postcode:
+      pattern: "2640"
+      output: AI-2640
 
 
 # Albania (Shqipëria)
@@ -75,6 +81,7 @@ aq:
     partition: 181
     languages: en, es, fr, ru
     names: !include country-names/aq.yaml
+    postcode: no
 
 
 # Argentina (Argentina)
@@ -149,6 +156,9 @@ bb:
     partition: 206
     languages: en
     names: !include country-names/bb.yaml
+    postcode:
+      pattern: "(ddddd)"
+      output: BB\1
 
 
 # Bangladesh (Bangladesh)
@@ -223,6 +233,9 @@ bm:
     partition: 176
     languages: en
     names: !include country-names/bm.yaml
+    postcode:
+      pattern: "(ll)[ -]?(dd)"
+      output: \1 \2
 
 
 # Brunei (Brunei)
@@ -230,6 +243,9 @@ bn:
     partition: 86
     languages: ms
     names: !include country-names/bn.yaml
+    postcode:
+      pattern: "(ll) ?(dddd)"
+      output: \1\2
 
 
 # Bolivia (Bolivia)
@@ -252,6 +268,9 @@ br:
     partition: 121
     languages: pt
     names: !include country-names/br.yaml
+    postcode:
+      pattern: "(ddddd)-?(ddd)"
+      output: \1-\2
 
 
 # The Bahamas (The Bahamas)
@@ -308,6 +327,9 @@ ca:
     partition: 244
     languages: en, fr
     names: !include country-names/ca.yaml
+    postcode:
+      pattern: "(ldl) ?(dld)"
+      output: \1 \2
 
 
 # Cocos (Keeling) Islands (Cocos (Keeling) Islands)
@@ -639,6 +661,9 @@ gb:
     partition: 1
     languages: en
     names: !include country-names/gb.yaml
+    postcode:
+      pattern: "(l?ld[A-Z0-9]?) ?(dll)"
+      output: \1 \2
 
 
 # Grenada (Grenada)
@@ -670,6 +695,9 @@ gg:
     partition: 77
     languages: en
     names: !include country-names/gg.yaml
+    postcode:
+      pattern: "(GYdd?) ?(dll)"
+      output: \1 \2
 
 
 # Ghana (Ghana)
@@ -686,6 +714,9 @@ gi:
     partition: 138
     languages: en
     names: !include country-names/gi.yaml
+    postcode:
+      pattern: "(GX11) ?(1AA)"
+      output: GX11 1AA
 
 
 # Greenland (Kalaallit Nunaat)
@@ -734,6 +765,9 @@ gr:
     partition: 22
     languages: el
     names: !include country-names/gr.yaml
+    postcode:
+      pattern: "(ddd) ?(dd)"
+      output: \1 \2
 
 
 # South Georgia and the South Sandwich Islands (South Georgia and the South Sandwich Islands)
@@ -741,6 +775,9 @@ gs:
     partition: 44
     languages: en
     names: !include country-names/gs.yaml
+    postcode:
+      pattern: "(SIQQ) ?(1ZZ)"
+      output: \1 \2
 
 
 # Guatemala (Guatemala)
@@ -840,6 +877,9 @@ ie:
     partition: 46
     languages: en, ga
     names: !include country-names/ie.yaml
+    postcode:
+      pattern: "(ldd) ?([0123456789ACDEFHKNPRTVWXY]{4})"
+      output: \1 \2
 
 
 # Israel (ישראל)
@@ -856,6 +896,9 @@ im:
     partition: 190
     languages: en
     names: !include country-names/im.yaml
+    postcode:
+      pattern: "(IMdd?) ?(dll)"
+      output: \1 \2
 
 
 # India (India)
@@ -863,6 +906,9 @@ in:
     partition: 128
     languages: hi, en
     names: !include country-names/in.yaml
+    postcode:
+      pattern: "(ddd) ?(ddd)"
+      output: \1\2
 
 
 # British Indian Ocean Territory (British Indian Ocean Territory)
@@ -870,6 +916,9 @@ io:
     partition: 13
     languages: en
     names: !include country-names/io.yaml
+    postcode:
+      pattern: "(BBND) ?(1ZZ)"
+      output: \1 \2
 
 
 # Iraq (العراق)
@@ -886,6 +935,9 @@ ir:
     partition: 80
     languages: fa
     names: !include country-names/ir.yaml
+    postcode:
+      pattern: "(ddddd)[-_ ]?(ddddd)"
+      output: \1-\2
 
 
 # Iceland (Ísland)
@@ -911,6 +963,9 @@ je:
     partition: 123
     languages: en
     names: !include country-names/je.yaml
+    postcode:
+      pattern: "(JEdd?) ?(dll)"
+      output: \1 \2
 
 
 # Jamaica (Jamaica)
@@ -918,6 +973,7 @@ jm:
     partition: 214
     languages: en
     names: !include country-names/jm.yaml
+    postcode: no
 
 
 # Jordan (الأردن)
@@ -934,6 +990,9 @@ jp:
     partition: 11
     languages: ja
     names: !include country-names/jp.yaml
+    postcode:
+      pattern: "(ddd)-?(dddd)"
+      output: \1-\2
 
 
 # Kenya (Kenya)
@@ -1019,6 +1078,9 @@ ky:
     partition: 38
     languages: en
     names: !include country-names/ky.yaml
+    postcode:
+      pattern: "(d)-(dddd)"
+      output: KY\1-\2
 
 
 # Kazakhstan (Қазақстан)
@@ -1044,6 +1106,8 @@ lb:
     partition: 66
     languages: ar, fr
     names: !include country-names/lb.yaml
+    postcode:
+      pattern: "(dddd)(?: ?dddd)?"
 
 
 # Saint Lucia (Saint Lucia)
@@ -1051,6 +1115,9 @@ lc:
     partition: 146
     languages: en
     names: !include country-names/lc.yaml
+    postcode:
+      pattern: "(dd) ?(ddd)"
+      output: LC\1 \2
 
 
 # Liechtenstein (Liechtenstein)
@@ -1112,6 +1179,9 @@ lv:
     partition: 162
     languages: lv
     names: !include country-names/lv.yaml
+    postcode:
+      pattern: "(dddd)"
+      output: LV-\1
 
 
 # Libya (ليبيا)
@@ -1145,6 +1215,9 @@ md:
     partition: 147
     languages: ro, ru, uk
     names: !include country-names/md.yaml
+    postcode:
+      pattern: "(dddd)"
+      output: MD-\1
 
 
 # Montenegro (Crna Gora / Црна Гора)
@@ -1258,6 +1331,9 @@ mt:
     partition: 165
     languages: mt, en
     names: !include country-names/mt.yaml
+    postcode:
+      pattern: "(lll) ?(dddd)"
+      output: \1 \2
 
 
 # Mauritius (Mauritius)
@@ -1309,6 +1385,9 @@ mz:
     partition: 98
     languages: pt
     names: !include country-names/mz.yaml
+    postcode:
+      pattern: "(dddd)-?(dd)?"
+      output: \1-\2
 
 
 # Namibia (Namibia)
@@ -1366,6 +1445,9 @@ nl:
     partition: 63
     languages: nl
     names: !include country-names/nl.yaml
+    postcode:
+      pattern: "(dddd) ?(ll)"
+      output: \1 \2
 
 
 # Norway (Norge)
@@ -1477,6 +1559,9 @@ pl:
     partition: 168
     languages: pl
     names: !include country-names/pl.yaml
+    postcode:
+      pattern: "(dd)[ -]?(ddd)"
+      output: \1-\2
 
 
 # Saint Pierre and Miquelon (Saint-Pierre-et-Miquelon)
@@ -1491,6 +1576,9 @@ pn:
     partition: 113
     languages: en, pih
     names: !include country-names/pn.yaml
+    postcode:
+      pattern: "(PCRN) ?(1ZZ)"
+      output: \1 \2
 
 
 # Puerto Rico (Puerto Rico)
@@ -1514,6 +1602,8 @@ pt:
     partition: 34
     languages: pt
     names: !include country-names/pt.yaml
+    postcode:
+      pattern: "dddd(?:-ddd)?"
 
 
 # Palau (Belau)
@@ -1589,6 +1679,8 @@ sa:
     partition: 52
     languages: ar
     names: !include country-names/sa.yaml
+    postcode:
+      pattern: "ddddd(?:-dddd)?"
 
 
 # Solomon Islands (Solomon Islands)
@@ -1640,6 +1732,9 @@ sh:
     partition: 196
     languages: en
     names: !include country-names/sh.yaml
+    postcode:
+      pattern: "(ASCN|STHL|TDCU) ?(1ZZ)"
+      output: \1 \2
 
 
 # Slovenia (Slovenija)
@@ -1699,6 +1794,9 @@ so:
     partition: 154
     languages: so, ar
     names: !include country-names/so.yaml
+    postcode:
+      pattern: "(ll) ?(ddddd)"
+      output: \1 \2
 
 
 # Suriname (Suriname)
@@ -1763,6 +1861,9 @@ tc:
     partition: 106
     languages: en
     names: !include country-names/tc.yaml
+    postcode:
+      pattern: "(TKCA) ?(1ZZ)"
+      output: \1 \2
 
 
 # Chad (Tchad تشاد)
@@ -1879,6 +1980,8 @@ tw:
     partition: 25
     languages: zh-hant
     names: !include country-names/tw.yaml
+    postcode:
+      pattern: "ddd(?:ddd?)?"
 
 
 # Tanzania (Tanzania)
@@ -1921,6 +2024,8 @@ us:
     partition: 2
     languages: en
     names: !include country-names/us.yaml
+    postcode:
+      pattern: "(ddddd)(?:-dddd)?"
 
 
 # Uruguay (Uruguay)
@@ -1955,6 +2060,9 @@ vc:
     partition: 171
     languages: en
     names: !include country-names/vc.yaml
+    postcode:
+      pattern: "(dddd)"
+      output: VC\1
 
 
 # Venezuela (Venezuela)
@@ -1971,6 +2079,9 @@ vg:
     partition: 109
     languages: en
     names: !include country-names/vg.yaml
+    postcode:
+      pattern: "(dddd)"
+      output: VG\1
 
 
 #  (United States Virgin Islands)

From 5ba75df507617162907c2b42a7825ee406218582 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Tue, 24 May 2022 17:11:40 +0200
Subject: [PATCH 10/30] postcode: generate a generic form

---
 .../tokenizer/sanitizers/clean_postcodes.py   | 31 +++++++++++++------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py
index a968c9db07..42beea37fe 100644
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -37,20 +37,28 @@ def __init__(self, country_code, config):
         self.output = config.get('output', r'\g<0>')
 
 
-    def normalize(self, postcode):
-        """ Return the normalized version of the postcode. If the given postcode
-            does not correspond to the usage-pattern, return null.
+    def match(self, postcode):
+        """ Match the given postcode against the postcode pattern for this
+            matcher. Returns a `re.Match` object if the match was successful
+            and None otherwise.
         """
         # Upper-case, strip spaces and leading country code.
         normalized = self.norm_pattern.fullmatch(postcode.upper())
 
         if normalized:
-            match = self.pattern.fullmatch(normalized.group(1))
-            return match.expand(self.output) if match else None
+            return self.pattern.fullmatch(normalized.group(1))
 
         return None
 
 
+    def normalize(self, match):
+        """ Return the default format of the postcode for the given match.
+            `match` must be a `re.Match` object previously returned by
+            `match()`
+        """
+        return match.expand(self.output)
+
+
 class _PostcodeSanitizer:
 
     def __init__(self, config):
@@ -83,7 +91,8 @@ def __call__(self, obj):
                 else:
                     obj.address.pop(pos)
             else:
-                postcode.name = formatted
+                postcode.name = formatted[0]
+                postcode.set_attr('lookup', formatted[1])
 
 
     def scan(self, postcode, country):
@@ -94,10 +103,14 @@ def scan(self, postcode, country):
         if country in self.country_without_postcode:
             return None
 
-        if country in self.country_matcher:
-            return self.country_matcher[country].normalize(postcode)
+        matcher = self.country_matcher.get(country)
+        if matcher is not None:
+            match = matcher.match(postcode)
+            if match is None:
+                return None
+            return matcher.normalize(match), ' '.join(match.groups())
 
-        return postcode.upper()
+        return postcode.upper(), ''
 
 
 

From 18864afa8aee710a5aa7fe65565711119ca7a663 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Tue, 24 May 2022 18:25:37 +0200
Subject: [PATCH 11/30] postcodes: introduce a default pattern for countries
 without postcodes

---
 .../tokenizer/sanitizers/clean_postcodes.py   | 22 +++++++++++++------
 settings/icu_tokenizer.yaml                   |  1 +
 .../sanitizers/test_clean_postcodes.py        | 12 ++++++++++
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py
index 42beea37fe..c6292a2942 100644
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -75,6 +75,12 @@ def __init__(self, config):
             else:
                 raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
 
+        default_pattern = config.get('default-pattern')
+        if default_pattern is not None and isinstance(default_pattern, str):
+            self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern})
+        else:
+            self.default_matcher = None
+
 
     def __call__(self, obj):
         if not obj.address:
@@ -103,14 +109,16 @@ def scan(self, postcode, country):
         if country in self.country_without_postcode:
             return None
 
-        matcher = self.country_matcher.get(country)
-        if matcher is not None:
-            match = matcher.match(postcode)
-            if match is None:
-                return None
-            return matcher.normalize(match), ' '.join(match.groups())
+        matcher = self.country_matcher.get(country, self.default_matcher)
+        if matcher is None:
+            return postcode.upper(), ''
+
+        match = matcher.match(postcode)
+        if match is None:
+            return None
+
+        return matcher.normalize(match), ' '.join(match.groups())
 
-        return postcode.upper(), ''
 
 
 
diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml
index 544bd81db0..f682bbcdf8 100644
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -34,6 +34,7 @@ sanitizers:
         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
     - step: clean-postcodes
       convert-to-address: yes
+      default-pattern: [A-Z0-9- ]{3,12}
     - step: split-name-list
     - step: strip-brace-terms
     - step: tag-analyzer-by-language
diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
index 228c2f3a1a..4437619625 100644
--- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py
+++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
@@ -88,3 +88,15 @@ def test_postcode_sweden_pass(sanitize, postcode):
 def test_postcode_sweden_fail(sanitize, postcode):
     assert sanitize(country='se', postcode=postcode) == []
 
+
+@pytest.mark.parametrize("postcode", ('AB1', '123-456-7890', '1 as 44'))
+@pytest.mark.sanitizer_params(default_pattern='[A-Z0-9- ]{3,12}')
+def test_postcode_default_pattern_pass(sanitize, postcode):
+    assert sanitize(country='an', postcode=postcode) == [('postcode', postcode.upper())]
+
+
+@pytest.mark.parametrize("postcode", ('C', '12', 'ABC123DEF 456', '1234,5678', '11223;11224'))
+@pytest.mark.sanitizer_params(convert_to_address=False, default_pattern='[A-Z0-9- ]{3,12}')
+def test_postcode_default_pattern_fail(sanitize, postcode):
+    assert sanitize(country='an', postcode=postcode) == []
+

From ca7b46511d41d67e229f758e638367c241815c11 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Tue, 24 May 2022 21:45:06 +0200
Subject: [PATCH 12/30] introduce and use analyzer for postcodes

---
 lib-sql/tokenizer/icu_tokenizer.sql           | 23 ++++++++
 nominatim/tokenizer/icu_tokenizer.py          | 51 +++++++++++-------
 .../tokenizer/sanitizers/clean_postcodes.py   |  2 +-
 .../tokenizer/token_analysis/postcodes.py     | 54 +++++++++++++++++++
 settings/icu_tokenizer.yaml                   |  4 +-
 5 files changed, 114 insertions(+), 20 deletions(-)
 create mode 100644 nominatim/tokenizer/token_analysis/postcodes.py

diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql
index a3dac8ddcb..f323334b88 100644
--- a/lib-sql/tokenizer/icu_tokenizer.sql
+++ b/lib-sql/tokenizer/icu_tokenizer.sql
@@ -223,3 +223,26 @@ BEGIN
 END;
 $$
 LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION create_postcode_word(postcode TEXT, lookup_terms TEXT[])
+  RETURNS BOOLEAN
+  AS $$
+DECLARE
+  existing INTEGER;
+BEGIN
+  SELECT count(*) INTO existing
+    FROM word WHERE word = postcode and type = 'P';
+
+  IF existing > 0 THEN
+    RETURN TRUE;
+  END IF;
+
+  -- postcodes don't need word ids
+  INSERT INTO word (word_token, type, word)
+    SELECT lookup_term, 'P', postcode FROM unnest(lookup_terms) as lookup_term;
+
+  RETURN FALSE;
+END;
+$$
+LANGUAGE plpgsql;
+
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index 4678af66eb..e9812ba043 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -11,7 +11,6 @@
 import itertools
 import json
 import logging
-import re
 from textwrap import dedent
 
 from nominatim.db.connection import connect
@@ -473,7 +472,7 @@ def process_place(self, place):
     def _process_place_address(self, token_info, address):
         for item in address:
             if item.kind == 'postcode':
-                self._add_postcode(item.name)
+                token_info.set_postcode(self._add_postcode(item))
             elif item.kind == 'housenumber':
                 token_info.add_housenumber(*self._compute_housenumber_token(item))
             elif item.kind == 'street':
@@ -605,26 +604,36 @@ def _compute_name_tokens(self, names):
         return full_tokens, partial_tokens
 
 
-    def _add_postcode(self, postcode):
+    def _add_postcode(self, item):
         """ Make sure the normalized postcode is present in the word table.
         """
-        if re.search(r'[:,;]', postcode) is None:
-            postcode = self.normalize_postcode(postcode)
+        analyzer = self.token_analysis.get_analyzer('@postcode')
 
-            if postcode not in self._cache.postcodes:
-                term = self._search_normalized(postcode)
-                if not term:
-                    return
+        if analyzer is None:
+            postcode_name = item.name.strip().upper()
+            variant_base = None
+        else:
+            postcode_name = analyzer.normalize(item.name)
+            variant_base = item.get_attr("variant")
 
-                with self.conn.cursor() as cur:
-                    # no word_id needed for postcodes
-                    cur.execute("""INSERT INTO word (word_token, type, word)
-                                   (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
-                                    WHERE NOT EXISTS
-                                     (SELECT * FROM word
-                                      WHERE type = 'P' and word = pc))
-                                """, (term, postcode))
-                self._cache.postcodes.add(postcode)
+        if variant_base is not None:
+            postcode = f'{postcode_name}@{variant_base}'
+        else:
+            postcode = postcode_name
+
+        if postcode not in self._cache.postcodes:
+            term = self._search_normalized(postcode_name)
+            if not term:
+                return
+
+            variants = {term}
+            if analyzer is not None and variant_base is not None:
+                variants.update(analyzer.get_variants_ascii(variant_base))
+
+            with self.conn.cursor() as cur:
+                cur.execute("SELECT create_postcode_word(%s, %s)",
+                            (postcode, list(variants)))
+            self._cache.postcodes.add(postcode)
 
 
 class _TokenInfo:
@@ -637,6 +646,7 @@ def __init__(self):
         self.street_tokens = set()
         self.place_tokens = set()
         self.address_tokens = {}
+        self.postcode = None
 
 
     @staticmethod
@@ -701,6 +711,11 @@ def add_address_term(self, key, partials):
         if partials:
             self.address_tokens[key] = self._mk_array(partials)
 
+    def set_postcode(self, postcode):
+        """ Set the postcode to the given one.
+        """
+        self.postcode = postcode
+
 
 class _TokenCache:
     """ Cache for token information to avoid repeated database queries.
diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py
index c6292a2942..d1edc60d1e 100644
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -98,7 +98,7 @@ def __call__(self, obj):
                     obj.address.pop(pos)
             else:
                 postcode.name = formatted[0]
-                postcode.set_attr('lookup', formatted[1])
+                postcode.set_attr('variant', formatted[1])
 
 
     def scan(self, postcode, country):
diff --git a/nominatim/tokenizer/token_analysis/postcodes.py b/nominatim/tokenizer/token_analysis/postcodes.py
new file mode 100644
index 0000000000..e105b132da
--- /dev/null
+++ b/nominatim/tokenizer/token_analysis/postcodes.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Specialized processor for postcodes. Supports a 'lookup' variant of the
+token, which produces variants with optional spaces.
+"""
+
+from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
+
+### Configuration section
+
+def configure(rules, normalization_rules): # pylint: disable=W0613
+    """ All behaviour is currently hard-coded.
+    """
+    return None
+
+### Analysis section
+
+def create(normalizer, transliterator, config): # pylint: disable=W0613
+    """ Create a new token analysis instance for this module.
+    """
+    return PostcodeTokenAnalysis(normalizer, transliterator)
+
+class PostcodeTokenAnalysis:
+    """ Detects common housenumber patterns and normalizes them.
+    """
+    def __init__(self, norm, trans):
+        self.norm = norm
+        self.trans = trans
+
+        self.mutator = MutationVariantGenerator(' ', (' ', ''))
+
+
+    def normalize(self, name):
+        """ Return the standard form of the postcode.
+        """
+        return name.strip().upper()
+
+
+    def get_variants_ascii(self, norm_name):
+        """ Compute the spelling variants for the given normalized postcode.
+
+            The official form creates one variant. If a 'lookup version' is
+            given, then it will create variants with optional spaces.
+        """
+        # Postcodes follow their own transliteration rules.
+        # Make sure at this point, that the terms are normalized in a way
+        # that they are searchable with the standard transliteration rules.
+        return [self.trans.transliterate(term) for term in
+                self.mutator.generate([self.norm.transliterate(norm_name)])]
diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml
index f682bbcdf8..212fdcb9e2 100644
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -34,7 +34,7 @@ sanitizers:
         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
     - step: clean-postcodes
       convert-to-address: yes
-      default-pattern: [A-Z0-9- ]{3,12}
+      default-pattern: "[A-Z0-9- ]{3,12}"
     - step: split-name-list
     - step: strip-brace-terms
     - step: tag-analyzer-by-language
@@ -46,6 +46,8 @@ token-analysis:
     - analyzer: generic
     - id: "@housenumber"
       analyzer: housenumbers
+    - id: "@postcode"
+      analyzer: postcodes
     - id: bg
       analyzer: generic
       mode: variant-only

From b7704833e4b011541928372a46ea692c3a496b5c Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Fri, 3 Jun 2022 17:12:01 +0200
Subject: [PATCH 13/30] icu: switch postcodes to using the pre-formatted one

---
 lib-sql/functions/interpolation.sql   | 19 +++++++++----------
 lib-sql/functions/placex_triggers.sql |  5 ++---
 lib-sql/tokenizer/icu_tokenizer.sql   |  7 +++++++
 nominatim/tokenizer/icu_tokenizer.py  |  3 +++
 4 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/lib-sql/functions/interpolation.sql b/lib-sql/functions/interpolation.sql
index c8cfbcc68c..3a99471101 100644
--- a/lib-sql/functions/interpolation.sql
+++ b/lib-sql/functions/interpolation.sql
@@ -156,7 +156,6 @@ DECLARE
   linegeo GEOMETRY;
   splitline GEOMETRY;
   sectiongeo GEOMETRY;
-  interpol_postcode TEXT;
   postcode TEXT;
   stepmod SMALLINT;
 BEGIN
@@ -174,8 +173,6 @@ BEGIN
                                                  ST_PointOnSurface(NEW.linegeo),
                                                  NEW.linegeo);
 
-  interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
-
   NEW.token_info := token_strip_info(NEW.token_info);
   IF NEW.address ? '_inherited' THEN
     NEW.address := hstore('interpolation', NEW.address->'interpolation');
@@ -207,6 +204,11 @@ BEGIN
     FOR nextnode IN
       SELECT DISTINCT ON (nodeidpos)
           osm_id, address, geometry,
+          -- Take the postcode from the node only if it has a housenumber itself.
+          -- Note that there is a corner-case where the node has a wrongly
+          -- formatted postcode and therefore 'postcode' contains a derived
+          -- variant.
+          CASE WHEN address ? 'postcode' THEN placex.postcode ELSE NULL::text END as postcode,
           substring(address->'housenumber','[0-9]+')::integer as hnr
         FROM placex, generate_series(1, array_upper(waynodes, 1)) nodeidpos
         WHERE osm_type = 'N' and osm_id = waynodes[nodeidpos]::BIGINT
@@ -260,13 +262,10 @@ BEGIN
         endnumber := newend;
 
         -- determine postcode
-        postcode := coalesce(interpol_postcode,
-                             token_normalized_postcode(prevnode.address->'postcode'),
-                             token_normalized_postcode(nextnode.address->'postcode'),
-                             postcode);
-        IF postcode is NULL THEN
-            SELECT token_normalized_postcode(placex.postcode)
-              FROM placex WHERE place_id = NEW.parent_place_id INTO postcode;
+        postcode := coalesce(prevnode.postcode, nextnode.postcode, postcode);
+        IF postcode is NULL and NEW.parent_place_id > 0 THEN
+            SELECT placex.postcode FROM placex
+              WHERE place_id = NEW.parent_place_id INTO postcode;
         END IF;
         IF postcode is NULL THEN
             postcode := get_nearest_postcode(NEW.country_code, nextnode.geometry);
diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql
index 6143a1edae..1f7e6dc61a 100644
--- a/lib-sql/functions/placex_triggers.sql
+++ b/lib-sql/functions/placex_triggers.sql
@@ -992,7 +992,7 @@ BEGIN
       {% if debug %}RAISE WARNING 'Got parent details from search name';{% endif %}
 
       -- determine postcode
-      NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
+      NEW.postcode := coalesce(token_get_postcode(NEW.token_info),
                                location.postcode,
                                get_nearest_postcode(NEW.country_code, NEW.centroid));
 
@@ -1150,8 +1150,7 @@ BEGIN
 
   {% if debug %}RAISE WARNING 'RETURN insert_addresslines: %, %, %', NEW.parent_place_id, NEW.postcode, nameaddress_vector;{% endif %}
 
-  NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
-                           NEW.postcode);
+  NEW.postcode := coalesce(token_get_postcode(NEW.token_info), NEW.postcode);
 
   -- if we have a name add this to the name search table
   IF NEW.name IS NOT NULL THEN
diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql
index f323334b88..f86a0a3794 100644
--- a/lib-sql/tokenizer/icu_tokenizer.sql
+++ b/lib-sql/tokenizer/icu_tokenizer.sql
@@ -104,6 +104,13 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
+CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB)
+  RETURNS TEXT
+AS $$
+  SELECT info->>'postcode';
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
 -- Return token info that should be saved permanently in the database.
 CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
   RETURNS JSONB
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index e9812ba043..61c47c1188 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -675,6 +675,9 @@ def to_dict(self):
         if self.address_tokens:
             out['addr'] = self.address_tokens
 
+        if self.postcode:
+            out['postcode'] = self.postcode
+
         return out
 
 

From 4885fdf0f97d0615027fa6b2ed410e75ae1a2e20 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 6 Jun 2022 09:49:00 +0200
Subject: [PATCH 14/30] add class for online centroid computation

---
 nominatim/utils/__init__.py        |  0
 nominatim/utils/centroid.py        | 48 +++++++++++++++++++++++++
 test/python/utils/test_centroid.py | 56 ++++++++++++++++++++++++++++++
 3 files changed, 104 insertions(+)
 create mode 100644 nominatim/utils/__init__.py
 create mode 100644 nominatim/utils/centroid.py
 create mode 100644 test/python/utils/test_centroid.py

diff --git a/nominatim/utils/__init__.py b/nominatim/utils/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/nominatim/utils/centroid.py b/nominatim/utils/centroid.py
new file mode 100644
index 0000000000..c2bd61927e
--- /dev/null
+++ b/nominatim/utils/centroid.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for computation of centroids.
+"""
+from collections.abc import Collection
+
+class PointsCentroid:
+    """ Centroid computation from single points using an online algorithm.
+        More points may be added at any time.
+
+        Coordinates are internally treated as a 7-digit fixed-point float
+        (i.e. in OSM style).
+    """
+
+    def __init__(self):
+        self.sum_x = 0
+        self.sum_y = 0
+        self.count = 0
+
+    def centroid(self):
+        """ Return the centroid of all points collected so far.
+        """
+        if self.count == 0:
+            raise ValueError("No points available for centroid.")
+
+        return (float(self.sum_x/self.count)/10000000,
+                float(self.sum_y/self.count)/10000000)
+
+
+    def __len__(self):
+        return self.count
+
+
+    def __iadd__(self, other):
+        if isinstance(other, Collection) and len(other) == 2:
+            if all(isinstance(p, (float, int)) for p in other):
+                x, y = other
+                self.sum_x += int(x * 10000000)
+                self.sum_y += int(y * 10000000)
+                self.count += 1
+                return self
+
+        raise ValueError("Can only add 2-element tuples to centroid.")
diff --git a/test/python/utils/test_centroid.py b/test/python/utils/test_centroid.py
new file mode 100644
index 0000000000..63d967e756
--- /dev/null
+++ b/test/python/utils/test_centroid.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for centroid computation.
+"""
+import pytest
+
+from nominatim.utils.centroid import PointsCentroid
+
+def test_empty_set():
+    c = PointsCentroid()
+
+    with pytest.raises(ValueError, match='No points'):
+        c.centroid()
+
+
+@pytest.mark.parametrize("centroid", [(0,0), (-1, 3), [0.0000032, 88.4938]])
+def test_one_point_centroid(centroid):
+    c = PointsCentroid()
+
+    c += centroid
+
+    assert len(c.centroid()) == 2
+    assert c.centroid() == (pytest.approx(centroid[0]), pytest.approx(centroid[1]))
+
+
+def test_multipoint_centroid():
+    c = PointsCentroid()
+
+    c += (20.0, -10.0)
+    assert c.centroid() == (pytest.approx(20.0), pytest.approx(-10.0))
+    c += (20.2, -9.0)
+    assert c.centroid() == (pytest.approx(20.1), pytest.approx(-9.5))
+    c += (20.2, -9.0)
+    assert c.centroid() == (pytest.approx(20.13333), pytest.approx(-9.333333))
+
+
+def test_manypoint_centroid():
+    c = PointsCentroid()
+
+    for _ in range(10000):
+        c += (4.564732, -0.000034)
+
+    assert c.centroid() == (pytest.approx(4.564732), pytest.approx(-0.000034))
+
+
+@pytest.mark.parametrize("param", ["aa", None, 5, [1, 2, 3], (3, None), ("a", 3.9)])
+def test_add_non_tuple(param):
+    c = PointsCentroid()
+
+    with pytest.raises(ValueError, match='2-element tuples'):
+        c += param

From bf86b45178ff69dbe87942840543d67168577401 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 6 Jun 2022 10:46:48 +0200
Subject: [PATCH 15/30] move postcode centroid computation to Python

---
 nominatim/tools/postcodes.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py
index 2b7027e721..42f54cea3e 100644
--- a/nominatim/tools/postcodes.py
+++ b/nominatim/tools/postcodes.py
@@ -8,6 +8,7 @@
 Functions for importing, updating and otherwise maintaining the table
 of artificial postcode centroids.
 """
+from collections import defaultdict
 import csv
 import gzip
 import logging
@@ -16,6 +17,7 @@
 from psycopg2 import sql as pysql
 
 from nominatim.db.connection import connect
+from nominatim.utils.centroid import PointsCentroid
 
 LOG = logging.getLogger()
 
@@ -36,14 +38,14 @@ class _CountryPostcodesCollector:
 
     def __init__(self, country):
         self.country = country
-        self.collected = {}
+        self.collected = defaultdict(PointsCentroid)
 
 
     def add(self, postcode, x, y):
         """ Add the given postcode to the collection cache. If the postcode
             already existed, it is overwritten with the new centroid.
         """
-        self.collected[postcode] = (x, y)
+        self.collected[postcode] += (x, y)
 
 
     def commit(self, conn, analyzer, project_dir):
@@ -93,16 +95,16 @@ def _compute_changes(self, conn):
                            WHERE country_code = %s""",
                         (self.country, ))
             for postcode, x, y in cur:
-                newx, newy = self.collected.pop(postcode, (None, None))
-                if newx is not None:
-                    dist = (x - newx)**2 + (y - newy)**2
-                    if dist > 0.0000001:
+                pcobj = self.collected.pop(postcode, None)
+                if pcobj:
+                    newx, newy = pcobj.centroid()
+                    if (x - newx) > 0.0000001 or (y - newy) > 0.0000001:
                         to_update.append((postcode, newx, newy))
                 else:
                     to_delete.append(postcode)
 
-        to_add = [(k, v[0], v[1]) for k, v in self.collected.items()]
-        self.collected = []
+        to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
+        self.collected = None
 
         return to_add, to_delete, to_update
 
@@ -125,8 +127,10 @@ def _update_from_external(self, analyzer, project_dir):
                 postcode = analyzer.normalize_postcode(row['postcode'])
                 if postcode not in self.collected:
                     try:
-                        self.collected[postcode] = (_to_float(row['lon'], 180),
-                                                    _to_float(row['lat'], 90))
+                        # Do float conversation separately, it might throw
+                        centroid = (_to_float(row['lon'], 180),
+                                    _to_float(row['lat'], 90))
+                        self.collected[postcode] += centroid
                     except ValueError:
                         LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
                                     row['lat'], row['lon'], self.country)
@@ -174,12 +178,10 @@ def update_postcodes(dsn, project_dir, tokenizer):
                         COALESCE(plx.country_code,
                                  get_country_code(ST_Centroid(pl.geometry))) as cc,
                         token_normalized_postcode(pl.address->'postcode') as pc,
-                        ST_Centroid(ST_Collect(COALESCE(plx.centroid,
-                                                        ST_Centroid(pl.geometry)))) as centroid
+                        COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
                       FROM place AS pl LEFT OUTER JOIN placex AS plx
                              ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
-                    WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null
-                    GROUP BY cc, pc) xx
+                    WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
                 WHERE pc IS NOT null AND cc IS NOT null
                 ORDER BY country_code, pc""")
 

From 80ea13437df4c6d57ea503adbdfc9928de8d859c Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 6 Jun 2022 23:37:04 +0200
Subject: [PATCH 16/30] move postcode matcher in a separate file

---
 nominatim/data/__init__.py                    |  0
 nominatim/data/postcode_format.py             | 97 +++++++++++++++++++
 nominatim/tokenizer/icu_tokenizer.py          |  2 +-
 .../tokenizer/sanitizers/clean_postcodes.py   | 70 +------------
 test/python/tokenizer/test_icu.py             |  7 --
 5 files changed, 103 insertions(+), 73 deletions(-)
 create mode 100644 nominatim/data/__init__.py
 create mode 100644 nominatim/data/postcode_format.py

diff --git a/nominatim/data/__init__.py b/nominatim/data/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/nominatim/data/postcode_format.py b/nominatim/data/postcode_format.py
new file mode 100644
index 0000000000..0158111ada
--- /dev/null
+++ b/nominatim/data/postcode_format.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for formatting postcodes according to their country-specific
+format.
+"""
+import re
+
+from nominatim.errors import UsageError
+from nominatim.tools import country_info
+
+class CountryPostcodeMatcher:
+    """ Matches and formats a postcode according to a format definition
+        of the given country.
+    """
+    def __init__(self, country_code, config):
+        if 'pattern' not in config:
+            raise UsageError("Field 'pattern' required for 'postcode' "
+                             f"for country '{country_code}'")
+
+        pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
+
+        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
+        self.pattern = re.compile(pc_pattern)
+
+        self.output = config.get('output', r'\g<0>')
+
+
+    def match(self, postcode):
+        """ Match the given postcode against the postcode pattern for this
+            matcher. Returns a `re.Match` object if the match was successful
+            and None otherwise.
+        """
+        # Upper-case, strip spaces and leading country code.
+        normalized = self.norm_pattern.fullmatch(postcode.upper())
+
+        if normalized:
+            return self.pattern.fullmatch(normalized.group(1))
+
+        return None
+
+
+    def normalize(self, match):
+        """ Return the default format of the postcode for the given match.
+            `match` must be a `re.Match` object previously returned by
+            `match()`
+        """
+        return match.expand(self.output)
+
+
+class PostcodeFormatter:
+    """ Container for different postcode formats of the world and
+        access functions.
+    """
+    def __init__(self):
+        # Objects without a country code can't have a postcode per definition.
+        self.country_without_postcode = {None}
+        self.country_matcher = {}
+        self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'})
+
+        for ccode, prop in country_info.iterate('postcode'):
+            if prop is False:
+                self.country_without_postcode.add(ccode)
+            elif isinstance(prop, dict):
+                self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop)
+            else:
+                raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+
+
+    def set_default_pattern(self, pattern):
+        """ Set the postcode match pattern to use, when a country does not
+            have a specific pattern or is marked as country without postcode.
+        """
+        self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
+
+
+    def match(self, country_code, postcode):
+        """ Match the given postcode against the postcode pattern for this
+            matcher. Returns a `re.Match` object if the country has a pattern
+            and the match was successful or None if the match failed.
+        """
+        if country_code in self.country_without_postcode:
+            return None
+
+        return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
+
+
+    def normalize(self, country_code, match):
+        """ Return the default format of the postcode for the given match.
+            `match` must be a `re.Match` object previously returned by
+            `match()`
+        """
+        return self.country_matcher.get(country_code, self.default_matcher).normalize(match)
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index 61c47c1188..0dc551e1b4 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -607,7 +607,7 @@ def _compute_name_tokens(self, names):
     def _add_postcode(self, item):
         """ Make sure the normalized postcode is present in the word table.
         """
-        analyzer = self.token_analysis.get_analyzer('@postcode')
+        analyzer = self.token_analysis.analysis.get('@postcode')
 
         if analyzer is None:
             postcode_name = item.name.strip().upper()
diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py
index d1edc60d1e..fbc46fa582 100644
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -16,70 +16,17 @@
                         When set to 'no', non-conforming postcodes are not
                         searchable either.
 """
-import re
-
-from nominatim.errors import UsageError
-from nominatim.tools import country_info
-
-class _PostcodeMatcher:
-    """ Matches and formats a postcode according to the format definition.
-    """
-    def __init__(self, country_code, config):
-        if 'pattern' not in config:
-            raise UsageError("Field 'pattern' required for 'postcode' "
-                             f"for country '{country_code}'")
-
-        pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
-
-        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
-        self.pattern = re.compile(pc_pattern)
-
-        self.output = config.get('output', r'\g<0>')
-
-
-    def match(self, postcode):
-        """ Match the given postcode against the postcode pattern for this
-            matcher. Returns a `re.Match` object if the match was successful
-            and None otherwise.
-        """
-        # Upper-case, strip spaces and leading country code.
-        normalized = self.norm_pattern.fullmatch(postcode.upper())
-
-        if normalized:
-            return self.pattern.fullmatch(normalized.group(1))
-
-        return None
-
-
-    def normalize(self, match):
-        """ Return the default format of the postcode for the given match.
-            `match` must be a `re.Match` object previously returned by
-            `match()`
-        """
-        return match.expand(self.output)
-
+from nominatim.data.postcode_format import PostcodeFormatter
 
 class _PostcodeSanitizer:
 
     def __init__(self, config):
         self.convert_to_address = config.get_bool('convert-to-address', True)
-        # Objects without a country code can't have a postcode per definition.
-        self.country_without_postcode = {None}
-        self.country_matcher = {}
-
-        for ccode, prop in country_info.iterate('postcode'):
-            if prop is False:
-                self.country_without_postcode.add(ccode)
-            elif isinstance(prop, dict):
-                self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop)
-            else:
-                raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+        self.matcher = PostcodeFormatter()
 
         default_pattern = config.get('default-pattern')
         if default_pattern is not None and isinstance(default_pattern, str):
-            self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern})
-        else:
-            self.default_matcher = None
+            self.matcher.set_default_pattern(default_pattern)
 
 
     def __call__(self, obj):
@@ -106,18 +53,11 @@ def scan(self, postcode, country):
             normalized version. Returns None if the postcode does not
             correspond to the oficial format of the given country.
         """
-        if country in self.country_without_postcode:
-            return None
-
-        matcher = self.country_matcher.get(country, self.default_matcher)
-        if matcher is None:
-            return postcode.upper(), ''
-
-        match = matcher.match(postcode)
+        match = self.matcher.match(country, postcode)
         if match is None:
             return None
 
-        return matcher.normalize(match), ' '.join(match.groups())
+        return self.matcher.normalize(country, match), ' '.join(match.groups())
 
 
 
diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py
index d85a5b65e5..6138a03a42 100644
--- a/test/python/tokenizer/test_icu.py
+++ b/test/python/tokenizer/test_icu.py
@@ -437,13 +437,6 @@ def test_process_place_postcode(self, word_table, pcode):
         assert word_table.get_postcodes() == {pcode, }
 
 
-    @pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
-    def test_process_place_bad_postcode(self, word_table, pcode):
-        self.process_address(postcode=pcode)
-
-        assert not word_table.get_postcodes()
-
-
     @pytest.mark.parametrize('hnr', ['123a', '1', '101'])
     def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id):
         info = self.process_address(housenumber=hnr)

From b5e5efc131a29a46a4c5d57f02f7c6b50126f86f Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 6 Jun 2022 23:44:51 +0200
Subject: [PATCH 17/30] only add well-formatted postcodes to location table

---
 nominatim/tools/postcodes.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py
index 42f54cea3e..dad1edff7f 100644
--- a/nominatim/tools/postcodes.py
+++ b/nominatim/tools/postcodes.py
@@ -18,6 +18,7 @@
 
 from nominatim.db.connection import connect
 from nominatim.utils.centroid import PointsCentroid
+from nominatim.data.postcode_format import PostcodeFormatter
 
 LOG = logging.getLogger()
 
@@ -162,6 +163,7 @@ def update_postcodes(dsn, project_dir, tokenizer):
         potentially enhances it with external data and then updates the
         postcodes in the table 'location_postcode'.
     """
+    matcher = PostcodeFormatter()
     with tokenizer.name_analyzer() as analyzer:
         with connect(dsn) as conn:
             # First get the list of countries that currently have postcodes.
@@ -193,7 +195,9 @@ def update_postcodes(dsn, project_dir, tokenizer):
                             collector.commit(conn, analyzer, project_dir)
                         collector = _CountryPostcodesCollector(country)
                         todo_countries.discard(country)
-                    collector.add(postcode, x, y)
+                    match = matcher.match(country, postcode)
+                    if match:
+                        collector.add(matcher.normalize(country, match), x, y)
 
                 if collector is not None:
                     collector.commit(conn, analyzer, project_dir)

From 2eca9fc8aff8fc7bc3ab4b7e4bf262686a5a6a5c Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Tue, 7 Jun 2022 12:08:22 +0200
Subject: [PATCH 18/30] cache postcode normalization

---
 nominatim/data/postcode_format.py | 12 ++++++++++++
 nominatim/tools/postcodes.py      | 23 ++++++++++++++++-------
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/nominatim/data/postcode_format.py b/nominatim/data/postcode_format.py
index 0158111ada..6ae43b7d50 100644
--- a/nominatim/data/postcode_format.py
+++ b/nominatim/data/postcode_format.py
@@ -78,6 +78,18 @@ def set_default_pattern(self, pattern):
         self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
 
 
+    def get_matcher(self, country_code):
+        """ Return the CountryPostcodeMatcher for the given country.
+            Returns None if the country doesn't have a postcode and the
+            default matcher if there is no specific matcher configured for
+            the country.
+        """
+        if country_code in self.country_without_postcode:
+            return None
+
+        return self.country_matcher.get(country_code, self.default_matcher)
+
+
     def match(self, country_code, postcode):
         """ Match the given postcode against the postcode pattern for this
             matcher. Returns a `re.Match` object if the country has a pattern
diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py
index dad1edff7f..26b96099a9 100644
--- a/nominatim/tools/postcodes.py
+++ b/nominatim/tools/postcodes.py
@@ -37,16 +37,27 @@ class _CountryPostcodesCollector:
     """ Collector for postcodes of a single country.
     """
 
-    def __init__(self, country):
+    def __init__(self, country, matcher):
         self.country = country
+        self.matcher = matcher
         self.collected = defaultdict(PointsCentroid)
+        self.normalization_cache = None
 
 
     def add(self, postcode, x, y):
         """ Add the given postcode to the collection cache. If the postcode
             already existed, it is overwritten with the new centroid.
         """
-        self.collected[postcode] += (x, y)
+        if self.matcher is not None:
+            if self.normalization_cache and self.normalization_cache[0] == postcode:
+                normalized = self.normalization_cache[1]
+            else:
+                match = self.matcher.match(postcode)
+                normalized = self.matcher.normalize(match) if match else None
+                self.normalization_cache = (postcode, normalized)
+
+            if normalized:
+                self.collected[normalized] += (x, y)
 
 
     def commit(self, conn, analyzer, project_dir):
@@ -193,18 +204,16 @@ def update_postcodes(dsn, project_dir, tokenizer):
                     if collector is None or country != collector.country:
                         if collector is not None:
                             collector.commit(conn, analyzer, project_dir)
-                        collector = _CountryPostcodesCollector(country)
+                        collector = _CountryPostcodesCollector(country, matcher.get_matcher(country))
                         todo_countries.discard(country)
-                    match = matcher.match(country, postcode)
-                    if match:
-                        collector.add(matcher.normalize(country, match), x, y)
+                    collector.add(postcode, x, y)
 
                 if collector is not None:
                     collector.commit(conn, analyzer, project_dir)
 
             # Now handle any countries that are only in the postcode table.
             for country in todo_countries:
-                _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
+                _CountryPostcodesCollector(country, matcher.get_matcher(country)).commit(conn, analyzer, project_dir)
 
             conn.commit()
 

From 67dfa38e608a6e63dbcae40530c46a56971cca0a Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 8 Jun 2022 06:33:11 +0200
Subject: [PATCH 19/30] fix liniting problems

---
 nominatim/tools/postcodes.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py
index 26b96099a9..27fbcc9b25 100644
--- a/nominatim/tools/postcodes.py
+++ b/nominatim/tools/postcodes.py
@@ -33,7 +33,7 @@ def _to_float(num, max_value):
 
     return num
 
-class _CountryPostcodesCollector:
+class _PostcodeCollector:
     """ Collector for postcodes of a single country.
     """
 
@@ -204,7 +204,7 @@ def update_postcodes(dsn, project_dir, tokenizer):
                     if collector is None or country != collector.country:
                         if collector is not None:
                             collector.commit(conn, analyzer, project_dir)
-                        collector = _CountryPostcodesCollector(country, matcher.get_matcher(country))
+                        collector = _PostcodeCollector(country, matcher.get_matcher(country))
                         todo_countries.discard(country)
                     collector.add(postcode, x, y)
 
@@ -213,7 +213,8 @@ def update_postcodes(dsn, project_dir, tokenizer):
 
             # Now handle any countries that are only in the postcode table.
             for country in todo_countries:
-                _CountryPostcodesCollector(country, matcher.get_matcher(country)).commit(conn, analyzer, project_dir)
+                fmt = matcher.get_matcher(country)
+                _PostcodeCollector(country, fmt).commit(conn, analyzer, project_dir)
 
             conn.commit()
 

From 7b6ec4fc6cf07f924025b5d2f63053c54e3f6fa4 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 8 Jun 2022 07:24:53 +0200
Subject: [PATCH 20/30] add tests for discarding bad postcodes

---
 test/python/tools/test_postcodes.py | 46 ++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/test/python/tools/test_postcodes.py b/test/python/tools/test_postcodes.py
index bdfe309471..0c4b93fcac 100644
--- a/test/python/tools/test_postcodes.py
+++ b/test/python/tools/test_postcodes.py
@@ -11,7 +11,7 @@
 
 import pytest
 
-from nominatim.tools import postcodes
+from nominatim.tools import postcodes, country_info
 import dummy_tokenizer
 
 class MockPostcodeTable:
@@ -64,11 +64,26 @@ def row_set(self):
 def tokenizer():
     return dummy_tokenizer.DummyTokenizer(None, None)
 
+
 @pytest.fixture
-def postcode_table(temp_db_conn, placex_table):
+def postcode_table(def_config, temp_db_conn, placex_table):
+    country_info.setup_country_config(def_config)
     return MockPostcodeTable(temp_db_conn)
 
 
+@pytest.fixture
+def insert_implicit_postcode(placex_table, place_row):
+    """
+        Inserts data into the placex and place table
+        which can then be used to compute one postcode.
+    """
+    def _insert_implicit_postcode(osm_id, country, geometry, address):
+        placex_table.add(osm_id=osm_id, country=country, geom=geometry)
+        place_row(osm_id=osm_id, geom='SRID=4326;'+geometry, address=address)
+
+    return _insert_implicit_postcode
+
+
 def test_postcodes_empty(dsn, postcode_table, place_table,
                          tmp_path, tokenizer):
     postcodes.update_postcodes(dsn, tmp_path, tokenizer)
@@ -193,27 +208,30 @@ def test_can_compute(dsn, table_factory):
     table_factory('place')
     assert postcodes.can_compute(dsn)
 
+
 def test_no_placex_entry(dsn, tmp_path, temp_db_cursor, place_row, postcode_table, tokenizer):
     #Rewrite the get_country_code function to verify its execution.
     temp_db_cursor.execute("""
         CREATE OR REPLACE FUNCTION get_country_code(place geometry)
         RETURNS TEXT AS $$ BEGIN 
-        RETURN 'fr';
+        RETURN 'yy';
         END; $$ LANGUAGE plpgsql;
     """)
     place_row(geom='SRID=4326;POINT(10 12)', address=dict(postcode='AB 4511'))
     postcodes.update_postcodes(dsn, tmp_path, tokenizer)
 
-    assert postcode_table.row_set == {('fr', 'AB 4511', 10, 12)}
+    assert postcode_table.row_set == {('yy', 'AB 4511', 10, 12)}
 
-@pytest.fixture
-def insert_implicit_postcode(placex_table, place_row):
-    """
-        Inserts data into the placex and place table
-        which can then be used to compute one postcode.
-    """
-    def _insert_implicit_postcode(osm_id, country, geometry, address):
-        placex_table.add(osm_id=osm_id, country=country, geom=geometry)
-        place_row(osm_id=osm_id, geom='SRID=4326;'+geometry, address=address)
 
-    return _insert_implicit_postcode
+def test_discard_badly_formatted_postcodes(dsn, tmp_path, temp_db_cursor, place_row, postcode_table, tokenizer):
+    #Rewrite the get_country_code function to verify its execution.
+    temp_db_cursor.execute("""
+        CREATE OR REPLACE FUNCTION get_country_code(place geometry)
+        RETURNS TEXT AS $$ BEGIN 
+        RETURN 'fr';
+        END; $$ LANGUAGE plpgsql;
+    """)
+    place_row(geom='SRID=4326;POINT(10 12)', address=dict(postcode='AB 4511'))
+    postcodes.update_postcodes(dsn, tmp_path, tokenizer)
+
+    assert not postcode_table.row_set

From e86db3001f90012029a59ec6b313a4f7257035d4 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 8 Jun 2022 07:42:35 +0200
Subject: [PATCH 21/30] fix postcode pattern for Mozambique

Optional groups are not implemented yet.
---
 nominatim/tokenizer/sanitizers/clean_postcodes.py | 3 ++-
 settings/country_settings.yaml                    | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py
index fbc46fa582..43d297695f 100644
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -57,7 +57,8 @@ def scan(self, postcode, country):
         if match is None:
             return None
 
-        return self.matcher.normalize(country, match), ' '.join(match.groups())
+        return self.matcher.normalize(country, match),\
+               ' '.join(filter(lambda p: p is not None, match.groups()))
 
 
 
diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml
index 67905ea2a3..14d08de3a3 100644
--- a/settings/country_settings.yaml
+++ b/settings/country_settings.yaml
@@ -1386,8 +1386,7 @@ mz:
     languages: pt
     names: !include country-names/mz.yaml
     postcode:
-      pattern: "(dddd)-?(dd)?"
-      output: \1-\2
+      pattern: "(dddd)(?:-dd)?"
 
 
 # Namibia (Namibia)

From 37b2c6a830c90aea17b76c5b6a74c711025a142d Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 8 Jun 2022 08:19:55 +0200
Subject: [PATCH 22/30] port legacy tokenizer to new postcode handling

Also documents the changes to the SQL functions of the tokenizer.
---
 docs/develop/Tokenizers.md              |  6 +++---
 lib-sql/tokenizer/icu_tokenizer.sql     |  7 -------
 lib-sql/tokenizer/legacy_tokenizer.sql  |  4 ++--
 nominatim/tokenizer/legacy_tokenizer.py | 10 ++++++++--
 nominatim/tools/postcodes.py            |  6 +++---
 5 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/docs/develop/Tokenizers.md b/docs/develop/Tokenizers.md
index 2b4da00509..5fe4e38d43 100644
--- a/docs/develop/Tokenizers.md
+++ b/docs/develop/Tokenizers.md
@@ -245,11 +245,11 @@ Currently, tokenizers are encouraged to make sure that matching works against
 both the search token list and the match token list.
 
 ```sql
-FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT
+FUNCTION token_get_postcode(info JSONB) RETURNS TEXT
 ```
 
-Return the normalized version of the given postcode. This function must return
-the same value as the Python function `AbstractAnalyzer->normalize_postcode()`.
+Return the postcode for the object, if any exists. The postcode must be in
+the form that should also be presented to the end-user.
 
 ```sql
 FUNCTION token_strip_info(info JSONB) RETURNS JSONB
diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql
index f86a0a3794..599d0eb089 100644
--- a/lib-sql/tokenizer/icu_tokenizer.sql
+++ b/lib-sql/tokenizer/icu_tokenizer.sql
@@ -97,13 +97,6 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
-  RETURNS TEXT
-AS $$
-  SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
-$$ LANGUAGE SQL IMMUTABLE STRICT;
-
-
 CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB)
   RETURNS TEXT
 AS $$
diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql
index 64453d4e59..5826f74ac2 100644
--- a/lib-sql/tokenizer/legacy_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer.sql
@@ -97,10 +97,10 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB)
   RETURNS TEXT
 AS $$
-  SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
+  SELECT info->>'postcode';
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py
index a292b180b8..36fd572244 100644
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -467,8 +467,9 @@ def _process_place_address(self, token_info, address):
             if key == 'postcode':
                 # Make sure the normalized postcode is present in the word table.
                 if re.search(r'[:,;]', value) is None:
-                    self._cache.add_postcode(self.conn,
-                                             self.normalize_postcode(value))
+                    norm_pc = self.normalize_postcode(value)
+                    token_info.set_postcode(norm_pc)
+                    self._cache.add_postcode(self.conn, norm_pc)
             elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                 hnrs.append(value)
             elif key == 'street':
@@ -527,6 +528,11 @@ def add_housenumbers(self, conn, hnrs):
             self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
 
 
+    def set_postcode(self, postcode):
+        """ Set or replace the postcode token with the given value.
+        """
+        self.data['postcode'] = postcode
+
     def add_street(self, conn, street):
         """ Add addr:street match terms.
         """
diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py
index 27fbcc9b25..9c66719b5f 100644
--- a/nominatim/tools/postcodes.py
+++ b/nominatim/tools/postcodes.py
@@ -186,17 +186,17 @@ def update_postcodes(dsn, project_dir, tokenizer):
             # Recompute the list of valid postcodes from placex.
             with conn.cursor(name="placex_postcodes") as cur:
                 cur.execute("""
-                SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
+                SELECT cc, pc, ST_X(centroid), ST_Y(centroid)
                 FROM (SELECT
                         COALESCE(plx.country_code,
                                  get_country_code(ST_Centroid(pl.geometry))) as cc,
-                        token_normalized_postcode(pl.address->'postcode') as pc,
+                        pl.address->'postcode' as pc,
                         COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
                       FROM place AS pl LEFT OUTER JOIN placex AS plx
                              ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
                     WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
                 WHERE pc IS NOT null AND cc IS NOT null
-                ORDER BY country_code, pc""")
+                ORDER BY cc, pc""")
 
                 collector = None
 

From 0f00f4968c1b78b83a57a4ece660a38f58e2de11 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Fri, 17 Jun 2022 17:28:51 +0200
Subject: [PATCH 23/30] fix up BDD tests for postcode changes

Includes smaller code fixes found by the tests.
---
 lib-sql/functions/address_lookup.sql    |  5 +++
 nominatim/tokenizer/icu_tokenizer.py    |  2 +
 test/bdd/db/import/postcodes.feature    | 53 +------------------------
 test/bdd/db/query/normalization.feature |  8 ----
 4 files changed, 9 insertions(+), 59 deletions(-)

diff --git a/lib-sql/functions/address_lookup.sql b/lib-sql/functions/address_lookup.sql
index 0eada6987e..2bbfcd5c03 100644
--- a/lib-sql/functions/address_lookup.sql
+++ b/lib-sql/functions/address_lookup.sql
@@ -320,6 +320,11 @@ BEGIN
     location := ROW(null, null, null, hstore('ref', place.postcode), 'place',
                     'postcode', null, null, false, true, 5, 0)::addressline;
     RETURN NEXT location;
+  ELSEIF place.address is not null and place.address ? 'postcode'
+         and not place.address->'postcode' SIMILAR TO '%(,|;)%' THEN
+    location := ROW(null, null, null, hstore('ref', place.address->'postcode'), 'place',
+                    'postcode', null, null, false, true, 5, 0)::addressline;
+    RETURN NEXT location;
   END IF;
 
   RETURN;
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index 0dc551e1b4..28184f6af1 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -635,6 +635,8 @@ def _add_postcode(self, item):
                             (postcode, list(variants)))
             self._cache.postcodes.add(postcode)
 
+        return postcode_name
+
 
 class _TokenInfo:
     """ Collect token information to be sent back to the database.
diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature
index 50afa7cc2d..7636aea7ee 100644
--- a/test/bdd/db/import/postcodes.feature
+++ b/test/bdd/db/import/postcodes.feature
@@ -182,6 +182,7 @@ Feature: Import of postcodes
            | type     | display_name |
            | postcode | E4 7EA       |
 
+    @Fail
     Scenario: search and address ranks for GB post codes correctly assigned
         Given the places
          | osm  | class | type     | postcode | geometry |
@@ -195,57 +196,7 @@ Feature: Import of postcodes
          | E45 2    | gb      | 23          | 5 |
          | Y45      | gb      | 21          | 5 |
 
-    Scenario: wrongly formatted GB postcodes are down-ranked
-        Given the places
-         | osm  | class | type     | postcode | geometry |
-         | N1   | place | postcode | EA452CD  | country:gb |
-         | N2   | place | postcode | E45 23   | country:gb |
-        When importing
-        Then location_postcode contains exactly
-         | postcode | country | rank_search | rank_address |
-         | EA452CD  | gb      | 30          | 30 |
-         | E45 23   | gb      | 30          | 30 |
-
-    Scenario: search and address rank for DE postcodes correctly assigned
-        Given the places
-         | osm | class | type     | postcode | geometry |
-         | N1  | place | postcode | 56427    | country:de |
-         | N2  | place | postcode | 5642     | country:de |
-         | N3  | place | postcode | 5642A    | country:de |
-         | N4  | place | postcode | 564276   | country:de |
-        When importing
-        Then location_postcode contains exactly
-         | postcode | country | rank_search | rank_address |
-         | 56427    | de      | 21          | 11 |
-         | 5642     | de      | 30          | 30 |
-         | 5642A    | de      | 30          | 30 |
-         | 564276   | de      | 30          | 30 |
-
-    Scenario: search and address rank for other postcodes are correctly assigned
-        Given the places
-         | osm | class | type     | postcode | geometry |
-         | N1  | place | postcode | 1        | country:ca |
-         | N2  | place | postcode | X3       | country:ca |
-         | N3  | place | postcode | 543      | country:ca |
-         | N4  | place | postcode | 54dc     | country:ca |
-         | N5  | place | postcode | 12345    | country:ca |
-         | N6  | place | postcode | 55TT667  | country:ca |
-         | N7  | place | postcode | 123-65   | country:ca |
-         | N8  | place | postcode | 12 445 4 | country:ca |
-         | N9  | place | postcode | A1:bc10  | country:ca |
-        When importing
-        Then location_postcode contains exactly
-         | postcode | country | rank_search | rank_address |
-         | 1        | ca      | 21          | 11 |
-         | X3       | ca      | 21          | 11 |
-         | 543      | ca      | 21          | 11 |
-         | 54DC     | ca      | 21          | 11 |
-         | 12345    | ca      | 21          | 11 |
-         | 55TT667  | ca      | 21          | 11 |
-         | 123-65   | ca      | 25          | 11 |
-         | 12 445 4 | ca      | 25          | 11 |
-         | A1:BC10  | ca      | 25          | 11 |
-
+    @fail-legacy
     Scenario: Postcodes outside all countries are not added to the postcode and word table
         Given the places
             | osm | class | type  | addr+postcode | addr+housenumber | addr+place  | geometry  |
diff --git a/test/bdd/db/query/normalization.feature b/test/bdd/db/query/normalization.feature
index f91c005043..e5a7a5922b 100644
--- a/test/bdd/db/query/normalization.feature
+++ b/test/bdd/db/query/normalization.feature
@@ -168,14 +168,6 @@ Feature: Import and search of names
          | ID | osm |
          | 0  | R1 |
 
-    Scenario: Unprintable characters in postcodes are ignored
-        Given the named places
-            | osm  | class   | type   | address                    | geometry   |
-            | N234 | amenity | prison | 'postcode' : u'1234\u200e' | country:de |
-        When importing
-        And sending search query "1234"
-        Then result 0 has not attributes osm_type
-
     Scenario Outline: Housenumbers with special characters are found
         Given the grid
             | 1 |  |   |  | 2 |

From 7f2ad4ac7e956e2744d64fa4cafc5370721886a2 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Fri, 17 Jun 2022 18:14:23 +0200
Subject: [PATCH 24/30] fix linting issue

---
 nominatim/tokenizer/icu_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index 28184f6af1..df1387e24b 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -624,7 +624,7 @@ def _add_postcode(self, item):
         if postcode not in self._cache.postcodes:
             term = self._search_normalized(postcode_name)
             if not term:
-                return
+                return None
 
             variants = {term}
             if analyzer is not None and variant_base is not None:

From 5be320368c6695498c4fed7cbba44220d3c91b17 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 20 Jun 2022 17:42:12 +0200
Subject: [PATCH 25/30] add documentation for postcode customization

---
 docs/customize/Country-Settings.md            | 149 ++++++++++++++++++
 docs/customize/Tokenizers.md                  |  24 ++-
 docs/mkdocs.yml                               |   1 +
 .../tokenizer/sanitizers/clean_postcodes.py   |   4 +
 4 files changed, 176 insertions(+), 2 deletions(-)
 create mode 100644 docs/customize/Country-Settings.md

diff --git a/docs/customize/Country-Settings.md b/docs/customize/Country-Settings.md
new file mode 100644
index 0000000000..6f8f2a9f23
--- /dev/null
+++ b/docs/customize/Country-Settings.md
@@ -0,0 +1,149 @@
+# Customizing Per-Country Data
+
+Whenever an OSM is imported into Nominatim, the object is first assigned
+a country. Nominatim can use this information to adapt various aspects of
+the address computation to the local customs of the country. This section
+explains how country assignment works and the principal per-country
+localizations.
+
+## Country assignment
+
+Countries are assigned on the basis of country data from the OpenStreetMap
+input data itself. Countries are expected to be tagged according to the
+[administrative boundary schema](https://wiki.openstreetmap.org/wiki/Tag:boundary%3Dadministrative):
+a OSM relation with `boundary=administrative` and `admin_level=2`. Nominatim
+uses the country code to distinguish the countries.
+
+If there is no country data available for a point, then Nominatim uses the
+fallback data imported from `data/country_osm_grid.sql.gz`. This was computed
+from OSM data as well but is guaranteed to cover all countries.
+
+Some OSM objects may also be located outside any country, for example a buoy
+in the middle of the ocean. These object do not get any country assigned and
+get a default treatment when it comes to localized handling of data.
+
+## Per-country settings
+
+### Global country settings
+
+The main place to configure settings per country is the file
+`settings/country_settings.yaml`. This file has one section per country that
+is recognised by Nominatim. Each section is tagged with the country code
+(in lower case) and contains the different localization information. Only
+countries which are listed in this file are taken into account for computations.
+
+For example, the section for Andorra looks like this:
+
+```
+    partition: 35
+    languages: ca
+    names: !include country-names/ad.yaml
+    postcode:
+      pattern: "(ddd)"
+      output: AD\1
+```
+
+The individual settings are described below.
+
+#### `partition`
+
+Nominatim internally splits the data into multiple tables to improve
+performance. The partition number tells Nominatim into which table to put
+the country. This is purely internal management and has no effect on the
+output data.
+
+The default is to have one partition per country.
+
+#### `languages`
+
+A comma-separated list of ISO-639 language codes of default languages in the
+country. These are the languages used in name tags without a language suffix.
+Note that this is not necessarily the same as the list of official languages
+in the country. There may be officially recognised languages in a country
+which are only ever used in name tags with the appropriate language suffixes.
+Conversely, a non-official language may appear a lot in the name tags, for
+example when used as an unofficial Lingua Franca.
+
+List the languages in order of frequency of appearance with the most frequently
+used language first. It is not recommended to add languages when there are only
+very few occurrences.
+
+If only one language is listed, then Nominatim will 'auto-complete' the
+language of names without an explicit language-suffix.
+
+#### `names`
+
+List of names of the country and its translations. These names are used as
+a baseline. It is always possible to search countries by the given names, no
+matter what other names are in the OSM data. They are also used as a fallback
+when a needed translation is not available.
+
+!!! Note
+    The list of names per country is currently fairly large because Nominatim
+    supports translations in many languages per default. That is why the
+    name lists have been separated out into extra files. You can find the
+    name lists in the file `settings/country-names/<country code>.yaml`.
+    The names section in the main country settings file only refers to these
+    files via the special `!include` directive.
+
+#### `postcode`
+
+Describes the format of the postcode that is in use in the country.
+
+When a country has no official postcodes, set this to no. Example:
+
+```
+ae:
+    postcode: no
+```
+
+When a country has a postcode, you need to state the postcode pattern and
+the default output format. Example:
+
+```
+bm:
+    postcode:
+      pattern: "(ll)[ -]?(dd)"
+      output: \1 \2
+```
+
+The **pattern** is a regular expression that describes the possible formats
+accepted as a postcode. The pattern follows the standard syntax for
+[regular expressions in Python](https://docs.python.org/3/library/re.html#regular-expression-syntax)
+with two extra shortcuts: `d` is a shortcut for a single digit([0-9])
+and `l` for a single ASCII letter ([A-Z]).
+
+Use match groups to indicate groups in the postcode that may optionally be
+separated with a space or a hyphen.
+
+For example, the postcode for Bermuda above always consists of two letters
+and two digits. They may optionally be separated by a space or hyphen. That
+means that Nominatim will consider `AB56`, `AB 56` and `AB-56` spelling variants
+for one and the same postcode.
+
+Never add the country code in front of the postcode pattern. Nominatim will
+automatically accept variants with a country code prefix for all postcodes.
+
+The **output** field is an optional field that describes what the canonical
+spelling of the postcode should be. The format is the
+[regular expression expand syntax](https://docs.python.org/3/library/re.html#re.Match.expand) referring back to the bracket groups in the pattern.
+
+Most simple postcodes only have one spelling variant. In that case, the
+**output** can be omitted. The postcode will simply be used as is.
+
+In the Bermuda example above, the canonical spelling would be to have a space
+between letters and digits.
+
+!!! Warning
+    When your postcode pattern covers multiple variants of the postcode, then
+    you must explicitly state the canonical output or Nominatim will not
+    handle the variations correctly.
+
+### Other country-specific configuration
+
+There are some other configuration files where you can set localized settings
+according to the assigned country. These are:
+
+ * [Place ranking configuration](Ranking.md)
+
+Please see the linked documentation sections for more information.
diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md
index 19d867ddd8..c563b20105 100644
--- a/docs/customize/Tokenizers.md
+++ b/docs/customize/Tokenizers.md
@@ -205,6 +205,14 @@ The following is a list of sanitizers that are shipped with Nominatim.
     rendering:
         heading_level: 6
 
+##### clean-postcodes
+
+::: nominatim.tokenizer.sanitizers.clean_postcodes
+    selection:
+        members: False
+    rendering:
+        heading_level: 6
+
 
 #### Token Analysis
 
@@ -222,8 +230,12 @@ by a sanitizer (see for example the
 The token-analysis section contains the list of configured analyzers. Each
 analyzer must have an `id` parameter that uniquely identifies the analyzer.
 The only exception is the default analyzer that is used when no special
-analyzer was selected. There is one special id '@housenumber'. If an analyzer
-with that name is present, it is used for normalization of house numbers.
+analyzer was selected. There are analysers with special ids:
+
+ * '@housenumber'. If an analyzer with that name is present, it is used
+   for normalization of house numbers.
+ * '@potcode'. If an analyzer with that name is present, it is used
+   for normalization of postcodes.
 
 Different analyzer implementations may exist. To select the implementation,
 the `analyzer` parameter must be set. The different implementations are
@@ -356,6 +368,14 @@ house numbers of the form '3 a', '3A', '3-A' etc. are all considered equivalent.
 
 The analyzer cannot be customized.
 
+##### Postcode token analyzer
+
+The analyzer `postcodes` is pupose-made to analyze postcodes. It supports
+a 'lookup' varaint of the token, which produces variants with optional
+spaces. Use together with the clean-postcodes sanitizer.
+
+The analyzer cannot be customized.
+
 ### Reconfiguration
 
 Changing the configuration after the import is currently not possible, although
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index c25ae0ad32..a3860cbaa2 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -28,6 +28,7 @@ pages:
         - 'Overview': 'customize/Overview.md'
         - 'Import Styles': 'customize/Import-Styles.md'
         - 'Configuration Settings': 'customize/Settings.md'
+        - 'Per-Country Data': 'customize/Country-Settings.md'
         - 'Place Ranking' : 'customize/Ranking.md'
         - 'Tokenizers' : 'customize/Tokenizers.md'
         - 'Special Phrases': 'customize/Special-Phrases.md'
diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py
index 43d297695f..05e90ca122 100644
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -15,6 +15,10 @@
                         postcode centroids of a country but is still searchable.
                         When set to 'no', non-conforming postcodes are not
                         searchable either.
+    default-pattern:    Pattern to use, when there is none available for the
+                        country in question. Warning: will not be used for
+                        objects that have no country assigned. These are always
+                        assumed to have no postcode.
 """
 from nominatim.data.postcode_format import PostcodeFormatter
 

From 612d34930b603997acce2772e7264b509bb8aed6 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Tue, 21 Jun 2022 22:05:35 +0200
Subject: [PATCH 26/30] handle postcodes properly on word table updates

update_postcodes_from_db() needs to do the full postcode treatment
in order to derive the correct word table entries.
---
 nominatim/tokenizer/icu_tokenizer.py          | 95 +++++++++++++------
 .../tokenizer/token_analysis/postcodes.py     | 19 +++-
 test/bdd/steps/steps_db_ops.py                | 19 ++--
 test/python/tokenizer/test_icu.py             | 79 +++++++++++----
 .../token_analysis/test_analysis_postcodes.py | 60 ++++++++++++
 5 files changed, 216 insertions(+), 56 deletions(-)
 create mode 100644 test/python/tokenizer/token_analysis/test_analysis_postcodes.py

diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index df1387e24b..a6ff08a407 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -290,33 +290,72 @@ def update_postcodes_from_db(self):
         """ Update postcode tokens in the word table from the location_postcode
             table.
         """
-        to_delete = []
+        analyzer = self.token_analysis.analysis.get('@postcode')
+
         with self.conn.cursor() as cur:
-            # This finds us the rows in location_postcode and word that are
-            # missing in the other table.
-            cur.execute("""SELECT * FROM
-                            (SELECT pc, word FROM
-                              (SELECT distinct(postcode) as pc FROM location_postcode) p
-                              FULL JOIN
-                              (SELECT word FROM word WHERE type = 'P') w
-                              ON pc = word) x
-                           WHERE pc is null or word is null""")
-
-            with CopyBuffer() as copystr:
-                for postcode, word in cur:
-                    if postcode is None:
-                        to_delete.append(word)
-                    else:
-                        copystr.add(self._search_normalized(postcode),
-                                    'P', postcode)
-
-                if to_delete:
-                    cur.execute("""DELETE FROM WORD
-                                   WHERE type ='P' and word = any(%s)
-                                """, (to_delete, ))
-
-                copystr.copy_out(cur, 'word',
-                                 columns=['word_token', 'type', 'word'])
+            # First get all postcode names currently in the word table.
+            cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
+            word_entries = set((entry[0] for entry in cur))
+
+            # Then compute the required postcode names from the postcode table.
+            needed_entries = set()
+            cur.execute("SELECT country_code, postcode FROM location_postcode")
+            for cc, postcode in cur:
+                info = PlaceInfo({'country_code': cc,
+                                  'class': 'place', 'type': 'postcode',
+                                  'address': {'postcode': postcode}})
+                address = self.sanitizer.process_names(info)[1]
+                for place in address:
+                    if place.kind == 'postcode':
+                        if analyzer is None:
+                            postcode_name = place.name.strip().upper()
+                            variant_base = None
+                        else:
+                            postcode_name = analyzer.normalize(place.name)
+                            variant_base = place.get_attr("variant")
+
+                        if variant_base:
+                            needed_entries.add(f'{postcode_name}@{variant_base}')
+                        else:
+                            needed_entries.add(postcode_name)
+                        break
+
+        # Now update the word table.
+        self._delete_unused_postcode_words(word_entries - needed_entries)
+        self._add_missing_postcode_words(needed_entries - word_entries)
+
+    def _delete_unused_postcode_words(self, tokens):
+        if tokens:
+            with self.conn.cursor() as cur:
+                cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
+                            (list(tokens), ))
+
+    def _add_missing_postcode_words(self, tokens):
+        if not tokens:
+            return
+
+        analyzer = self.token_analysis.analysis.get('@postcode')
+        terms = []
+
+        for postcode_name in tokens:
+            if '@' in postcode_name:
+                term, variant = postcode_name.split('@', 2)
+                term = self._search_normalized(term)
+                variants = {term}
+                if analyzer is not None:
+                    variants.update(analyzer.get_variants_ascii(variant))
+                    variants = list(variants)
+            else:
+                variants = [self._search_normalized(postcode_name)]
+            terms.append((postcode_name, variants))
+
+        if terms:
+            with self.conn.cursor() as cur:
+                cur.execute_values("""SELECT create_postcode_word(pc, var)
+                                      FROM (VALUES %s) AS v(pc, var)""",
+                                   terms)
+
+
 
 
     def update_special_phrases(self, phrases, should_replace):
@@ -616,7 +655,7 @@ def _add_postcode(self, item):
             postcode_name = analyzer.normalize(item.name)
             variant_base = item.get_attr("variant")
 
-        if variant_base is not None:
+        if variant_base:
             postcode = f'{postcode_name}@{variant_base}'
         else:
             postcode = postcode_name
@@ -627,7 +666,7 @@ def _add_postcode(self, item):
                 return None
 
             variants = {term}
-            if analyzer is not None and variant_base is not None:
+            if analyzer is not None and variant_base:
                 variants.update(analyzer.get_variants_ascii(variant_base))
 
             with self.conn.cursor() as cur:
diff --git a/nominatim/tokenizer/token_analysis/postcodes.py b/nominatim/tokenizer/token_analysis/postcodes.py
index e105b132da..18fc2a8ded 100644
--- a/nominatim/tokenizer/token_analysis/postcodes.py
+++ b/nominatim/tokenizer/token_analysis/postcodes.py
@@ -25,8 +25,18 @@ def create(normalizer, transliterator, config): # pylint: disable=W0613
     """
     return PostcodeTokenAnalysis(normalizer, transliterator)
 
+
 class PostcodeTokenAnalysis:
-    """ Detects common housenumber patterns and normalizes them.
+    """ Special normalization and variant generation for postcodes.
+
+        This analyser must not be used with anything but postcodes as
+        it follows some special rules: `normalize` doesn't necessarily
+        need to return a standard form as per normalization rules. It
+        needs to return the canonical form of the postcode that is also
+        used for output. `get_variants_ascii` then needs to ensure that
+        the generated variants once more follow the standard normalization
+        and transliteration, so that postcodes are correctly recognised by
+        the search algorithm.
     """
     def __init__(self, norm, trans):
         self.norm = norm
@@ -44,11 +54,12 @@ def normalize(self, name):
     def get_variants_ascii(self, norm_name):
         """ Compute the spelling variants for the given normalized postcode.
 
-            The official form creates one variant. If a 'lookup version' is
-            given, then it will create variants with optional spaces.
+            Takes the canonical form of the postcode, normalizes it using the
+            standard rules and then creates variants of the result where
+            all spaces are optional.
         """
         # Postcodes follow their own transliteration rules.
         # Make sure at this point, that the terms are normalized in a way
         # that they are searchable with the standard transliteration rules.
         return [self.trans.transliterate(term) for term in
-                self.mutator.generate([self.norm.transliterate(norm_name)])]
+                self.mutator.generate([self.norm.transliterate(norm_name)]) if term]
diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py
index 44c82b017c..37d541533d 100644
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@@ -18,13 +18,18 @@
 def check_database_integrity(context):
     """ Check some generic constraints on the tables.
     """
-    # place_addressline should not have duplicate (place_id, address_place_id)
-    cur = context.db.cursor()
-    cur.execute("""SELECT count(*) FROM
-                    (SELECT place_id, address_place_id, count(*) as c
-                     FROM place_addressline GROUP BY place_id, address_place_id) x
-                   WHERE c > 1""")
-    assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline"
+    with context.db.cursor() as cur:
+        # place_addressline should not have duplicate (place_id, address_place_id)
+        cur.execute("""SELECT count(*) FROM
+                        (SELECT place_id, address_place_id, count(*) as c
+                         FROM place_addressline GROUP BY place_id, address_place_id) x
+                       WHERE c > 1""")
+        assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline"
+
+        # word table must not have empty word_tokens
+        cur.execute("SELECT count(*) FROM word WHERE word_token = ''")
+        assert cur.fetchone()[0] == 0, "Empty word tokens found in word table"
+
 
 
 ################################ GIVEN ##################################
diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py
index 6138a03a42..b9de97bcc2 100644
--- a/test/python/tokenizer/test_icu.py
+++ b/test/python/tokenizer/test_icu.py
@@ -72,7 +72,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
 
     def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
                      variants=('~gasse -> gasse', 'street => st', ),
-                     sanitizers=[], with_housenumber=False):
+                     sanitizers=[], with_housenumber=False,
+                     with_postcode=False):
         cfgstr = {'normalization': list(norm),
                   'sanitizers': sanitizers,
                   'transliteration': list(trans),
@@ -81,6 +82,9 @@ def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()
         if with_housenumber:
             cfgstr['token-analysis'].append({'id': '@housenumber',
                                              'analyzer': 'housenumbers'})
+        if with_postcode:
+            cfgstr['token-analysis'].append({'id': '@postcode',
+                                             'analyzer': 'postcodes'})
         (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
         tok.loader = nominatim.tokenizer.icu_rule_loader.ICURuleLoader(test_config)
 
@@ -246,28 +250,69 @@ def test_normalize_postcode(analyzer):
         anl.normalize_postcode('38 Б') == '38 Б'
 
 
-def test_update_postcodes_from_db_empty(analyzer, table_factory, word_table):
-    table_factory('location_postcode', 'postcode TEXT',
-                  content=(('1234',), ('12 34',), ('AB23',), ('1234',)))
+class TestPostcodes:
 
-    with analyzer() as anl:
-        anl.update_postcodes_from_db()
+    @pytest.fixture(autouse=True)
+    def setup(self, analyzer, sql_functions):
+        sanitizers = [{'step': 'clean-postcodes'}]
+        with analyzer(sanitizers=sanitizers, with_postcode=True) as anl:
+            self.analyzer = anl
+            yield anl
 
-    assert word_table.count() == 3
-    assert word_table.get_postcodes() == {'1234', '12 34', 'AB23'}
 
+    def process_postcode(self, cc, postcode):
+        return self.analyzer.process_place(PlaceInfo({'country_code': cc,
+                                                      'address': {'postcode': postcode}}))
 
-def test_update_postcodes_from_db_add_and_remove(analyzer, table_factory, word_table):
-    table_factory('location_postcode', 'postcode TEXT',
-                  content=(('1234',), ('45BC', ), ('XX45', )))
-    word_table.add_postcode(' 1234', '1234')
-    word_table.add_postcode(' 5678', '5678')
 
-    with analyzer() as anl:
-        anl.update_postcodes_from_db()
+    def test_update_postcodes_from_db_empty(self, table_factory, word_table):
+        table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
+                      content=(('de', '12345'), ('se', '132 34'),
+                               ('bm', 'AB23'), ('fr', '12345')))
+
+        self.analyzer.update_postcodes_from_db()
+
+        assert word_table.count() == 5
+        assert word_table.get_postcodes() == {'12345', '132 34@132 34', 'AB 23@AB 23'}
+
+
+    def test_update_postcodes_from_db_ambigious(self, table_factory, word_table):
+        table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
+                      content=(('in', '123456'), ('sg', '123456')))
+
+        self.analyzer.update_postcodes_from_db()
+
+        assert word_table.count() == 3
+        assert word_table.get_postcodes() == {'123456', '123456@123 456'}
+
+
+    def test_update_postcodes_from_db_add_and_remove(self, table_factory, word_table):
+        table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
+                      content=(('ch', '1234'), ('bm', 'BC 45'), ('bm', 'XX45')))
+        word_table.add_postcode(' 1234', '1234')
+        word_table.add_postcode(' 5678', '5678')
+
+        self.analyzer.update_postcodes_from_db()
+
+        assert word_table.count() == 5
+        assert word_table.get_postcodes() == {'1234', 'BC 45@BC 45', 'XX 45@XX 45'}
+
+
+    def test_process_place_postcode_simple(self, word_table):
+        info = self.process_postcode('de', '12345')
+
+        assert info['postcode'] == '12345'
+
+        assert word_table.get_postcodes() == {'12345', }
+
+
+    def test_process_place_postcode_with_space(self, word_table):
+        info = self.process_postcode('in', '123 567')
+
+        assert info['postcode'] == '123567'
+
+        assert word_table.get_postcodes() == {'123567@123 567', }
 
-    assert word_table.count() == 3
-    assert word_table.get_postcodes() == {'1234', '45BC', 'XX45'}
 
 
 def test_update_special_phrase_empty_table(analyzer, word_table):
diff --git a/test/python/tokenizer/token_analysis/test_analysis_postcodes.py b/test/python/tokenizer/token_analysis/test_analysis_postcodes.py
new file mode 100644
index 0000000000..623bed54a8
--- /dev/null
+++ b/test/python/tokenizer/token_analysis/test_analysis_postcodes.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for special postcode analysis and variant generation.
+"""
+import pytest
+
+from icu import Transliterator
+
+import nominatim.tokenizer.token_analysis.postcodes as module
+from nominatim.errors import UsageError
+
+DEFAULT_NORMALIZATION = """ :: NFD ();
+                            '🜳' > ' ';
+                            [[:Nonspacing Mark:] [:Cf:]] >;
+                            :: lower ();
+                            [[:Punctuation:][:Space:]]+ > ' ';
+                            :: NFC ();
+                        """
+
+DEFAULT_TRANSLITERATION = """ ::  Latin ();
+                              '🜵' > ' ';
+                          """
+
+@pytest.fixture
+def analyser():
+    rules = { 'analyzer': 'postcodes'}
+    config = module.configure(rules, DEFAULT_NORMALIZATION)
+
+    trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
+    norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
+
+    return module.create(norm, trans, config)
+
+
+def get_normalized_variants(proc, name):
+    norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
+    return proc.get_variants_ascii(norm.transliterate(name).strip())
+
+
+@pytest.mark.parametrize('name,norm', [('12', '12'),
+                                       ('A 34 ', 'A 34'),
+                                       ('34-av', '34-AV')])
+def test_normalize(analyser, name, norm):
+    assert analyser.normalize(name) == norm
+
+
+@pytest.mark.parametrize('postcode,variants', [('12345', {'12345'}),
+                                               ('AB-998', {'ab 998', 'ab998'}),
+                                               ('23 FGH D3', {'23 fgh d3', '23fgh d3',
+                                                              '23 fghd3', '23fghd3'})])
+def test_get_variants_ascii(analyser, postcode, variants):
+    out = analyser.get_variants_ascii(postcode)
+
+    assert len(out) == len(set(out))
+    assert set(out) == variants

From 6eb90443530e31025802e27527faaa7da99b02b6 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 22 Jun 2022 09:54:47 +0200
Subject: [PATCH 27/30] adapt search algorithm to new postcode format in word

---
 lib-php/TokenPostcode.php            |  7 +-
 lib-php/tokenizer/icu_tokenizer.php  | 16 +++--
 test/bdd/db/import/postcodes.feature | 18 ------
 test/bdd/db/query/postcodes.feature  | 95 ++++++++++++++++++++++++++++
 4 files changed, 111 insertions(+), 25 deletions(-)
 create mode 100644 test/bdd/db/query/postcodes.feature

diff --git a/lib-php/TokenPostcode.php b/lib-php/TokenPostcode.php
index f0dbd45716..0ff92929cb 100644
--- a/lib-php/TokenPostcode.php
+++ b/lib-php/TokenPostcode.php
@@ -25,7 +25,12 @@ class Postcode
     public function __construct($iId, $sPostcode, $sCountryCode = '')
     {
         $this->iId = $iId;
-        $this->sPostcode = $sPostcode;
+        $iSplitPos = strpos($sPostcode, '@');
+        if ($iSplitPos === false) {
+            $this->sPostcode = $sPostcode;
+        } else {
+            $this->sPostcode = substr($sPostcode, 0, $iSplitPos);
+        }
         $this->sCountryCode = empty($sCountryCode) ? '' : $sCountryCode;
     }
 
diff --git a/lib-php/tokenizer/icu_tokenizer.php b/lib-php/tokenizer/icu_tokenizer.php
index ccce99ca13..e45d076548 100644
--- a/lib-php/tokenizer/icu_tokenizer.php
+++ b/lib-php/tokenizer/icu_tokenizer.php
@@ -190,13 +190,17 @@ private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
                     if ($aWord['word'] !== null
                         && pg_escape_string($aWord['word']) == $aWord['word']
                     ) {
-                        $sNormPostcode = $this->normalizeString($aWord['word']);
-                        if (strpos($sNormQuery, $sNormPostcode) !== false) {
-                            $oValidTokens->addToken(
-                                $sTok,
-                                new Token\Postcode($iId, $aWord['word'], null)
-                            );
+                        $iSplitPos = strpos($aWord['word'], '@');
+                        if ($iSplitPos === false) {
+                            $sPostcode = $aWord['word'];
+                        } else {
+                            $sPostcode = substr($aWord['word'], 0, $iSplitPos);
                         }
+
+                        $oValidTokens->addToken(
+                            $sTok,
+                            new Token\Postcode($iId, $sPostcode, null)
+                        );
                     }
                     break;
                 case 'S':  // tokens for classification terms (special phrases)
diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature
index 7636aea7ee..4d146d18c1 100644
--- a/test/bdd/db/import/postcodes.feature
+++ b/test/bdd/db/import/postcodes.feature
@@ -163,24 +163,6 @@ Feature: Import of postcodes
            | de      | 01982    | country:de |
         And there are word tokens for postcodes 01982
 
-    Scenario: Different postcodes with the same normalization can both be found
-        Given the places
-           | osm | class | type  | addr+postcode | addr+housenumber | geometry |
-           | N34 | place | house | EH4 7EA       | 111              | country:gb |
-           | N35 | place | house | E4 7EA        | 111              | country:gb |
-        When importing
-        Then location_postcode contains exactly
-           | country | postcode | geometry |
-           | gb      | EH4 7EA  | country:gb |
-           | gb      | E4 7EA   | country:gb |
-        When sending search query "EH4 7EA"
-        Then results contain
-           | type     | display_name |
-           | postcode | EH4 7EA      |
-        When sending search query "E4 7EA"
-        Then results contain
-           | type     | display_name |
-           | postcode | E4 7EA       |
 
     @Fail
     Scenario: search and address ranks for GB post codes correctly assigned
diff --git a/test/bdd/db/query/postcodes.feature b/test/bdd/db/query/postcodes.feature
new file mode 100644
index 0000000000..c399b63b0b
--- /dev/null
+++ b/test/bdd/db/query/postcodes.feature
@@ -0,0 +1,95 @@
+@DB
+@fail-legacy
+Feature: Querying fo postcode variants
+
+    Scenario: Postcodes in Singapore (6-digit postcode)
+        Given the grid with origin SG
+            | 10 |   |   |   | 11 |
+        And the places
+            | osm | class   | type | name   | addr+postcode | geometry |
+            | W1  | highway | path | Lorang | 399174        | 10,11    |
+        When importing
+        When sending search query "399174"
+        Then results contain
+            | ID | type     | display_name |
+            | 0  | postcode | 399174       |
+
+
+    Scenario Outline: Postcodes in the Netherlands (mixed postcode with spaces)
+        Given the grid with origin NL
+            | 10 |   |   |   | 11 |
+        And the places
+            | osm | class   | type | name     | addr+postcode | geometry |
+            | W1  | highway | path | De Weide | 3993 DX       | 10,11    |
+        When importing
+        When sending search query "3993 DX"
+        Then results contain
+            | ID | type     | display_name |
+            | 0  | postcode | 3993 DX      |
+        When sending search query "3993dx"
+        Then results contain
+            | ID | type     | display_name |
+            | 0  | postcode | 3993 DX      |
+
+        Examples:
+            | postcode |
+            | 3993 DX  |
+            | 3993DX   |
+            | 3993 dx  |
+
+
+    Scenario: Postcodes in Singapore (6-digit postcode)
+        Given the grid with origin SG
+            | 10 |   |   |   | 11 |
+        And the places
+            | osm | class   | type | name   | addr+postcode | geometry |
+            | W1  | highway | path | Lorang | 399174        | 10,11    |
+        When importing
+        When sending search query "399174"
+        Then results contain
+            | ID | type     | display_name |
+            | 0  | postcode | 399174       |
+
+
+    Scenario Outline: Postcodes in Andorra (with country code)
+        Given the grid with origin AD
+            | 10 |   |   |   | 11 |
+        And the places
+            | osm | class   | type | name   | addr+postcode | geometry |
+            | W1  | highway | path | Lorang | <postcode>    | 10,11    |
+        When importing
+        When sending search query "675"
+        Then results contain
+            | ID | type     | display_name |
+            | 0  | postcode | AD675        |
+        When sending search query "AD675"
+        Then results contain
+            | ID | type     | display_name |
+            | 0  | postcode | AD675        |
+
+        Examples:
+            | postcode |
+            | 675      |
+            | AD 675   |
+            | AD675    |
+
+
+    Scenario: Different postcodes with the same normalization can both be found
+        Given the places
+           | osm | class | type  | addr+postcode | addr+housenumber | geometry |
+           | N34 | place | house | EH4 7EA       | 111              | country:gb |
+           | N35 | place | house | E4 7EA        | 111              | country:gb |
+        When importing
+        Then location_postcode contains exactly
+           | country | postcode | geometry |
+           | gb      | EH4 7EA  | country:gb |
+           | gb      | E4 7EA   | country:gb |
+        When sending search query "EH4 7EA"
+        Then results contain
+           | type     | display_name |
+           | postcode | EH4 7EA      |
+        When sending search query "E4 7EA"
+        Then results contain
+           | type     | display_name |
+           | postcode | E4 7EA       |
+

From 93d5be097a338b0333ecf2452b0a7311cf061bff Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 22 Jun 2022 10:47:08 +0200
Subject: [PATCH 28/30] bdd: do not expect legacy word table to be without
 empty tokens

It can happen for bogus names and this will not get fixed anymore.
---
 test/bdd/steps/steps_db_ops.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py
index 37d541533d..8fd918f88f 100644
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@@ -27,8 +27,9 @@ def check_database_integrity(context):
         assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline"
 
         # word table must not have empty word_tokens
-        cur.execute("SELECT count(*) FROM word WHERE word_token = ''")
-        assert cur.fetchone()[0] == 0, "Empty word tokens found in word table"
+        if context.nominatim.tokenizer != 'legacy':
+            cur.execute("SELECT count(*) FROM word WHERE word_token = ''")
+            assert cur.fetchone()[0] == 0, "Empty word tokens found in word table"
 
 
 

From 3dd7410bb7adf937f3a4c53ab6219c028d7685b8 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 22 Jun 2022 11:38:23 +0200
Subject: [PATCH 29/30] bdd: correctly skip postcode tests for legacy

---
 test/bdd/db/query/postcodes.feature | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/bdd/db/query/postcodes.feature b/test/bdd/db/query/postcodes.feature
index c399b63b0b..a3ca70352a 100644
--- a/test/bdd/db/query/postcodes.feature
+++ b/test/bdd/db/query/postcodes.feature
@@ -1,5 +1,4 @@
 @DB
-@fail-legacy
 Feature: Querying fo postcode variants
 
     Scenario: Postcodes in Singapore (6-digit postcode)
@@ -15,6 +14,7 @@ Feature: Querying fo postcode variants
             | 0  | postcode | 399174       |
 
 
+    @fail-legacy
     Scenario Outline: Postcodes in the Netherlands (mixed postcode with spaces)
         Given the grid with origin NL
             | 10 |   |   |   | 11 |
@@ -38,6 +38,7 @@ Feature: Querying fo postcode variants
             | 3993 dx  |
 
 
+    @fail-legacy
     Scenario: Postcodes in Singapore (6-digit postcode)
         Given the grid with origin SG
             | 10 |   |   |   | 11 |
@@ -51,6 +52,7 @@ Feature: Querying fo postcode variants
             | 0  | postcode | 399174       |
 
 
+    @fail-legacy
     Scenario Outline: Postcodes in Andorra (with country code)
         Given the grid with origin AD
             | 10 |   |   |   | 11 |

From 536f08f33a1388c5022c2adf799da55035fd7e0a Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Thu, 23 Jun 2022 16:17:47 +0200
Subject: [PATCH 30/30] ignore 5+ postcodes in the US for now

Hierarchical postcodes need a different treatment.
---
 settings/country_settings.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml
index 14d08de3a3..b0bacdfcc5 100644
--- a/settings/country_settings.yaml
+++ b/settings/country_settings.yaml
@@ -1998,7 +1998,7 @@ ua:
     languages: uk
     names: !include country-names/ua.yaml
     postcode:
-      pattern: "ddddd"
+      pattern: "d?ddddd"
 
 
 # Uganda (Uganda)
@@ -2024,7 +2024,7 @@ us:
     languages: en
     names: !include country-names/us.yaml
     postcode:
-      pattern: "(ddddd)(?:-dddd)?"
+      pattern: "ddddd"
 
 
 # Uruguay (Uruguay)