Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds sanitizer for Japanese addresses to correspond to block address #3122

Merged
merged 7 commits into from
Aug 1, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/customize/Tokenizers.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,14 @@ The following is a list of sanitizers that are shipped with Nominatim.
rendering:
heading_level: 6

#### tag-japanese

::: nominatim.tokenizer.sanitizers.tag_japanese
selection:
members: False
rendering:
heading_level: 6

#### Token Analysis

Token analyzers take a full name and transform it into one or more normalized
Expand Down
146 changes: 146 additions & 0 deletions nominatim/tokenizer/sanitizers/tag_japanese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
This sanitizer maps OSM data to Japanese block addresses.
It replaces blocknumber and housenumber with housenumber,
and quarter and neighbourhood with place.
"""


from typing import Callable
from typing import List, Optional

from nominatim.tokenizer.sanitizers.base import ProcessInfo
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
from nominatim.data.place_name import PlaceName

KANJI_MAP = {
ord('零'): '0',
ord('一'): '1',
ord('二'): '2',
ord('三'): '3',
ord('四'): '4',
ord('五'): '5',
ord('六'): '6',
ord('七'): '7',
ord('八'): '8',
ord('九'): '9'
}

def convert_kanji_sequence_to_number(sequence: str) -> str:
"""Converts Kanji numbers to Arabic numbers
"""
converted = sequence.translate(KANJI_MAP)
return converted
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have a look at the Python function translate. It does a similar thing, as far as I can see. You just need to be careful that with translate, the mapping table needs unicode numbers as keys. So you need to define the table like this: KANJI_MAP = { ord('零'): '0', ....}.


def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
"""Set up the sanitizer
"""
return tag_japanese

def reconbine_housenumber(
new_address: List[PlaceName],
tmp_housenumber: Optional[str],
tmp_blocknumber: Optional[str]
) -> List[PlaceName]:
""" Recombine the tag of housenumber by using housenumber and blocknumber
"""
if tmp_blocknumber:
tmp_blocknumber = convert_kanji_sequence_to_number(tmp_blocknumber)
if tmp_housenumber:
tmp_housenumber = convert_kanji_sequence_to_number(tmp_housenumber)

if tmp_blocknumber and tmp_housenumber:
new_address.append(
PlaceName(
kind='housenumber',
name=f'{tmp_blocknumber}-{tmp_housenumber}',
suffix=''
)
)
elif tmp_blocknumber:
new_address.append(
PlaceName(
kind='housenumber',
name=tmp_blocknumber,
suffix=''
)
)
elif tmp_housenumber:
new_address.append(
PlaceName(
kind='housenumber',
name=tmp_housenumber,
suffix=''
)
)
return new_address

def reconbine_place(
new_address: List[PlaceName],
tmp_neighbourhood: Optional[str],
tmp_quarter: Optional[str]
) -> List[PlaceName]:
""" Recombine the tag of place by using neighbourhood and quarter
"""
if tmp_neighbourhood:
tmp_neighbourhood = convert_kanji_sequence_to_number(tmp_neighbourhood)
if tmp_quarter:
tmp_quarter = convert_kanji_sequence_to_number(tmp_quarter)

if tmp_neighbourhood and tmp_quarter:
new_address.append(
PlaceName(
kind='place',
name=f'{tmp_quarter}{tmp_neighbourhood}',
suffix=''
)
)
elif tmp_neighbourhood:
new_address.append(
PlaceName(
kind='place',
name=tmp_neighbourhood,
suffix=''
)
)
elif tmp_quarter:
new_address.append(
PlaceName(
kind='place',
name=tmp_quarter,
suffix=''
)
)
return new_address
def tag_japanese(obj: ProcessInfo) -> None:
"""Recombine kind of address
"""
if obj.place.country_code != 'jp':
return
tmp_housenumber = None
tmp_blocknumber = None
tmp_neighbourhood = None
tmp_quarter = None

new_address = []
for item in obj.address:
if item.kind == 'housenumber':
tmp_housenumber = item.name
elif item.kind == 'block_number':
tmp_blocknumber = item.name
elif item.kind == 'neighbourhood':
tmp_neighbourhood = item.name
elif item.kind == 'quarter':
tmp_quarter = item.name
else:
new_address.append(item)

new_address = reconbine_housenumber(new_address, tmp_housenumber, tmp_blocknumber)
new_address = reconbine_place(new_address, tmp_neighbourhood, tmp_quarter)

obj.address = [item for item in new_address if item.name is not None]
1 change: 1 addition & 0 deletions settings/icu_tokenizer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ sanitizers:
whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
use-defaults: all
mode: append
- step: tag-japanese
token-analysis:
- analyzer: generic
- id: "@housenumber"
Expand Down
30 changes: 30 additions & 0 deletions test/bdd/db/query/japanese.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
@DB
Feature: Searches in Japan
Test specifically for searches of Japanese addresses and in Japanese language.
@fail-legacy
Scenario: A block house-number is parented to the neighbourhood
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that your test needs a sanitizer, it is natural that it cannot work with the legacy tokenizer system that does not know about sanitizers. Simply add a @fail-legacy above the scenario.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you so much. I understood.
I added @fail-legacy.

Given the grid with origin JP
| 1 | | | | 2 |
| | 3 | | | |
| | | 9 | | |
| | | | 6 | |
And the places
| osm | class | type | name | geometry |
| W1 | highway | residential | 雉子橋通り | 1,2 |
And the places
| osm | class | type | housenr | addr+block_number | addr+neighbourhood | geometry |
| N3 | amenity | restaurant | 2 | 6 | 2丁目 | 3 |
And the places
| osm | class | type | name | geometry |
| N9 | place | neighbourhood | 2丁目 | 9 |
And the places
| osm | class | type | name | geometry |
| N6 | place | quarter | 加瀬 | 6 |
When importing
Then placex contains
| object | parent_place_id |
| N3 | N9 |
When sending search query "2丁目 6-2"
Then results contain
| osm |
| N3 |
84 changes: 84 additions & 0 deletions test/python/tokenizer/sanitizers/test_tag_japanese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from nominatim.data.place_info import PlaceInfo
from nominatim.data.place_name import PlaceName
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from typing import Mapping, Optional, List
import pytest

class TestTagJapanese:
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config

def run_sanitizer_on(self,type, **kwargs):
place = PlaceInfo({
'address': kwargs,
'country_code': 'jp'
})
sanitizer_args = {'step': 'tag-japanese'}
_, address = PlaceSanitizer([sanitizer_args], self.config).process_names(place)
tmp_list = [(p.name,p.kind) for p in address]
return sorted(tmp_list)

def test_on_address(self):
res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz')
assert res == [('bar','ref'),('baz','ref_abc'),('foo','name')]

def test_housenumber(self):
res = self.run_sanitizer_on('address', housenumber='2')
assert res == [('2','housenumber')]

def test_blocknumber(self):
res = self.run_sanitizer_on('address', block_number='6')
assert res == [('6','housenumber')]

def test_neighbourhood(self):
res = self.run_sanitizer_on('address', neighbourhood='8')
assert res == [('8','place')]

def test_quarter(self):
res = self.run_sanitizer_on('address', quarter='kase')
assert res==[('kase','place')]

def test_housenumber_blocknumber(self):
res = self.run_sanitizer_on('address', housenumber='2', block_number='6')
assert res == [('6-2','housenumber')]

def test_quarter_neighbourhood(self):
res = self.run_sanitizer_on('address', quarter='kase', neighbourhood='8')
assert res == [('kase8','place')]
Comment on lines +46 to +48
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you so much for your comment.
I have added a new test for reconbine_place() after this line.


def test_blocknumber_housenumber_quarter(self):
res = self.run_sanitizer_on('address', block_number='6', housenumber='2', quarter='kase')
assert res == [('6-2','housenumber'),('kase','place')]

def test_blocknumber_housenumber_quarter_neighbourhood(self):
res = self.run_sanitizer_on('address', block_number='6', housenumber='2', neighbourhood='8')
assert res == [('6-2','housenumber'),('8','place')]

def test_blocknumber_quarter_neighbourhood(self):
res = self.run_sanitizer_on('address',block_number='6', quarter='kase', neighbourhood='8')
assert res == [('6','housenumber'),('kase8','place')]

def test_blocknumber_quarter(self):
res = self.run_sanitizer_on('address',block_number='6', quarter='kase')
assert res == [('6','housenumber'),('kase','place')]

def test_blocknumber_neighbourhood(self):
res = self.run_sanitizer_on('address',block_number='6', neighbourhood='8')
assert res == [('6','housenumber'),('8','place')]

def test_housenumber_quarter_neighbourhood(self):
res = self.run_sanitizer_on('address',housenumber='2', quarter='kase', neighbourhood='8')
assert res == [('2','housenumber'),('kase8','place')]

def test_housenumber_quarter(self):
res = self.run_sanitizer_on('address',housenumber='2', quarter='kase')
assert res == [('2','housenumber'),('kase','place')]

def test_housenumber_blocknumber_neighbourhood_quarter(self):
res = self.run_sanitizer_on('address', block_number='6', housenumber='2', quarter='kase', neighbourhood='8')
assert res == [('6-2','housenumber'),('kase8','place')]

def test_KANJI_MAP(self):
res = self.run_sanitizer_on('address', block_number='六', housenumber='二', quarter='kase', neighbourhood='八')
assert res == [('6-2','housenumber'),('kase8','place')]