Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds sanitizer for Japanese addresses to correspond to block address #3122

Merged
merged 7 commits into from
Aug 1, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/customize/Tokenizers.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,14 @@ The following is a list of sanitizers that are shipped with Nominatim.
rendering:
heading_level: 6

#### tag-japanese

::: nominatim.tokenizer.sanitizers.tag_japanese
selection:
members: False
rendering:
heading_level: 6

#### Token Analysis

Token analyzers take a full name and transform it into one or more normalized
Expand Down
151 changes: 151 additions & 0 deletions nominatim/tokenizer/sanitizers/tag_japanese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
This sanitizer maps OSM data to Japanese block addresses.
It replaces blocknumber and housenumber with housenumber,
and quarter and neighbourhood with place.
"""


from typing import Callable
from typing import List, Optional

from nominatim.tokenizer.sanitizers.base import ProcessInfo
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
from nominatim.data.place_name import PlaceName

KANJI_MAP = {
'零': '0',
'一': '1',
'二': '2',
'三': '3',
'四': '4',
'五': '5',
'六': '6',
'七': '7',
'八': '8',
'九': '9'
}

def convert_kanji_sequence_to_number(sequence: str) -> str:
"""Converts Kanji numbers to Arabic numbers
"""
converted = ''
current_number = ''
for char in sequence:
if char in KANJI_MAP:
current_number += KANJI_MAP[char]
else:
converted += current_number
current_number = ''
converted += char
converted += current_number
return converted
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have a look at the Python function translate. It does a similar thing, as far as I can see. You just need to be careful that with translate, the mapping table needs unicode numbers as keys. So you need to define the table like this: KANJI_MAP = { ord('零'): '0', ....}.


def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
#def create(config: SanitizerConfig) -> Callable[[ProcessInfo],None]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove all commented-out code.

"""Set up the sanitizer
"""
return tag_japanese
#return tag_japanese(config)

def reconbine_housenumber(
new_address: List[PlaceName],
tmp_housenumber: Optional[str],
tmp_blocknumber: Optional[str]
) -> List[PlaceName]:
""" Recombine the tag of housenumber by using housenumber and blocknumber
"""
if tmp_blocknumber and tmp_housenumber:
new_address.append(
PlaceName(
kind='housenumber',
name=f'{tmp_blocknumber}-{tmp_housenumber}',
suffix=''
)
)
elif tmp_blocknumber:
new_address.append(
PlaceName(
kind='housenumber',
name=f'{tmp_blocknumber}',
suffix=''
)
)
elif tmp_housenumber:
new_address.append(
PlaceName(
kind='housenumber',
name=f'{tmp_housenumber}',
suffix=''
)
)
return new_address

def reconbine_place(
new_address: List[PlaceName],
tmp_neighbourhood: Optional[str],
tmp_quarter: Optional[str]
) -> List[PlaceName]:
""" Recombine the tag of place by using neighbourhood and quarter
"""
if tmp_neighbourhood and tmp_quarter:
new_address.append(
PlaceName(
kind='place',
name=f'{tmp_quarter}{tmp_neighbourhood}',
suffix=''
)
)
elif tmp_neighbourhood:
new_address.append(
PlaceName(
kind='place',
name=f'{tmp_neighbourhood}',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The format string is unnecessary here. Same below.

suffix=''
)
)
elif tmp_quarter:
new_address.append(
PlaceName(
kind='place',
name=f'{tmp_quarter}',
suffix=''
)
)
return new_address
def tag_japanese(obj: ProcessInfo) -> None:
"""Recombine kind of address
"""
if obj.place.country_code != 'jp':
return
tmp_housenumber = None
tmp_blocknumber = None
tmp_neighbourhood = None
tmp_quarter = None

new_address = []
for item in obj.names:
item.name = convert_kanji_sequence_to_number(item.name)

for item in obj.address:
item.name = convert_kanji_sequence_to_number(item.name)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've given that some more thought and doing it for all name and address items might cause trouble because there is no such conversion when searching. Better to do this only when creating the combined place name in line 100.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there are cases where all four tags used for combining will contain KANJI, so I adapted the function to all four, not just the 100th line. https://github.com/miku0/Nominatim/blob/4d61cc87cff9ecf5c90741a94c4b93da2f12c5ad/nominatim/tokenizer/sanitizers/tag_japanese.py#L90
Is this appropriate?

if item.kind == 'housenumber':
tmp_housenumber = item.name
elif item.kind == 'block_number':
tmp_blocknumber = item.name
elif item.kind == 'neighbourhood':
tmp_neighbourhood = item.name
elif item.kind == 'quarter':
tmp_quarter = item.name
else:
new_address.append(item)

new_address = reconbine_housenumber(new_address,tmp_housenumber,tmp_blocknumber)
new_address = reconbine_place(new_address,tmp_neighbourhood,tmp_quarter)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please leave spaces after commas.


obj.address = [item for item in new_address if item.name is not None]
1 change: 1 addition & 0 deletions settings/icu_tokenizer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ sanitizers:
whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
use-defaults: all
mode: append
- step: tag-japanese
token-analysis:
- analyzer: generic
- id: "@housenumber"
Expand Down
30 changes: 30 additions & 0 deletions test/bdd/db/query/japanese.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
@DB
Feature: Searches in Japan
Test specifically for searches of Japanese addresses and in Japanese language.
@fail-legacy
Scenario: A block house-number is parented to the neighbourhood
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that your test needs a sanitizer, it is natural that it cannot work with the legacy tokenizer system that does not know about sanitizers. Simply add a @fail-legacy above the scenario.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you so much. I understood.
I added @fail-legacy.

Given the grid with origin JP
| 1 | | | | 2 |
| | 3 | | | |
| | | 9 | | |
| | | | 6 | |
And the places
| osm | class | type | name | geometry |
| W1 | highway | residential | 雉子橋通り | 1,2 |
And the places
| osm | class | type | housenr | addr+block_number | addr+neighbourhood | geometry |
| N3 | amenity | restaurant | 2 | 6 | 2丁目 | 3 |
And the places
| osm | class | type | name | geometry |
| N9 | place | neighbourhood | 2丁目 | 9 |
And the places
| osm | class | type | name | geometry |
| N6 | place | quarter | 加瀬 | 6 |
When importing
Then placex contains
| object | parent_place_id |
| N3 | N9 |
When sending search query "2丁目 6-2"
Then results contain
| osm |
| N3 |
65 changes: 65 additions & 0 deletions test/python/tokenizer/sanitizers/test_tag_japanese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from nominatim.data.place_info import PlaceInfo
from nominatim.data.place_name import PlaceName
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from typing import Mapping, Optional, List
import pytest

class TestTagJapanese:
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config

def run_sanitizer_on(self,type, **kwargs):
place = PlaceInfo({
'address': kwargs,
'country_code': 'jp'
})
sanitizer_args = {'step': 'tag-japanese'}
_, address = PlaceSanitizer([sanitizer_args], self.config).process_names(place)
tmp_list = [(p.name,p.kind) for p in address]
return sorted(tmp_list)

def test_on_address(self):
res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz')
assert res == [('bar','ref'),('baz','ref_abc'),('foo','name')]

def test_housenumber(self):
res = self.run_sanitizer_on('address', housenumber='2')
assert res == [('2','housenumber')]

def test_blocknumber(self):
res = self.run_sanitizer_on('address', block_number='6')
assert res == [('6','housenumber')]

#def test_neighbourhood(self):
# res = self.run_sanitizer_on('address',neighbourhood='8丁目')
# assert res == [('8','place')]
def test_neighbourhood(self):
res = self.run_sanitizer_on('address', neighbourhood='8')
assert res == [('8','place')]
def test_quarter(self):
res = self.run_sanitizer_on('address', quarter='kase')
assert res==[('kase','place')]

def test_housenumber_blocknumber(self):
res = self.run_sanitizer_on('address', housenumber='2', block_number='6')
assert res == [('6-2','housenumber')]

def test_housenumber_blocknumber(self):
res = self.run_sanitizer_on('address', housenumber='2', neighbourhood='8')
assert res == [('2','housenumber'),('8','place')]

def test_housenumber_blocknumber(self):
res = self.run_sanitizer_on('address', block_number='6', neighbourhood='8')
assert res == [('6','housenumber'),('8','place')]

def test_housenumber_blocknumber_neighbourhood(self):
res = self.run_sanitizer_on('address', housenumber='2', block_number='6', neighbourhood='8')
assert res == [('6-2','housenumber'),('8','place')]

def test_housenumber_blocknumber_neighbourhood_quarter(self):
res = self.run_sanitizer_on('address', housenumber='2', block_number='6', neighbourhood='8',quarter='kase')
assert res == [('6-2','housenumber'),('kase8','place')]
def test_neighbourhood_quarter(self):
res = self.run_sanitizer_on('address', neighbourhood='8',quarter='kase')
assert res == [('kase8','place')]