-
-
Notifications
You must be signed in to change notification settings - Fork 714
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adds sanitizer for Japanese addresses to correspond to block address #3122
Changes from 1 commit
0722495
848e5ac
fac8c32
67706ce
2350018
4d61cc8
67e1c7d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
# SPDX-License-Identifier: GPL-3.0-or-later | ||
# | ||
# This file is part of Nominatim. (https://nominatim.org) | ||
# | ||
# Copyright (C) 2022 by the Nominatim developer community. | ||
# For a full list of authors see the git log. | ||
""" | ||
This sanitizer maps OSM data to Japanese block addresses. | ||
It replaces blocknumber and housenumber with housenumber, | ||
and quarter and neighbourhood with place. | ||
""" | ||
|
||
|
||
from typing import Callable | ||
from typing import List | ||
|
||
from nominatim.tokenizer.sanitizers.base import ProcessInfo | ||
from nominatim.tokenizer.sanitizers.config import SanitizerConfig | ||
from nominatim.data.place_name import PlaceName | ||
|
||
def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]: | ||
#def create(config: SanitizerConfig) -> Callable[[ProcessInfo],None]: | ||
"""Set up the sanitizer | ||
""" | ||
return tag_japanese | ||
#return tag_japanese(config) | ||
|
||
def convert_kanji_sequence_to_number(sequence: str) -> str: | ||
"""Converts Kanji numbers to Arabic numbers | ||
""" | ||
kanji_map = { | ||
'零': '0', | ||
'一': '1', | ||
'二': '2', | ||
'三': '3', | ||
'四': '4', | ||
'五': '5', | ||
'六': '6', | ||
'七': '7', | ||
'八': '8', | ||
'九': '9' | ||
} | ||
converted = '' | ||
current_number = '' | ||
for char in sequence: | ||
if char in kanji_map: | ||
current_number += kanji_map[char] | ||
else: | ||
converted += current_number | ||
current_number = '' | ||
converted += char | ||
converted += current_number | ||
return converted | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have a look at the Python function translate. It does a similar thing, as far as I can see. You just need to be careful that with translate, the mapping table needs unicode numbers as keys. So you need to define the table like this: |
||
|
||
def reconbine_housenumber( | ||
new_address: List[PlaceName], | ||
tmp_housenumber: str | None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You still need to use the 'old' syntax There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://github.com/miku0/Nominatim/blob/sanitizer-final/nominatim/tokenizer/sanitizers/tag_japanese.py#L31 |
||
tmp_blocknumber: str | None | ||
) -> List[PlaceName]: | ||
""" Recombine the tag of housenumber by using housenumber and blocknumber | ||
""" | ||
if tmp_blocknumber and tmp_housenumber: | ||
new_address.append( | ||
PlaceName( | ||
kind='housenumber', | ||
name=f'{tmp_blocknumber}-{tmp_housenumber}', | ||
suffix='' | ||
) | ||
) | ||
elif tmp_blocknumber: | ||
new_address.append( | ||
PlaceName( | ||
kind='housenumber', | ||
name=f'{tmp_blocknumber}', | ||
suffix='' | ||
) | ||
) | ||
elif tmp_housenumber: | ||
new_address.append( | ||
PlaceName( | ||
kind='housenumber', | ||
name=f'{tmp_housenumber}', | ||
suffix='' | ||
) | ||
) | ||
return new_address | ||
|
||
def reconbine_place( | ||
new_address: List[PlaceName], | ||
tmp_neighbourhood: str | None, | ||
tmp_quarter: str | None | ||
) -> List[PlaceName]: | ||
""" Recombine the tag of place by using neighbourhood and quarter | ||
""" | ||
if tmp_neighbourhood and tmp_quarter: | ||
new_address.append( | ||
PlaceName( | ||
kind='place', | ||
name=f'{tmp_quarter}{tmp_neighbourhood}', | ||
suffix='' | ||
) | ||
) | ||
elif tmp_neighbourhood: | ||
new_address.append( | ||
PlaceName( | ||
kind='place', | ||
name=f'{tmp_neighbourhood}', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The format string is unnecessary here. Same below. |
||
suffix='' | ||
) | ||
) | ||
elif tmp_quarter: | ||
new_address.append( | ||
PlaceName( | ||
kind='place', | ||
name=f'{tmp_quarter}', | ||
suffix='' | ||
) | ||
) | ||
return new_address | ||
def tag_japanese(obj: ProcessInfo) -> None: | ||
"""Recombine kind of address | ||
""" | ||
if obj.place.country_code != 'jp': | ||
return | ||
tmp_housenumber = None | ||
tmp_blocknumber = None | ||
tmp_neighbourhood = None | ||
tmp_quarter = None | ||
|
||
new_address = [] | ||
for item in obj.names: | ||
item.name = convert_kanji_sequence_to_number(item.name) | ||
|
||
for item in obj.address: | ||
item.name = convert_kanji_sequence_to_number(item.name) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've given that some more thought and doing it for all name and address items might cause trouble because there is no such conversion when searching. Better to do this only when creating the combined place name in line 100. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there are cases where all four tags used for combining will contain KANJI, so I adapted the function to all four, not just the 100th line. https://github.com/miku0/Nominatim/blob/4d61cc87cff9ecf5c90741a94c4b93da2f12c5ad/nominatim/tokenizer/sanitizers/tag_japanese.py#L90 |
||
if item.kind == 'housenumber': | ||
tmp_housenumber = item.name | ||
elif item.kind == 'block_number': | ||
tmp_blocknumber = item.name | ||
elif item.kind == 'neighbourhood': | ||
tmp_neighbourhood = item.name | ||
elif item.kind == 'quarter': | ||
tmp_quarter = item.name | ||
else: | ||
new_address.append(item) | ||
|
||
new_address = reconbine_housenumber(new_address,tmp_housenumber,tmp_blocknumber) | ||
new_address = reconbine_place(new_address,tmp_neighbourhood,tmp_quarter) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please leave spaces after commas. |
||
|
||
obj.address = [item for item in new_address if item.name is not None] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
@DB | ||
Feature: Searches in Japan | ||
Test specifically for searches of Japanese addresses and in Japanese language. | ||
Scenario: A block house-number is parented to the neighbourhood | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given that your test needs a sanitizer, it is natural that it cannot work with the legacy tokenizer system that does not know about sanitizers. Simply add a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you so much. I understood. |
||
Given the grid with origin JP | ||
| 1 | | | | 2 | | ||
| | 3 | | | | | ||
| | | 9 | | | | ||
| | | | 6 | | | ||
And the places | ||
| osm | class | type | name | geometry | | ||
| W1 | highway | residential | 雉子橋通り | 1,2 | | ||
And the places | ||
| osm | class | type | housenr | addr+block_number | addr+neighbourhood | geometry | | ||
| N3 | amenity | restaurant | 2 | 6 | 2丁目 | 3 | | ||
And the places | ||
| osm | class | type | name | geometry | | ||
| N9 | place | neighbourhood | 2丁目 | 9 | | ||
And the places | ||
| osm | class | type | name | geometry | | ||
| N6 | place | quarter | 加瀬 | 6 | | ||
When importing | ||
Then placex contains | ||
| object | parent_place_id | | ||
| N3 | N9 | | ||
When sending search query "2丁目 6-2" | ||
Then results contain | ||
| osm | | ||
| N3 | |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
from nominatim.data.place_info import PlaceInfo | ||
from nominatim.data.place_name import PlaceName | ||
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer | ||
from typing import Mapping, Optional, List | ||
import pytest | ||
|
||
class TestTagJapanese: | ||
@pytest.fixture(autouse=True) | ||
def setup_country(self, def_config): | ||
self.config = def_config | ||
|
||
def run_sanitizer_on(self,type, **kwargs): | ||
place = PlaceInfo({ | ||
'address': kwargs, | ||
'country_code': 'jp' | ||
}) | ||
sanitizer_args = {'step': 'tag-japanese'} | ||
_, address = PlaceSanitizer([sanitizer_args], self.config).process_names(place) | ||
tmp_list = [(p.name,p.kind) for p in address] | ||
return sorted(tmp_list) | ||
|
||
def test_on_address(self): | ||
res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz') | ||
assert res == [('bar','ref'),('baz','ref_abc'),('foo','name')] | ||
|
||
def test_housenumber(self): | ||
res = self.run_sanitizer_on('address', housenumber='2') | ||
assert res == [('2','housenumber')] | ||
|
||
def test_blocknumber(self): | ||
res = self.run_sanitizer_on('address', block_number='6') | ||
assert res == [('6','housenumber')] | ||
|
||
#def test_neighbourhood(self): | ||
# res = self.run_sanitizer_on('address',neighbourhood='8丁目') | ||
# assert res == [('8','place')] | ||
def test_neighbourhood(self): | ||
res = self.run_sanitizer_on('address', neighbourhood='8') | ||
assert res == [('8','place')] | ||
def test_quarter(self): | ||
res = self.run_sanitizer_on('address', quarter='kase') | ||
assert res==[('kase','place')] | ||
|
||
def test_housenumber_blocknumber(self): | ||
res = self.run_sanitizer_on('address', housenumber='2', block_number='6') | ||
assert res == [('6-2','housenumber')] | ||
|
||
def test_housenumber_blocknumber(self): | ||
res = self.run_sanitizer_on('address', housenumber='2', neighbourhood='8') | ||
assert res == [('2','housenumber'),('8','place')] | ||
|
||
def test_housenumber_blocknumber(self): | ||
res = self.run_sanitizer_on('address', block_number='6', neighbourhood='8') | ||
assert res == [('6','housenumber'),('8','place')] | ||
|
||
def test_housenumber_blocknumber_neighbourhood(self): | ||
res = self.run_sanitizer_on('address', housenumber='2', block_number='6', neighbourhood='8') | ||
assert res == [('6-2','housenumber'),('8','place')] | ||
|
||
def test_housenumber_blocknumber_neighbourhood_quarter(self): | ||
res = self.run_sanitizer_on('address', housenumber='2', block_number='6', neighbourhood='8',quarter='kase') | ||
assert res == [('6-2','housenumber'),('kase8','place')] | ||
def test_neighbourhood_quarter(self): | ||
res = self.run_sanitizer_on('address', neighbourhood='8',quarter='kase') | ||
assert res == [('kase8','place')] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like a constant which could be reused?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nominatim/tokenizer/sanitizers/kanji_utils.py
I split it into new files.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don' t think this is what was meant here. Rather move ' kanji_map' outside the function as a module global variable. Then it is initialised only once. Please note, that the variable name should be then in uppercase letters for being a constant.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm sorry. I was mistaken.
I moved and capitalized.