Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce word usage statistics for addresses #3367

Merged
merged 4 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 60 additions & 13 deletions nominatim/api/search/db_search_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,27 +226,74 @@ def yield_lookups(self, name: TokenRange, address: List[TokenRange])\
name_fulls = self.query.get_tokens(name, TokenType.WORD)
if name_fulls:
fulls_count = sum(t.count for t in name_fulls)
# At this point drop unindexed partials from the address.
# This might yield wrong results, nothing we can do about that.
if not partials_indexed:
addr_tokens = [t.token for t in addr_partials if t.is_indexed]
if len(name_partials) == 1:
penalty += min(0.5, max(0, (exp_count - 50 * fulls_count) / (2000 * fulls_count)))
if partials_indexed:
penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
# Any of the full names applies with all of the partials from the address
yield penalty, fulls_count / (2**len(addr_tokens)),\
dbf.lookup_by_any_name([t.token for t in name_fulls],
addr_tokens,
fulls_count > 30000 / max(1, len(addr_tokens)))

yield penalty,fulls_count / (2**len(addr_tokens)), \
self.get_full_name_ranking(name_fulls, addr_partials,
fulls_count > 30000 / max(1, len(addr_tokens)))

# To catch remaining results, lookup by name and address
# We only do this if there is a reasonable number of results expected.
exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count
if exp_count < 10000 and all(t.is_indexed for t in name_partials.values()):
lookup = [dbf.FieldLookup('name_vector', list(name_partials.keys()), lookups.LookupAll)]
if addr_tokens:
lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll))
penalty += 0.35 * max(1 if name_fulls else 0.1,
5 - len(name_partials) - len(addr_tokens))
yield penalty, exp_count, lookup
yield penalty, exp_count,\
self.get_name_address_ranking(list(name_partials.keys()), addr_partials)


def get_name_address_ranking(self, name_tokens: List[int],
addr_partials: List[Token]) -> List[dbf.FieldLookup]:
""" Create a ranking expression looking up by name and address.
"""
lookup = [dbf.FieldLookup('name_vector', name_tokens, lookups.LookupAll)]

addr_restrict_tokens = []
addr_lookup_tokens = []
for t in addr_partials:
if t.is_indexed:
if t.addr_count > 20000:
addr_restrict_tokens.append(t.token)
else:
addr_lookup_tokens.append(t.token)

if addr_restrict_tokens:
lookup.append(dbf.FieldLookup('nameaddress_vector',
addr_restrict_tokens, lookups.Restrict))
if addr_lookup_tokens:
lookup.append(dbf.FieldLookup('nameaddress_vector',
addr_lookup_tokens, lookups.LookupAll))

return lookup


def get_full_name_ranking(self, name_fulls: List[Token], addr_partials: List[Token],
use_lookup: bool) -> List[dbf.FieldLookup]:
""" Create a ranking expression with full name terms and
additional address lookup. When 'use_lookup' is true, then
address lookups will use the index, when the occurences are not
too many.
"""
# At this point drop unindexed partials from the address.
# This might yield wrong results, nothing we can do about that.
if use_lookup:
addr_restrict_tokens = []
addr_lookup_tokens = []
for t in addr_partials:
if t.is_indexed:
if t.addr_count > 20000:
addr_restrict_tokens.append(t.token)
else:
addr_lookup_tokens.append(t.token)
else:
addr_restrict_tokens = [t.token for t in addr_partials if t.is_indexed]
addr_lookup_tokens = []

return dbf.lookup_by_any_name([t.token for t in name_fulls],
addr_restrict_tokens, addr_lookup_tokens)


def get_name_ranking(self, trange: TokenRange,
Expand Down
11 changes: 6 additions & 5 deletions nominatim/api/search/db_search_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,16 +231,17 @@ def lookup_by_names(name_tokens: List[int], addr_tokens: List[int]) -> List[Fiel
return lookup


def lookup_by_any_name(name_tokens: List[int], addr_tokens: List[int],
use_index_for_addr: bool) -> List[FieldLookup]:
def lookup_by_any_name(name_tokens: List[int], addr_restrict_tokens: List[int],
addr_lookup_tokens: List[int]) -> List[FieldLookup]:
""" Create a lookup list where name tokens are looked up via index
and only one of the name tokens must be present.
Potential address tokens are used to restrict the search further.
"""
lookup = [FieldLookup('name_vector', name_tokens, lookups.LookupAny)]
if addr_tokens:
lookup.append(FieldLookup('nameaddress_vector', addr_tokens,
lookups.LookupAll if use_index_for_addr else lookups.Restrict))
if addr_restrict_tokens:
lookup.append(FieldLookup('nameaddress_vector', addr_restrict_tokens, lookups.Restrict))
if addr_lookup_tokens:
lookup.append(FieldLookup('nameaddress_vector', addr_lookup_tokens, lookups.LookupAll))

return lookup

Expand Down
6 changes: 4 additions & 2 deletions nominatim/api/search/icu_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def from_db_row(row: SaRow) -> 'ICUToken':
""" Create a ICUToken from the row of the word table.
"""
count = 1 if row.info is None else row.info.get('count', 1)
addr_count = 1 if row.info is None else row.info.get('addr_count', 1)

penalty = 0.0
if row.type == 'w':
Expand All @@ -123,7 +124,8 @@ def from_db_row(row: SaRow) -> 'ICUToken':

return ICUToken(penalty=penalty, token=row.word_id, count=count,
lookup_word=lookup_word, is_indexed=True,
word_token=row.word_token, info=row.info)
word_token=row.word_token, info=row.info,
addr_count=addr_count)



Expand Down Expand Up @@ -257,7 +259,7 @@ def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
if len(part.token) <= 4 and part[0].isdigit()\
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
ICUToken(0.5, 0, 1, part.token, True, part.token, None))
ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None))


def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
Expand Down
3 changes: 2 additions & 1 deletion nominatim/api/search/legacy_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ def make_token(self, row: SaRow) -> Tuple[LegacyToken, qmod.TokenType]:

return LegacyToken(penalty=penalty, token=row.word_id,
count=row.search_name_count or 1,
addr_count=1, # not supported
lookup_word=lookup_word,
word_token=row.word_token.strip(),
category=(rowclass, row.type) if rowclass is not None else None,
Expand All @@ -226,7 +227,7 @@ def add_extra_tokens(self, query: qmod.QueryStruct, parts: List[str]) -> None:
if len(part) <= 4 and part.isdigit()\
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
LegacyToken(penalty=0.5, token=0, count=1,
LegacyToken(penalty=0.5, token=0, count=1, addr_count=1,
lookup_word=part, word_token=part,
category=None, country=None,
operator=None, is_indexed=True))
Expand Down
1 change: 1 addition & 0 deletions nominatim/api/search/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ class Token(ABC):
penalty: float
token: int
count: int
addr_count: int
lookup_word: str
is_indexed: bool

Expand Down
3 changes: 2 additions & 1 deletion nominatim/clicmd/refresh.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, t

if args.word_counts:
LOG.warning('Recompute word statistics')
self._get_tokenizer(args.config).update_statistics(args.config)
self._get_tokenizer(args.config).update_statistics(args.config,
threads=args.threads or 1)

if args.address_levels:
LOG.warning('Updating address levels')
Expand Down
2 changes: 1 addition & 1 deletion nominatim/clicmd/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def run(self, args: NominatimArgs) -> int: # pylint: disable=too-many-statements
tokenizer.finalize_import(args.config)

LOG.warning('Recompute word counts')
tokenizer.update_statistics(args.config)
tokenizer.update_statistics(args.config, threads=num_threads)

webdir = args.project_dir / 'website'
LOG.warning('Setup website at %s', webdir)
Expand Down
2 changes: 1 addition & 1 deletion nominatim/tokenizer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def check_database(self, config: Configuration) -> Optional[str]:


@abstractmethod
def update_statistics(self, config: Configuration) -> None:
def update_statistics(self, config: Configuration, threads: int = 1) -> None:
""" Recompute any tokenizer statistics necessary for efficient lookup.
This function is meant to be called from time to time by the user
to improve performance. However, the tokenizer must not depend on
Expand Down
101 changes: 84 additions & 17 deletions nominatim/tokenizer/icu_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,30 +104,97 @@ def check_database(self, config: Configuration) -> None:
self.init_from_project(config)


def update_statistics(self, config: Configuration) -> None:
def update_statistics(self, config: Configuration, threads: int = 2) -> None:
""" Recompute frequencies for all name words.
"""
with connect(self.dsn) as conn:
if not conn.table_exists('search_name'):
return

with conn.cursor() as cur:
LOG.info('Computing word frequencies')
cur.drop_table('word_frequencies')
cur.execute("""CREATE TEMP TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute('CREATE INDEX ON word_frequencies(id)')
LOG.info('Update word table with recomputed frequencies')
cur.drop_table('tmp_word')
cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
(CASE WHEN wf.count is null THEN info
ELSE info || jsonb_build_object('count', wf.count)
END) as info
FROM word LEFT JOIN word_frequencies wf
ON word.word_id = wf.id""")
cur.drop_table('word_frequencies')
cur.execute('ANALYSE search_name')
if threads > 1:
cur.execute('SET max_parallel_workers_per_gather TO %s',
(min(threads, 6),))

if conn.server_version_tuple() < (12, 0):
LOG.info('Computing word frequencies')
cur.drop_table('word_frequencies')
cur.drop_table('addressword_frequencies')
cur.execute("""CREATE TEMP TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute('CREATE INDEX ON word_frequencies(id)')
cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
SELECT unnest(nameaddress_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute('CREATE INDEX ON addressword_frequencies(id)')
cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
INOUT info JSONB)
AS $$
DECLARE rec RECORD;
BEGIN
IF info is null THEN
info = '{}'::jsonb;
END IF;
FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
LOOP
info = info || jsonb_build_object('count', rec.count);
END LOOP;
FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
LOOP
info = info || jsonb_build_object('addr_count', rec.count);
END LOOP;
IF info = '{}'::jsonb THEN
info = null;
END IF;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
""")
LOG.info('Update word table with recomputed frequencies')
cur.drop_table('tmp_word')
cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
word_freq_update(word_id, info) as info
FROM word
""")
cur.drop_table('word_frequencies')
cur.drop_table('addressword_frequencies')
else:
LOG.info('Computing word frequencies')
cur.drop_table('word_frequencies')
cur.execute("""
CREATE TEMP TABLE word_frequencies AS
WITH word_freq AS MATERIALIZED (
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id),
addr_freq AS MATERIALIZED (
SELECT unnest(nameaddress_vector) as id, count(*)
FROM search_name GROUP BY id)
SELECT coalesce(a.id, w.id) as id,
(CASE WHEN w.count is null THEN '{}'::JSONB
ELSE jsonb_build_object('count', w.count) END
||
CASE WHEN a.count is null THEN '{}'::JSONB
ELSE jsonb_build_object('addr_count', a.count) END) as info
FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
""")
cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
cur.execute('ANALYSE word_frequencies')
LOG.info('Update word table with recomputed frequencies')
cur.drop_table('tmp_word')
cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
(CASE WHEN wf.info is null THEN word.info
ELSE coalesce(word.info, '{}'::jsonb) || wf.info
END) as info
FROM word LEFT JOIN word_frequencies wf
ON word.word_id = wf.id
""")
cur.drop_table('word_frequencies')

with conn.cursor() as cur:
cur.execute('SET max_parallel_workers_per_gather TO 0')

sqlp = SQLPreprocessor(conn, config)
sqlp.run_string(conn,
Expand Down
2 changes: 1 addition & 1 deletion nominatim/tokenizer/legacy_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def migrate_database(self, config: Configuration) -> None:
self._save_config(conn, config)


def update_statistics(self, _: Configuration) -> None:
def update_statistics(self, config: Configuration, threads: int = 1) -> None:
""" Recompute the frequency of full words.
"""
with connect(self.dsn) as conn:
Expand Down
3 changes: 2 additions & 1 deletion test/python/api/search/test_api_search_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ def get_category(self):


def mktoken(tid: int):
return MyToken(3.0, tid, 1, 'foo', True)
return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,
lookup_word='foo', is_indexed=True)


@pytest.mark.parametrize('ptype,ttype', [('NONE', 'WORD'),
Expand Down
12 changes: 7 additions & 5 deletions test/python/api/search/test_db_search_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ def make_query(*args):
for end, ttype, tinfo in tlist:
for tid, word in tinfo:
q.add_token(TokenRange(start, end), ttype,
MyToken(0.5 if ttype == TokenType.PARTIAL else 0.0, tid, 1, word, True))
MyToken(penalty=0.5 if ttype == TokenType.PARTIAL else 0.0,
token=tid, count=1, addr_count=1,
lookup_word=word, is_indexed=True))


return q
Expand Down Expand Up @@ -395,14 +397,14 @@ def make_counted_searches(name_part, name_full, address_part, address_full,
q.add_node(BreakType.END, PhraseType.NONE)

q.add_token(TokenRange(0, 1), TokenType.PARTIAL,
MyToken(0.5, 1, name_part, 'name_part', True))
MyToken(0.5, 1, name_part, 1, 'name_part', True))
q.add_token(TokenRange(0, 1), TokenType.WORD,
MyToken(0, 101, name_full, 'name_full', True))
MyToken(0, 101, name_full, 1, 'name_full', True))
for i in range(num_address_parts):
q.add_token(TokenRange(i + 1, i + 2), TokenType.PARTIAL,
MyToken(0.5, 2, address_part, 'address_part', True))
MyToken(0.5, 2, address_part, 1, 'address_part', True))
q.add_token(TokenRange(i + 1, i + 2), TokenType.WORD,
MyToken(0, 102, address_full, 'address_full', True))
MyToken(0, 102, address_full, 1, 'address_full', True))

builder = SearchBuilder(q, SearchDetails())

Expand Down
3 changes: 2 additions & 1 deletion test/python/api/search/test_token_assignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def get_category(self):

def make_query(*args):
q = QueryStruct([Phrase(args[0][1], '')])
dummy = MyToken(3.0, 45, 1, 'foo', True)
dummy = MyToken(penalty=3.0, token=45, count=1, addr_count=1,
lookup_word='foo', is_indexed=True)

for btype, ptype, _ in args[1:]:
q.add_node(btype, ptype)
Expand Down
8 changes: 4 additions & 4 deletions test/python/cli/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,16 @@ def __init__(self, *args, **kwargs):
self.update_statistics_called = False
self.update_word_tokens_called = False

def update_sql_functions(self, *args):
def update_sql_functions(self, *args, **kwargs):
self.update_sql_functions_called = True

def finalize_import(self, *args):
def finalize_import(self, *args, **kwargs):
self.finalize_import_called = True

def update_statistics(self, *args):
def update_statistics(self, *args, **kwargs):
self.update_statistics_called = True

def update_word_tokens(self, *args):
def update_word_tokens(self, *args, **kwargs):
self.update_word_tokens_called = True


Expand Down
Loading
Loading