From bb5de9b955e9ff676ea4d3c73cdfa94c60854857 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 13 Mar 2024 15:10:25 +0100 Subject: [PATCH 1/4] extend word statistics to address index Word frequency in names is not sufficient to interpolate word frequency in the address because names of towns, states etc. are much more frequently used than, say street names. --- nominatim/api/search/query.py | 1 + nominatim/clicmd/refresh.py | 3 +- nominatim/clicmd/setup.py | 2 +- nominatim/tokenizer/icu_tokenizer.py | 101 ++++++++++++++++++++++----- 4 files changed, 88 insertions(+), 19 deletions(-) diff --git a/nominatim/api/search/query.py b/nominatim/api/search/query.py index 333722fe4..bd91c2ece 100644 --- a/nominatim/api/search/query.py +++ b/nominatim/api/search/query.py @@ -102,6 +102,7 @@ class Token(ABC): lookup_word: str is_indexed: bool + addr_count: int = 1 @abstractmethod def get_category(self) -> Tuple[str, str]: diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py index afafe4a83..343fe48d2 100644 --- a/nominatim/clicmd/refresh.py +++ b/nominatim/clicmd/refresh.py @@ -110,7 +110,8 @@ def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, t if args.word_counts: LOG.warning('Recompute word statistics') - self._get_tokenizer(args.config).update_statistics(args.config) + self._get_tokenizer(args.config).update_statistics(args.config, + threads=args.threads or 1) if args.address_levels: LOG.warning('Updating address levels') diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 2fd8b141a..ccd6bd788 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -168,7 +168,7 @@ def run(self, args: NominatimArgs) -> int: # pylint: disable=too-many-statements tokenizer.finalize_import(args.config) LOG.warning('Recompute word counts') - tokenizer.update_statistics(args.config) + tokenizer.update_statistics(args.config, threads=num_threads) webdir = args.project_dir / 'website' LOG.warning('Setup website at %s', webdir) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index c1821d7ed..251f4da5d 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -104,7 +104,7 @@ def check_database(self, config: Configuration) -> None: self.init_from_project(config) - def update_statistics(self, config: Configuration) -> None: + def update_statistics(self, config: Configuration, threads: int = 2) -> None: """ Recompute frequencies for all name words. """ with connect(self.dsn) as conn: @@ -112,22 +112,89 @@ def update_statistics(self, config: Configuration) -> None: return with conn.cursor() as cur: - LOG.info('Computing word frequencies') - cur.drop_table('word_frequencies') - cur.execute("""CREATE TEMP TABLE word_frequencies AS - SELECT unnest(name_vector) as id, count(*) - FROM search_name GROUP BY id""") - cur.execute('CREATE INDEX ON word_frequencies(id)') - LOG.info('Update word table with recomputed frequencies') - cur.drop_table('tmp_word') - cur.execute("""CREATE TABLE tmp_word AS - SELECT word_id, word_token, type, word, - (CASE WHEN wf.count is null THEN info - ELSE info || jsonb_build_object('count', wf.count) - END) as info - FROM word LEFT JOIN word_frequencies wf - ON word.word_id = wf.id""") - cur.drop_table('word_frequencies') + cur.execute('ANALYSE search_name') + if threads > 1: + cur.execute('SET max_parallel_workers_per_gather TO %s', + (min(threads, 6),)) + + if conn.server_version_tuple() < (12, 0): + LOG.info('Computing word frequencies') + cur.drop_table('word_frequencies') + cur.drop_table('addressword_frequencies') + cur.execute("""CREATE TEMP TABLE word_frequencies AS + SELECT unnest(name_vector) as id, count(*) + FROM search_name GROUP BY id""") + cur.execute('CREATE INDEX ON word_frequencies(id)') + cur.execute("""CREATE TEMP TABLE addressword_frequencies AS + SELECT unnest(nameaddress_vector) as id, count(*) + FROM search_name GROUP BY id""") + cur.execute('CREATE INDEX ON addressword_frequencies(id)') + cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER, + INOUT info JSONB) + AS $$ + DECLARE rec RECORD; + BEGIN + IF info is null THEN + info = '{}'::jsonb; + END IF; + FOR rec IN SELECT count FROM word_frequencies WHERE id = wid + LOOP + info = info || jsonb_build_object('count', rec.count); + END LOOP; + FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid + LOOP + info = info || jsonb_build_object('addr_count', rec.count); + END LOOP; + IF info = '{}'::jsonb THEN + info = null; + END IF; + END; + $$ LANGUAGE plpgsql IMMUTABLE; + """) + LOG.info('Update word table with recomputed frequencies') + cur.drop_table('tmp_word') + cur.execute("""CREATE TABLE tmp_word AS + SELECT word_id, word_token, type, word, + word_freq_update(word_id, info) as info + FROM word + """) + cur.drop_table('word_frequencies') + cur.drop_table('addressword_frequencies') + else: + LOG.info('Computing word frequencies') + cur.drop_table('word_frequencies') + cur.execute(""" + CREATE TEMP TABLE word_frequencies AS + WITH word_freq AS MATERIALIZED ( + SELECT unnest(name_vector) as id, count(*) + FROM search_name GROUP BY id), + addr_freq AS MATERIALIZED ( + SELECT unnest(nameaddress_vector) as id, count(*) + FROM search_name GROUP BY id) + SELECT coalesce(a.id, w.id) as id, + (CASE WHEN w.count is null THEN '{}'::JSONB + ELSE jsonb_build_object('count', w.count) END + || + CASE WHEN a.count is null THEN '{}'::JSONB + ELSE jsonb_build_object('addr_count', a.count) END) as info + FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id; + """) + cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)') + cur.execute('ANALYSE word_frequencies') + LOG.info('Update word table with recomputed frequencies') + cur.drop_table('tmp_word') + cur.execute("""CREATE TABLE tmp_word AS + SELECT word_id, word_token, type, word, + (CASE WHEN wf.info is null THEN word.info + ELSE coalesce(word.info, '{}'::jsonb) || wf.info + END) as info + FROM word LEFT JOIN word_frequencies wf + ON word.word_id = wf.id + """) + cur.drop_table('word_frequencies') + + with conn.cursor() as cur: + cur.execute('SET max_parallel_workers_per_gather TO 0') sqlp = SQLPreprocessor(conn, config) sqlp.run_string(conn, From 07b7fd1dbbb1c2aab6f678a29ba0b5711ebff53e Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 15 Mar 2024 10:54:13 +0100 Subject: [PATCH 2/4] add address counts to tokens --- nominatim/api/search/icu_tokenizer.py | 6 ++++-- nominatim/api/search/legacy_tokenizer.py | 3 ++- nominatim/api/search/query.py | 2 +- nominatim/tokenizer/base.py | 2 +- nominatim/tokenizer/legacy_tokenizer.py | 2 +- test/python/api/search/test_api_search_query.py | 3 ++- test/python/api/search/test_db_search_builder.py | 12 +++++++----- test/python/api/search/test_token_assignment.py | 3 ++- test/python/cli/conftest.py | 8 ++++---- test/python/tokenizer/test_icu.py | 12 ++++++++---- 10 files changed, 32 insertions(+), 21 deletions(-) diff --git a/nominatim/api/search/icu_tokenizer.py b/nominatim/api/search/icu_tokenizer.py index 1c2565d1a..05ec7690c 100644 --- a/nominatim/api/search/icu_tokenizer.py +++ b/nominatim/api/search/icu_tokenizer.py @@ -97,6 +97,7 @@ def from_db_row(row: SaRow) -> 'ICUToken': """ Create a ICUToken from the row of the word table. """ count = 1 if row.info is None else row.info.get('count', 1) + addr_count = 1 if row.info is None else row.info.get('addr_count', 1) penalty = 0.0 if row.type == 'w': @@ -123,7 +124,8 @@ def from_db_row(row: SaRow) -> 'ICUToken': return ICUToken(penalty=penalty, token=row.word_id, count=count, lookup_word=lookup_word, is_indexed=True, - word_token=row.word_token, info=row.info) + word_token=row.word_token, info=row.info, + addr_count=addr_count) @@ -257,7 +259,7 @@ def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: if len(part.token) <= 4 and part[0].isdigit()\ and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER): query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER, - ICUToken(0.5, 0, 1, part.token, True, part.token, None)) + ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None)) def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: diff --git a/nominatim/api/search/legacy_tokenizer.py b/nominatim/api/search/legacy_tokenizer.py index 86d42a543..bd17706e5 100644 --- a/nominatim/api/search/legacy_tokenizer.py +++ b/nominatim/api/search/legacy_tokenizer.py @@ -210,6 +210,7 @@ def make_token(self, row: SaRow) -> Tuple[LegacyToken, qmod.TokenType]: return LegacyToken(penalty=penalty, token=row.word_id, count=row.search_name_count or 1, + addr_count=1, # not supported lookup_word=lookup_word, word_token=row.word_token.strip(), category=(rowclass, row.type) if rowclass is not None else None, @@ -226,7 +227,7 @@ def add_extra_tokens(self, query: qmod.QueryStruct, parts: List[str]) -> None: if len(part) <= 4 and part.isdigit()\ and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER): query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER, - LegacyToken(penalty=0.5, token=0, count=1, + LegacyToken(penalty=0.5, token=0, count=1, addr_count=1, lookup_word=part, word_token=part, category=None, country=None, operator=None, is_indexed=True)) diff --git a/nominatim/api/search/query.py b/nominatim/api/search/query.py index bd91c2ece..a0d7add1b 100644 --- a/nominatim/api/search/query.py +++ b/nominatim/api/search/query.py @@ -99,10 +99,10 @@ class Token(ABC): penalty: float token: int count: int + addr_count: int lookup_word: str is_indexed: bool - addr_count: int = 1 @abstractmethod def get_category(self) -> Tuple[str, str]: diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py index 29bcc8e19..12c826eb2 100644 --- a/nominatim/tokenizer/base.py +++ b/nominatim/tokenizer/base.py @@ -201,7 +201,7 @@ def check_database(self, config: Configuration) -> Optional[str]: @abstractmethod - def update_statistics(self, config: Configuration) -> None: + def update_statistics(self, config: Configuration, threads: int = 1) -> None: """ Recompute any tokenizer statistics necessary for efficient lookup. This function is meant to be called from time to time by the user to improve performance. However, the tokenizer must not depend on diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index f3a00839a..93808cc39 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -210,7 +210,7 @@ def migrate_database(self, config: Configuration) -> None: self._save_config(conn, config) - def update_statistics(self, _: Configuration) -> None: + def update_statistics(self, config: Configuration, threads: int = 1) -> None: """ Recompute the frequency of full words. """ with connect(self.dsn) as conn: diff --git a/test/python/api/search/test_api_search_query.py b/test/python/api/search/test_api_search_query.py index fe850ce90..bfdceb416 100644 --- a/test/python/api/search/test_api_search_query.py +++ b/test/python/api/search/test_api_search_query.py @@ -18,7 +18,8 @@ def get_category(self): def mktoken(tid: int): - return MyToken(3.0, tid, 1, 'foo', True) + return MyToken(penalty=3.0, token=tid, count=1, addr_count=1, + lookup_word='foo', is_indexed=True) @pytest.mark.parametrize('ptype,ttype', [('NONE', 'WORD'), diff --git a/test/python/api/search/test_db_search_builder.py b/test/python/api/search/test_db_search_builder.py index d3aea9000..68f71298c 100644 --- a/test/python/api/search/test_db_search_builder.py +++ b/test/python/api/search/test_db_search_builder.py @@ -31,7 +31,9 @@ def make_query(*args): for end, ttype, tinfo in tlist: for tid, word in tinfo: q.add_token(TokenRange(start, end), ttype, - MyToken(0.5 if ttype == TokenType.PARTIAL else 0.0, tid, 1, word, True)) + MyToken(penalty=0.5 if ttype == TokenType.PARTIAL else 0.0, + token=tid, count=1, addr_count=1, + lookup_word=word, is_indexed=True)) return q @@ -395,14 +397,14 @@ def make_counted_searches(name_part, name_full, address_part, address_full, q.add_node(BreakType.END, PhraseType.NONE) q.add_token(TokenRange(0, 1), TokenType.PARTIAL, - MyToken(0.5, 1, name_part, 'name_part', True)) + MyToken(0.5, 1, name_part, 1, 'name_part', True)) q.add_token(TokenRange(0, 1), TokenType.WORD, - MyToken(0, 101, name_full, 'name_full', True)) + MyToken(0, 101, name_full, 1, 'name_full', True)) for i in range(num_address_parts): q.add_token(TokenRange(i + 1, i + 2), TokenType.PARTIAL, - MyToken(0.5, 2, address_part, 'address_part', True)) + MyToken(0.5, 2, address_part, 1, 'address_part', True)) q.add_token(TokenRange(i + 1, i + 2), TokenType.WORD, - MyToken(0, 102, address_full, 'address_full', True)) + MyToken(0, 102, address_full, 1, 'address_full', True)) builder = SearchBuilder(q, SearchDetails()) diff --git a/test/python/api/search/test_token_assignment.py b/test/python/api/search/test_token_assignment.py index 54e8af14c..cde8495d0 100644 --- a/test/python/api/search/test_token_assignment.py +++ b/test/python/api/search/test_token_assignment.py @@ -19,7 +19,8 @@ def get_category(self): def make_query(*args): q = QueryStruct([Phrase(args[0][1], '')]) - dummy = MyToken(3.0, 45, 1, 'foo', True) + dummy = MyToken(penalty=3.0, token=45, count=1, addr_count=1, + lookup_word='foo', is_indexed=True) for btype, ptype, _ in args[1:]: q.add_node(btype, ptype) diff --git a/test/python/cli/conftest.py b/test/python/cli/conftest.py index 1bb393fb2..28aba597e 100644 --- a/test/python/cli/conftest.py +++ b/test/python/cli/conftest.py @@ -32,16 +32,16 @@ def __init__(self, *args, **kwargs): self.update_statistics_called = False self.update_word_tokens_called = False - def update_sql_functions(self, *args): + def update_sql_functions(self, *args, **kwargs): self.update_sql_functions_called = True - def finalize_import(self, *args): + def finalize_import(self, *args, **kwargs): self.finalize_import_called = True - def update_statistics(self, *args): + def update_statistics(self, *args, **kwargs): self.update_statistics_called = True - def update_word_tokens(self, *args): + def update_word_tokens(self, *args, **kwargs): self.update_word_tokens_called = True diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index aa1afe160..9f6eae62e 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -227,16 +227,20 @@ def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_conf def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory, test_config): word_table.add_full_word(1000, 'hello') + word_table.add_full_word(1001, 'bye') table_factory('search_name', - 'place_id BIGINT, name_vector INT[]', - [(12, [1000])]) + 'place_id BIGINT, name_vector INT[], nameaddress_vector INT[]', + [(12, [1000], [1001])]) tok = tokenizer_factory() tok.update_statistics(test_config) assert temp_db_cursor.scalar("""SELECT count(*) FROM word - WHERE type = 'W' and - (info->>'count')::int > 0""") > 0 + WHERE type = 'W' and word_id = 1000 and + (info->>'count')::int > 0""") == 1 + assert temp_db_cursor.scalar("""SELECT count(*) FROM word + WHERE type = 'W' and word_id = 1001 and + (info->>'addr_count')::int > 0""") == 1 def test_normalize_postcode(analyzer): From ff3230a7f3fcb314c22ab977fa014465ff274ec8 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 16 Mar 2024 11:26:00 +0100 Subject: [PATCH 3/4] add penalty for single words that look like stop words --- nominatim/api/search/db_search_builder.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nominatim/api/search/db_search_builder.py b/nominatim/api/search/db_search_builder.py index ef7a66b85..f8eabad14 100644 --- a/nominatim/api/search/db_search_builder.py +++ b/nominatim/api/search/db_search_builder.py @@ -226,6 +226,8 @@ def yield_lookups(self, name: TokenRange, address: List[TokenRange])\ name_fulls = self.query.get_tokens(name, TokenType.WORD) if name_fulls: fulls_count = sum(t.count for t in name_fulls) + if len(name_partials) == 1: + penalty += min(1, max(0, (exp_count - 50 * fulls_count) / (1000 * fulls_count))) # At this point drop unindexed partials from the address. # This might yield wrong results, nothing we can do about that. if not partials_indexed: From ace84ed0e370c5f1530bf02983990c0ab45b220b Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 16 Mar 2024 16:56:04 +0100 Subject: [PATCH 4/4] use address counts for improving index lookup --- nominatim/api/search/db_search_builder.py | 73 ++++++++++++++++++----- nominatim/api/search/db_search_fields.py | 11 ++-- 2 files changed, 65 insertions(+), 19 deletions(-) diff --git a/nominatim/api/search/db_search_builder.py b/nominatim/api/search/db_search_builder.py index f8eabad14..97e7ac028 100644 --- a/nominatim/api/search/db_search_builder.py +++ b/nominatim/api/search/db_search_builder.py @@ -227,28 +227,73 @@ def yield_lookups(self, name: TokenRange, address: List[TokenRange])\ if name_fulls: fulls_count = sum(t.count for t in name_fulls) if len(name_partials) == 1: - penalty += min(1, max(0, (exp_count - 50 * fulls_count) / (1000 * fulls_count))) - # At this point drop unindexed partials from the address. - # This might yield wrong results, nothing we can do about that. - if not partials_indexed: - addr_tokens = [t.token for t in addr_partials if t.is_indexed] + penalty += min(0.5, max(0, (exp_count - 50 * fulls_count) / (2000 * fulls_count))) + if partials_indexed: penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed) - # Any of the full names applies with all of the partials from the address - yield penalty, fulls_count / (2**len(addr_tokens)),\ - dbf.lookup_by_any_name([t.token for t in name_fulls], - addr_tokens, - fulls_count > 30000 / max(1, len(addr_tokens))) + + yield penalty,fulls_count / (2**len(addr_tokens)), \ + self.get_full_name_ranking(name_fulls, addr_partials, + fulls_count > 30000 / max(1, len(addr_tokens))) # To catch remaining results, lookup by name and address # We only do this if there is a reasonable number of results expected. exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count if exp_count < 10000 and all(t.is_indexed for t in name_partials.values()): - lookup = [dbf.FieldLookup('name_vector', list(name_partials.keys()), lookups.LookupAll)] - if addr_tokens: - lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll)) penalty += 0.35 * max(1 if name_fulls else 0.1, 5 - len(name_partials) - len(addr_tokens)) - yield penalty, exp_count, lookup + yield penalty, exp_count,\ + self.get_name_address_ranking(list(name_partials.keys()), addr_partials) + + + def get_name_address_ranking(self, name_tokens: List[int], + addr_partials: List[Token]) -> List[dbf.FieldLookup]: + """ Create a ranking expression looking up by name and address. + """ + lookup = [dbf.FieldLookup('name_vector', name_tokens, lookups.LookupAll)] + + addr_restrict_tokens = [] + addr_lookup_tokens = [] + for t in addr_partials: + if t.is_indexed: + if t.addr_count > 20000: + addr_restrict_tokens.append(t.token) + else: + addr_lookup_tokens.append(t.token) + + if addr_restrict_tokens: + lookup.append(dbf.FieldLookup('nameaddress_vector', + addr_restrict_tokens, lookups.Restrict)) + if addr_lookup_tokens: + lookup.append(dbf.FieldLookup('nameaddress_vector', + addr_lookup_tokens, lookups.LookupAll)) + + return lookup + + + def get_full_name_ranking(self, name_fulls: List[Token], addr_partials: List[Token], + use_lookup: bool) -> List[dbf.FieldLookup]: + """ Create a ranking expression with full name terms and + additional address lookup. When 'use_lookup' is true, then + address lookups will use the index, when the occurences are not + too many. + """ + # At this point drop unindexed partials from the address. + # This might yield wrong results, nothing we can do about that. + if use_lookup: + addr_restrict_tokens = [] + addr_lookup_tokens = [] + for t in addr_partials: + if t.is_indexed: + if t.addr_count > 20000: + addr_restrict_tokens.append(t.token) + else: + addr_lookup_tokens.append(t.token) + else: + addr_restrict_tokens = [t.token for t in addr_partials if t.is_indexed] + addr_lookup_tokens = [] + + return dbf.lookup_by_any_name([t.token for t in name_fulls], + addr_restrict_tokens, addr_lookup_tokens) def get_name_ranking(self, trange: TokenRange, diff --git a/nominatim/api/search/db_search_fields.py b/nominatim/api/search/db_search_fields.py index cd5717753..7f775277e 100644 --- a/nominatim/api/search/db_search_fields.py +++ b/nominatim/api/search/db_search_fields.py @@ -231,16 +231,17 @@ def lookup_by_names(name_tokens: List[int], addr_tokens: List[int]) -> List[Fiel return lookup -def lookup_by_any_name(name_tokens: List[int], addr_tokens: List[int], - use_index_for_addr: bool) -> List[FieldLookup]: +def lookup_by_any_name(name_tokens: List[int], addr_restrict_tokens: List[int], + addr_lookup_tokens: List[int]) -> List[FieldLookup]: """ Create a lookup list where name tokens are looked up via index and only one of the name tokens must be present. Potential address tokens are used to restrict the search further. """ lookup = [FieldLookup('name_vector', name_tokens, lookups.LookupAny)] - if addr_tokens: - lookup.append(FieldLookup('nameaddress_vector', addr_tokens, - lookups.LookupAll if use_index_for_addr else lookups.Restrict)) + if addr_restrict_tokens: + lookup.append(FieldLookup('nameaddress_vector', addr_restrict_tokens, lookups.Restrict)) + if addr_lookup_tokens: + lookup.append(FieldLookup('nameaddress_vector', addr_lookup_tokens, lookups.LookupAll)) return lookup