Merge pull request #3273 from lonvia/search-with-sqlite

Add forward search capability for SQLite databases
osm-search · Dec 12, 2023 · af85ad3 · af85ad3
2 parents 8c7140d + ab45db5
commit af85ad3
Show file tree

Hide file tree

Showing 38 changed files with 1,160 additions and 483 deletions.
diff --git a/docs/customize/SQLite.md b/docs/customize/SQLite.md
@@ -0,0 +1,55 @@
+A Nominatim database can be converted into an SQLite database and used as
+a read-only source for geocoding queries. This sections describes how to
+create and use an SQLite database.
+
+!!! danger
+    This feature is in an experimental state at the moment. Use at your own
+    risk.
+
+## Installing prerequisites
+
+To use a SQLite database, you need to install:
+
+* SQLite (>= 3.30)
+* Spatialite (> 5.0.0)
+
+On Ubuntu/Debian, you can run:
+
+    sudo apt install sqlite3 libsqlite3-mod-spatialite libspatialite7
+
+## Creating a new SQLite database
+
+Nominatim cannot import directly into SQLite database. Instead you have to
+first create a geocoding database in PostgreSQL by running a
+[regular Nominatim import](../admin/Import.md).
+
+Once this is done, the database can be converted to SQLite with
+
+    nominatim convert -o mydb.sqlite
+
+This will create a database where all geocoding functions are available.
+Depending on what functions you need, the database can be made smaller:
+
+* `--without-reverse` omits indexes only needed for reverse geocoding
+* `--without-search` omit tables and indexes used for forward search
+* `--without-details` leaves out extra information only available in the
+  details API
+
+## Using an SQLite database
+
+Once you have created the database, you can use it by simply pointing the
+database DSN to the SQLite file:
+
+    NOMINATIM_DATABASE_DSN=sqlite:dbname=mydb.sqlite
+
+Please note that SQLite support is only available for the Python frontend. To
+use the test server with an SQLite database, you therefore need to switch
+the frontend engine:
+
+    nominatim serve --engine falcon
+
+You need to install falcon or starlette for this, depending on which engine
+you choose.
+
+The CLI query commands and the library interface already use the new Python
+frontend and therefore work right out of the box.
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -40,6 +40,7 @@ nav:
         - 'Special Phrases': 'customize/Special-Phrases.md'
         - 'External data: US housenumbers from TIGER': 'customize/Tiger.md'
         - 'External data: Postcodes': 'customize/Postcodes.md'
+        - 'Conversion to SQLite': 'customize/SQLite.md'
     - 'Library Guide':
         - 'Getting Started': 'library/Getting-Started.md'
         - 'Nominatim API class': 'library/NominatimAPI.md'

diff --git a/lib-sql/functions/ranking.sql b/lib-sql/functions/ranking.sql
@@ -287,21 +287,19 @@ LANGUAGE plpgsql IMMUTABLE;
 
 
 CREATE OR REPLACE FUNCTION weigh_search(search_vector INT[],
-                                        term_vectors TEXT[],
-                                        weight_vectors FLOAT[],
+                                        rankings TEXT,
                                         def_weight FLOAT)
   RETURNS FLOAT
   AS $$
 DECLARE
-  pos INT := 1;
-  terms TEXT;
+  rank JSON;
 BEGIN
-  FOREACH terms IN ARRAY term_vectors
+  FOR rank IN
+    SELECT * FROM json_array_elements(rankings::JSON)
   LOOP
-    IF search_vector @> terms::INTEGER[] THEN
-      RETURN weight_vectors[pos];
+    IF true = ALL(SELECT x::int = ANY(search_vector) FROM json_array_elements_text(rank->1) as x) THEN
+      RETURN (rank->>0)::float;
     END IF;
-    pos := pos + 1;
   END LOOP;
   RETURN def_weight;
 END;

diff --git a/nominatim/api/core.py b/nominatim/api/core.py
@@ -19,6 +19,7 @@
 from nominatim.errors import UsageError
 from nominatim.db.sqlalchemy_schema import SearchTables
 from nominatim.db.async_core_library import PGCORE_LIB, PGCORE_ERROR
+import nominatim.db.sqlite_functions
 from nominatim.config import Configuration
 from nominatim.api.connection import SearchConnection
 from nominatim.api.status import get_status, StatusResult
@@ -84,6 +85,14 @@ async def setup_database(self) -> None:
             extra_args: Dict[str, Any] = {'future': True,
                                           'echo': self.config.get_bool('DEBUG_SQL')}
 
+            if self.config.get_int('API_POOL_SIZE') == 0:
+                extra_args['poolclass'] = sa.pool.NullPool
+            else:
+                extra_args['poolclass'] = sa.pool.QueuePool
+                extra_args['max_overflow'] = 0
+                extra_args['pool_size'] = self.config.get_int('API_POOL_SIZE')
+
+
             is_sqlite = self.config.DATABASE_DSN.startswith('sqlite:')
 
             if is_sqlite:
@@ -92,6 +101,10 @@ async def setup_database(self) -> None:
                 dburl = sa.engine.URL.create('sqlite+aiosqlite',
                                              database=params.get('dbname'))
 
+                if not ('NOMINATIM_DATABASE_RW' in self.config.environ
+                        and self.config.get_bool('DATABASE_RW')) \
+                   and not Path(params.get('dbname', '')).is_file():
+                    raise UsageError(f"SQlite database '{params.get('dbname')}' does not exist.")
             else:
                 dsn = self.config.get_database_params()
                 query = {k: v for k, v in dsn.items()
@@ -105,39 +118,40 @@ async def setup_database(self) -> None:
                            host=dsn.get('host'),
                            port=int(dsn['port']) if 'port' in dsn else None,
                            query=query)
-                extra_args['max_overflow'] = 0
-                extra_args['pool_size'] = self.config.get_int('API_POOL_SIZE')
 
             engine = sa_asyncio.create_async_engine(dburl, **extra_args)
 
-            try:
-                async with engine.begin() as conn:
-                    result = await conn.scalar(sa.text('SHOW server_version_num'))
-                    server_version = int(result)
-            except (PGCORE_ERROR, sa.exc.OperationalError):
+            if is_sqlite:
                 server_version = 0
 
-            if server_version >= 110000 and not is_sqlite:
-                @sa.event.listens_for(engine.sync_engine, "connect")
-                def _on_connect(dbapi_con: Any, _: Any) -> None:
-                    cursor = dbapi_con.cursor()
-                    cursor.execute("SET jit_above_cost TO '-1'")
-                    cursor.execute("SET max_parallel_workers_per_gather TO '0'")
-                # Make sure that all connections get the new settings
-                await self.close()
-
-            if is_sqlite:
                 @sa.event.listens_for(engine.sync_engine, "connect")
                 def _on_sqlite_connect(dbapi_con: Any, _: Any) -> None:
                     dbapi_con.run_async(lambda conn: conn.enable_load_extension(True))
+                    nominatim.db.sqlite_functions.install_custom_functions(dbapi_con)
                     cursor = dbapi_con.cursor()
                     cursor.execute("SELECT load_extension('mod_spatialite')")
                     cursor.execute('SELECT SetDecimalPrecision(7)')
                     dbapi_con.run_async(lambda conn: conn.enable_load_extension(False))
+            else:
+                try:
+                    async with engine.begin() as conn:
+                        result = await conn.scalar(sa.text('SHOW server_version_num'))
+                        server_version = int(result)
+                except (PGCORE_ERROR, sa.exc.OperationalError):
+                    server_version = 0
+
+                if server_version >= 110000:
+                    @sa.event.listens_for(engine.sync_engine, "connect")
+                    def _on_connect(dbapi_con: Any, _: Any) -> None:
+                        cursor = dbapi_con.cursor()
+                        cursor.execute("SET jit_above_cost TO '-1'")
+                        cursor.execute("SET max_parallel_workers_per_gather TO '0'")
+                    # Make sure that all connections get the new settings
+                    await engine.dispose()
 
             self._property_cache['DB:server_version'] = server_version
 
-            self._tables = SearchTables(sa.MetaData(), engine.name) # pylint: disable=no-member
+            self._tables = SearchTables(sa.MetaData()) # pylint: disable=no-member
             self._engine = engine
 
 

diff --git a/nominatim/api/logging.py b/nominatim/api/logging.py
@@ -90,26 +90,42 @@ def format_sql(self, conn: AsyncConnection, statement: 'sa.Executable',
         params = dict(compiled.params)
         if isinstance(extra_params, Mapping):
             for k, v in extra_params.items():
-                params[k] = str(v)
+                if hasattr(v, 'to_wkt'):
+                    params[k] = v.to_wkt()
+                elif isinstance(v, (int, float)):
+                    params[k] = v
+                else:
+                    params[k] = str(v)
         elif isinstance(extra_params, Sequence) and extra_params:
             for k in extra_params[0]:
                 params[k] = f':{k}'
 
         sqlstr = str(compiled)
 
-        if sa.__version__.startswith('1'):
-            try:
-                sqlstr = re.sub(r'__\[POSTCOMPILE_[^]]*\]', '%s', sqlstr)
-                return sqlstr % tuple((repr(params.get(name, None))
-                                      for name in compiled.positiontup)) # type: ignore
-            except TypeError:
-                return sqlstr
-
-        # Fixes an odd issue with Python 3.7 where percentages are not
-        # quoted correctly.
-        sqlstr = re.sub(r'%(?!\()', '%%', sqlstr)
-        sqlstr = re.sub(r'__\[POSTCOMPILE_([^]]*)\]', r'%(\1)s', sqlstr)
-        return sqlstr % params
+        if conn.dialect.name == 'postgresql':
+            if sa.__version__.startswith('1'):
+                try:
+                    sqlstr = re.sub(r'__\[POSTCOMPILE_[^]]*\]', '%s', sqlstr)
+                    return sqlstr % tuple((repr(params.get(name, None))
+                                          for name in compiled.positiontup)) # type: ignore
+                except TypeError:
+                    return sqlstr
+
+            # Fixes an odd issue with Python 3.7 where percentages are not
+            # quoted correctly.
+            sqlstr = re.sub(r'%(?!\()', '%%', sqlstr)
+            sqlstr = re.sub(r'__\[POSTCOMPILE_([^]]*)\]', r'%(\1)s', sqlstr)
+            return sqlstr % params
+
+        assert conn.dialect.name == 'sqlite'
+
+        # params in positional order
+        pparams = (repr(params.get(name, None)) for name in compiled.positiontup) # type: ignore
+
+        sqlstr = re.sub(r'__\[POSTCOMPILE_([^]]*)\]', '?', sqlstr)
+        sqlstr = re.sub(r"\?", lambda m: next(pparams), sqlstr)
+
+        return sqlstr
 
 class HTMLLogger(BaseLogger):
     """ Logger that formats messages in HTML.

diff --git a/nominatim/api/reverse.py b/nominatim/api/reverse.py
@@ -180,7 +180,7 @@ async def _find_closest_street_or_poi(self, distance: float) -> Optional[SaRow]:
         diststr = sa.text(f"{distance}")
 
         sql: SaLambdaSelect = sa.lambda_stmt(lambda: _select_from_placex(t)
-                .where(t.c.geometry.ST_DWithin(WKT_PARAM, diststr))
+                .where(t.c.geometry.within_distance(WKT_PARAM, diststr))
                 .where(t.c.indexed_status == 0)
                 .where(t.c.linked_place_id == None)
                 .where(sa.or_(sa.not_(t.c.geometry.is_area()),
@@ -219,7 +219,7 @@ async def _find_housenumber_for_street(self, parent_place_id: int) -> Optional[S
         t = self.conn.t.placex
 
         sql: SaLambdaSelect = sa.lambda_stmt(lambda: _select_from_placex(t)
-                .where(t.c.geometry.ST_DWithin(WKT_PARAM, 0.001))
+                .where(t.c.geometry.within_distance(WKT_PARAM, 0.001))
                 .where(t.c.parent_place_id == parent_place_id)
                 .where(sa.func.IsAddressPoint(t))
                 .where(t.c.indexed_status == 0)
@@ -241,7 +241,7 @@ async def _find_interpolation_for_street(self, parent_place_id: Optional[int],
                    sa.select(t,
                              t.c.linegeo.ST_Distance(WKT_PARAM).label('distance'),
                              _locate_interpolation(t))
-                     .where(t.c.linegeo.ST_DWithin(WKT_PARAM, distance))
+                     .where(t.c.linegeo.within_distance(WKT_PARAM, distance))
                      .where(t.c.startnumber != None)
                      .order_by('distance')
                      .limit(1))
@@ -275,7 +275,7 @@ def _base_query() -> SaSelect:
             inner = sa.select(t,
                               t.c.linegeo.ST_Distance(WKT_PARAM).label('distance'),
                               _locate_interpolation(t))\
-                      .where(t.c.linegeo.ST_DWithin(WKT_PARAM, 0.001))\
+                      .where(t.c.linegeo.within_distance(WKT_PARAM, 0.001))\
                       .where(t.c.parent_place_id == parent_place_id)\
                       .order_by('distance')\
                       .limit(1)\

diff --git a/nominatim/api/search/db_search_builder.py b/nominatim/api/search/db_search_builder.py
@@ -15,6 +15,7 @@
 from nominatim.api.search.token_assignment import TokenAssignment
 import nominatim.api.search.db_search_fields as dbf
 import nominatim.api.search.db_searches as dbs
+import nominatim.api.search.db_search_lookups as lookups
 
 
 def wrap_near_search(categories: List[Tuple[str, str]],
@@ -152,7 +153,7 @@ def build_special_search(self, sdata: dbf.SearchData,
                 sdata.lookups = [dbf.FieldLookup('nameaddress_vector',
                                                  [t.token for r in address
                                                   for t in self.query.get_partials_list(r)],
-                                                 'restrict')]
+                                                 lookups.Restrict)]
                 penalty += 0.2
             yield dbs.PostcodeSearch(penalty, sdata)
 
@@ -162,24 +163,24 @@ def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[Token],
         """ Build a simple address search for special entries where the
             housenumber is the main name token.
         """
-        sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], 'lookup_any')]
+        sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], lookups.LookupAny)]
         expected_count = sum(t.count for t in hnrs)
 
         partials = [t for trange in address
                        for t in self.query.get_partials_list(trange)]
 
         if expected_count < 8000:
             sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
-                                                 [t.token for t in partials], 'restrict'))
+                                                 [t.token for t in partials], lookups.Restrict))
         elif len(partials) != 1 or partials[0].count < 10000:
             sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
-                                                 [t.token for t in partials], 'lookup_all'))
+                                                 [t.token for t in partials], lookups.LookupAll))
         else:
             sdata.lookups.append(
                 dbf.FieldLookup('nameaddress_vector',
                                 [t.token for t
                                  in self.query.get_tokens(address[0], TokenType.WORD)],
-                                'lookup_any'))
+                                lookups.LookupAny))
 
         sdata.housenumbers = dbf.WeightedStrings([], [])
         yield dbs.PlaceSearch(0.05, sdata, expected_count)
@@ -232,16 +233,16 @@ def yield_lookups(self, name: TokenRange, address: List[TokenRange])\
                 penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
             # Any of the full names applies with all of the partials from the address
             yield penalty, fulls_count / (2**len(addr_partials)),\
-                  dbf.lookup_by_any_name([t.token for t in name_fulls], addr_tokens,
-                                         'restrict' if fulls_count < 10000 else 'lookup_all')
+                  dbf.lookup_by_any_name([t.token for t in name_fulls],
+                                         addr_tokens, fulls_count > 10000)
 
         # To catch remaining results, lookup by name and address
         # We only do this if there is a reasonable number of results expected.
         exp_count = exp_count / (2**len(addr_partials)) if addr_partials else exp_count
         if exp_count < 10000 and all(t.is_indexed for t in name_partials):
-            lookup = [dbf.FieldLookup('name_vector', name_tokens, 'lookup_all')]
+            lookup = [dbf.FieldLookup('name_vector', name_tokens, lookups.LookupAll)]
             if addr_tokens:
-                lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all'))
+                lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll))
             penalty += 0.35 * max(0, 5 - len(name_partials) - len(addr_tokens))
             yield penalty, exp_count, lookup