Skip to content

Commit

Permalink
Merge pull request #3273 from lonvia/search-with-sqlite
Browse files Browse the repository at this point in the history
Add forward search capability for SQLite databases
  • Loading branch information
lonvia authored Dec 12, 2023
2 parents 8c7140d + ab45db5 commit af85ad3
Show file tree
Hide file tree
Showing 38 changed files with 1,160 additions and 483 deletions.
55 changes: 55 additions & 0 deletions docs/customize/SQLite.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
A Nominatim database can be converted into an SQLite database and used as
a read-only source for geocoding queries. This sections describes how to
create and use an SQLite database.

!!! danger
This feature is in an experimental state at the moment. Use at your own
risk.

## Installing prerequisites

To use a SQLite database, you need to install:

* SQLite (>= 3.30)
* Spatialite (> 5.0.0)

On Ubuntu/Debian, you can run:

sudo apt install sqlite3 libsqlite3-mod-spatialite libspatialite7

## Creating a new SQLite database

Nominatim cannot import directly into SQLite database. Instead you have to
first create a geocoding database in PostgreSQL by running a
[regular Nominatim import](../admin/Import.md).

Once this is done, the database can be converted to SQLite with

nominatim convert -o mydb.sqlite

This will create a database where all geocoding functions are available.
Depending on what functions you need, the database can be made smaller:

* `--without-reverse` omits indexes only needed for reverse geocoding
* `--without-search` omit tables and indexes used for forward search
* `--without-details` leaves out extra information only available in the
details API

## Using an SQLite database

Once you have created the database, you can use it by simply pointing the
database DSN to the SQLite file:

NOMINATIM_DATABASE_DSN=sqlite:dbname=mydb.sqlite

Please note that SQLite support is only available for the Python frontend. To
use the test server with an SQLite database, you therefore need to switch
the frontend engine:

nominatim serve --engine falcon

You need to install falcon or starlette for this, depending on which engine
you choose.

The CLI query commands and the library interface already use the new Python
frontend and therefore work right out of the box.
1 change: 1 addition & 0 deletions docs/mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ nav:
- 'Special Phrases': 'customize/Special-Phrases.md'
- 'External data: US housenumbers from TIGER': 'customize/Tiger.md'
- 'External data: Postcodes': 'customize/Postcodes.md'
- 'Conversion to SQLite': 'customize/SQLite.md'
- 'Library Guide':
- 'Getting Started': 'library/Getting-Started.md'
- 'Nominatim API class': 'library/NominatimAPI.md'
Expand Down
14 changes: 6 additions & 8 deletions lib-sql/functions/ranking.sql
Original file line number Diff line number Diff line change
Expand Up @@ -287,21 +287,19 @@ LANGUAGE plpgsql IMMUTABLE;


CREATE OR REPLACE FUNCTION weigh_search(search_vector INT[],
term_vectors TEXT[],
weight_vectors FLOAT[],
rankings TEXT,
def_weight FLOAT)
RETURNS FLOAT
AS $$
DECLARE
pos INT := 1;
terms TEXT;
rank JSON;
BEGIN
FOREACH terms IN ARRAY term_vectors
FOR rank IN
SELECT * FROM json_array_elements(rankings::JSON)
LOOP
IF search_vector @> terms::INTEGER[] THEN
RETURN weight_vectors[pos];
IF true = ALL(SELECT x::int = ANY(search_vector) FROM json_array_elements_text(rank->1) as x) THEN
RETURN (rank->>0)::float;
END IF;
pos := pos + 1;
END LOOP;
RETURN def_weight;
END;
Expand Down
50 changes: 32 additions & 18 deletions nominatim/api/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from nominatim.errors import UsageError
from nominatim.db.sqlalchemy_schema import SearchTables
from nominatim.db.async_core_library import PGCORE_LIB, PGCORE_ERROR
import nominatim.db.sqlite_functions
from nominatim.config import Configuration
from nominatim.api.connection import SearchConnection
from nominatim.api.status import get_status, StatusResult
Expand Down Expand Up @@ -84,6 +85,14 @@ async def setup_database(self) -> None:
extra_args: Dict[str, Any] = {'future': True,
'echo': self.config.get_bool('DEBUG_SQL')}

if self.config.get_int('API_POOL_SIZE') == 0:
extra_args['poolclass'] = sa.pool.NullPool
else:
extra_args['poolclass'] = sa.pool.QueuePool
extra_args['max_overflow'] = 0
extra_args['pool_size'] = self.config.get_int('API_POOL_SIZE')


is_sqlite = self.config.DATABASE_DSN.startswith('sqlite:')

if is_sqlite:
Expand All @@ -92,6 +101,10 @@ async def setup_database(self) -> None:
dburl = sa.engine.URL.create('sqlite+aiosqlite',
database=params.get('dbname'))

if not ('NOMINATIM_DATABASE_RW' in self.config.environ
and self.config.get_bool('DATABASE_RW')) \
and not Path(params.get('dbname', '')).is_file():
raise UsageError(f"SQlite database '{params.get('dbname')}' does not exist.")
else:
dsn = self.config.get_database_params()
query = {k: v for k, v in dsn.items()
Expand All @@ -105,39 +118,40 @@ async def setup_database(self) -> None:
host=dsn.get('host'),
port=int(dsn['port']) if 'port' in dsn else None,
query=query)
extra_args['max_overflow'] = 0
extra_args['pool_size'] = self.config.get_int('API_POOL_SIZE')

engine = sa_asyncio.create_async_engine(dburl, **extra_args)

try:
async with engine.begin() as conn:
result = await conn.scalar(sa.text('SHOW server_version_num'))
server_version = int(result)
except (PGCORE_ERROR, sa.exc.OperationalError):
if is_sqlite:
server_version = 0

if server_version >= 110000 and not is_sqlite:
@sa.event.listens_for(engine.sync_engine, "connect")
def _on_connect(dbapi_con: Any, _: Any) -> None:
cursor = dbapi_con.cursor()
cursor.execute("SET jit_above_cost TO '-1'")
cursor.execute("SET max_parallel_workers_per_gather TO '0'")
# Make sure that all connections get the new settings
await self.close()

if is_sqlite:
@sa.event.listens_for(engine.sync_engine, "connect")
def _on_sqlite_connect(dbapi_con: Any, _: Any) -> None:
dbapi_con.run_async(lambda conn: conn.enable_load_extension(True))
nominatim.db.sqlite_functions.install_custom_functions(dbapi_con)
cursor = dbapi_con.cursor()
cursor.execute("SELECT load_extension('mod_spatialite')")
cursor.execute('SELECT SetDecimalPrecision(7)')
dbapi_con.run_async(lambda conn: conn.enable_load_extension(False))
else:
try:
async with engine.begin() as conn:
result = await conn.scalar(sa.text('SHOW server_version_num'))
server_version = int(result)
except (PGCORE_ERROR, sa.exc.OperationalError):
server_version = 0

if server_version >= 110000:
@sa.event.listens_for(engine.sync_engine, "connect")
def _on_connect(dbapi_con: Any, _: Any) -> None:
cursor = dbapi_con.cursor()
cursor.execute("SET jit_above_cost TO '-1'")
cursor.execute("SET max_parallel_workers_per_gather TO '0'")
# Make sure that all connections get the new settings
await engine.dispose()

self._property_cache['DB:server_version'] = server_version

self._tables = SearchTables(sa.MetaData(), engine.name) # pylint: disable=no-member
self._tables = SearchTables(sa.MetaData()) # pylint: disable=no-member
self._engine = engine


Expand Down
44 changes: 30 additions & 14 deletions nominatim/api/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,26 +90,42 @@ def format_sql(self, conn: AsyncConnection, statement: 'sa.Executable',
params = dict(compiled.params)
if isinstance(extra_params, Mapping):
for k, v in extra_params.items():
params[k] = str(v)
if hasattr(v, 'to_wkt'):
params[k] = v.to_wkt()
elif isinstance(v, (int, float)):
params[k] = v
else:
params[k] = str(v)
elif isinstance(extra_params, Sequence) and extra_params:
for k in extra_params[0]:
params[k] = f':{k}'

sqlstr = str(compiled)

if sa.__version__.startswith('1'):
try:
sqlstr = re.sub(r'__\[POSTCOMPILE_[^]]*\]', '%s', sqlstr)
return sqlstr % tuple((repr(params.get(name, None))
for name in compiled.positiontup)) # type: ignore
except TypeError:
return sqlstr

# Fixes an odd issue with Python 3.7 where percentages are not
# quoted correctly.
sqlstr = re.sub(r'%(?!\()', '%%', sqlstr)
sqlstr = re.sub(r'__\[POSTCOMPILE_([^]]*)\]', r'%(\1)s', sqlstr)
return sqlstr % params
if conn.dialect.name == 'postgresql':
if sa.__version__.startswith('1'):
try:
sqlstr = re.sub(r'__\[POSTCOMPILE_[^]]*\]', '%s', sqlstr)
return sqlstr % tuple((repr(params.get(name, None))
for name in compiled.positiontup)) # type: ignore
except TypeError:
return sqlstr

# Fixes an odd issue with Python 3.7 where percentages are not
# quoted correctly.
sqlstr = re.sub(r'%(?!\()', '%%', sqlstr)
sqlstr = re.sub(r'__\[POSTCOMPILE_([^]]*)\]', r'%(\1)s', sqlstr)
return sqlstr % params

assert conn.dialect.name == 'sqlite'

# params in positional order
pparams = (repr(params.get(name, None)) for name in compiled.positiontup) # type: ignore

sqlstr = re.sub(r'__\[POSTCOMPILE_([^]]*)\]', '?', sqlstr)
sqlstr = re.sub(r"\?", lambda m: next(pparams), sqlstr)

return sqlstr

class HTMLLogger(BaseLogger):
""" Logger that formats messages in HTML.
Expand Down
8 changes: 4 additions & 4 deletions nominatim/api/reverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ async def _find_closest_street_or_poi(self, distance: float) -> Optional[SaRow]:
diststr = sa.text(f"{distance}")

sql: SaLambdaSelect = sa.lambda_stmt(lambda: _select_from_placex(t)
.where(t.c.geometry.ST_DWithin(WKT_PARAM, diststr))
.where(t.c.geometry.within_distance(WKT_PARAM, diststr))
.where(t.c.indexed_status == 0)
.where(t.c.linked_place_id == None)
.where(sa.or_(sa.not_(t.c.geometry.is_area()),
Expand Down Expand Up @@ -219,7 +219,7 @@ async def _find_housenumber_for_street(self, parent_place_id: int) -> Optional[S
t = self.conn.t.placex

sql: SaLambdaSelect = sa.lambda_stmt(lambda: _select_from_placex(t)
.where(t.c.geometry.ST_DWithin(WKT_PARAM, 0.001))
.where(t.c.geometry.within_distance(WKT_PARAM, 0.001))
.where(t.c.parent_place_id == parent_place_id)
.where(sa.func.IsAddressPoint(t))
.where(t.c.indexed_status == 0)
Expand All @@ -241,7 +241,7 @@ async def _find_interpolation_for_street(self, parent_place_id: Optional[int],
sa.select(t,
t.c.linegeo.ST_Distance(WKT_PARAM).label('distance'),
_locate_interpolation(t))
.where(t.c.linegeo.ST_DWithin(WKT_PARAM, distance))
.where(t.c.linegeo.within_distance(WKT_PARAM, distance))
.where(t.c.startnumber != None)
.order_by('distance')
.limit(1))
Expand Down Expand Up @@ -275,7 +275,7 @@ def _base_query() -> SaSelect:
inner = sa.select(t,
t.c.linegeo.ST_Distance(WKT_PARAM).label('distance'),
_locate_interpolation(t))\
.where(t.c.linegeo.ST_DWithin(WKT_PARAM, 0.001))\
.where(t.c.linegeo.within_distance(WKT_PARAM, 0.001))\
.where(t.c.parent_place_id == parent_place_id)\
.order_by('distance')\
.limit(1)\
Expand Down
19 changes: 10 additions & 9 deletions nominatim/api/search/db_search_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from nominatim.api.search.token_assignment import TokenAssignment
import nominatim.api.search.db_search_fields as dbf
import nominatim.api.search.db_searches as dbs
import nominatim.api.search.db_search_lookups as lookups


def wrap_near_search(categories: List[Tuple[str, str]],
Expand Down Expand Up @@ -152,7 +153,7 @@ def build_special_search(self, sdata: dbf.SearchData,
sdata.lookups = [dbf.FieldLookup('nameaddress_vector',
[t.token for r in address
for t in self.query.get_partials_list(r)],
'restrict')]
lookups.Restrict)]
penalty += 0.2
yield dbs.PostcodeSearch(penalty, sdata)

Expand All @@ -162,24 +163,24 @@ def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[Token],
""" Build a simple address search for special entries where the
housenumber is the main name token.
"""
sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], 'lookup_any')]
sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], lookups.LookupAny)]
expected_count = sum(t.count for t in hnrs)

partials = [t for trange in address
for t in self.query.get_partials_list(trange)]

if expected_count < 8000:
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
[t.token for t in partials], 'restrict'))
[t.token for t in partials], lookups.Restrict))
elif len(partials) != 1 or partials[0].count < 10000:
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
[t.token for t in partials], 'lookup_all'))
[t.token for t in partials], lookups.LookupAll))
else:
sdata.lookups.append(
dbf.FieldLookup('nameaddress_vector',
[t.token for t
in self.query.get_tokens(address[0], TokenType.WORD)],
'lookup_any'))
lookups.LookupAny))

sdata.housenumbers = dbf.WeightedStrings([], [])
yield dbs.PlaceSearch(0.05, sdata, expected_count)
Expand Down Expand Up @@ -232,16 +233,16 @@ def yield_lookups(self, name: TokenRange, address: List[TokenRange])\
penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
# Any of the full names applies with all of the partials from the address
yield penalty, fulls_count / (2**len(addr_partials)),\
dbf.lookup_by_any_name([t.token for t in name_fulls], addr_tokens,
'restrict' if fulls_count < 10000 else 'lookup_all')
dbf.lookup_by_any_name([t.token for t in name_fulls],
addr_tokens, fulls_count > 10000)

# To catch remaining results, lookup by name and address
# We only do this if there is a reasonable number of results expected.
exp_count = exp_count / (2**len(addr_partials)) if addr_partials else exp_count
if exp_count < 10000 and all(t.is_indexed for t in name_partials):
lookup = [dbf.FieldLookup('name_vector', name_tokens, 'lookup_all')]
lookup = [dbf.FieldLookup('name_vector', name_tokens, lookups.LookupAll)]
if addr_tokens:
lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all'))
lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll))
penalty += 0.35 * max(0, 5 - len(name_partials) - len(addr_tokens))
yield penalty, exp_count, lookup

Expand Down
Loading

0 comments on commit af85ad3

Please sign in to comment.