Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exploration of sqlite performance #845

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions sotodlib/core/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def update_context(self, new_stuff):
else:
self[k] = v

def reload(self, load_list='all'):
def reload(self, load_list='all', readwrite=False):
"""Load (or reload) certain databases associated with this dataset.
(Note we don't load any per-observation metadata here.)

Expand All @@ -139,7 +139,9 @@ def reload(self, load_list='all'):
db_file = os.path.abspath(db_file)
logger.info(f'Loading {key} from {self[key]} -> {db_file}.')
try:
db = cls.from_file(db_file, force_new_db=False)
db = cls.from_file(
db_file, force_new_db=False, readonly=(not readwrite)
)
except Exception as e:
logger.error(f'DB failure when loading {key} from {self[key]} -> {db_file}\n')
raise e
Expand Down
118 changes: 113 additions & 5 deletions sotodlib/core/metadata/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,114 @@
import os


def sqlite_connect(filename=None, mode="w"):
"""Utility function for connecting to a sqlite3 DB.

This provides a single function for opening an sqlite connection
consistently across the code base. When connecting to a file on
disk we try to use options which will be more performant in the
situation where the DB is on a networked / shared filesystem and
being accessed from multiple processes.

Note that throughout the codebase we use the Connection.execute()
shortcut which internally creates a Cursor object. This technique
is not portable between SQL backends, but is fine for sqlite use.

Args:
filename (str): The path on disk or None if using an
in-memory DB.
mode (str): Either "r" or "w".

Returns:
(sqlite3.Connection): The database connection.

"""
if filename is None:
# Memory-backed DB
if mode == "r":
raise ValueError("Cannot open memory DB in read-only mode")
return sqlite3.connect(":memory:")

# This timeout is in seconds. If multiple processes are writing, they
# might be blocked for a while until they get their turn. This prevents
# them from giving up too soon if other processes have a write lock.
# https://www.sqlite.org/pragma.html#pragma_busy_timeout
busy_time = 1000

# Journaling options

# Persistent journaling mode. File creation / deletion can be expensive
# on some filesystems. This just writes some zeros to the header and
# leaves the file. This journal is a "side car" file next to the original
# DB file and is safe to delete manually if one is sure that no processes
# are accessing the DB.
# https://www.sqlite.org/pragma.html#pragma_journal_mode
journal_mode = "persist"

# Max size of the journal. Although it is being overwritten repeatedly,
# if it gets too large we purge it and recreate. This should not happen
# for most normal operations.
# https://www.sqlite.org/pragma.html#pragma_journal_size_limit
journal_size = f"{10 * 1024 * 1024}"

# Disk synchronization options

# Using "normal" instead of the default "full" can avoid potentially expensive
# (on network filesystems) sync operations.
# https://www.sqlite.org/pragma.html#pragma_synchronous
sync_mode = "normal"

# Memory caching

# The default page size in modern sqlite is 4096 bytes, and should be fine.
# We set this explicitly to allow easy changing in the future or keeping it
# fixed if the default changes.
# https://www.sqlite.org/pragma.html#pragma_page_size
page_size = 4096

# The number of pages to cache in memory. Setting this to a few MB of RAM
# can have substantial performance benefits. Total will be number of pages
# times page size.
# https://www.sqlite.org/pragma.html#pragma_cache_size
n_cache_pages = 4000

# Open connection
if mode == "r":
connstr = f"file:{filename}?mode=ro"
else:
connstr = f"file:{filename}?mode=rwc"
try:
# Python >= 3.12
conn = sqlite3.connect(
connstr, uri=True, timeout=busy_time, autocommit=True
)
except TypeError:
conn = sqlite3.connect(
connstr, uri=True, timeout=busy_time, isolation_level="IMMEDIATE"
)

# Set cache sizes
conn.execute(f"pragma page_size={page_size}")
conn.execute(f"pragma cache_size={n_cache_pages}")

if mode == "r":
# Read-only mode, all done.
return conn

# In write mode, set journaling / sync options
conn.execute(f"pragma journal_mode={journal_mode}")
conn.execute(f"pragma journal_size_limit={journal_size}")
conn.execute(f"pragma synchronous={sync_mode}")

# Other tuning options

# Hold temporary tables in memory.
# https://www.sqlite.org/pragma.html#pragma_temp_store
conn.execute("pragma temp_store=memory")

return conn


def sqlite_to_file(db, filename, overwrite=True, fmt=None):
"""Write an sqlite db to file. Supports several output formats.

Expand All @@ -26,7 +134,7 @@ def sqlite_to_file(db, filename, overwrite=True, fmt=None):
if fmt == 'sqlite':
if os.path.exists(filename):
os.remove(filename)
new_db = sqlite3.connect(filename)
new_db = sqlite_connect(filename=filename, mode='w')
script = ' '.join(db.iterdump())
new_db.executescript(script)
new_db.commit()
Expand All @@ -41,7 +149,7 @@ def sqlite_to_file(db, filename, overwrite=True, fmt=None):
else:
raise RuntimeError(f'Unknown format "{fmt}" requested.')

def sqlite_from_file(filename, fmt=None, force_new_db=True):
def sqlite_from_file(filename, fmt=None, force_new_db=True, readonly=False):
"""Instantiate an sqlite3.Connection and return it, with the data
copied in from the specified file. The function can either map the database
file directly, or map a copy of the database in memory (see force_new_db
Expand All @@ -51,7 +159,7 @@ def sqlite_from_file(filename, fmt=None, force_new_db=True):
filename (str): path to the file.
fmt (str): format of the input; see to_file for details.
force_new_db (bool): Used if connecting to an sqlite database. If True the
databas is copied into memory and if False returns a connection to the
database is copied into memory and if False returns a connection to the
database without reading it into memory

"""
Expand All @@ -60,7 +168,7 @@ def sqlite_from_file(filename, fmt=None, force_new_db=True):
if filename.endswith('.gz'):
fmt = 'gz'
if fmt == 'sqlite':
db0 = sqlite3.connect(f'file:{filename}?mode=ro', uri=True)
db0 = sqlite_connect(filename=filename, mode=("r" if readonly else "w"))
if not force_new_db:
return db0
data = ' '.join(db0.iterdump())
Expand All @@ -72,7 +180,7 @@ def sqlite_from_file(filename, fmt=None, force_new_db=True):
data = fin.read().decode('utf-8')
else:
raise RuntimeError(f'Unknown format "{fmt}" requested.')
db = sqlite3.connect(':memory:')
db = sqlite_connect()
db.executescript(data)
return db

48 changes: 36 additions & 12 deletions sotodlib/core/metadata/detdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,18 +71,32 @@ class DetDb(object):
"`time1` integer",
]

def __init__(self, map_file=None, init_db=True):
"""Instantiate a DetDb. If map_file is provided, the database will
def __init__(self, map_file=None, init_db=True, readonly=False):
"""Instantiate a DetDb.

If map_file is provided, the database will
be connected to the indicated sqlite file on disk, and any
changes made to this object be written back to the file.

Args:
map_file (string): sqlite database file to map. Defaults to
':memory:'.
init_db (bool): If True, attempt to create the database
tables.
readonly (bool): If True, the database file will be mapped
in read-only mode. Not valid on dbs held in :memory:.

"""
if init_db and readonly:
raise ValueError("Cannot initialize a read-only DB")
self._readonly = readonly
if isinstance(map_file, sqlite3.Connection):
self.conn = map_file
else:
if map_file is None:
map_file = ':memory:'
self.conn = sqlite3.connect(map_file)
self.conn = common.sqlite_connect(
filename=map_file,
mode=("r" if readonly else "w"),
)
self.conn.row_factory = sqlite3.Row # access columns by name

if init_db:
Expand Down Expand Up @@ -170,6 +184,8 @@ def create_table(self, table_name, column_defs, raw=False, commit=True):
]

"""
if self._readonly:
raise RuntimeError("Cannot use create_table() on a read-only DB")
c = self.conn.cursor()
pre_cols = self.TABLE_TEMPLATE
if raw:
Expand All @@ -195,7 +211,7 @@ def copy(self, map_file=None, overwrite=False):
else:
raise RuntimeError("Output file %s exists (overwrite=True "
"to overwrite)." % map_file)
new_db = DetDb(map_file=map_file, init_db=False)
new_db = DetDb(map_file=map_file, init_db=False, readonly=False)
script = ' '.join(self.conn.iterdump())
new_db.conn.executescript(script)
return new_db
Expand All @@ -220,7 +236,7 @@ def to_file(self, filename, overwrite=True, fmt=None):
raise RuntimeError(f'File {filename} exists; remove or pass '
'overwrite=True.')
if fmt == 'sqlite':
self.copy(map_file=filename, overwrite=overwrite)
_ = self.copy(map_file=filename, overwrite=overwrite)
elif fmt == 'dump':
with open(filename, 'w') as fout:
for line in self.conn.iterdump():
Expand All @@ -233,14 +249,15 @@ def to_file(self, filename, overwrite=True, fmt=None):
raise RuntimeError(f'Unknown format "{fmt}" requested.')

@classmethod
def from_file(cls, filename, fmt=None, force_new_db=True):
def from_file(cls, filename, fmt=None, force_new_db=True, readonly=False):
"""This method calls
:func:`sotodlib.core.metadata.common.sqlite_from_file`
"""
conn = common.sqlite_from_file(filename, fmt=fmt,
force_new_db=force_new_db)
conn = common.sqlite_from_file(
filename, fmt=fmt, force_new_db=force_new_db, readonly=readonly
)
return cls(conn, init_db=False)


def reduce(self, dets=None, time0=None, time1=None,
inplace=False):
Expand All @@ -262,7 +279,12 @@ def reduce(self, dets=None, time0=None, time1=None,
Returns the reduced data (which is self, if inplace is True).

"""
if not inplace:
if inplace:
if self._readonly:
raise RuntimeError(
"Cannot do inplace reduce of a read-only DB."
)
else:
return self.copy().reduce(dets, time0, time1, inplace=True)

time_clause = '0'
Expand Down Expand Up @@ -339,6 +361,8 @@ def add_props(self, table_, name_, time_range=None, commit=True, **kw):
into the property table.

"""
if self._readonly:
raise RuntimeError("Cannot add_props() on a read-only DB")
if time_range is None:
time_range = self.ALWAYS
row_id = self.get_id(name_, create=True, commit=False)
Expand Down
Loading