Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move Metadata to --internal database #2343

Merged
merged 8 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 137 additions & 73 deletions datasette/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,37 @@ def __init__(
self._root_token = secrets.token_hex(32)
self.client = DatasetteClient(self)

async def apply_metadata_json(self):
# Apply any metadata entries from metadata.json to the internal tables
# step 1: top-level metadata
for key in self._metadata_local or {}:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should rename self._metadata_local to self._metadata_on_startup or self._metadata_from_file to help clarify that it's the metadata that was passed to Datasette's constructor on startup?

I don't want people to get confused and think that reading from self._metadata_local is a useful thing to do, when they should be calling the new async methods instead.

Probably overthinking that though, since those methods will be documented but self._metadata_local will not.

if key == "databases":
continue
await self.set_instance_metadata(key, self._metadata_local[key])

# step 2: database-level metadata
for dbname, db in self._metadata_local.get("databases", {}).items():
for key, value in db.items():
if key == "tables":
continue
await self.set_database_metadata(dbname, key, value)

# step 3: table-level metadata
for tablename, table in db.get("tables", {}).items():
for key, value in table.items():
if key == "columns":
continue
await self.set_resource_metadata(dbname, tablename, key, value)

# step 4: column-level metadata (only descriptions in metadata.json)
for columnname, column_description in table.get("columns", {}).items():
await self.set_column_metadata(
dbname, tablename, columnname, "description", column_description
)

# TODO(alex) is metadata.json was loaded in, and --internal is not memory, then log
# a warning to user that they should delete their metadata.json file

Comment on lines +474 to +476
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm OK with us not doing this - software telling me to delete a file is a bit weird.

def get_jinja_environment(self, request: Request = None) -> Environment:
environment = self._jinja_env
if request:
Expand Down Expand Up @@ -476,6 +507,7 @@ async def _refresh_schemas(self):
internal_db = self.get_internal_database()
if not self.internal_db_created:
await init_internal_db(internal_db)
await self.apply_metadata_json()
self.internal_db_created = True
current_schema_versions = {
row["database_name"]: row["schema_version"]
Expand Down Expand Up @@ -646,57 +678,113 @@ def _metadata_recursive_update(self, orig, updated):
orig[key] = upd_value
return orig

def metadata(self, key=None, database=None, table=None, fallback=True):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm so happy to be rid of the synchronous (but undocumented) .metadata() method.

"""
Looks up metadata, cascading backwards from specified level.
Returns None if metadata value is not found.
"""
assert not (
database is None and table is not None
), "Cannot call metadata() with table= specified but not database="
metadata = {}
async def get_instance_metadata(self):
rows = await self.get_internal_database().execute(
"""
SELECT
key,
value
FROM datasette_metadata_instance_entries
"""
)
return dict(rows)

async def get_database_metadata(self, database_name: str):
rows = await self.get_internal_database().execute(
"""
SELECT
key,
value
FROM datasette_metadata_database_entries
WHERE database_name = ?
""",
[database_name],
)
return dict(rows)

async def get_resource_metadata(self, database_name: str, resource_name: str):
rows = await self.get_internal_database().execute(
"""
SELECT
key,
value
FROM datasette_metadata_resource_entries
WHERE database_name = ?
AND resource_name = ?
""",
[database_name, resource_name],
)
return dict(rows)

for hook_dbs in pm.hook.get_metadata(
datasette=self, key=key, database=database, table=table
):
metadata = self._metadata_recursive_update(metadata, hook_dbs)

# security precaution!! don't allow anything in the local config
# to be overwritten. this is a temporary measure, not sure if this
# is a good idea long term or maybe if it should just be a concern
# of the plugin's implemtnation
metadata = self._metadata_recursive_update(metadata, self._metadata_local)

databases = metadata.get("databases") or {}

search_list = []
if database is not None:
search_list.append(databases.get(database) or {})
if table is not None:
table_metadata = ((databases.get(database) or {}).get("tables") or {}).get(
table
) or {}
search_list.insert(0, table_metadata)

search_list.append(metadata)
if not fallback:
# No fallback allowed, so just use the first one in the list
search_list = search_list[:1]
if key is not None:
for item in search_list:
if key in item:
return item[key]
return None
else:
# Return the merged list
m = {}
for item in search_list:
m.update(item)
return m
async def get_column_metadata(
self, database_name: str, resource_name: str, column_name: str
):
rows = await self.get_internal_database().execute(
"""
SELECT
key,
value
FROM datasette_metadata_column_entries
WHERE database_name = ?
AND resource_name = ?
AND column_name = ?
""",
[database_name, resource_name, column_name],
)
return dict(rows)

async def set_instance_metadata(self, key: str, value: str):
# TODO upsert only supported on SQLite 3.24.0 (2018-06-04)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This inspired me to survey people to find out what SQLite versions are out there in the wild:

https://twitter.com/simonw/status/1800198142002115029
https://fedi.simonwillison.net/@simon/112593180340389914

await self.get_internal_database().execute_write(
"""
INSERT INTO datasette_metadata_instance_entries(key, value)
VALUES(?, ?)
ON CONFLICT(key) DO UPDATE SET value = excluded.value;
""",
[key, value],
)

@property
def _metadata(self):
return self.metadata()
async def set_database_metadata(self, database_name: str, key: str, value: str):
# TODO upsert only supported on SQLite 3.24.0 (2018-06-04)
await self.get_internal_database().execute_write(
"""
INSERT INTO datasette_metadata_database_entries(database_name, key, value)
VALUES(?, ?, ?)
ON CONFLICT(database_name, key) DO UPDATE SET value = excluded.value;
""",
[database_name, key, value],
)

async def set_resource_metadata(
self, database_name: str, resource_name: str, key: str, value: str
):
# TODO upsert only supported on SQLite 3.24.0 (2018-06-04)
await self.get_internal_database().execute_write(
"""
INSERT INTO datasette_metadata_resource_entries(database_name, resource_name, key, value)
VALUES(?, ?, ?, ?)
ON CONFLICT(database_name, resource_name, key) DO UPDATE SET value = excluded.value;
""",
[database_name, resource_name, key, value],
)

async def set_column_metadata(
self,
database_name: str,
resource_name: str,
column_name: str,
key: str,
value: str,
):
# TODO upsert only supported on SQLite 3.24.0 (2018-06-04)
await self.get_internal_database().execute_write(
"""
INSERT INTO datasette_metadata_column_entries(database_name, resource_name, column_name, key, value)
VALUES(?, ?, ?, ?, ?)
ON CONFLICT(database_name, resource_name, column_name, key) DO UPDATE SET value = excluded.value;
""",
[database_name, resource_name, column_name, key, value],
)

def get_internal_database(self):
return self._internal_database
Expand Down Expand Up @@ -774,20 +862,6 @@ async def get_canned_query(self, database_name, query_name, actor):
if query:
return query

def update_with_inherited_metadata(self, metadata):
# Fills in source/license with defaults, if available
metadata.update(
{
"source": metadata.get("source") or self.metadata("source"),
"source_url": metadata.get("source_url") or self.metadata("source_url"),
"license": metadata.get("license") or self.metadata("license"),
"license_url": metadata.get("license_url")
or self.metadata("license_url"),
"about": metadata.get("about") or self.metadata("about"),
"about_url": metadata.get("about_url") or self.metadata("about_url"),
}
)

def _prepare_connection(self, conn, database):
conn.row_factory = sqlite3.Row
conn.text_factory = lambda x: str(x, "utf-8", "replace")
Expand Down Expand Up @@ -1079,11 +1153,6 @@ def absolute_url(self, request, path):
url = "https://" + url[len("http://") :]
return url

def _register_custom_units(self):
"""Register any custom units defined in the metadata.json with Pint"""
for unit in self.metadata("custom_units") or []:
ureg.define(unit)

def _connected_databases(self):
return [
{
Expand Down Expand Up @@ -1436,10 +1505,6 @@ def add_route(view, regex):
),
r"/:memory:(?P<rest>.*)$",
)
add_route(
JsonDataView.as_view(self, "metadata.json", lambda: self.metadata()),
r"/-/metadata(\.(?P<format>json))?$",
)
add_route(
JsonDataView.as_view(self, "versions.json", self._versions),
r"/-/versions(\.(?P<format>json))?$",
Expand Down Expand Up @@ -1585,7 +1650,6 @@ async def resolve_row(self, request):
def app(self):
"""Returns an ASGI app function that serves the whole of Datasette"""
routes = self._routes()
self._register_custom_units()

async def setup_db():
# First time server starts up, calculate table counts for immutable databases
Expand Down
4 changes: 0 additions & 4 deletions datasette/default_menu_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@ async def inner():
"href": datasette.urls.path("/-/versions"),
"label": "Version info",
},
{
"href": datasette.urls.path("/-/metadata"),
"label": "Metadata",
},
{
"href": datasette.urls.path("/-/settings"),
"label": "Settings",
Expand Down
13 changes: 9 additions & 4 deletions datasette/facets.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,15 @@ def get_facet_size(self):
max_returned_rows = self.ds.setting("max_returned_rows")
table_facet_size = None
if self.table:
tables_metadata = self.ds.metadata("tables", database=self.database) or {}
table_metadata = tables_metadata.get(self.table) or {}
if table_metadata:
table_facet_size = table_metadata.get("facet_size")
config_facet_size = (
self.ds.config.get("databases", {})
.get(self.database, {})
.get("tables", {})
.get(self.table, {})
.get("facet_size")
)
if config_facet_size:
table_facet_size = config_facet_size
custom_facet_size = self.request.args.get("_facet_size")
if custom_facet_size:
if custom_facet_size == "max":
Expand Down
5 changes: 0 additions & 5 deletions datasette/hookspecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,6 @@ def startup(datasette):
"""Fires directly after Datasette first starts running"""


@hookspec
def get_metadata(datasette, key, database, table):
"""Return metadata to be merged into Datasette's metadata dictionary"""


@hookspec
def asgi_wrapper(datasette):
"""Returns an ASGI middleware callable to wrap our ASGI application with"""
Expand Down
1 change: 0 additions & 1 deletion datasette/renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ def json_renderer(request, args, data, error, truncated=None):

if truncated is not None:
data["truncated"] = truncated

if shape == "arrayfirst":
if not data["rows"]:
data = []
Expand Down
37 changes: 37 additions & 0 deletions datasette/utils/internal_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,43 @@ async def init_internal_db(db):
"""
).strip()
await db.execute_write_script(create_tables_sql)
await initialize_metadata_tables(db)


async def initialize_metadata_tables(db):
await db.execute_write_script(
"""
CREATE TABLE IF NOT EXISTS datasette_metadata_instance_entries(
key text,
value text,
unique(key)
);

CREATE TABLE IF NOT EXISTS datasette_metadata_database_entries(
database_name text,
key text,
value text,
unique(database_name, key)
);

CREATE TABLE IF NOT EXISTS datasette_metadata_resource_entries(
database_name text,
resource_name text,
key text,
value text,
unique(database_name, resource_name, key)
);

CREATE TABLE IF NOT EXISTS datasette_metadata_column_entries(
database_name text,
resource_name text,
column_name text,
key text,
value text,
unique(database_name, resource_name, column_name, key)
);
"""
)


async def populate_schema_tables(internal_db, db):
Expand Down
6 changes: 1 addition & 5 deletions datasette/views/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,6 @@ async def get(self, request):

end = time.perf_counter()
data["query_ms"] = (end - start) * 1000
for key in ("source", "source_url", "license", "license_url"):
value = self.ds.metadata(key)
if value:
data[key] = value

# Special case for .jsono extension - redirect to _shape=objects
if _format == "jsono":
Expand Down Expand Up @@ -385,7 +381,7 @@ async def get(self, request):
},
}
if "metadata" not in context:
context["metadata"] = self.ds.metadata()
context["metadata"] = await self.ds.get_instance_metadata()
r = await self.render(templates, request=request, context=context)
if status_code is not None:
r.status = status_code
Expand Down
Loading
Loading