Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce queries for published graphs when indexing #11513

Draft
wants to merge 1 commit into
base: dev/8.0.x
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 32 additions & 23 deletions arches/app/utils/index_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from datetime import datetime
from django.db import connection, connections
from django.db.models import prefetch_related_objects, Prefetch, Q, QuerySet
from django.utils.translation import get_language

from arches.app.models import models
from arches.app.models.resource import Resource
from arches.app.models.system_settings import settings
Expand All @@ -25,21 +27,6 @@


logger = logging.getLogger(__name__)
serialized_graphs = {}


def get_serialized_graph(graph):
"""
Returns the serialized version of the graph from the database

"""
if not graph:
return None

if graph.graphid not in serialized_graphs:
published_graph = graph.get_published_graph()
serialized_graphs[graph.graphid] = published_graph.serialized_graph
return serialized_graphs[graph.graphid]


def index_db(
Expand Down Expand Up @@ -201,15 +188,16 @@ def optimize_resource_iteration(resources: Iterable[Resource], chunk_size: int):
- select related graphs
- prefetch tiles (onto .prefetched_tiles)
- prefetch primary descriptors (onto graph.descriptor_function)
- prefetch published graphs (onto graph.publication.published_graph_active_lang)
- apply chunk_size to reduce memory footprint and spread the work
of prefetching tiles across multiple queries

The caller is responsible for moving the descriptor function
prefetch from the graph to the resource instance--a symptom of
this being more of a graph property--and for moving the prefetched
tiles to .tiles (because the Resource proxy model initializes
.tiles to an empty array and Django thinks that represents the
state in the db.)
and published graph prefetches from the graph to the resource instances
--a symptom of these being more like graph properties--
and for moving the prefetched tiles to .tiles (because the Resource
proxy model initializes .tiles to an empty array and Django thinks
that represents the state in the db.)
"""
tiles_prefetch = Prefetch("tilemodel_set", to_attr="prefetched_tiles")
# Same queryset as Resource.save_descriptors()
Expand All @@ -221,18 +209,30 @@ def optimize_resource_iteration(resources: Iterable[Resource], chunk_size: int):
queryset=descriptor_query,
to_attr="descriptor_function",
)
published_graph_query = models.PublishedGraph.objects.filter(
language=get_language()
)
published_graph_prefetch = Prefetch(
"graph__publication__publishedgraph_set",
queryset=published_graph_query,
to_attr="published_graph_active_lang",
)

if isinstance(resources, QuerySet):
return (
resources.select_related("graph")
.prefetch_related(tiles_prefetch, descriptor_prefetch)
.prefetch_related(
tiles_prefetch, descriptor_prefetch, published_graph_prefetch
)
.iterator(chunk_size=chunk_size)
)
else: # public API that arches itself does not currently use
for r in resources:
r.clean_fields() # ensure strings become UUIDs

prefetch_related_objects(resources, tiles_prefetch, descriptor_prefetch)
prefetch_related_objects(
resources, tiles_prefetch, descriptor_prefetch, published_graph_prefetch
)
return resources


Expand Down Expand Up @@ -263,10 +263,19 @@ def index_resources_using_singleprocessing(
for resource in optimize_resource_iteration(
resources, chunk_size=batch_size // 8
):
# Move prefetched relations to where the Proxy Model expects them.
resource.tiles = resource.prefetched_tiles
resource.descriptor_function = resource.graph.descriptor_function
try:
resource.serialized_graph = (
resource.graph.publication.published_graph_active_lang[
0
].serialized_graph
)
except IndexError:
resource.serialized_graph = None

resource.set_node_datatypes(node_datatypes)
jacobtylerwalls marked this conversation as resolved.
Show resolved Hide resolved
resource.set_serialized_graph(get_serialized_graph(resource.graph))
if recalculate_descriptors:
resource.save_descriptors()
if quiet is False and bar is not None:
Expand Down
16 changes: 10 additions & 6 deletions tests/models/resource_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,18 +422,15 @@ def test_provisional_user_can_delete_own_resource(self):
self.assertFalse(result)

def test_recalculate_descriptors_prefetch_related_objects(self):
other_graph = Graph.new(name="Other graph", is_resource=True)
other_graph.publish()
r1 = Resource(graph_id=self.search_model_graphid)
r2 = Resource(graph_id=self.search_model_graphid)
r2 = Resource(graph_id=other_graph.pk)
r1_tile = Tile(
data={self.search_model_creation_date_nodeid: "1941-01-01"},
nodegroup_id=self.search_model_creation_date_nodeid,
)
r1.tiles.append(r1_tile)
r2_tile = Tile(
data={self.search_model_creation_date_nodeid: "1941-01-01"},
nodegroup_id=self.search_model_creation_date_nodeid,
)
r2.tiles.append(r2_tile)
r1.save(index=False)
r2.save(index=False)

Expand Down Expand Up @@ -465,6 +462,13 @@ def test_recalculate_descriptors_prefetch_related_objects(self):
]
self.assertEqual(len(tile_selects), 1)

published_graph_selects = [
q
for q in queries
if q["sql"].startswith('SELECT "published_graphs"."id"')
]
self.assertEqual(len(published_graph_selects), 1)

def test_self_referring_resource_instance_descriptor(self):
# Create a nodegroup with a string node and a resource-instance node.
graph = Graph.new(name="Self-referring descriptor test", is_resource=True)
Expand Down
Loading