From b5f51b4cfaa408aca74f7987664ce85cfd87017a Mon Sep 17 00:00:00 2001 From: Richard Tibbles Date: Wed, 6 Nov 2024 14:59:55 -0800 Subject: [PATCH] Ensure the specific node is returned, not just any with the matching content_id. Use node_id when available. Query for most recent session log to give parallel results for summary logs. --- kolibri/core/logger/csv_export.py | 101 +++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 31 deletions(-) diff --git a/kolibri/core/logger/csv_export.py b/kolibri/core/logger/csv_export.py index 31a529142b0..6ef35cc24f7 100644 --- a/kolibri/core/logger/csv_export.py +++ b/kolibri/core/logger/csv_export.py @@ -7,6 +7,10 @@ from dateutil import parser from django.core.cache import cache +from django.db.models import F +from django.db.models import Max +from django.db.models import OuterRef +from django.db.models import Subquery from django.utils.translation import gettext_lazy as _ from django.utils.translation import pgettext_lazy from le_utils.constants import content_kinds @@ -29,23 +33,47 @@ CACHE_TIMEOUT = 60 * 10 -def add_content_to_cache(content_id, **kwargs): - title_key = "{content_id}_ContentNode_title".format(content_id=content_id) - ancestors_key = "{content_id}_ContentNode_ancestors".format(content_id=content_id) +def _optional_node_id(item): + if ( + "most_recent_session_log_extra_fields" in item + and item["most_recent_session_log_extra_fields"] + and "context" in item["most_recent_session_log_extra_fields"] + ): + return item["most_recent_session_log_extra_fields"]["context"].get("node_id") - cache.set(title_key, kwargs.get("title", ""), CACHE_TIMEOUT) - cache.set(ancestors_key, kwargs.get("ancestors", []), CACHE_TIMEOUT) +def _key_gen(item, name): + node_id = _optional_node_id(item) + if node_id: + return f"{node_id}_ContentNode_{name}" + return f"{item['content_id']}_{item['channel_id']}_ContentNode_{name}" -def get_cached_content_data(content_id): - title_key = f"{content_id}_ContentNode_title" - ancestors_key = f"{content_id}_ContentNode_ancestors" - title = cache.get(title_key) - ancestors = cache.get(ancestors_key) +def _title_key(item): + return _key_gen(item, "title") + + +def _ancestors_key(item): + return _key_gen(item, "ancestors") + + +def add_content_to_cache(item, title, ancestors): + cache.set(_title_key(item), title, CACHE_TIMEOUT) + cache.set(_ancestors_key(item), ancestors, CACHE_TIMEOUT) + + +def get_cached_content_data(item): + title = cache.get(_title_key(item)) + ancestors = cache.get(_ancestors_key(item)) if title is None or ancestors is None: - node = ContentNode.objects.filter(content_id=content_id).first() + node_id = _optional_node_id(item) + if node_id: + node = ContentNode.objects.filter(pk=node_id).first() + else: + node = ContentNode.objects.filter( + content_id=item["content_id"], channel_id=item["channel_id"] + ).first() if node: title = node.title ancestors = node.ancestors @@ -53,7 +81,7 @@ def get_cached_content_data(content_id): title = "" ancestors = [] - add_content_to_cache(content_id, title=title, ancestors=ancestors) + add_content_to_cache(item, title, ancestors) return title, ancestors @@ -72,8 +100,7 @@ def get_cached_channel_name(obj): def get_cached_content_title(obj): - content_id = obj["content_id"] - title, _ = get_cached_content_data(content_id) + title, _ = get_cached_content_data(obj) return title @@ -131,19 +158,17 @@ def get_cached_ancestors(content_id): ) -def get_max_ancestor_depth(): - """Returns one less than the maximum depth of the ancestors of all content nodes""" - max_depth = 0 - content_ids = ContentSummaryLog.objects.values_list("content_id", flat=True) +def get_max_ancestor_depth(queryset): + """ + Returns one less than the maximum depth of the ancestors of all content nodes. + Because we are maxing the level attribute, we don't need to subtract 1 from the result, + as it is zero indexed. + """ + content_ids = queryset.values_list("content_id", flat=True) nodes = ContentNode.objects.filter(content_id__in=content_ids).only( "content_id", "title", "ancestors" ) - for node in nodes: - ancestors = node.ancestors - # cache it here so the retireival while adding ancestors info into csv is faster - add_content_to_cache(node.content_id, title=node.title, ancestors=ancestors) - max_depth = max(max_depth, len(ancestors)) - return max_depth - 1 + return (nodes.aggregate(max_depth=Max("level"))["max_depth"] or 1) - 1 def add_ancestors_info(row, ancestors, max_depth): @@ -158,16 +183,18 @@ def add_ancestors_info(row, ancestors, max_depth): ) -def map_object(item): +def map_object(item, topic_headers_length): mapped_item = output_mapper(item, labels=labels, output_mappings=mappings) - ancestors = get_cached_ancestors(item["content_id"]) - add_ancestors_info(mapped_item, ancestors, get_max_ancestor_depth()) + ancestors = get_cached_ancestors(item) + add_ancestors_info(mapped_item, ancestors, topic_headers_length) return mapped_item classes_info = { "session": { - "queryset": ContentSessionLog.objects.exclude(kind=content_kinds.QUIZ), + "queryset": ContentSessionLog.objects.exclude(kind=content_kinds.QUIZ).annotate( + most_recent_session_log_extra_fields=F("extra_fields") + ), "filename": CSV_EXPORT_FILENAMES["session"], "db_columns": ( "user__username", @@ -179,10 +206,21 @@ def map_object(item): "time_spent", "progress", "kind", + "most_recent_session_log_extra_fields", ), }, "summary": { - "queryset": ContentSummaryLog.objects.exclude(kind=content_kinds.QUIZ), + "queryset": ContentSummaryLog.objects.exclude(kind=content_kinds.QUIZ).annotate( + most_recent_session_log_extra_fields=Subquery( + ContentSessionLog.objects.filter( + user=OuterRef("user"), + content_id=OuterRef("content_id"), + channel_id=OuterRef("channel_id"), + ) + .order_by("-end_timestamp") + .values("extra_fields")[:1] + ) + ), "filename": CSV_EXPORT_FILENAMES["summary"], "db_columns": ( "user__username", @@ -195,6 +233,7 @@ def map_object(item): "time_spent", "progress", "kind", + "most_recent_session_log_extra_fields", ), }, } @@ -238,7 +277,7 @@ def csv_file_generator( # len of topic headers should be equal to the max depth of the content node topic_headers = [ (f"Topic level {i+1}", _(f"Topic level {i+1}")) - for i in range(get_max_ancestor_depth()) + for i in range(get_max_ancestor_depth(queryset)) ] content_id_index = header_labels.index(labels["content_id"]) @@ -255,5 +294,5 @@ def csv_file_generator( for item in queryset.select_related("user", "user__facility").values( *log_info["db_columns"] ): - writer.writerow(map_object(item)) + writer.writerow(map_object(item, len(topic_headers))) yield