From e835ce34e0e104290476e96b833f771d12565c3c Mon Sep 17 00:00:00 2001 From: Sujai Kumar Gupta Date: Mon, 4 Nov 2024 12:59:19 +0530 Subject: [PATCH 1/6] add function to get the max_ancestor_depth --- kolibri/core/logger/csv_export.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kolibri/core/logger/csv_export.py b/kolibri/core/logger/csv_export.py index d63f60acf42..208091e65d2 100644 --- a/kolibri/core/logger/csv_export.py +++ b/kolibri/core/logger/csv_export.py @@ -103,6 +103,16 @@ def cache_content_title(obj): ) ) + +def get_max_ancestor_depth(): + max_depth = 0 + for node in ContentNode.objects.filter( + content_id__in=ContentSummaryLog.objects.values_list("content_id", flat=True) + ): + max_depth = max(max_depth, len(node.ancestors)) + return max_depth + + map_object = partial(output_mapper, labels=labels, output_mappings=mappings) From c84ff836fe598f0d698103953e6e47c058c8d04b Mon Sep 17 00:00:00 2001 From: Sujai Kumar Gupta Date: Mon, 4 Nov 2024 13:01:53 +0530 Subject: [PATCH 2/6] make header_labels mutable --- kolibri/core/logger/csv_export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kolibri/core/logger/csv_export.py b/kolibri/core/logger/csv_export.py index 208091e65d2..0d909ea33c7 100644 --- a/kolibri/core/logger/csv_export.py +++ b/kolibri/core/logger/csv_export.py @@ -181,7 +181,7 @@ def csv_file_generator( queryset = queryset.filter(start_timestamp__lte=end) # Exclude completion timestamp for the sessionlog CSV - header_labels = tuple( + header_labels = list( label for label in labels.values() if log_type == "summary" or label != labels["completion_timestamp"] From caf881128391a1470b2b1add8c5a9ea5c9892606 Mon Sep 17 00:00:00 2001 From: Sujai Kumar Gupta Date: Mon, 4 Nov 2024 13:02:35 +0530 Subject: [PATCH 3/6] add topic headers in header_labels --- kolibri/core/logger/csv_export.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kolibri/core/logger/csv_export.py b/kolibri/core/logger/csv_export.py index 0d909ea33c7..5098ae85f31 100644 --- a/kolibri/core/logger/csv_export.py +++ b/kolibri/core/logger/csv_export.py @@ -186,6 +186,13 @@ def csv_file_generator( for label in labels.values() if log_type == "summary" or label != labels["completion_timestamp"] ) + # len of topic headers should be equal to the max depth of the content node + topic_headers = [ + (f"Topic level {i+1}", _(f"Topic level {i+1}")) + for i in range(get_max_ancestor_depth()) + ] + + header_labels += [label for _, label in topic_headers] csv_file = open_csv_for_writing(filepath) From de9b0366fa3355ff5c8981784a956969cbfac7ec Mon Sep 17 00:00:00 2001 From: Sujai Kumar Gupta Date: Mon, 4 Nov 2024 13:04:22 +0530 Subject: [PATCH 4/6] update the map_object to add the ancestor info before writing into csv --- kolibri/core/logger/csv_export.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/kolibri/core/logger/csv_export.py b/kolibri/core/logger/csv_export.py index 5098ae85f31..a6275b63159 100644 --- a/kolibri/core/logger/csv_export.py +++ b/kolibri/core/logger/csv_export.py @@ -4,7 +4,6 @@ import math import os from collections import OrderedDict -from functools import partial from dateutil import parser from django.core.cache import cache @@ -113,7 +112,25 @@ def get_max_ancestor_depth(): return max_depth -map_object = partial(output_mapper, labels=labels, output_mappings=mappings) +def add_ancestors_info(row, ancestors, max_depth): + row.update( + { + f"Topic level {level + 1}": ancestors[level]["title"] + if level < len(ancestors) + else "" + for level in range(max_depth) + } + ) + + +def map_object(item): + mapped_item = output_mapper(item, labels=labels, output_mappings=mappings) + node = ContentNode.objects.filter(content_id=item["content_id"]).first() + if node and node.ancestors: + add_ancestors_info(mapped_item, node.ancestors, get_max_ancestor_depth()) + else: + add_ancestors_info(mapped_item, [], get_max_ancestor_depth()) + return mapped_item classes_info = { From 03113e058b7db1cb5bfe3f135f2dfa72562f60e5 Mon Sep 17 00:00:00 2001 From: Sujai Kumar Gupta Date: Tue, 5 Nov 2024 01:44:31 +0530 Subject: [PATCH 5/6] optimize retreival of ancestors by caching --- kolibri/core/logger/csv_export.py | 76 +++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 23 deletions(-) diff --git a/kolibri/core/logger/csv_export.py b/kolibri/core/logger/csv_export.py index a6275b63159..afa7de1ef81 100644 --- a/kolibri/core/logger/csv_export.py +++ b/kolibri/core/logger/csv_export.py @@ -26,8 +26,39 @@ "summary": "{}_{}_content_summary_logs_from_{}_to_{}.csv", } +CACHE_TIMEOUT = 60 * 10 -def cache_channel_name(obj): + +def add_content_to_cache(content_id, **kwargs): + title_key = "{content_id}_ContentNode_title".format(content_id=content_id) + ancestors_key = "{content_id}_ContentNode_ancestors".format(content_id=content_id) + + cache.set(title_key, kwargs.get("title", ""), CACHE_TIMEOUT) + cache.set(ancestors_key, kwargs.get("ancestors", []), CACHE_TIMEOUT) + + +def get_cached_content_data(content_id): + title_key = f"{content_id}_ContentNode_title" + ancestors_key = f"{content_id}_ContentNode_ancestors" + + title = cache.get(title_key) + ancestors = cache.get(ancestors_key) + + if title is None or ancestors is None: + node = ContentNode.objects.filter(content_id=content_id).first() + if node: + title = node.title + ancestors = node.ancestors + else: + title = "" + ancestors = [] + + add_content_to_cache(content_id, title=title, ancestors=ancestors) + + return title, ancestors + + +def get_cached_channel_name(obj): channel_id = obj["channel_id"] key = "{id}_ChannelMetadata_name".format(id=channel_id) channel_name = cache.get(key) @@ -36,27 +67,24 @@ def cache_channel_name(obj): channel_name = ChannelMetadata.objects.get(id=channel_id) except ChannelMetadata.DoesNotExist: channel_name = "" - cache.set(key, channel_name, 60 * 10) + cache.set(key, channel_name, CACHE_TIMEOUT) return channel_name -def cache_content_title(obj): +def get_cached_content_title(obj): content_id = obj["content_id"] - key = "{id}_ContentNode_title".format(id=content_id) - title = cache.get(key) - if title is None: - node = ContentNode.objects.filter(content_id=content_id).first() - if node: - title = node.title - else: - title = "" - cache.set(key, title, 60 * 10) + title, _ = get_cached_content_data(content_id) return title +def get_cached_ancestors(content_id): + _, ancestors = get_cached_content_data(content_id) + return ancestors + + mappings = { - "channel_name": cache_channel_name, - "content_title": cache_content_title, + "channel_name": get_cached_channel_name, + "content_title": get_cached_content_title, "time_spent": lambda x: "{:.1f}".format(round(x["time_spent"], 1)), "progress": lambda x: "{:.4f}".format(math.floor(x["progress"] * 10000.0) / 10000), } @@ -105,10 +133,15 @@ def cache_content_title(obj): def get_max_ancestor_depth(): max_depth = 0 - for node in ContentNode.objects.filter( - content_id__in=ContentSummaryLog.objects.values_list("content_id", flat=True) - ): - max_depth = max(max_depth, len(node.ancestors)) + content_ids = ContentSummaryLog.objects.values_list("content_id", flat=True) + nodes = ContentNode.objects.filter(content_id__in=content_ids).only( + "content_id", "title", "ancestors" + ) + for node in nodes: + ancestors = node.ancestors + # cache it here so the retireival while adding ancestors info into csv is faster + add_content_to_cache(node.content_id, title=node.title, ancestors=ancestors) + max_depth = max(max_depth, len(ancestors)) return max_depth @@ -125,11 +158,8 @@ def add_ancestors_info(row, ancestors, max_depth): def map_object(item): mapped_item = output_mapper(item, labels=labels, output_mappings=mappings) - node = ContentNode.objects.filter(content_id=item["content_id"]).first() - if node and node.ancestors: - add_ancestors_info(mapped_item, node.ancestors, get_max_ancestor_depth()) - else: - add_ancestors_info(mapped_item, [], get_max_ancestor_depth()) + ancestors = get_cached_ancestors(item["content_id"]) + add_ancestors_info(mapped_item, ancestors, get_max_ancestor_depth()) return mapped_item From 93af375f2dd4cfd215dde3d5a6233d468001f52f Mon Sep 17 00:00:00 2001 From: Sujai Kumar Gupta Date: Wed, 6 Nov 2024 22:36:47 +0530 Subject: [PATCH 6/6] move the ancestor info in between of csv and ignore the first ancestor --- kolibri/core/logger/csv_export.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kolibri/core/logger/csv_export.py b/kolibri/core/logger/csv_export.py index afa7de1ef81..31a529142b0 100644 --- a/kolibri/core/logger/csv_export.py +++ b/kolibri/core/logger/csv_export.py @@ -132,6 +132,7 @@ def get_cached_ancestors(content_id): def get_max_ancestor_depth(): + """Returns one less than the maximum depth of the ancestors of all content nodes""" max_depth = 0 content_ids = ContentSummaryLog.objects.values_list("content_id", flat=True) nodes = ContentNode.objects.filter(content_id__in=content_ids).only( @@ -142,10 +143,11 @@ def get_max_ancestor_depth(): # cache it here so the retireival while adding ancestors info into csv is faster add_content_to_cache(node.content_id, title=node.title, ancestors=ancestors) max_depth = max(max_depth, len(ancestors)) - return max_depth + return max_depth - 1 def add_ancestors_info(row, ancestors, max_depth): + ancestors = ancestors[1:] row.update( { f"Topic level {level + 1}": ancestors[level]["title"] @@ -239,7 +241,10 @@ def csv_file_generator( for i in range(get_max_ancestor_depth()) ] - header_labels += [label for _, label in topic_headers] + content_id_index = header_labels.index(labels["content_id"]) + header_labels[content_id_index:content_id_index] = [ + label for _, label in topic_headers + ] csv_file = open_csv_for_writing(filepath)