Skip to content

Commit

Permalink
optimize retreival of ancestors by caching
Browse files Browse the repository at this point in the history
  • Loading branch information
thesujai committed Nov 4, 2024
1 parent de9b036 commit fac9514
Showing 1 changed file with 45 additions and 23 deletions.
68 changes: 45 additions & 23 deletions kolibri/core/logger/csv_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,33 @@
"summary": "{}_{}_content_summary_logs_from_{}_to_{}.csv",
}

CACHE_TIMEOUT = 60 * 10
BATCH_SIZE = 500

def cache_channel_name(obj):

def cache_content_data(content_id):
title_key = f"{content_id}_ContentNode_title"
ancestors_key = f"{content_id}_ContentNode_ancestors"

title = cache.get(title_key)
ancestors = cache.get(ancestors_key)

if title is None or ancestors is None:
node = ContentNode.objects.filter(content_id=content_id).first()
if node:
title = node.title
ancestors = node.ancestors
else:
title = ""
ancestors = []

cache.set(title_key, title, CACHE_TIMEOUT)
cache.set(ancestors_key, ancestors, CACHE_TIMEOUT)

return title, ancestors


def get_cached_channel_name(obj):
channel_id = obj["channel_id"]
key = "{id}_ChannelMetadata_name".format(id=channel_id)
channel_name = cache.get(key)
Expand All @@ -36,27 +61,24 @@ def cache_channel_name(obj):
channel_name = ChannelMetadata.objects.get(id=channel_id)
except ChannelMetadata.DoesNotExist:
channel_name = ""
cache.set(key, channel_name, 60 * 10)
cache.set(key, channel_name, CACHE_TIMEOUT)
return channel_name


def cache_content_title(obj):
def get_cached_content_title(obj):
content_id = obj["content_id"]
key = "{id}_ContentNode_title".format(id=content_id)
title = cache.get(key)
if title is None:
node = ContentNode.objects.filter(content_id=content_id).first()
if node:
title = node.title
else:
title = ""
cache.set(key, title, 60 * 10)
title, _ = cache_content_data(content_id)
return title


def get_cached_ancestors(content_id):
_, ancestors = cache_content_data(content_id)
return ancestors


mappings = {
"channel_name": cache_channel_name,
"content_title": cache_content_title,
"channel_name": get_cached_channel_name,
"content_title": get_cached_content_title,
"time_spent": lambda x: "{:.1f}".format(round(x["time_spent"], 1)),
"progress": lambda x: "{:.4f}".format(math.floor(x["progress"] * 10000.0) / 10000),
}
Expand Down Expand Up @@ -105,10 +127,13 @@ def cache_content_title(obj):

def get_max_ancestor_depth():
max_depth = 0
for node in ContentNode.objects.filter(
content_id__in=ContentSummaryLog.objects.values_list("content_id", flat=True)
):
max_depth = max(max_depth, len(node.ancestors))
content_ids = ContentSummaryLog.objects.values_list("content_id", flat=True)
for i in range(0, len(content_ids), BATCH_SIZE):
batch_ids = content_ids[i : i + BATCH_SIZE]
nodes = ContentNode.objects.filter(content_id__in=batch_ids)
for node in nodes:
ancestors = get_cached_ancestors(node.content_id)
max_depth = max(max_depth, len(ancestors))
return max_depth


Expand All @@ -125,11 +150,8 @@ def add_ancestors_info(row, ancestors, max_depth):

def map_object(item):
mapped_item = output_mapper(item, labels=labels, output_mappings=mappings)
node = ContentNode.objects.filter(content_id=item["content_id"]).first()
if node and node.ancestors:
add_ancestors_info(mapped_item, node.ancestors, get_max_ancestor_depth())
else:
add_ancestors_info(mapped_item, [], get_max_ancestor_depth())
ancestors = get_cached_ancestors(item["content_id"])
add_ancestors_info(mapped_item, ancestors, get_max_ancestor_depth())
return mapped_item


Expand Down

0 comments on commit fac9514

Please sign in to comment.