Skip to content

Commit

Permalink
Merge pull request #3399 from vkWeb/optimize/search
Browse files Browse the repository at this point in the history
Import from other channels search optimized
  • Loading branch information
bjester authored Oct 7, 2022
2 parents 70f45cd + aae3be1 commit b7470b7
Show file tree
Hide file tree
Showing 21 changed files with 354 additions and 275 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ learningactivities:

set-tsvectors:
python contentcuration/manage.py set_channel_tsvectors
python contentcuration/manage.py set_contentnode_tsvectors
python contentcuration/manage.py set_contentnode_tsvectors --published

###############################################################
# END PRODUCTION COMMANDS #####################################
Expand Down
47 changes: 0 additions & 47 deletions contentcuration/contentcuration/debug/middleware.py

This file was deleted.

11 changes: 8 additions & 3 deletions contentcuration/contentcuration/debug_panel_settings.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from .dev_settings import * # noqa

# These endpoints will throw an error on the django debug panel
# These endpoints will throw an error on the django debug panel.
EXCLUDED_DEBUG_URLS = [
"/content/storage",

# Disabling sync API because as soon as the sync API gets polled
# the current request data gets overwritten.
# Can be removed after websockets deployment.
"/api/sync",
]

DEBUG_PANEL_ACTIVE = True
Expand All @@ -14,10 +19,10 @@ def custom_show_toolbar(request):
) # noqa F405


# if debug_panel exists, add it to our INSTALLED_APPS
# if debug_panel exists, add it to our INSTALLED_APPS.
INSTALLED_APPS += ("debug_panel", "debug_toolbar", "pympler") # noqa F405
MIDDLEWARE += ( # noqa F405
"contentcuration.debug.middleware.CustomDebugPanelMiddleware",
"debug_toolbar.middleware.DebugToolbarMiddleware",
)
DEBUG_TOOLBAR_CONFIG = {
"SHOW_TOOLBAR_CALLBACK": custom_show_toolbar,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@
if (this.isTopic) {
return `${baseUrl}#/${this.node.id}`;
}
return `${baseUrl}#/${this.node.parent}/${this.node.id}`;
return `${baseUrl}#/${this.node.parent_id}/${this.node.id}`;
},
resourcesMsg() {
let count;
Expand All @@ -160,13 +160,8 @@
}
return this.$tr('resourcesCount', { count });
},
numLocations() {
return this.node.location_ids.length;
},
goToLocationLabel() {
return this.numLocations > 1
? this.$tr('goToPluralLocationsAction', { count: this.numLocations })
: this.$tr('goToSingleLocationAction');
return this.$tr('goToSingleLocationAction');
},
isTopic() {
return this.node.kind === ContentKindsNames.TOPIC;
Expand All @@ -189,8 +184,6 @@
$trs: {
tagsList: 'Tags: {tags}',
goToSingleLocationAction: 'Go to location',
goToPluralLocationsAction:
'In {count, number} {count, plural, one {location} other {locations}}',
addToClipboardAction: 'Copy to clipboard',
resourcesCount: '{count, number} {count, plural, one {resource} other {resources}}',
coach: 'Resource for coaches',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@
[this.channelFilter]: true,
page: this.$route.query.page || 1,
exclude: this.currentChannelId,
published: true,
}).then(page => {
this.pageCount = page.total_pages;
this.channels = page.results;
Expand Down
8 changes: 8 additions & 0 deletions contentcuration/contentcuration/tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
from importlib import import_module

import mock
from search.models import ContentNodeFullTextSearch

from contentcuration.models import ContentNode
from contentcuration.models import TaskResult


Expand Down Expand Up @@ -39,6 +41,12 @@ def mock_class_instance(target):
else:
target_cls = target

# ContentNode's node_fts field can be handled by Django when tests
# access the database but we mock it so that we don't need to query
# the database. By doing so we get faster test execution.
if type(target_cls) is ContentNode:
target_cls.node_fts = ContentNodeFullTextSearch()

class MockClass(target_cls):
def __new__(cls, *args, **kwargs):
return mock.Mock(spec_set=cls)
Expand Down
7 changes: 3 additions & 4 deletions contentcuration/contentcuration/tests/utils/test_cache.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import mock
from django.test import TestCase
from django.test import SimpleTestCase

from ..helpers import mock_class_instance
from contentcuration.models import ContentNode
from contentcuration.utils.cache import ResourceSizeCache


class ResourceSizeCacheTestCase(TestCase):
class ResourceSizeCacheTestCase(SimpleTestCase):
def setUp(self):
super(ResourceSizeCacheTestCase, self).setUp()
self.node = mock.Mock(spec_set=ContentNode())
self.node = mock_class_instance("contentcuration.models.ContentNode")
self.node.pk = "abcdefghijklmnopqrstuvwxyz"
self.redis_client = mock_class_instance("redis.client.StrictRedis")
self.cache_client = mock_class_instance("django_redis.client.DefaultClient")
Expand Down
8 changes: 4 additions & 4 deletions contentcuration/contentcuration/tests/utils/test_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
from dateutil.parser import isoparse
from django.db.models import F
from django.db.models import Max
from django.test import TestCase
from django.test import SimpleTestCase

from ..base import StudioTestCase
from contentcuration.models import ContentNode
from contentcuration.tests.helpers import mock_class_instance
from contentcuration.utils.nodes import calculate_resource_size
from contentcuration.utils.nodes import ResourceSizeHelper
from contentcuration.utils.nodes import SlowCalculationError
Expand Down Expand Up @@ -42,10 +42,10 @@ def test_modified_since(self):

@mock.patch("contentcuration.utils.nodes.ResourceSizeHelper")
@mock.patch("contentcuration.utils.nodes.ResourceSizeCache")
class CalculateResourceSizeTestCase(TestCase):
class CalculateResourceSizeTestCase(SimpleTestCase):
def setUp(self):
super(CalculateResourceSizeTestCase, self).setUp()
self.node = mock.Mock(spec_set=ContentNode())
self.node = mock_class_instance("contentcuration.models.ContentNode")

def assertCalculation(self, cache, helper, force=False):
helper().get_size.return_value = 456
Expand Down
54 changes: 53 additions & 1 deletion contentcuration/contentcuration/utils/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@
from django.core.files.storage import default_storage as storage
from django.core.management import call_command
from django.db.models import Count
from django.db.models import Exists
from django.db.models import Max
from django.db.models import OuterRef
from django.db.models import Q
from django.db.models import Subquery
from django.db.models import Sum
from django.db.utils import IntegrityError
from django.template.loader import render_to_string
Expand All @@ -37,6 +40,10 @@
from le_utils.constants import roles
from past.builtins import basestring
from past.utils import old_div
from search.models import ChannelFullTextSearch
from search.models import ContentNodeFullTextSearch
from search.utils import get_fts_annotated_channel_qs
from search.utils import get_fts_annotated_contentnode_qs

from contentcuration import models as ccmodels
from contentcuration.decorators import delay_user_storage_calculation
Expand Down Expand Up @@ -808,6 +815,50 @@ def fill_published_fields(channel, version_notes):
channel.save()


def sync_contentnode_and_channel_tsvectors(channel_id):
"""
Creates, deletes and updates tsvectors of the channel and all its content nodes
to reflect the current state of channel's main tree.
"""
# Update or create channel tsvector entry.
logging.info("Setting tsvector for channel with id {}.".format(channel_id))

channel = (get_fts_annotated_channel_qs()
.values("keywords_tsvector", "main_tree__tree_id")
.get(pk=channel_id))

obj, is_created = ChannelFullTextSearch.objects.update_or_create(channel_id=channel_id, defaults={"keywords_tsvector": channel["keywords_tsvector"]})
del obj

if is_created:
logging.info("Created 1 channel tsvector.")
else:
logging.info("Updated 1 channel tsvector.")

# Update or create contentnodes tsvector entry for channel_id.
logging.info("Setting tsvectors for all main tree contentnodes in channel {}.".format(channel_id))

if ContentNodeFullTextSearch.objects.filter(channel_id=channel_id).exists():
# First, delete nodes that are no longer in main_tree.
nodes_no_longer_in_main_tree = ~Exists(ccmodels.ContentNode.objects.filter(id=OuterRef("contentnode_id"), tree_id=channel["main_tree__tree_id"]))
ContentNodeFullTextSearch.objects.filter(nodes_no_longer_in_main_tree, channel_id=channel_id).delete()

# Now, all remaining nodes are in main_tree, so let's update them.
# Update only changed nodes.
node_tsv_subquery = get_fts_annotated_contentnode_qs(channel_id).filter(id=OuterRef("contentnode_id")).order_by()
ContentNodeFullTextSearch.objects.filter(channel_id=channel_id, contentnode__complete=True, contentnode__changed=True).update(
keywords_tsvector=Subquery(node_tsv_subquery.values("keywords_tsvector")[:1]),
author_tsvector=Subquery(node_tsv_subquery.values("author_tsvector")[:1])
)

# Insert newly created nodes.
# "set_contentnode_tsvectors" command is defined in "search/management/commands" directory.
call_command("set_contentnode_tsvectors",
"--channel-id={}".format(channel_id),
"--tree-id={}".format(channel["main_tree__tree_id"]),
"--complete")


@delay_user_storage_calculation
def publish_channel(
user_id,
Expand All @@ -829,8 +880,9 @@ def publish_channel(
set_channel_icon_encoding(channel)
kolibri_temp_db = create_content_database(channel, force, user_id, force_exercises, progress_tracker=progress_tracker)
increment_channel_version(channel)
mark_all_nodes_as_published(channel)
add_tokens_to_channel(channel)
sync_contentnode_and_channel_tsvectors(channel_id=channel.id)
mark_all_nodes_as_published(channel)
fill_published_fields(channel, version_notes)

# Attributes not getting set for some reason, so just save it here
Expand Down
29 changes: 12 additions & 17 deletions contentcuration/contentcuration/viewsets/channel.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
from rest_framework.serializers import CharField
from rest_framework.serializers import FloatField
from rest_framework.serializers import IntegerField
from search.models import ChannelFullTextSearch
from search.models import ContentNodeFullTextSearch
from search.utils import get_fts_search_query

from contentcuration.decorators import cache_no_user_data
from contentcuration.models import Change
Expand Down Expand Up @@ -119,23 +122,15 @@ def filter_deleted(self, queryset, name, value):
return queryset.filter(deleted=value)

def filter_keywords(self, queryset, name, value):
# TODO: Wait until we show more metadata on cards to add this back in
# keywords_query = self.main_tree_query.filter(
# Q(tags__tag_name__icontains=value)
# | Q(author__icontains=value)
# | Q(aggregator__icontains=value)
# | Q(provider__icontains=value)
# )
return queryset.annotate(
# keyword_match_count=SQCount(keywords_query, field="content_id"),
primary_token=primary_token_subquery,
).filter(
Q(name__icontains=value)
| Q(description__icontains=value)
| Q(pk__istartswith=value)
| Q(primary_token=value.replace("-", ""))
# | Q(keyword_match_count__gt=0)
)
search_query = get_fts_search_query(value)
dash_replaced_search_query = get_fts_search_query(value.replace("-", ""))

channel_keywords_query = (Exists(ChannelFullTextSearch.objects.filter(
Q(keywords_tsvector=search_query) | Q(keywords_tsvector=dash_replaced_search_query), channel_id=OuterRef("id"))))
contentnode_search_query = (Exists(ContentNodeFullTextSearch.objects.filter(
Q(keywords_tsvector=search_query) | Q(author_tsvector=search_query), channel_id=OuterRef("id"))))

return queryset.filter(Q(channel_keywords_query) | Q(contentnode_search_query))

def filter_languages(self, queryset, name, value):
languages = value.split(",")
Expand Down
1 change: 0 additions & 1 deletion contentcuration/search/admin.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,8 @@
from django.core.management.base import BaseCommand
from django.db.models import Exists
from django.db.models import OuterRef
from search.constants import CHANNEL_KEYWORDS_TSVECTOR
from search.models import ChannelFullTextSearch

from contentcuration.models import Channel
from contentcuration.viewsets.channel import primary_token_subquery
from search.utils import get_fts_annotated_channel_qs


logmodule.basicConfig(level=logmodule.INFO)
Expand All @@ -27,10 +24,8 @@ def handle(self, *args, **options):

channel_not_already_inserted_query = ~Exists(ChannelFullTextSearch.objects.filter(channel_id=OuterRef("id")))

channel_query = (Channel.objects.filter(channel_not_already_inserted_query,
deleted=False, main_tree__published=True)
.annotate(primary_channel_token=primary_token_subquery,
keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR)
channel_query = (get_fts_annotated_channel_qs().filter(channel_not_already_inserted_query,
deleted=False, main_tree__published=True)
.values("id", "keywords_tsvector"))

insertable_channels = list(channel_query[:CHUNKSIZE])
Expand Down
Loading

0 comments on commit b7470b7

Please sign in to comment.