From 357204c936e9d905c567465e2423ef8df86296a1 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Thu, 2 Jun 2022 16:42:36 +0530 Subject: [PATCH 01/25] Optimized import from other channels search --- .../contentcuration/dev_settings.py | 11 ++++ .../contentcuration/tests/testdata.py | 18 +++--- .../contentcuration/viewsets/contentnode.py | 3 +- contentcuration/search/tests.py | 55 +++++++++++++++++++ .../search/viewsets/contentnode.py | 2 +- 5 files changed, 79 insertions(+), 10 deletions(-) diff --git a/contentcuration/contentcuration/dev_settings.py b/contentcuration/contentcuration/dev_settings.py index d81d23a993..3dcbd5bf34 100644 --- a/contentcuration/contentcuration/dev_settings.py +++ b/contentcuration/contentcuration/dev_settings.py @@ -6,3 +6,14 @@ ROOT_URLCONF = "contentcuration.dev_urls" INSTALLED_APPS += ("drf_yasg",) + +REST_FRAMEWORK = { + 'DEFAULT_PERMISSION_CLASSES': ( + 'rest_framework.permissions.IsAuthenticated', + ), + 'DEFAULT_AUTHENTICATION_CLASSES': ( + 'rest_framework.authentication.SessionAuthentication', + 'rest_framework.authentication.BasicAuthentication', + 'rest_framework.authentication.TokenAuthentication', + ) +} diff --git a/contentcuration/contentcuration/tests/testdata.py b/contentcuration/contentcuration/tests/testdata.py index 50895337b1..5560e853a5 100644 --- a/contentcuration/contentcuration/tests/testdata.py +++ b/contentcuration/contentcuration/tests/testdata.py @@ -195,21 +195,23 @@ def node(data, parent=None): return new_node -def tree(parent=None): +def tree(parent=None, tree_data=None): # Read from json fixture - filepath = os.path.sep.join([os.path.dirname(__file__), "fixtures", "tree.json"]) - with open(filepath, "rb") as jsonfile: - data = json.load(jsonfile) + if tree_data is None: + filepath = os.path.sep.join([os.path.dirname(__file__), "fixtures", "tree.json"]) + with open(filepath, "rb") as jsonfile: + tree_data = json.load(jsonfile) - return node(data, parent) + return node(tree_data, parent) -def channel(name="testchannel"): +def channel(name="testchannel", create_main_tree=True, main_tree_data=None): channel = cc.Channel.objects.create(name=name) channel.save() - channel.main_tree = tree() - channel.save() + if create_main_tree: + channel.main_tree = tree(tree_data=main_tree_data) + channel.save() return channel diff --git a/contentcuration/contentcuration/viewsets/contentnode.py b/contentcuration/contentcuration/viewsets/contentnode.py index c7bb2e222a..44169a7532 100644 --- a/contentcuration/contentcuration/viewsets/contentnode.py +++ b/contentcuration/contentcuration/viewsets/contentnode.py @@ -610,7 +610,8 @@ def delete_from_changes(self, changes): def dict_if_none(obj, field_name=None): - return obj[field_name] if obj[field_name] else {} + value = obj.get(field_name) + return value if value else {} # Apply mixin first to override ValuesViewset diff --git a/contentcuration/search/tests.py b/contentcuration/search/tests.py index 2341fc9d1b..9d0aff4f4d 100644 --- a/contentcuration/search/tests.py +++ b/contentcuration/search/tests.py @@ -29,3 +29,58 @@ def test_filter_channels_by_edit(self): ) self.assertEqual(response.status_code, 200, response.content) self.assertNotEqual(response.data["results"], []) + + def test_search_result(self): + # Create two users + user_a = testdata.user(email="a@a.com") + user_b = testdata.user(email="b@b.com") + + # Create two channels with two editors + test_tree_data = { + "node_id": "00000000000000000000000000000000", + "title": "Root topic node", + "kind_id": "topic", + "children": [ + { + "node_id": "00000000000000000000000000000001", + "title": "Kolibri video", + "kind_id": "video", + }, + ] + } + + channel_a = testdata.channel(name="user_a_channel", main_tree_data=test_tree_data) + channel_a.editors.add(user_a) + + channel_b = testdata.channel(name="user_b_channel", create_main_tree=False) + channel_b.editors.add(user_b) + + # Publish channel_a + channel_a.main_tree.publishing = False + channel_a.main_tree.changed = False + channel_a.main_tree.published = True + channel_a.main_tree.save() + channel_a.public = True + channel_a.save() + + # Import resources from channel_a to channel_b + channel_a.main_tree.copy_to(channel_b.main_tree, batch_size=1) + channel_b.main_tree.refresh_from_db() + + # Send request from user_b to the search endpoint + self.client.force_authenticate(user=user_b) + response = self.client.get( + reverse("search-list"), + data={ + "channel_list": "public", + "keywords": "video" + }, + format="json", + ) + + # Assert whether the location_ids are of accessible nodes or not + kolibri_video_node = channel_b.main_tree.get_descendants().filter(title="Kolibri video").first() + + # The ids in location_ids should be of channel_b's ContentNode only + self.assertEqual(len(response.data["results"][0]["location_ids"]), 1) + self.assertEqual(response.data["results"][0]["location_ids"][0], kolibri_video_node.cloned_source_id) diff --git a/contentcuration/search/viewsets/contentnode.py b/contentcuration/search/viewsets/contentnode.py index 54b5f437aa..7736b1f98a 100644 --- a/contentcuration/search/viewsets/contentnode.py +++ b/contentcuration/search/viewsets/contentnode.py @@ -162,7 +162,7 @@ def annotate_queryset(self, queryset): 2. Annotate lists of content node and channel pks """ # Get accessible content nodes that match the content id - content_id_query = ContentNode.filter_view_queryset(ContentNode.objects.all(), self.request.user).filter( + content_id_query = queryset.filter( content_id=OuterRef("content_id") ) From d24416cfbfd176d9dd0d3172b646a18629e8e449 Mon Sep 17 00:00:00 2001 From: Blaine Jester Date: Wed, 15 Jun 2022 13:47:24 -0700 Subject: [PATCH 02/25] Add search test for channel filtering and location_ids handling --- .../contentcuration/tests/testdata.py | 2 + contentcuration/search/tests.py | 75 +++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/contentcuration/contentcuration/tests/testdata.py b/contentcuration/contentcuration/tests/testdata.py index 5560e853a5..b857dfb513 100644 --- a/contentcuration/contentcuration/tests/testdata.py +++ b/contentcuration/contentcuration/tests/testdata.py @@ -145,6 +145,7 @@ def node(data, parent=None): sort_order=data.get('sort_order', 1), complete=True, extra_fields=data.get('extra_fields'), + grade_levels="{}", ) new_node.save() video_file = fileobj_video(contents=b"Video File") @@ -171,6 +172,7 @@ def node(data, parent=None): content_id=data.get('content_id') or data['node_id'], sort_order=data.get('sort_order', 1), complete=True, + grade_levels="{}", ) new_node.save() diff --git a/contentcuration/search/tests.py b/contentcuration/search/tests.py index 9d0aff4f4d..bcb3b2aa99 100644 --- a/contentcuration/search/tests.py +++ b/contentcuration/search/tests.py @@ -2,6 +2,7 @@ from django.urls import reverse +from contentcuration.models import Channel from contentcuration.tests import testdata from contentcuration.tests.base import StudioAPITestCase @@ -84,3 +85,77 @@ def test_search_result(self): # The ids in location_ids should be of channel_b's ContentNode only self.assertEqual(len(response.data["results"][0]["location_ids"]), 1) self.assertEqual(response.data["results"][0]["location_ids"][0], kolibri_video_node.cloned_source_id) + + def test_channel_list_filter_and_location_ids(self): + users = [] + channels = [] + for i in range(4): + user = testdata.user(email="a{}@a.com".format(i)) + users.append(user) + channel = Channel.objects.create(name="user_a{}_channel".format(i)) + channel.save() + channels.append(channel) + channel.editors.add(user) + + public_channel, editable_channel, viewable_channel, inaccessible_channel = channels + + public_video_node = testdata.node({ + "title": "Kolibri video", + "kind_id": "video", + }, parent=public_channel.main_tree) + public_video_node.complete = True + public_video_node.published = True + public_video_node.changed = False + public_video_node.save() + + public_channel.main_tree.published = True + public_channel.main_tree.changed = False + public_channel.main_tree.save() + + public_channel.public = True + public_channel.save() + + user_b = users[1] + viewable_channel.viewers.add(user_b) + + public_video_node.refresh_from_db() + public_video_node.copy_to(target=editable_channel.main_tree) + public_video_node.copy_to(target=viewable_channel.main_tree) + public_video_node.copy_to(target=inaccessible_channel.main_tree) + + editable_channel.main_tree.refresh_from_db() + editable_video_node = editable_channel.main_tree.get_descendants().first() + viewable_channel.main_tree.refresh_from_db() + viewable_video_node = viewable_channel.main_tree.get_descendants().first() + inaccessible_channel.main_tree.refresh_from_db() + inaccessible_video_node = inaccessible_channel.main_tree.get_descendants().first() + + # Send request from user_b to the search endpoint + self.client.force_authenticate(user=user_b) + + for channel_list in ("public", "edit", "view"): + response = self.client.get( + reverse("search-list"), + data={ + "channel_list": channel_list, + "keywords": "video" + }, + format="json", + ) + + for result in response.data["results"]: + self.assertNotEqual(result["id"], inaccessible_video_node.id) + + if channel_list == "public": + self.assertEqual(result["id"], public_video_node.id) + elif channel_list == "edit": + self.assertEqual(result["id"], editable_video_node.id) + elif channel_list == "view": + self.assertEqual(result["id"], viewable_video_node.id) + + location_ids = result["location_ids"] + self.assertEqual(len(location_ids), 3) + self.assertIn(editable_video_node.id, location_ids) + self.assertIn(viewable_video_node.id, location_ids) + self.assertIn(public_video_node.id, location_ids) + self.assertNotIn(inaccessible_video_node.id, location_ids) From 676526e4f7de518066f3af8acc4aa9f539a66ba2 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Thu, 30 Jun 2022 13:18:26 +0530 Subject: [PATCH 03/25] Fix autodiscovery of search tests --- contentcuration/search/tests/__init__.py | 0 .../search/{tests.py => tests/test_search.py} | 0 contentcuration/search/viewsets/contentnode.py | 17 ++++++++++------- 3 files changed, 10 insertions(+), 7 deletions(-) create mode 100644 contentcuration/search/tests/__init__.py rename contentcuration/search/{tests.py => tests/test_search.py} (100%) diff --git a/contentcuration/search/tests/__init__.py b/contentcuration/search/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/contentcuration/search/tests.py b/contentcuration/search/tests/test_search.py similarity index 100% rename from contentcuration/search/tests.py rename to contentcuration/search/tests/test_search.py diff --git a/contentcuration/search/viewsets/contentnode.py b/contentcuration/search/viewsets/contentnode.py index 7736b1f98a..9c61c6092a 100644 --- a/contentcuration/search/viewsets/contentnode.py +++ b/contentcuration/search/viewsets/contentnode.py @@ -13,18 +13,19 @@ from django_filters.rest_framework import CharFilter from le_utils.constants import content_kinds from le_utils.constants import roles +from rest_framework.permissions import IsAuthenticated from contentcuration.models import Channel from contentcuration.models import ContentNode from contentcuration.models import File from contentcuration.utils.pagination import CachedListPagination from contentcuration.viewsets.base import RequiredFilterSet +from contentcuration.viewsets.base import ValuesViewset from contentcuration.viewsets.common import NotNullMapArrayAgg from contentcuration.viewsets.common import SQArrayAgg from contentcuration.viewsets.common import SQCount from contentcuration.viewsets.common import UUIDFilter from contentcuration.viewsets.common import UUIDInFilter -from contentcuration.viewsets.contentnode import ContentNodeViewSet class ListPagination(CachedListPagination): @@ -65,9 +66,11 @@ def filter_channel_list(self, queryset, name, value): def filter_keywords(self, queryset, name, value): filter_query = Q(title__icontains=value) | Q(description__icontains=value) + tags_node_ids = ContentNode.tags.through.objects.filter( contenttag__tag_name__icontains=value - ).values_list("contentnode_id", flat=True)[:250] + ).values_list("contentnode_id", flat=True) + # Check if we have a Kolibri node id or ids and add them to the search if so. # Add to, rather than replace, the filters so that we never misinterpret a search term as a UUID. # node_ids = uuid_re.findall(value) + list(tags_node_ids) @@ -77,10 +80,8 @@ def filter_keywords(self, queryset, name, value): filter_query |= Q(node_id=node_id) filter_query |= Q(content_id=node_id) filter_query |= Q(id=node_id) - for node_id in tags_node_ids: - filter_query |= Q(id=node_id) - return queryset.filter(filter_query) + return queryset.filter(Q(id__in=list(tags_node_ids)) | filter_query) def filter_author(self, queryset, name, value): return queryset.filter( @@ -130,9 +131,11 @@ class Meta: ) -class SearchContentNodeViewSet(ContentNodeViewSet): +class SearchContentNodeViewSet(ValuesViewset): + queryset = ContentNode.objects.all() filterset_class = ContentNodeFilter pagination_class = ListPagination + permission_classes = [IsAuthenticated] values = ( "id", "content_id", @@ -162,7 +165,7 @@ def annotate_queryset(self, queryset): 2. Annotate lists of content node and channel pks """ # Get accessible content nodes that match the content id - content_id_query = queryset.filter( + content_id_query = ContentNode.filter_view_queryset(ContentNode.objects.all(), self.request.user).filter( content_id=OuterRef("content_id") ) From 6d40c5559556adb9894740270bc7e6bc3e67208d Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Sat, 9 Jul 2022 19:47:40 +0530 Subject: [PATCH 04/25] Remove location_ids, zero db queries on descendant resource count --- .../contentcuration/dev_settings.py | 4 ++ .../views/ImportFromChannels/BrowsingCard.vue | 9 +-- .../search/viewsets/contentnode.py | 72 ++++++------------- requirements-dev.in | 1 + requirements-dev.txt | 6 +- 5 files changed, 32 insertions(+), 60 deletions(-) diff --git a/contentcuration/contentcuration/dev_settings.py b/contentcuration/contentcuration/dev_settings.py index 3dcbd5bf34..9df785afd1 100644 --- a/contentcuration/contentcuration/dev_settings.py +++ b/contentcuration/contentcuration/dev_settings.py @@ -1,4 +1,6 @@ # flake8: noqa +from dotenv import load_dotenv + from .not_production_settings import * # noqa DEBUG = True @@ -17,3 +19,5 @@ 'rest_framework.authentication.TokenAuthentication', ) } + +load_dotenv(override=True) # take environment variables from .env. diff --git a/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/BrowsingCard.vue b/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/BrowsingCard.vue index d8e3a30a30..f3451c96b8 100644 --- a/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/BrowsingCard.vue +++ b/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/BrowsingCard.vue @@ -155,13 +155,8 @@ } return this.$tr('resourcesCount', { count }); }, - numLocations() { - return this.node.location_ids.length; - }, goToLocationLabel() { - return this.numLocations > 1 - ? this.$tr('goToPluralLocationsAction', { count: this.numLocations }) - : this.$tr('goToSingleLocationAction'); + return this.$tr('goToSingleLocationAction'); }, isTopic() { return this.node.kind === ContentKindsNames.TOPIC; @@ -184,8 +179,6 @@ $trs: { tagsList: 'Tags: {tags}', goToSingleLocationAction: 'Go to location', - goToPluralLocationsAction: - 'In {count, number} {count, plural, one {location} other {locations}}', addToClipboardAction: 'Copy to clipboard', resourcesCount: '{count, number} {count, plural, one {resource} other {resources}}', coach: 'Resource for coaches', diff --git a/contentcuration/search/viewsets/contentnode.py b/contentcuration/search/viewsets/contentnode.py index 9c61c6092a..8fab7f4bbe 100644 --- a/contentcuration/search/viewsets/contentnode.py +++ b/contentcuration/search/viewsets/contentnode.py @@ -1,14 +1,12 @@ import re -from django.db.models import Case +from django.db.models import ExpressionWrapper from django.db.models import F from django.db.models import IntegerField from django.db.models import OuterRef from django.db.models import Q from django.db.models import Subquery from django.db.models import Value -from django.db.models import When -from django.db.models.functions import Coalesce from django_filters.rest_framework import BooleanFilter from django_filters.rest_framework import CharFilter from le_utils.constants import content_kinds @@ -22,10 +20,9 @@ from contentcuration.viewsets.base import RequiredFilterSet from contentcuration.viewsets.base import ValuesViewset from contentcuration.viewsets.common import NotNullMapArrayAgg -from contentcuration.viewsets.common import SQArrayAgg -from contentcuration.viewsets.common import SQCount from contentcuration.viewsets.common import UUIDFilter from contentcuration.viewsets.common import UUIDInFilter +from contentcuration.viewsets.contentnode import get_title class ListPagination(CachedListPagination): @@ -136,6 +133,7 @@ class SearchContentNodeViewSet(ValuesViewset): filterset_class = ContentNodeFilter pagination_class = ListPagination permission_classes = [IsAuthenticated] + values = ( "id", "content_id", @@ -154,71 +152,43 @@ class SearchContentNodeViewSet(ValuesViewset): "modified", "parent_id", "changed", - "location_ids", "content_tags", "original_channel_name", ) + field_map = { + "title": get_title, + } + + def get_queryset(self): + queryset = super(SearchContentNodeViewSet, self).get_queryset() + return ContentNode._annotate_channel_id(queryset) + def annotate_queryset(self, queryset): """ 1. Do a distinct by 'content_id,' using the original node if possible 2. Annotate lists of content node and channel pks """ - # Get accessible content nodes that match the content id - content_id_query = ContentNode.filter_view_queryset(ContentNode.objects.all(), self.request.user).filter( - content_id=OuterRef("content_id") - ) - - # Combine by unique content id - deduped_content_query = ( - content_id_query.filter(content_id=OuterRef("content_id")) - .annotate( - is_original=Case( - When(original_source_node_id=F("node_id"), then=Value(1)), - default=Value(2), - output_field=IntegerField(), - ), - ) - .order_by("is_original", "created") - ) - queryset = queryset.filter( - pk__in=Subquery(deduped_content_query.values_list("id", flat=True)[:1]) - ).order_by() - thumbnails = File.objects.filter( contentnode=OuterRef("id"), preset__thumbnail=True ) - descendant_resources = ( - ContentNode.objects.filter( - tree_id=OuterRef("tree_id"), - lft__gt=OuterRef("lft"), - rght__lt=OuterRef("rght"), - ) - .exclude(kind_id=content_kinds.TOPIC) - .values("id", "role_visibility", "changed") - .order_by() - ) - original_channel_name = Coalesce( - Subquery( - Channel.objects.filter(pk=OuterRef("original_channel_id")).values( - "name" - )[:1] - ), - Subquery( - Channel.objects.filter(main_tree__tree_id=OuterRef("tree_id")).values( - "name" - )[:1] - ), + descendant_resources_count = ExpressionWrapper(((F("rght") - F("lft") - Value(1)) / Value(2)), output_field=IntegerField()) + + channel_name = Subquery( + Channel.objects.filter(pk=OuterRef("channel_id")).values( + "name" + )[:1] ) + queryset = queryset.annotate( - location_ids=SQArrayAgg(content_id_query, field="id"), - resource_count=SQCount(descendant_resources, field="id"), + resource_count=descendant_resources_count, thumbnail_checksum=Subquery(thumbnails.values("checksum")[:1]), thumbnail_extension=Subquery( thumbnails.values("file_format__extension")[:1] ), content_tags=NotNullMapArrayAgg("tags__tag_name"), - original_channel_name=original_channel_name, + original_channel_name=channel_name, ) + return queryset diff --git a/requirements-dev.in b/requirements-dev.in index 6d4a04ff49..ea6f900871 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -41,3 +41,4 @@ tabulate==0.8.2 fonttools flower==0.9.4 minio==7.1.1 +python-dotenv diff --git a/requirements-dev.txt b/requirements-dev.txt index 2961f78cec..0c21ce1c18 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile +# This file is autogenerated by pip-compile with python 3.8 # To update, run: # # pip-compile requirements-dev.in @@ -256,6 +256,8 @@ python-dateutil==2.8.1 # via # -c requirements.txt # faker +python-dotenv==0.20.0 + # via -r requirements-dev.in python-jsonrpc-server==0.4.0 # via python-language-server python-language-server==0.36.2 @@ -286,6 +288,8 @@ rope==0.19.0 # via -r requirements-dev.in ruamel-yaml==0.17.4 # via drf-yasg +ruamel-yaml-clib==0.2.6 + # via ruamel-yaml service-factory==0.1.6 # via -r requirements-dev.in six==1.16.0 From 094f05f7ef0e995ebccf2ba5da9aa41b8267a152 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Tue, 12 Jul 2022 14:19:28 +0530 Subject: [PATCH 05/25] Upgrade django debug toolbar and fix its settings --- .../contentcuration/debug/__init__.py | 0 .../contentcuration/debug/middleware.py | 47 ------------------- .../contentcuration/debug_panel_settings.py | 2 +- requirements-dev.in | 2 +- requirements-dev.txt | 2 +- 5 files changed, 3 insertions(+), 50 deletions(-) delete mode 100644 contentcuration/contentcuration/debug/__init__.py delete mode 100644 contentcuration/contentcuration/debug/middleware.py diff --git a/contentcuration/contentcuration/debug/__init__.py b/contentcuration/contentcuration/debug/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/contentcuration/contentcuration/debug/middleware.py b/contentcuration/contentcuration/debug/middleware.py deleted file mode 100644 index 803a94f89b..0000000000 --- a/contentcuration/contentcuration/debug/middleware.py +++ /dev/null @@ -1,47 +0,0 @@ -import threading -import time - -import debug_panel.urls -from debug_panel.cache import cache -from debug_panel.middleware import DebugPanelMiddleware -from django.urls import reverse - - -class CustomDebugPanelMiddleware(DebugPanelMiddleware): - """ - Custom version to fix SQL escaping: - https://github.com/recamshak/django-debug-panel/issues/17#issuecomment-366268893 - """ - - def process_response(self, request, response): - """ - Store the DebugToolbarMiddleware rendered toolbar into a cache store. - The data stored in the cache are then reachable from an URL that is appened - to the HTTP response header under the 'X-debug-data-url' key. - """ - toolbar = self.__class__.debug_toolbars.get( - threading.current_thread().ident, None - ) - - response = super(DebugPanelMiddleware, self).process_response(request, response) - - if toolbar: - # for django-debug-toolbar >= 1.4 - for panel in reversed(toolbar.enabled_panels): - if ( - hasattr(panel, "generate_stats") and not panel.get_stats() - ): # PATCH HERE - panel.generate_stats(request, response) - - cache_key = "%f" % time.time() - cache.set(cache_key, toolbar.render_toolbar()) - - response["X-debug-data-url"] = request.build_absolute_uri( - reverse( - "debug_data", - urlconf=debug_panel.urls, - kwargs={"cache_key": cache_key}, - ) - ) - - return response diff --git a/contentcuration/contentcuration/debug_panel_settings.py b/contentcuration/contentcuration/debug_panel_settings.py index 79f9ddac6e..a61b573aab 100644 --- a/contentcuration/contentcuration/debug_panel_settings.py +++ b/contentcuration/contentcuration/debug_panel_settings.py @@ -17,7 +17,7 @@ def custom_show_toolbar(request): # if debug_panel exists, add it to our INSTALLED_APPS INSTALLED_APPS += ("debug_panel", "debug_toolbar", "pympler") # noqa F405 MIDDLEWARE += ( # noqa F405 - "contentcuration.debug.middleware.CustomDebugPanelMiddleware", + "debug_toolbar.middleware.DebugToolbarMiddleware", ) DEBUG_TOOLBAR_CONFIG = { "SHOW_TOOLBAR_CALLBACK": custom_show_toolbar, diff --git a/requirements-dev.in b/requirements-dev.in index ea6f900871..a398514373 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -2,7 +2,7 @@ python-language-server django-concurrent-test-helper==0.7.0 django-debug-panel==0.8.3 -django-debug-toolbar==1.9.1 +django-debug-toolbar==3.5.0 flake8==3.4.1 whitenoise Pympler diff --git a/requirements-dev.txt b/requirements-dev.txt index 0c21ce1c18..1b98ce6bc3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -81,7 +81,7 @@ django-concurrent-test-helper==0.7.0 # via -r requirements-dev.in django-debug-panel==0.8.3 # via -r requirements-dev.in -django-debug-toolbar==1.9.1 +django-debug-toolbar==3.5.0 # via # -r requirements-dev.in # django-debug-panel From 701ddc9a0176882c077839479e0bc0990367ddf1 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Fri, 22 Jul 2022 15:02:25 +0530 Subject: [PATCH 06/25] Remove unnecessary dev settings --- .../contentcuration/debug_panel_settings.py | 8 ++++++-- contentcuration/contentcuration/dev_settings.py | 15 --------------- requirements-dev.in | 1 - requirements-dev.txt | 2 -- 4 files changed, 6 insertions(+), 20 deletions(-) diff --git a/contentcuration/contentcuration/debug_panel_settings.py b/contentcuration/contentcuration/debug_panel_settings.py index a61b573aab..5dc3c6783f 100644 --- a/contentcuration/contentcuration/debug_panel_settings.py +++ b/contentcuration/contentcuration/debug_panel_settings.py @@ -1,8 +1,12 @@ from .dev_settings import * # noqa -# These endpoints will throw an error on the django debug panel +# These endpoints will throw an error on the django debug panel. EXCLUDED_DEBUG_URLS = [ "/content/storage", + + # Disabling task API because as soon as the task API gets polled + # the current request data gets overwritten. + "/api/task", ] DEBUG_PANEL_ACTIVE = True @@ -14,7 +18,7 @@ def custom_show_toolbar(request): ) # noqa F405 -# if debug_panel exists, add it to our INSTALLED_APPS +# if debug_panel exists, add it to our INSTALLED_APPS. INSTALLED_APPS += ("debug_panel", "debug_toolbar", "pympler") # noqa F405 MIDDLEWARE += ( # noqa F405 "debug_toolbar.middleware.DebugToolbarMiddleware", diff --git a/contentcuration/contentcuration/dev_settings.py b/contentcuration/contentcuration/dev_settings.py index 9df785afd1..d81d23a993 100644 --- a/contentcuration/contentcuration/dev_settings.py +++ b/contentcuration/contentcuration/dev_settings.py @@ -1,6 +1,4 @@ # flake8: noqa -from dotenv import load_dotenv - from .not_production_settings import * # noqa DEBUG = True @@ -8,16 +6,3 @@ ROOT_URLCONF = "contentcuration.dev_urls" INSTALLED_APPS += ("drf_yasg",) - -REST_FRAMEWORK = { - 'DEFAULT_PERMISSION_CLASSES': ( - 'rest_framework.permissions.IsAuthenticated', - ), - 'DEFAULT_AUTHENTICATION_CLASSES': ( - 'rest_framework.authentication.SessionAuthentication', - 'rest_framework.authentication.BasicAuthentication', - 'rest_framework.authentication.TokenAuthentication', - ) -} - -load_dotenv(override=True) # take environment variables from .env. diff --git a/requirements-dev.in b/requirements-dev.in index a398514373..8a83c768c5 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -41,4 +41,3 @@ tabulate==0.8.2 fonttools flower==0.9.4 minio==7.1.1 -python-dotenv diff --git a/requirements-dev.txt b/requirements-dev.txt index 1b98ce6bc3..cf06183f09 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -256,8 +256,6 @@ python-dateutil==2.8.1 # via # -c requirements.txt # faker -python-dotenv==0.20.0 - # via -r requirements-dev.in python-jsonrpc-server==0.4.0 # via python-language-server python-language-server==0.36.2 From 1a14749e79047fcbe52670419650fd59c2cc3e40 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Fri, 22 Jul 2022 15:33:52 +0530 Subject: [PATCH 07/25] Add .envrc to .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 19bfec72a5..63b5c6d221 100644 --- a/.gitignore +++ b/.gitignore @@ -65,7 +65,8 @@ docs/_build/ # PyBuilder target/ -# virtualenv, pipenv +# virtualenv, pipenv, direnv +.envrc .env venv .venv From bff595cdb78b0c7089bd931b82461e831077df26 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Fri, 12 Aug 2022 21:57:44 +0530 Subject: [PATCH 08/25] Add vector search column & indexes, also GiST trigram index --- .../0138_contentnode_search_vector.py | 34 +++++++++++++++++++ contentcuration/contentcuration/models.py | 19 +++++++++++ contentcuration/contentcuration/settings.py | 1 + .../contentcuration/utils/publish.py | 7 +++- .../search/viewsets/contentnode.py | 21 +++--------- 5 files changed, 64 insertions(+), 18 deletions(-) create mode 100644 contentcuration/contentcuration/migrations/0138_contentnode_search_vector.py diff --git a/contentcuration/contentcuration/migrations/0138_contentnode_search_vector.py b/contentcuration/contentcuration/migrations/0138_contentnode_search_vector.py new file mode 100644 index 0000000000..e3b24b8905 --- /dev/null +++ b/contentcuration/contentcuration/migrations/0138_contentnode_search_vector.py @@ -0,0 +1,34 @@ +# Generated by Django 3.2.13 on 2022-08-10 19:20 +import django.contrib.postgres.indexes +import django.contrib.postgres.search +from django.contrib.postgres.operations import AddIndexConcurrently +from django.contrib.postgres.operations import TrigramExtension +from django.db import migrations + + +class Migration(migrations.Migration): + + atomic = False + + dependencies = [ + ('contentcuration', '0137_channelhistory'), + ] + + operations = [ + # Installs the pg_trgm module that comes pre-bundled with PostgreSQL 9.6. + TrigramExtension(), + + migrations.AddField( + model_name='contentnode', + name='title_description_search_vector', + field=django.contrib.postgres.search.SearchVectorField(blank=True, null=True), + ), + AddIndexConcurrently( + model_name='contentnode', + index=django.contrib.postgres.indexes.GinIndex(fields=['title_description_search_vector'], name='node_search_vector_gin_idx'), + ), + AddIndexConcurrently( + model_name='contenttag', + index=django.contrib.postgres.indexes.GistIndex(fields=['tag_name'], name='contenttag_tag_name_gist_idx', opclasses=['gist_trgm_ops']), + ), + ] diff --git a/contentcuration/contentcuration/models.py b/contentcuration/contentcuration/models.py index f7fea1aee2..e72e05b012 100644 --- a/contentcuration/contentcuration/models.py +++ b/contentcuration/contentcuration/models.py @@ -13,6 +13,10 @@ from django.contrib.auth.base_user import AbstractBaseUser from django.contrib.auth.base_user import BaseUserManager from django.contrib.auth.models import PermissionsMixin +from django.contrib.postgres.indexes import GinIndex +from django.contrib.postgres.indexes import GistIndex +from django.contrib.postgres.search import SearchVector +from django.contrib.postgres.search import SearchVectorField from django.core.cache import cache from django.core.exceptions import MultipleObjectsReturned from django.core.exceptions import ObjectDoesNotExist @@ -1082,6 +1086,9 @@ def delete(self, *args, **kwargs): self.secret_token.delete() +CONTENT_TAG_NAME__INDEX_NAME = "contenttag_tag_name_gist_idx" + + class ContentTag(models.Model): id = UUIDField(primary_key=True, default=uuid.uuid4) tag_name = models.CharField(max_length=50) @@ -1093,6 +1100,7 @@ def __str__(self): class Meta: unique_together = ['tag_name', 'channel'] + indexes = [GistIndex(fields=["tag_name"], name=CONTENT_TAG_NAME__INDEX_NAME, opclasses=["gist_trgm_ops"])] def delegate_manager(method): @@ -1136,6 +1144,12 @@ def __str__(self): NODE_ID_INDEX_NAME = "node_id_idx" NODE_MODIFIED_INDEX_NAME = "node_modified_idx" NODE_MODIFIED_DESC_INDEX_NAME = "node_modified_desc_idx" +NODE_SEARCH_VECTOR_GIN_INDEX_NAME = "node_search_vector_gin_idx" + +# Ours postgres full text search configuration. +POSTGRES_FTS_CONFIG = "simple" +# Search vector to create tsvector of title and description concatenated. +POSTGRES_SEARCH_VECTOR = SearchVector("title", "description", config=POSTGRES_FTS_CONFIG) class ContentNode(MPTTModel, models.Model): @@ -1231,6 +1245,10 @@ class ContentNode(MPTTModel, models.Model): # this duration should be in seconds. suggested_duration = models.IntegerField(blank=True, null=True, help_text="Suggested duration for the content node (in seconds)") + # A field to store the ts_vector form of (title + ' ' + description). + # This significantly increases the search performance. + title_description_search_vector = SearchVectorField(blank=True, null=True) + objects = CustomContentNodeTreeManager() # Track all updates and ignore a blacklist of attributes @@ -1830,6 +1848,7 @@ class Meta: indexes = [ models.Index(fields=["node_id"], name=NODE_ID_INDEX_NAME), models.Index(fields=["-modified"], name=NODE_MODIFIED_DESC_INDEX_NAME), + GinIndex(fields=["title_description_search_vector"], name=NODE_SEARCH_VECTOR_GIN_INDEX_NAME), ] diff --git a/contentcuration/contentcuration/settings.py b/contentcuration/contentcuration/settings.py index ae637542ba..b79c3b641e 100644 --- a/contentcuration/contentcuration/settings.py +++ b/contentcuration/contentcuration/settings.py @@ -86,6 +86,7 @@ 'webpack_loader', 'django_filters', 'mathfilters', + 'django.contrib.postgres', ) SESSION_ENGINE = "django.contrib.sessions.backends.cached_db" diff --git a/contentcuration/contentcuration/utils/publish.py b/contentcuration/contentcuration/utils/publish.py index 8dfd5ef428..88b48be574 100644 --- a/contentcuration/contentcuration/utils/publish.py +++ b/contentcuration/contentcuration/utils/publish.py @@ -187,6 +187,10 @@ def queue_get_return_none_when_empty(): logging.debug("Mapping node with id {id}".format( id=node.pk)) + # Update tsvector for this node. + node.title_description_search_vector = ccmodels.POSTGRES_SEARCH_VECTOR + node.save(update_fields=["title_description_search_vector"]) + if node.get_descendants(include_self=True).exclude(kind_id=content_kinds.TOPIC).exists() and node.complete: children = (node.children.all()) node_queue.extend(children) @@ -428,7 +432,8 @@ def process_assessment_metadata(ccnode, kolibrinode): exercise_data_type = "" if exercise_data.get('mastery_model'): exercise_data_type = exercise_data.get('mastery_model') - if exercise_data.get('option') and exercise_data.get('option').get('completion_criteria') and exercise_data.get('option').get('completion_criteria').get('mastery_model'): + if exercise_data.get('option') and exercise_data.get('option').get('completion_criteria') \ + and exercise_data.get('option').get('completion_criteria').get('mastery_model'): exercise_data_type = exercise_data.get('option').get('completion_criteria').get('mastery_model') mastery_model = {'type': exercise_data_type or exercises.M_OF_N} diff --git a/contentcuration/search/viewsets/contentnode.py b/contentcuration/search/viewsets/contentnode.py index 8fab7f4bbe..864897db25 100644 --- a/contentcuration/search/viewsets/contentnode.py +++ b/contentcuration/search/viewsets/contentnode.py @@ -1,5 +1,6 @@ import re +from django.contrib.postgres.search import SearchQuery from django.db.models import ExpressionWrapper from django.db.models import F from django.db.models import IntegerField @@ -16,6 +17,7 @@ from contentcuration.models import Channel from contentcuration.models import ContentNode from contentcuration.models import File +from contentcuration.models import POSTGRES_FTS_CONFIG from contentcuration.utils.pagination import CachedListPagination from contentcuration.viewsets.base import RequiredFilterSet from contentcuration.viewsets.base import ValuesViewset @@ -62,23 +64,8 @@ def filter_channel_list(self, queryset, name, value): return queryset.filter(channel_id__in=list(channel_ids)) def filter_keywords(self, queryset, name, value): - filter_query = Q(title__icontains=value) | Q(description__icontains=value) - - tags_node_ids = ContentNode.tags.through.objects.filter( - contenttag__tag_name__icontains=value - ).values_list("contentnode_id", flat=True) - - # Check if we have a Kolibri node id or ids and add them to the search if so. - # Add to, rather than replace, the filters so that we never misinterpret a search term as a UUID. - # node_ids = uuid_re.findall(value) + list(tags_node_ids) - node_ids = uuid_re.findall(value) - for node_id in node_ids: - # check for the major ID types - filter_query |= Q(node_id=node_id) - filter_query |= Q(content_id=node_id) - filter_query |= Q(id=node_id) - - return queryset.filter(Q(id__in=list(tags_node_ids)) | filter_query) + search_tsquery = SearchQuery(value=value, config=POSTGRES_FTS_CONFIG, search_type="plain") + return queryset.filter(title_description_search_vector=search_tsquery) def filter_author(self, queryset, name, value): return queryset.filter( From c0e55ee704ecf7d7ba997c2b6108233097489653 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Fri, 12 Aug 2022 22:21:46 +0530 Subject: [PATCH 09/25] Remove cyclic migration conflicts --- ...de_search_vector.py => 0141_contentnode_search_vector.py} | 2 +- contentcuration/search/viewsets/contentnode.py | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) rename contentcuration/contentcuration/migrations/{0138_contentnode_search_vector.py => 0141_contentnode_search_vector.py} (95%) diff --git a/contentcuration/contentcuration/migrations/0138_contentnode_search_vector.py b/contentcuration/contentcuration/migrations/0141_contentnode_search_vector.py similarity index 95% rename from contentcuration/contentcuration/migrations/0138_contentnode_search_vector.py rename to contentcuration/contentcuration/migrations/0141_contentnode_search_vector.py index e3b24b8905..551f42d5b7 100644 --- a/contentcuration/contentcuration/migrations/0138_contentnode_search_vector.py +++ b/contentcuration/contentcuration/migrations/0141_contentnode_search_vector.py @@ -11,7 +11,7 @@ class Migration(migrations.Migration): atomic = False dependencies = [ - ('contentcuration', '0137_channelhistory'), + ('contentcuration', '0140_delete_task'), ] operations = [ diff --git a/contentcuration/search/viewsets/contentnode.py b/contentcuration/search/viewsets/contentnode.py index 864897db25..e192a3ccb7 100644 --- a/contentcuration/search/viewsets/contentnode.py +++ b/contentcuration/search/viewsets/contentnode.py @@ -24,7 +24,6 @@ from contentcuration.viewsets.common import NotNullMapArrayAgg from contentcuration.viewsets.common import UUIDFilter from contentcuration.viewsets.common import UUIDInFilter -from contentcuration.viewsets.contentnode import get_title class ListPagination(CachedListPagination): @@ -143,10 +142,6 @@ class SearchContentNodeViewSet(ValuesViewset): "original_channel_name", ) - field_map = { - "title": get_title, - } - def get_queryset(self): queryset = super(SearchContentNodeViewSet, self).get_queryset() return ContentNode._annotate_channel_id(queryset) From 34e8436886516e92478dce5f364a2774ce63f113 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Fri, 12 Aug 2022 22:25:57 +0530 Subject: [PATCH 10/25] Fix wrong indentation happened due to merge conflict --- .../contentcuration/utils/publish.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/contentcuration/contentcuration/utils/publish.py b/contentcuration/contentcuration/utils/publish.py index 213682b539..732d92d125 100644 --- a/contentcuration/contentcuration/utils/publish.py +++ b/contentcuration/contentcuration/utils/publish.py @@ -198,17 +198,17 @@ def queue_get_return_none_when_empty(): children = (node.children.all()) node_queue.extend(children) - kolibrinode = create_bare_contentnode(node, default_language, channel_id, channel_name) - - if node.kind.kind == content_kinds.EXERCISE: - exercise_data = process_assessment_metadata(node, kolibrinode) - if force_exercises or node.changed or not \ - node.files.filter(preset_id=format_presets.EXERCISE).exists(): - create_perseus_exercise(node, kolibrinode, exercise_data, user_id=user_id) - elif node.kind.kind == content_kinds.SLIDESHOW: - create_slideshow_manifest(node, kolibrinode, user_id=user_id) - create_associated_file_objects(kolibrinode, node) - map_tags_to_node(kolibrinode, node) + kolibrinode = create_bare_contentnode(node, default_language, channel_id, channel_name) + + if node.kind.kind == content_kinds.EXERCISE: + exercise_data = process_assessment_metadata(node, kolibrinode) + if force_exercises or node.changed or not \ + node.files.filter(preset_id=format_presets.EXERCISE).exists(): + create_perseus_exercise(node, kolibrinode, exercise_data, user_id=user_id) + elif node.kind.kind == content_kinds.SLIDESHOW: + create_slideshow_manifest(node, kolibrinode, user_id=user_id) + create_associated_file_objects(kolibrinode, node) + map_tags_to_node(kolibrinode, node) if progress_tracker: progress_tracker.increment(increment=percent_per_node) From e00c512f40718858c154e6e3b0584287a836cef8 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Sun, 14 Aug 2022 19:07:20 +0530 Subject: [PATCH 11/25] Add a command for setting tsvectors and fix tests --- .../management/commands/set_tsvectors.py | 71 ++++++++++++++++++ contentcuration/contentcuration/models.py | 12 +++ contentcuration/contentcuration/settings.py | 3 +- contentcuration/search/tests/test_search.py | 73 ++----------------- .../search/viewsets/contentnode.py | 15 +--- 5 files changed, 96 insertions(+), 78 deletions(-) create mode 100644 contentcuration/contentcuration/management/commands/set_tsvectors.py diff --git a/contentcuration/contentcuration/management/commands/set_tsvectors.py b/contentcuration/contentcuration/management/commands/set_tsvectors.py new file mode 100644 index 0000000000..6ed9819ff5 --- /dev/null +++ b/contentcuration/contentcuration/management/commands/set_tsvectors.py @@ -0,0 +1,71 @@ +""" +This command sets tsvector in title_description_search_vector field in batches. +The batches are created on the basis of channel_id. This enables resumption. Also helps +in cases of failure or memory overflow. +""" +import logging as logmodule + +from django.core.cache import cache +from django.core.management.base import BaseCommand + +from contentcuration.models import Channel +from contentcuration.models import ContentNode +from contentcuration.models import POSTGRES_SEARCH_VECTOR + + +logmodule.basicConfig(level=logmodule.INFO) +logging = logmodule.getLogger(__name__) + + +UPDATED_TS_VECTORS_CACHE_KEY = "tsvectors_updated_for_channel_ids" +UPDATED_TS_VECTORS_FOR_NULL_CHANNEL_CACHE_KEY = "tsvectors_updated_for_null_channels" + + +class Command(BaseCommand): + def add_arguments(self, parser): + parser.add_argument( + "--public", + action="store_true", + help="Set tsvector for only the public channel nodes instead of all nodes.", + ) + parser.add_argument( + "--no-cache", + action="store_true", + help="Disables the cache. This updates all previously updated nodes.", + ) + + def handle(self, *args, **options): + if options["no_cache"]: + updated_channel_ids = [] + do_update_nodes_with_null_channel_id = True + else: + updated_channel_ids = [] if cache.get(UPDATED_TS_VECTORS_CACHE_KEY) is None else cache.get(UPDATED_TS_VECTORS_CACHE_KEY) + do_update_nodes_with_null_channel_id = not cache.get(UPDATED_TS_VECTORS_FOR_NULL_CHANNEL_CACHE_KEY) + + if options["public"]: + to_update_channel_ids = list(Channel.get_public_channels().exclude(id__in=updated_channel_ids).values_list("id", flat=True)) + do_update_nodes_with_null_channel_id = False + logging.info("Started setting tsvector for public channel nodes.") + else: + to_update_channel_ids = list(Channel.objects.exclude(id__in=updated_channel_ids).values_list("id", flat=True)) + logging.info("Started setting tsvector for all nodes.") + + annotated_contentnode_qs = ContentNode._annotate_channel_id(ContentNode.objects) + + for channel_id in to_update_channel_ids: + logging.info("Setting tsvector for nodes of channel {}.".format(channel_id)) + annotated_contentnode_qs.filter(channel_id=channel_id).update(title_description_search_vector=POSTGRES_SEARCH_VECTOR) + updated_channel_ids.append(channel_id) + cache.set(UPDATED_TS_VECTORS_CACHE_KEY, updated_channel_ids, None) + logging.info("Finished setting tsvector for nodes of channel {}.".format(channel_id)) + + if do_update_nodes_with_null_channel_id: + logging.info("Setting tsvector for nodes with NULL channel_id.") + annotated_contentnode_qs.filter(channel_id__isnull=True).update(title_description_search_vector=POSTGRES_SEARCH_VECTOR) + cache.set(UPDATED_TS_VECTORS_FOR_NULL_CHANNEL_CACHE_KEY, True, None) + logging.info("Finished setting tsvector for nodes with NULL channel_id.") + + if options["public"]: + logging.info("Finished setting tsvector for public channel nodes.") + else: + logging.info("Finished setting tsvector for all nodes.") diff --git a/contentcuration/contentcuration/models.py b/contentcuration/contentcuration/models.py index cebbfea959..8b1c331097 100644 --- a/contentcuration/contentcuration/models.py +++ b/contentcuration/contentcuration/models.py @@ -14,6 +14,7 @@ from django.contrib.auth.models import PermissionsMixin from django.contrib.postgres.indexes import GinIndex from django.contrib.postgres.indexes import GistIndex +from django.contrib.postgres.search import SearchQuery from django.contrib.postgres.search import SearchVector from django.contrib.postgres.search import SearchVectorField from django.contrib.sessions.models import Session @@ -1356,6 +1357,12 @@ def filter_view_queryset(cls, queryset, user): | Q(public=True) ) + @classmethod + def search(self, queryset, search_term): + search_query = Q(title_description_search_vector=SearchQuery(value=search_term, config=POSTGRES_FTS_CONFIG, search_type="plain")) + tags_query = Q(tags__tag_name__icontains=search_term) + return queryset.filter(search_query | tags_query) + @raise_if_unsaved def get_root(self): # Only topics can be root nodes @@ -1839,8 +1846,10 @@ def set_default_learning_activity(self): def save(self, skip_lock=False, *args, **kwargs): if self._state.adding: + is_create = True self.on_create() else: + is_create = False self.on_update() # Logic borrowed from mptt - do a simple check to see if we have changed @@ -1884,6 +1893,9 @@ def save(self, skip_lock=False, *args, **kwargs): if changed_ids: ContentNode.objects.filter(id__in=changed_ids).update(changed=True) + if is_create: + ContentNode.filter_by_pk(pk=self.id).update(title_description_search_vector=POSTGRES_SEARCH_VECTOR) + # Copied from MPTT save.alters_data = True diff --git a/contentcuration/contentcuration/settings.py b/contentcuration/contentcuration/settings.py index 92ed582a57..1e3cdb1e2c 100644 --- a/contentcuration/contentcuration/settings.py +++ b/contentcuration/contentcuration/settings.py @@ -221,7 +221,8 @@ } IS_CONTENTNODE_TABLE_PARTITIONED = os.getenv("IS_CONTENTNODE_TABLE_PARTITIONED") or False - +TSVECTOR_SET_FOR_ALL_PUBLIC_CHANNEL_NODES = os.getenv("TSVECTOR_SET_FOR_ALL_PUBLIC_CHANNEL_NODES") or False +TSVECTOR_SET_FOR_ALL_NODES = os.getenv("TSVECTOR_SET_FOR_ALL_NODES") or False DATABASE_ROUTERS = [ "kolibri_content.router.ContentDBRouter", diff --git a/contentcuration/search/tests/test_search.py b/contentcuration/search/tests/test_search.py index bcb3b2aa99..6b3d34cc41 100644 --- a/contentcuration/search/tests/test_search.py +++ b/contentcuration/search/tests/test_search.py @@ -31,67 +31,14 @@ def test_filter_channels_by_edit(self): self.assertEqual(response.status_code, 200, response.content) self.assertNotEqual(response.data["results"], []) - def test_search_result(self): - # Create two users - user_a = testdata.user(email="a@a.com") - user_b = testdata.user(email="b@b.com") - - # Create two channels with two editors - test_tree_data = { - "node_id": "00000000000000000000000000000000", - "title": "Root topic node", - "kind_id": "topic", - "children": [ - { - "node_id": "00000000000000000000000000000001", - "title": "Kolibri video", - "kind_id": "video", - }, - ] - } - - channel_a = testdata.channel(name="user_a_channel", main_tree_data=test_tree_data) - channel_a.editors.add(user_a) - - channel_b = testdata.channel(name="user_b_channel", create_main_tree=False) - channel_b.editors.add(user_b) - - # Publish channel_a - channel_a.main_tree.publishing = False - channel_a.main_tree.changed = False - channel_a.main_tree.published = True - channel_a.main_tree.save() - channel_a.public = True - channel_a.save() - - # Import resources from channel_a to channel_b - channel_a.main_tree.copy_to(channel_b.main_tree, batch_size=1) - channel_b.main_tree.refresh_from_db() - - # Send request from user_b to the search endpoint - self.client.force_authenticate(user=user_b) - response = self.client.get( - reverse("search-list"), - data={ - "channel_list": "public", - "keywords": "video" - }, - format="json", - ) - - # Assert whether the location_ids are of accessible nodes or not - kolibri_video_node = channel_b.main_tree.get_descendants().filter(title="Kolibri video").first() - - # The ids in location_ids should be of channel_b's ContentNode only - self.assertEqual(len(response.data["results"][0]["location_ids"]), 1) - self.assertEqual(response.data["results"][0]["location_ids"][0], kolibri_video_node.cloned_source_id) - - def test_channel_list_filter_and_location_ids(self): + def test_search(self): users = [] channels = [] + for i in range(4): user = testdata.user(email="a{}@a.com".format(i)) users.append(user) + channel = Channel.objects.create(name="user_a{}_channel".format(i)) channel.save() channels.append(channel) @@ -99,6 +46,7 @@ def test_channel_list_filter_and_location_ids(self): public_channel, editable_channel, viewable_channel, inaccessible_channel = channels + # Create public video node and publish it. public_video_node = testdata.node({ "title": "Kolibri video", "kind_id": "video", @@ -108,13 +56,14 @@ def test_channel_list_filter_and_location_ids(self): public_video_node.changed = False public_video_node.save() + # Publish the public_channel. public_channel.main_tree.published = True public_channel.main_tree.changed = False public_channel.main_tree.save() - public_channel.public = True public_channel.save() + # Set user_b viewable channel. user_b = users[1] viewable_channel.viewers.add(user_b) @@ -123,6 +72,7 @@ def test_channel_list_filter_and_location_ids(self): public_video_node.copy_to(target=viewable_channel.main_tree) public_video_node.copy_to(target=inaccessible_channel.main_tree) + # Get different nodes based on access. editable_channel.main_tree.refresh_from_db() editable_video_node = editable_channel.main_tree.get_descendants().first() viewable_channel.main_tree.refresh_from_db() @@ -130,7 +80,7 @@ def test_channel_list_filter_and_location_ids(self): inaccessible_channel.main_tree.refresh_from_db() inaccessible_video_node = inaccessible_channel.main_tree.get_descendants().first() - # Send request from user_b to the search endpoint + # Send request from user_b to the search endpoint. self.client.force_authenticate(user=user_b) for channel_list in ("public", "edit", "view"): @@ -152,10 +102,3 @@ def test_channel_list_filter_and_location_ids(self): self.assertEqual(result["id"], editable_video_node.id) elif channel_list == "view": self.assertEqual(result["id"], viewable_video_node.id) - - location_ids = result["location_ids"] - self.assertEqual(len(location_ids), 3) - self.assertIn(editable_video_node.id, location_ids) - self.assertIn(viewable_video_node.id, location_ids) - self.assertIn(public_video_node.id, location_ids) - self.assertNotIn(inaccessible_video_node.id, location_ids) diff --git a/contentcuration/search/viewsets/contentnode.py b/contentcuration/search/viewsets/contentnode.py index e192a3ccb7..df5cf831cb 100644 --- a/contentcuration/search/viewsets/contentnode.py +++ b/contentcuration/search/viewsets/contentnode.py @@ -1,6 +1,5 @@ import re -from django.contrib.postgres.search import SearchQuery from django.db.models import ExpressionWrapper from django.db.models import F from django.db.models import IntegerField @@ -17,7 +16,6 @@ from contentcuration.models import Channel from contentcuration.models import ContentNode from contentcuration.models import File -from contentcuration.models import POSTGRES_FTS_CONFIG from contentcuration.utils.pagination import CachedListPagination from contentcuration.viewsets.base import RequiredFilterSet from contentcuration.viewsets.base import ValuesViewset @@ -32,9 +30,6 @@ class ListPagination(CachedListPagination): max_page_size = 100 -uuid_re = re.compile("([a-f0-9]{32})") - - class ContentNodeFilter(RequiredFilterSet): keywords = CharFilter(method="filter_keywords") languages = CharFilter(method="filter_languages") @@ -63,8 +58,7 @@ def filter_channel_list(self, queryset, name, value): return queryset.filter(channel_id__in=list(channel_ids)) def filter_keywords(self, queryset, name, value): - search_tsquery = SearchQuery(value=value, config=POSTGRES_FTS_CONFIG, search_type="plain") - return queryset.filter(title_description_search_vector=search_tsquery) + return ContentNode.search(queryset=queryset, search_term=value) def filter_author(self, queryset, name, value): return queryset.filter( @@ -115,7 +109,6 @@ class Meta: class SearchContentNodeViewSet(ValuesViewset): - queryset = ContentNode.objects.all() filterset_class = ContentNodeFilter pagination_class = ListPagination permission_classes = [IsAuthenticated] @@ -143,13 +136,11 @@ class SearchContentNodeViewSet(ValuesViewset): ) def get_queryset(self): - queryset = super(SearchContentNodeViewSet, self).get_queryset() - return ContentNode._annotate_channel_id(queryset) + return ContentNode._annotate_channel_id(ContentNode.objects) def annotate_queryset(self, queryset): """ - 1. Do a distinct by 'content_id,' using the original node if possible - 2. Annotate lists of content node and channel pks + Annotates thumbnails, resources count and channel name. """ thumbnails = File.objects.filter( contentnode=OuterRef("id"), preset__thumbnail=True From f3280d94432d8d54fdb1070b286b6b35ad849cf5 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Sun, 14 Aug 2022 20:43:13 +0530 Subject: [PATCH 12/25] Remove grade_level default to pass failing tests --- contentcuration/contentcuration/settings.py | 2 -- contentcuration/contentcuration/tests/testdata.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/contentcuration/contentcuration/settings.py b/contentcuration/contentcuration/settings.py index 1e3cdb1e2c..a8e5e8678b 100644 --- a/contentcuration/contentcuration/settings.py +++ b/contentcuration/contentcuration/settings.py @@ -221,8 +221,6 @@ } IS_CONTENTNODE_TABLE_PARTITIONED = os.getenv("IS_CONTENTNODE_TABLE_PARTITIONED") or False -TSVECTOR_SET_FOR_ALL_PUBLIC_CHANNEL_NODES = os.getenv("TSVECTOR_SET_FOR_ALL_PUBLIC_CHANNEL_NODES") or False -TSVECTOR_SET_FOR_ALL_NODES = os.getenv("TSVECTOR_SET_FOR_ALL_NODES") or False DATABASE_ROUTERS = [ "kolibri_content.router.ContentDBRouter", diff --git a/contentcuration/contentcuration/tests/testdata.py b/contentcuration/contentcuration/tests/testdata.py index b857dfb513..5560e853a5 100644 --- a/contentcuration/contentcuration/tests/testdata.py +++ b/contentcuration/contentcuration/tests/testdata.py @@ -145,7 +145,6 @@ def node(data, parent=None): sort_order=data.get('sort_order', 1), complete=True, extra_fields=data.get('extra_fields'), - grade_levels="{}", ) new_node.save() video_file = fileobj_video(contents=b"Video File") @@ -172,7 +171,6 @@ def node(data, parent=None): content_id=data.get('content_id') or data['node_id'], sort_order=data.get('sort_order', 1), complete=True, - grade_levels="{}", ) new_node.save() From 3ff0edd040d1ba5a2ab0f958b4e173152821c85a Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Thu, 8 Sep 2022 13:02:32 +0530 Subject: [PATCH 13/25] Full text search models and data migrations --- .../contentcuration/debug_panel_settings.py | 5 +- .../management/commands/set_tsvectors.py | 71 ------------------- .../0141_contentnode_search_vector.py | 34 --------- contentcuration/contentcuration/models.py | 27 ------- .../contentcuration/viewsets/contentnode.py | 3 +- contentcuration/search/admin.py | 1 - contentcuration/search/management/__init__.py | 0 .../search/management/commands/__init__.py | 0 .../commands/set_channel_tsvectors.py | 57 +++++++++++++++ .../commands/set_contentnode_tsvectors.py | 60 ++++++++++++++++ .../search/migrations/0003_fulltextsearch.py | 54 ++++++++++++++ contentcuration/search/models.py | 53 ++++++++++++++ 12 files changed, 228 insertions(+), 137 deletions(-) delete mode 100644 contentcuration/contentcuration/management/commands/set_tsvectors.py delete mode 100644 contentcuration/contentcuration/migrations/0141_contentnode_search_vector.py delete mode 100644 contentcuration/search/admin.py create mode 100644 contentcuration/search/management/__init__.py create mode 100644 contentcuration/search/management/commands/__init__.py create mode 100644 contentcuration/search/management/commands/set_channel_tsvectors.py create mode 100644 contentcuration/search/management/commands/set_contentnode_tsvectors.py create mode 100644 contentcuration/search/migrations/0003_fulltextsearch.py diff --git a/contentcuration/contentcuration/debug_panel_settings.py b/contentcuration/contentcuration/debug_panel_settings.py index 5dc3c6783f..c097acbbc6 100644 --- a/contentcuration/contentcuration/debug_panel_settings.py +++ b/contentcuration/contentcuration/debug_panel_settings.py @@ -4,9 +4,10 @@ EXCLUDED_DEBUG_URLS = [ "/content/storage", - # Disabling task API because as soon as the task API gets polled + # Disabling sync API because as soon as the sync API gets polled # the current request data gets overwritten. - "/api/task", + # Can be removed after websockets deployment. + "/api/sync", ] DEBUG_PANEL_ACTIVE = True diff --git a/contentcuration/contentcuration/management/commands/set_tsvectors.py b/contentcuration/contentcuration/management/commands/set_tsvectors.py deleted file mode 100644 index 6ed9819ff5..0000000000 --- a/contentcuration/contentcuration/management/commands/set_tsvectors.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -This command sets tsvector in title_description_search_vector field in batches. -The batches are created on the basis of channel_id. This enables resumption. Also helps -in cases of failure or memory overflow. -""" -import logging as logmodule - -from django.core.cache import cache -from django.core.management.base import BaseCommand - -from contentcuration.models import Channel -from contentcuration.models import ContentNode -from contentcuration.models import POSTGRES_SEARCH_VECTOR - - -logmodule.basicConfig(level=logmodule.INFO) -logging = logmodule.getLogger(__name__) - - -UPDATED_TS_VECTORS_CACHE_KEY = "tsvectors_updated_for_channel_ids" -UPDATED_TS_VECTORS_FOR_NULL_CHANNEL_CACHE_KEY = "tsvectors_updated_for_null_channels" - - -class Command(BaseCommand): - def add_arguments(self, parser): - parser.add_argument( - "--public", - action="store_true", - help="Set tsvector for only the public channel nodes instead of all nodes.", - ) - parser.add_argument( - "--no-cache", - action="store_true", - help="Disables the cache. This updates all previously updated nodes.", - ) - - def handle(self, *args, **options): - if options["no_cache"]: - updated_channel_ids = [] - do_update_nodes_with_null_channel_id = True - else: - updated_channel_ids = [] if cache.get(UPDATED_TS_VECTORS_CACHE_KEY) is None else cache.get(UPDATED_TS_VECTORS_CACHE_KEY) - do_update_nodes_with_null_channel_id = not cache.get(UPDATED_TS_VECTORS_FOR_NULL_CHANNEL_CACHE_KEY) - - if options["public"]: - to_update_channel_ids = list(Channel.get_public_channels().exclude(id__in=updated_channel_ids).values_list("id", flat=True)) - do_update_nodes_with_null_channel_id = False - logging.info("Started setting tsvector for public channel nodes.") - else: - to_update_channel_ids = list(Channel.objects.exclude(id__in=updated_channel_ids).values_list("id", flat=True)) - logging.info("Started setting tsvector for all nodes.") - - annotated_contentnode_qs = ContentNode._annotate_channel_id(ContentNode.objects) - - for channel_id in to_update_channel_ids: - logging.info("Setting tsvector for nodes of channel {}.".format(channel_id)) - annotated_contentnode_qs.filter(channel_id=channel_id).update(title_description_search_vector=POSTGRES_SEARCH_VECTOR) - updated_channel_ids.append(channel_id) - cache.set(UPDATED_TS_VECTORS_CACHE_KEY, updated_channel_ids, None) - logging.info("Finished setting tsvector for nodes of channel {}.".format(channel_id)) - - if do_update_nodes_with_null_channel_id: - logging.info("Setting tsvector for nodes with NULL channel_id.") - annotated_contentnode_qs.filter(channel_id__isnull=True).update(title_description_search_vector=POSTGRES_SEARCH_VECTOR) - cache.set(UPDATED_TS_VECTORS_FOR_NULL_CHANNEL_CACHE_KEY, True, None) - logging.info("Finished setting tsvector for nodes with NULL channel_id.") - - if options["public"]: - logging.info("Finished setting tsvector for public channel nodes.") - else: - logging.info("Finished setting tsvector for all nodes.") diff --git a/contentcuration/contentcuration/migrations/0141_contentnode_search_vector.py b/contentcuration/contentcuration/migrations/0141_contentnode_search_vector.py deleted file mode 100644 index 551f42d5b7..0000000000 --- a/contentcuration/contentcuration/migrations/0141_contentnode_search_vector.py +++ /dev/null @@ -1,34 +0,0 @@ -# Generated by Django 3.2.13 on 2022-08-10 19:20 -import django.contrib.postgres.indexes -import django.contrib.postgres.search -from django.contrib.postgres.operations import AddIndexConcurrently -from django.contrib.postgres.operations import TrigramExtension -from django.db import migrations - - -class Migration(migrations.Migration): - - atomic = False - - dependencies = [ - ('contentcuration', '0140_delete_task'), - ] - - operations = [ - # Installs the pg_trgm module that comes pre-bundled with PostgreSQL 9.6. - TrigramExtension(), - - migrations.AddField( - model_name='contentnode', - name='title_description_search_vector', - field=django.contrib.postgres.search.SearchVectorField(blank=True, null=True), - ), - AddIndexConcurrently( - model_name='contentnode', - index=django.contrib.postgres.indexes.GinIndex(fields=['title_description_search_vector'], name='node_search_vector_gin_idx'), - ), - AddIndexConcurrently( - model_name='contenttag', - index=django.contrib.postgres.indexes.GistIndex(fields=['tag_name'], name='contenttag_tag_name_gist_idx', opclasses=['gist_trgm_ops']), - ), - ] diff --git a/contentcuration/contentcuration/models.py b/contentcuration/contentcuration/models.py index 8b1c331097..a2a4af786d 100644 --- a/contentcuration/contentcuration/models.py +++ b/contentcuration/contentcuration/models.py @@ -12,11 +12,6 @@ from django.contrib.auth.base_user import AbstractBaseUser from django.contrib.auth.base_user import BaseUserManager from django.contrib.auth.models import PermissionsMixin -from django.contrib.postgres.indexes import GinIndex -from django.contrib.postgres.indexes import GistIndex -from django.contrib.postgres.search import SearchQuery -from django.contrib.postgres.search import SearchVector -from django.contrib.postgres.search import SearchVectorField from django.contrib.sessions.models import Session from django.core.cache import cache from django.core.exceptions import MultipleObjectsReturned @@ -1111,7 +1106,6 @@ def __str__(self): class Meta: unique_together = ['tag_name', 'channel'] - indexes = [GistIndex(fields=["tag_name"], name=CONTENT_TAG_NAME__INDEX_NAME, opclasses=["gist_trgm_ops"])] def delegate_manager(method): @@ -1156,11 +1150,6 @@ def __str__(self): NODE_MODIFIED_INDEX_NAME = "node_modified_idx" NODE_MODIFIED_DESC_INDEX_NAME = "node_modified_desc_idx" NODE_SEARCH_VECTOR_GIN_INDEX_NAME = "node_search_vector_gin_idx" - -# Ours postgres full text search configuration. -POSTGRES_FTS_CONFIG = "simple" -# Search vector to create tsvector of title and description concatenated. -POSTGRES_SEARCH_VECTOR = SearchVector("title", "description", config=POSTGRES_FTS_CONFIG) CONTENTNODE_TREE_ID_CACHE_KEY = "contentnode_{pk}__tree_id" @@ -1257,10 +1246,6 @@ class ContentNode(MPTTModel, models.Model): # this duration should be in seconds. suggested_duration = models.IntegerField(blank=True, null=True, help_text="Suggested duration for the content node (in seconds)") - # A field to store the ts_vector form of (title + ' ' + description). - # This significantly increases the search performance. - title_description_search_vector = SearchVectorField(blank=True, null=True) - objects = CustomContentNodeTreeManager() # Track all updates and ignore a blacklist of attributes @@ -1357,12 +1342,6 @@ def filter_view_queryset(cls, queryset, user): | Q(public=True) ) - @classmethod - def search(self, queryset, search_term): - search_query = Q(title_description_search_vector=SearchQuery(value=search_term, config=POSTGRES_FTS_CONFIG, search_type="plain")) - tags_query = Q(tags__tag_name__icontains=search_term) - return queryset.filter(search_query | tags_query) - @raise_if_unsaved def get_root(self): # Only topics can be root nodes @@ -1846,10 +1825,8 @@ def set_default_learning_activity(self): def save(self, skip_lock=False, *args, **kwargs): if self._state.adding: - is_create = True self.on_create() else: - is_create = False self.on_update() # Logic borrowed from mptt - do a simple check to see if we have changed @@ -1893,9 +1870,6 @@ def save(self, skip_lock=False, *args, **kwargs): if changed_ids: ContentNode.objects.filter(id__in=changed_ids).update(changed=True) - if is_create: - ContentNode.filter_by_pk(pk=self.id).update(title_description_search_vector=POSTGRES_SEARCH_VECTOR) - # Copied from MPTT save.alters_data = True @@ -1938,7 +1912,6 @@ class Meta: indexes = [ models.Index(fields=["node_id"], name=NODE_ID_INDEX_NAME), models.Index(fields=["-modified"], name=NODE_MODIFIED_DESC_INDEX_NAME), - GinIndex(fields=["title_description_search_vector"], name=NODE_SEARCH_VECTOR_GIN_INDEX_NAME), ] diff --git a/contentcuration/contentcuration/viewsets/contentnode.py b/contentcuration/contentcuration/viewsets/contentnode.py index dd942b161e..cc6ad0727f 100644 --- a/contentcuration/contentcuration/viewsets/contentnode.py +++ b/contentcuration/contentcuration/viewsets/contentnode.py @@ -633,8 +633,7 @@ def delete_from_changes(self, changes): def dict_if_none(obj, field_name=None): - value = obj.get(field_name) - return value if value else {} + return obj[field_name] if obj[field_name] else {} # Apply mixin first to override ValuesViewset diff --git a/contentcuration/search/admin.py b/contentcuration/search/admin.py deleted file mode 100644 index 846f6b4061..0000000000 --- a/contentcuration/search/admin.py +++ /dev/null @@ -1 +0,0 @@ -# Register your models here. diff --git a/contentcuration/search/management/__init__.py b/contentcuration/search/management/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/contentcuration/search/management/commands/__init__.py b/contentcuration/search/management/commands/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/contentcuration/search/management/commands/set_channel_tsvectors.py b/contentcuration/search/management/commands/set_channel_tsvectors.py new file mode 100644 index 0000000000..4d27927f53 --- /dev/null +++ b/contentcuration/search/management/commands/set_channel_tsvectors.py @@ -0,0 +1,57 @@ +""" +This command inserts in bulk channel tsvectors to the ChannelFullTextSearch table. +""" +import logging as logmodule +import time + +from django.core.management.base import BaseCommand +from django.db.models import Exists +from django.db.models import OuterRef +from search.models import CHANNEL_KEYWORDS_TSVECTOR +from search.models import ChannelFullTextSearch + +from contentcuration.models import Channel +from contentcuration.viewsets.channel import primary_token_subquery + + +logmodule.basicConfig(level=logmodule.INFO) +logging = logmodule.getLogger("command") + +CHUNKSIZE = 5000 + + +class Command(BaseCommand): + + def handle(self, *args, **options): + start = time.time() + + channel_not_already_inserted_query = ~Exists(ChannelFullTextSearch.objects.filter(channel_id=OuterRef("id"))) + + channel_query = (Channel.objects.select_related("main_tree") + .filter(channel_not_already_inserted_query, deleted=False, main_tree__published=True) + .annotate(primary_channel_token=primary_token_subquery, + keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR) + .values("id", "keywords_tsvector")) + + insertable_channels = list(channel_query[:CHUNKSIZE]) + total_channel_tsvectors_inserted = 0 + + while insertable_channels: + logging.info("Inserting channel tsvectors.") + + insert_objs = list() + for channel in insertable_channels: + obj = ChannelFullTextSearch(channel_id=channel["id"], keywords_tsvector=channel["keywords_tsvector"]) + insert_objs.append(obj) + + inserted_objs_list = ChannelFullTextSearch.objects.bulk_create(insert_objs) + + current_inserts_count = len(inserted_objs_list) + total_channel_tsvectors_inserted = total_channel_tsvectors_inserted + current_inserts_count + + logging.info("Inserted {} channel tsvectors.".format(current_inserts_count)) + + insertable_channels = list(channel_query[:CHUNKSIZE]) + + logging.info("Completed! successfully inserted total of {} channel tsvectors in {} seconds.".format( + total_channel_tsvectors_inserted, time.time() - start)) diff --git a/contentcuration/search/management/commands/set_contentnode_tsvectors.py b/contentcuration/search/management/commands/set_contentnode_tsvectors.py new file mode 100644 index 0000000000..58cf9350e8 --- /dev/null +++ b/contentcuration/search/management/commands/set_contentnode_tsvectors.py @@ -0,0 +1,60 @@ +""" +This command inserts in bulk contentnode tsvectors to the ContentNodeFullTextSearch table. +""" +import logging as logmodule +import time + +from django.contrib.postgres.aggregates import StringAgg +from django.core.management.base import BaseCommand +from django.db.models import Exists +from django.db.models import OuterRef +from search.models import CONTENTNODE_AUTHOR_TSVECTOR +from search.models import CONTENTNODE_KEYWORDS_TSVECTOR +from search.models import ContentNodeFullTextSearch + +from contentcuration.models import ContentNode + + +logmodule.basicConfig(level=logmodule.INFO) +logging = logmodule.getLogger("command") + +CHUNKSIZE = 10000 + + +class Command(BaseCommand): + + def handle(self, *args, **options): + start = time.time() + + tsvector_not_already_inserted_query = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id"))) + + tsvector_node_query = (ContentNode._annotate_channel_id(ContentNode.objects) + .annotate(contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), + keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, + author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR) + .filter(tsvector_not_already_inserted_query, published=True) + .values("id", "channel_id", "keywords_tsvector", "author_tsvector").order_by()) + + insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE]) + total_tsvectors_inserted = 0 + + while insertable_nodes_tsvector: + logging.info("Inserting contentnode tsvectors.") + + insert_objs = list() + for node in insertable_nodes_tsvector: + if node["channel_id"]: + obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"], + keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"]) + insert_objs.append(obj) + + inserted_objs_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs) + + current_inserts_count = len(inserted_objs_list) + total_tsvectors_inserted = total_tsvectors_inserted + current_inserts_count + + logging.info("Inserted {} contentnode tsvectors.".format(current_inserts_count)) + + insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE]) + + logging.info("Completed! Successfully inserted total of {} contentnode tsvectors in {} seconds.".format(total_tsvectors_inserted, time.time() - start)) diff --git a/contentcuration/search/migrations/0003_fulltextsearch.py b/contentcuration/search/migrations/0003_fulltextsearch.py new file mode 100644 index 0000000000..2885a4655f --- /dev/null +++ b/contentcuration/search/migrations/0003_fulltextsearch.py @@ -0,0 +1,54 @@ +# Generated by Django 3.2.14 on 2022-09-08 07:19 +import uuid + +import django.contrib.postgres.indexes +import django.contrib.postgres.search +import django.db.models.deletion +from django.contrib.postgres.operations import AddIndexConcurrently +from django.db import migrations +from django.db import models + +import contentcuration.models + + +class Migration(migrations.Migration): + + atomic = False + + dependencies = [ + ('contentcuration', '0140_delete_task'), + ('search', '0002_auto_20201215_2110'), + ] + + operations = [ + migrations.CreateModel( + name='ContentNodeFullTextSearch', + fields=[ + ('id', contentcuration.models.UUIDField(default=uuid.uuid4, max_length=32, primary_key=True, serialize=False)), + ('keywords_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)), + ('author_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)), + ('channel', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='channel_nodes_fts', to='contentcuration.channel')), + ('contentnode', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='node_fts', to='contentcuration.contentnode')), + ], + ), + migrations.CreateModel( + name='ChannelFullTextSearch', + fields=[ + ('id', contentcuration.models.UUIDField(default=uuid.uuid4, max_length=32, primary_key=True, serialize=False)), + ('keywords_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)), + ('channel', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='channel_fts', to='contentcuration.channel')), + ], + ), + AddIndexConcurrently( + model_name='contentnodefulltextsearch', + index=django.contrib.postgres.indexes.GinIndex(fields=['keywords_tsvector'], name='node_keywords_tsv__gin_idx'), + ), + AddIndexConcurrently( + model_name='contentnodefulltextsearch', + index=django.contrib.postgres.indexes.GinIndex(fields=['author_tsvector'], name='node_author_tsv__gin_idx'), + ), + AddIndexConcurrently( + model_name='channelfulltextsearch', + index=django.contrib.postgres.indexes.GinIndex(fields=['keywords_tsvector'], name='channel_keywords_tsv__gin_idx'), + ), + ] diff --git a/contentcuration/search/models.py b/contentcuration/search/models.py index e1e550576b..64b0472c8a 100644 --- a/contentcuration/search/models.py +++ b/contentcuration/search/models.py @@ -1,8 +1,15 @@ import uuid from django.conf import settings +from django.contrib.postgres.indexes import GinIndex +from django.contrib.postgres.search import SearchVector +from django.contrib.postgres.search import SearchVectorField from django.db import models +from contentcuration.models import Channel +from contentcuration.models import ContentNode +from contentcuration.models import UUIDField as StudioUUIDField + class SavedSearch(models.Model): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) @@ -13,3 +20,49 @@ class SavedSearch(models.Model): saved_by = models.ForeignKey( settings.AUTH_USER_MODEL, related_name="searches", on_delete=models.CASCADE ) + + +POSTGRES_FTS_CONFIG = "simple" + +CONTENTNODE_KEYWORDS_TSVECTOR_FIELDS = ("id", "channel_id", "node_id", "content_id", "tree_id", "title", "description", "contentnode_tags") +CONTENTNODE_KEYWORDS_TSVECTOR = SearchVector(*CONTENTNODE_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) + +CONTENTNODE_AUTHOR_TSVECTOR_FIELDS = ("author", "aggregator", "provider") +CONTENTNODE_AUTHOR_TSVECTOR = SearchVector(*CONTENTNODE_AUTHOR_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) + +CHANNEL_KEYWORDS_TSVECTOR_FIELDS = ("id", "main_tree__tree_id", "name", "description", "tagline", "primary_channel_token") +CHANNEL_KEYWORDS_TSVECTOR = SearchVector(*CHANNEL_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) + + +class ContentNodeFullTextSearch(models.Model): + id = StudioUUIDField(primary_key=True, default=uuid.uuid4) + + # The contentnode that this record points to. + contentnode = models.ForeignKey(ContentNode, on_delete=models.CASCADE, related_name="node_fts") + + # The channel to which the contentnode belongs. Channel cannot be NULL because we only allow + # searches to be made inside channels. + channel = models.ForeignKey(Channel, on_delete=models.CASCADE, related_name="channel_nodes_fts") + + # This stores the keywords as tsvector. + keywords_tsvector = SearchVectorField(null=True, blank=True) + + # This stores the author as tsvector. + author_tsvector = SearchVectorField(null=True, blank=True) + + class Meta: + indexes = [GinIndex(fields=["keywords_tsvector"], name="node_keywords_tsv__gin_idx"), + GinIndex(fields=["author_tsvector"], name="node_author_tsv__gin_idx")] + + +class ChannelFullTextSearch(models.Model): + id = StudioUUIDField(primary_key=True, default=uuid.uuid4) + + # The channel to which this record points. + channel = models.ForeignKey(Channel, on_delete=models.CASCADE, related_name="channel_fts") + + # This stores the channel keywords as tsvector for super fast searches. + keywords_tsvector = SearchVectorField(null=True, blank=True) + + class Meta: + indexes = [GinIndex(fields=["keywords_tsvector"], name="channel_keywords_tsv__gin_idx")] From 974b69e8edae9d039edce8b603ea8f5001ce70e0 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Thu, 8 Sep 2022 13:18:32 +0530 Subject: [PATCH 14/25] Resolve conflicts and remove old index refs --- contentcuration/contentcuration/models.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/contentcuration/contentcuration/models.py b/contentcuration/contentcuration/models.py index a2a4af786d..33ef23355a 100644 --- a/contentcuration/contentcuration/models.py +++ b/contentcuration/contentcuration/models.py @@ -1092,9 +1092,6 @@ def delete(self, *args, **kwargs): self.secret_token.delete() -CONTENT_TAG_NAME__INDEX_NAME = "contenttag_tag_name_gist_idx" - - class ContentTag(models.Model): id = UUIDField(primary_key=True, default=uuid.uuid4) tag_name = models.CharField(max_length=50) @@ -1149,7 +1146,6 @@ def __str__(self): NODE_ID_INDEX_NAME = "node_id_idx" NODE_MODIFIED_INDEX_NAME = "node_modified_idx" NODE_MODIFIED_DESC_INDEX_NAME = "node_modified_desc_idx" -NODE_SEARCH_VECTOR_GIN_INDEX_NAME = "node_search_vector_gin_idx" CONTENTNODE_TREE_ID_CACHE_KEY = "contentnode_{pk}__tree_id" From e15b015f17f11b703f98fcf667b2464e1aaf0576 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Sat, 10 Sep 2022 15:25:19 +0530 Subject: [PATCH 15/25] feat: full text search! --- .../contentcuration/utils/publish.py | 75 +++++++++++ .../contentcuration/viewsets/channel.py | 26 ++-- contentcuration/search/constants.py | 23 ++++ .../commands/set_channel_tsvectors.py | 2 +- .../commands/set_contentnode_tsvectors.py | 4 +- contentcuration/search/models.py | 13 -- contentcuration/search/serializers.py | 18 --- contentcuration/search/utils.py | 9 ++ .../search/viewsets/contentnode.py | 123 ++++++++++-------- 9 files changed, 185 insertions(+), 108 deletions(-) create mode 100644 contentcuration/search/constants.py delete mode 100644 contentcuration/search/serializers.py create mode 100644 contentcuration/search/utils.py diff --git a/contentcuration/contentcuration/utils/publish.py b/contentcuration/contentcuration/utils/publish.py index 66e8b37e49..afe4abdef4 100644 --- a/contentcuration/contentcuration/utils/publish.py +++ b/contentcuration/contentcuration/utils/publish.py @@ -14,13 +14,17 @@ from itertools import chain from django.conf import settings +from django.contrib.postgres.aggregates import StringAgg from django.core.files import File from django.core.files.storage import default_storage as storage from django.core.management import call_command from django.db.models import Count +from django.db.models import Exists from django.db.models import Max +from django.db.models import OuterRef from django.db.models import Q from django.db.models import Sum +from django.db.models import Value from django.db.utils import IntegrityError from django.template.loader import render_to_string from django.utils import timezone @@ -37,6 +41,13 @@ from le_utils.constants import roles from past.builtins import basestring from past.utils import old_div +from search.constants import CHANNEL_KEYWORDS_TSVECTOR +from search.constants import CONTENTNODE_AUTHOR_TSVECTOR +from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR +from search.constants import CONTENTNODE_PREFIXED_AUTHOR_TSVECTOR +from search.constants import CONTENTNODE_PREFIXED_KEYWORDS_TSVECTOR +from search.models import ChannelFullTextSearch +from search.models import ContentNodeFullTextSearch from contentcuration import models as ccmodels from contentcuration.decorators import delay_user_storage_calculation @@ -808,6 +819,64 @@ def fill_published_fields(channel, version_notes): channel.save() +def create_or_update_tsvectors(channel_id): + """ + Create or update tsvectors for the channel and all its content nodes. + """ + # Update or create channel tsvector entry. + logging.info("Starting to set tsvectors for channel with id {}.".format(channel_id)) + + from contentcuration.viewsets.channel import primary_token_subquery + + channel = (ccmodels.Channel.objects + .annotate(primary_channel_token=primary_token_subquery, + keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR) + .get(pk=channel_id)) + + if ChannelFullTextSearch.objects.filter(channel_id=channel_id).exists(): + update_count = ChannelFullTextSearch.objects.filter(channel_id=channel_id).update(keywords_tsvector=channel.keywords_tsvector) + logging.info("Updated {} channel tsvector.".format(update_count)) + else: + obj = ChannelFullTextSearch(channel_id=channel_id, keywords_tsvector=channel.keywords_tsvector) + obj.save() + logging.info("Created 1 channel tsvector.") + + # Update or create contentnodes tsvector entry for channel_id. + logging.info("Setting tsvectors for all contentnodes in channel {}.".format(channel_id)) + + if ContentNodeFullTextSearch.objects.filter(channel_id=channel_id).exists(): + + # First, delete nodes that are no longer in main_tree. + nodes_no_longer_in_main_tree = ~Exists(channel.main_tree.get_family().filter(id=OuterRef("contentnode_id"))) + ContentNodeFullTextSearch.objects.filter(nodes_no_longer_in_main_tree, channel_id=channel_id).delete() + + # Now, all remaining nodes are in main_tree, so let's update them. + update_count = (ContentNodeFullTextSearch.objects.filter(channel_id=channel_id) + .annotate(contentnode_tags=StringAgg("tags__tag_name", delimiter=" ")) + .update(keywords_tsvector=CONTENTNODE_PREFIXED_KEYWORDS_TSVECTOR, author_tsvector=CONTENTNODE_PREFIXED_AUTHOR_TSVECTOR)) + + # Insert newly created nodes. + nodes_not_having_tsvector_record = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id"))) + nodes_to_insert = (channel.main_tree.get_family() + .filter(nodes_not_having_tsvector_record) + .annotate(channel_id=Value(channel_id), + contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), + keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, + author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR) + .values("id", "channel_id", "keywords_tsvector", "author_tsvector")) + + insert_objs = list() + + for node in nodes_to_insert: + obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"], + keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"]) + insert_objs.append(obj) + + inserted_nodes_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs) + + logging.info("Successfully inserted {} and updated {} contentnode tsvectors.".format(len(inserted_nodes_list), update_count)) + + @delay_user_storage_calculation def publish_channel( user_id, @@ -818,6 +887,8 @@ def publish_channel( send_email=False, progress_tracker=None, language=settings.LANGUAGE_CODE, + + ): """ :type progress_tracker: contentcuration.utils.celery.ProgressTracker|None @@ -843,6 +914,10 @@ def publish_channel( if channel.public: delete_public_channel_cache_keys() + # Enqueue tsvector task to update or create channel tsvectors and all its + # contentnodes tsvector entries. + create_or_update_tsvectors(channel_id=channel_id) + if send_email: with override(language): send_emails(channel, user_id, version_notes=version_notes) diff --git a/contentcuration/contentcuration/viewsets/channel.py b/contentcuration/contentcuration/viewsets/channel.py index 517e66f0fe..ca5c487e0a 100644 --- a/contentcuration/contentcuration/viewsets/channel.py +++ b/contentcuration/contentcuration/viewsets/channel.py @@ -24,6 +24,9 @@ from rest_framework.serializers import CharField from rest_framework.serializers import FloatField from rest_framework.serializers import IntegerField +from search.models import ChannelFullTextSearch +from search.models import ContentNodeFullTextSearch +from search.utils import get_fts_search_query from contentcuration.decorators import cache_no_user_data from contentcuration.models import Change @@ -119,23 +122,12 @@ def filter_deleted(self, queryset, name, value): return queryset.filter(deleted=value) def filter_keywords(self, queryset, name, value): - # TODO: Wait until we show more metadata on cards to add this back in - # keywords_query = self.main_tree_query.filter( - # Q(tags__tag_name__icontains=value) - # | Q(author__icontains=value) - # | Q(aggregator__icontains=value) - # | Q(provider__icontains=value) - # ) - return queryset.annotate( - # keyword_match_count=SQCount(keywords_query, field="content_id"), - primary_token=primary_token_subquery, - ).filter( - Q(name__icontains=value) - | Q(description__icontains=value) - | Q(pk__istartswith=value) - | Q(primary_token=value.replace("-", "")) - # | Q(keyword_match_count__gt=0) - ) + channel_keywords_query = (Exists(ChannelFullTextSearch.objects.filter( + keywords_tsvector=get_fts_search_query(value.replace("-", "")), channel_id=OuterRef("id")))) + contentnode_search_query = (Exists(ContentNodeFullTextSearch.objects.filter( + Q(keywords_tsvector=get_fts_search_query(value)) | Q(author_tsvector=get_fts_search_query(value)), channel_id=OuterRef("id")))) + + return queryset.filter(Q(channel_keywords_query) | Q(contentnode_search_query)) def filter_languages(self, queryset, name, value): languages = value.split(",") diff --git a/contentcuration/search/constants.py b/contentcuration/search/constants.py new file mode 100644 index 0000000000..9b4a29d246 --- /dev/null +++ b/contentcuration/search/constants.py @@ -0,0 +1,23 @@ +from django.contrib.postgres.search import SearchVector + +# Postgres full text search configuration. We use "simple" to make search +# language agnostic. +POSTGRES_FTS_CONFIG = "simple" + +# ContentNode vectors and search fields. +CONTENTNODE_KEYWORDS_TSVECTOR_FIELDS = ("id", "channel_id", "node_id", "content_id", "tree_id", "title", "description", "contentnode_tags") +CONTENTNODE_KEYWORDS_TSVECTOR = SearchVector(*CONTENTNODE_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) + +CONTENTNODE_AUTHOR_TSVECTOR_FIELDS = ("author", "aggregator", "provider") +CONTENTNODE_AUTHOR_TSVECTOR = SearchVector(*CONTENTNODE_AUTHOR_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) + +CONTENTNODE_PREFIXED_KEYWORDS_TSVECTOR_FIELDS = ("contentnode__id", "channel_id", "contentnode__node_id", "contentnode__content_id", + "contentnode__tree_id", "contentnode__title", "contentnode__description", "contentnode_tags") +CONTENTNODE_PREFIXED_KEYWORDS_TSVECTOR = SearchVector(*CONTENTNODE_PREFIXED_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) + +CONTENTNODE_PREFIXED_AUTHOR_TSVECTOR_FIELDS = ("contentnode__author", "contentnode__aggregator", "contentnode__provider") +CONTENTNODE_PREFIXED_AUTHOR_TSVECTOR = SearchVector(*CONTENTNODE_PREFIXED_AUTHOR_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) + +# Channel vector and search fields. +CHANNEL_KEYWORDS_TSVECTOR_FIELDS = ("id", "main_tree__tree_id", "name", "description", "tagline", "primary_channel_token") +CHANNEL_KEYWORDS_TSVECTOR = SearchVector(*CHANNEL_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) diff --git a/contentcuration/search/management/commands/set_channel_tsvectors.py b/contentcuration/search/management/commands/set_channel_tsvectors.py index 4d27927f53..305e9a8adb 100644 --- a/contentcuration/search/management/commands/set_channel_tsvectors.py +++ b/contentcuration/search/management/commands/set_channel_tsvectors.py @@ -7,7 +7,7 @@ from django.core.management.base import BaseCommand from django.db.models import Exists from django.db.models import OuterRef -from search.models import CHANNEL_KEYWORDS_TSVECTOR +from search.constants import CHANNEL_KEYWORDS_TSVECTOR from search.models import ChannelFullTextSearch from contentcuration.models import Channel diff --git a/contentcuration/search/management/commands/set_contentnode_tsvectors.py b/contentcuration/search/management/commands/set_contentnode_tsvectors.py index 58cf9350e8..fbc862fae6 100644 --- a/contentcuration/search/management/commands/set_contentnode_tsvectors.py +++ b/contentcuration/search/management/commands/set_contentnode_tsvectors.py @@ -8,8 +8,8 @@ from django.core.management.base import BaseCommand from django.db.models import Exists from django.db.models import OuterRef -from search.models import CONTENTNODE_AUTHOR_TSVECTOR -from search.models import CONTENTNODE_KEYWORDS_TSVECTOR +from search.constants import CONTENTNODE_AUTHOR_TSVECTOR +from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR from search.models import ContentNodeFullTextSearch from contentcuration.models import ContentNode diff --git a/contentcuration/search/models.py b/contentcuration/search/models.py index 64b0472c8a..31793d4de7 100644 --- a/contentcuration/search/models.py +++ b/contentcuration/search/models.py @@ -2,7 +2,6 @@ from django.conf import settings from django.contrib.postgres.indexes import GinIndex -from django.contrib.postgres.search import SearchVector from django.contrib.postgres.search import SearchVectorField from django.db import models @@ -22,18 +21,6 @@ class SavedSearch(models.Model): ) -POSTGRES_FTS_CONFIG = "simple" - -CONTENTNODE_KEYWORDS_TSVECTOR_FIELDS = ("id", "channel_id", "node_id", "content_id", "tree_id", "title", "description", "contentnode_tags") -CONTENTNODE_KEYWORDS_TSVECTOR = SearchVector(*CONTENTNODE_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) - -CONTENTNODE_AUTHOR_TSVECTOR_FIELDS = ("author", "aggregator", "provider") -CONTENTNODE_AUTHOR_TSVECTOR = SearchVector(*CONTENTNODE_AUTHOR_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) - -CHANNEL_KEYWORDS_TSVECTOR_FIELDS = ("id", "main_tree__tree_id", "name", "description", "tagline", "primary_channel_token") -CHANNEL_KEYWORDS_TSVECTOR = SearchVector(*CHANNEL_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) - - class ContentNodeFullTextSearch(models.Model): id = StudioUUIDField(primary_key=True, default=uuid.uuid4) diff --git a/contentcuration/search/serializers.py b/contentcuration/search/serializers.py deleted file mode 100644 index 4137c0e47f..0000000000 --- a/contentcuration/search/serializers.py +++ /dev/null @@ -1,18 +0,0 @@ -from contentcuration import models as cc_models -from rest_framework import serializers - - -class ContentSearchResultSerializer(serializers.ModelSerializer): - - class Meta: - model = cc_models.ContentNode - fields = ( - 'id', - 'original_channel_id', - 'source_channel_id', - 'title', - 'kind', - 'tags', - 'children', - 'tree_id' - ) diff --git a/contentcuration/search/utils.py b/contentcuration/search/utils.py new file mode 100644 index 0000000000..9cb205bb47 --- /dev/null +++ b/contentcuration/search/utils.py @@ -0,0 +1,9 @@ +from django.contrib.postgres.search import SearchQuery +from search.constants import POSTGRES_FTS_CONFIG + + +def get_fts_search_query(value): + """ + Returns a `SearchQuery` with our postgres full text search config set on it. + """ + return SearchQuery(value=value, config=POSTGRES_FTS_CONFIG) diff --git a/contentcuration/search/viewsets/contentnode.py b/contentcuration/search/viewsets/contentnode.py index df5cf831cb..72c50aea3c 100644 --- a/contentcuration/search/viewsets/contentnode.py +++ b/contentcuration/search/viewsets/contentnode.py @@ -7,18 +7,20 @@ from django.db.models import Q from django.db.models import Subquery from django.db.models import Value +from django.db.models.functions import Coalesce from django_filters.rest_framework import BooleanFilter from django_filters.rest_framework import CharFilter from le_utils.constants import content_kinds from le_utils.constants import roles from rest_framework.permissions import IsAuthenticated +from search.models import ContentNodeFullTextSearch +from search.utils import get_fts_search_query from contentcuration.models import Channel -from contentcuration.models import ContentNode from contentcuration.models import File from contentcuration.utils.pagination import CachedListPagination +from contentcuration.viewsets.base import ReadOnlyValuesViewset from contentcuration.viewsets.base import RequiredFilterSet -from contentcuration.viewsets.base import ValuesViewset from contentcuration.viewsets.common import NotNullMapArrayAgg from contentcuration.viewsets.common import UUIDFilter from contentcuration.viewsets.common import UUIDInFilter @@ -48,110 +50,117 @@ def filter_channel_list(self, queryset, name, value): user = not self.request.user.is_anonymous and self.request.user channel_ids = [] if value == "public": - channel_ids = Channel.objects.filter(public=True, deleted=False).values_list("id", flat=True) + channel_ids = Channel.objects.filter(public=True, deleted=False, main_tree__published=True).values_list("id", flat=True) elif value == "edit" and user: - channel_ids = user.editable_channels.values_list("id", flat=True) + channel_ids = user.editable_channels.filter(main_tree__published=True).values_list("id", flat=True) elif value == "bookmark" and user: - channel_ids = user.bookmarked_channels.values_list("id", flat=True) + channel_ids = user.bookmarked_channels.filter(main_tree__published=True).values_list("id", flat=True) elif value == "view" and user: - channel_ids = user.view_only_channels.values_list("id", flat=True) + channel_ids = user.view_only_channels.filter(main_tree__published=True).values_list("id", flat=True) return queryset.filter(channel_id__in=list(channel_ids)) def filter_keywords(self, queryset, name, value): - return ContentNode.search(queryset=queryset, search_term=value) + return queryset.filter(Q(keywords_tsvector=get_fts_search_query(value)) + | Q(author_tsvector=get_fts_search_query(value))) def filter_author(self, queryset, name, value): - return queryset.filter( - Q(author__icontains=value) - | Q(aggregator__icontains=value) - | Q(provider__icontains=value) - ) + return queryset.filter(author_tsvector=get_fts_search_query(value)) def filter_languages(self, queryset, name, value): - return queryset.filter(language__lang_code__in=value.split(",")) + return queryset.filter(contentnode__language__lang_code__in=value.split(",")) def filter_licenses(self, queryset, name, value): licenses = [int(li) for li in value.split(",")] - return queryset.filter(license__in=licenses) + return queryset.filter(contentnode__license__in=licenses) def filter_kinds(self, queryset, name, value): - return queryset.filter(kind_id__in=value.split(",")) + return queryset.filter(contentnode__kind_id__in=value.split(",")) def filter_coach(self, queryset, name, value): - return queryset.filter(role_visibility=roles.COACH) + return queryset.filter(contentnode__role_visibility=roles.COACH) def filter_resources(self, queryset, name, value): - return queryset.exclude(kind_id=content_kinds.TOPIC) + return queryset.exclude(contentnode__kind_id=content_kinds.TOPIC) def filter_assessments(self, queryset, name, value): - return queryset.filter(kind_id=content_kinds.EXERCISE) + return queryset.filter(contentnode__kind_id=content_kinds.EXERCISE) def filter_created_after(self, queryset, name, value): date = re.search(r"(\d{4})-0?(\d+)-(\d+)", value) return queryset.filter( - created__year__gte=date.group(1), - created__month__gte=date.group(2), - created__day__gte=date.group(3), - ) - - class Meta: - model = ContentNode - fields = ( - "keywords", - "languages", - "licenses", - "kinds", - "coach", - "author", - "resources", - "assessments", + contentnode__created__year__gte=date.group(1), + contentnode__created__month__gte=date.group(2), + contentnode__created__day__gte=date.group(3), ) -class SearchContentNodeViewSet(ValuesViewset): +class SearchContentNodeViewSet(ReadOnlyValuesViewset): filterset_class = ContentNodeFilter pagination_class = ListPagination permission_classes = [IsAuthenticated] + field_map = { + "id": "contentnode__id", + "content_id": "contentnode__content_id", + "node_id": "contentnode__node_id", + "title": "contentnode__title", + "description": "contentnode__description", + "author": "contentnode__author", + "provider": "contentnode__provider", + "kind__kind": "contentnode__kind__kind", + "thumbnail_encoding": "contentnode__thumbnail_encoding", + "published": "contentnode__published", + "modified": "contentnode__modified", + "parent_id": "contentnode__parent_id", + "changed": "contentnode__changed", + } + values = ( - "id", - "content_id", - "node_id", - "title", - "description", - "author", - "provider", - "kind__kind", + "contentnode__id", + "contentnode__content_id", + "contentnode__node_id", + "contentnode__title", + "contentnode__description", + "contentnode__author", + "contentnode__provider", + "contentnode__kind__kind", + "contentnode__thumbnail_encoding", + "contentnode__published", + "contentnode__modified", + "contentnode__parent_id", + "contentnode__changed", "channel_id", "resource_count", "thumbnail_checksum", "thumbnail_extension", - "thumbnail_encoding", - "published", - "modified", - "parent_id", - "changed", "content_tags", "original_channel_name", ) def get_queryset(self): - return ContentNode._annotate_channel_id(ContentNode.objects) + return ContentNodeFullTextSearch.objects.select_related("contentnode") def annotate_queryset(self, queryset): """ Annotates thumbnails, resources count and channel name. """ thumbnails = File.objects.filter( - contentnode=OuterRef("id"), preset__thumbnail=True + contentnode=OuterRef("contentnode__id"), preset__thumbnail=True ) - descendant_resources_count = ExpressionWrapper(((F("rght") - F("lft") - Value(1)) / Value(2)), output_field=IntegerField()) + descendant_resources_count = ExpressionWrapper(((F("contentnode__rght") - F("contentnode__lft") - Value(1)) / Value(2)), output_field=IntegerField()) - channel_name = Subquery( - Channel.objects.filter(pk=OuterRef("channel_id")).values( - "name" - )[:1] + original_channel_name = Coalesce( + Subquery( + Channel.objects.filter(pk=OuterRef("contentnode__original_channel_id")).values( + "name" + )[:1] + ), + Subquery( + Channel.objects.filter(main_tree__tree_id=OuterRef("contentnode__tree_id")).values( + "name" + )[:1] + ), ) queryset = queryset.annotate( @@ -160,8 +169,8 @@ def annotate_queryset(self, queryset): thumbnail_extension=Subquery( thumbnails.values("file_format__extension")[:1] ), - content_tags=NotNullMapArrayAgg("tags__tag_name"), - original_channel_name=channel_name, + content_tags=NotNullMapArrayAgg("contentnode__tags__tag_name"), + original_channel_name=original_channel_name, ) return queryset From aa62dc2017a334d7c32710e99834f57cf1754fff Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Wed, 14 Sep 2022 18:07:29 +0530 Subject: [PATCH 16/25] Sync tsvectors on publish! --- .../views/ImportFromChannels/ChannelList.vue | 1 + .../contentcuration/utils/publish.py | 65 ++++++++++--------- .../contentcuration/viewsets/channel.py | 7 +- contentcuration/search/constants.py | 7 -- .../search/viewsets/contentnode.py | 21 +++--- 5 files changed, 52 insertions(+), 49 deletions(-) diff --git a/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/ChannelList.vue b/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/ChannelList.vue index 76dc9e1f8c..43de8e8ce4 100644 --- a/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/ChannelList.vue +++ b/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/ChannelList.vue @@ -112,6 +112,7 @@ [this.channelFilter]: true, page: this.$route.query.page || 1, exclude: this.currentChannelId, + published: true, }).then(page => { this.pageCount = page.total_pages; this.channels = page.results; diff --git a/contentcuration/contentcuration/utils/publish.py b/contentcuration/contentcuration/utils/publish.py index afe4abdef4..2b49e13b24 100644 --- a/contentcuration/contentcuration/utils/publish.py +++ b/contentcuration/contentcuration/utils/publish.py @@ -44,8 +44,6 @@ from search.constants import CHANNEL_KEYWORDS_TSVECTOR from search.constants import CONTENTNODE_AUTHOR_TSVECTOR from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR -from search.constants import CONTENTNODE_PREFIXED_AUTHOR_TSVECTOR -from search.constants import CONTENTNODE_PREFIXED_KEYWORDS_TSVECTOR from search.models import ChannelFullTextSearch from search.models import ContentNodeFullTextSearch @@ -819,9 +817,10 @@ def fill_published_fields(channel, version_notes): channel.save() -def create_or_update_tsvectors(channel_id): +def sync_contentnode_and_channel_tsvectors(channel_id): """ - Create or update tsvectors for the channel and all its content nodes. + Creates, deletes and updates tsvectors of the channel and all its content nodes + to reflect the current state of channel's main tree. """ # Update or create channel tsvector entry. logging.info("Starting to set tsvectors for channel with id {}.".format(channel_id)) @@ -829,52 +828,63 @@ def create_or_update_tsvectors(channel_id): from contentcuration.viewsets.channel import primary_token_subquery channel = (ccmodels.Channel.objects + .filter(pk=channel_id) .annotate(primary_channel_token=primary_token_subquery, keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR) - .get(pk=channel_id)) + .values("keywords_tsvector", "main_tree__tree_id") + .get()) if ChannelFullTextSearch.objects.filter(channel_id=channel_id).exists(): - update_count = ChannelFullTextSearch.objects.filter(channel_id=channel_id).update(keywords_tsvector=channel.keywords_tsvector) + update_count = ChannelFullTextSearch.objects.filter(channel_id=channel_id).update(keywords_tsvector=channel["keywords_tsvector"]) logging.info("Updated {} channel tsvector.".format(update_count)) else: - obj = ChannelFullTextSearch(channel_id=channel_id, keywords_tsvector=channel.keywords_tsvector) + obj = ChannelFullTextSearch(channel_id=channel_id, keywords_tsvector=channel["keywords_tsvector"]) obj.save() logging.info("Created 1 channel tsvector.") # Update or create contentnodes tsvector entry for channel_id. - logging.info("Setting tsvectors for all contentnodes in channel {}.".format(channel_id)) + logging.info("Starting to set tsvectors for all contentnodes in channel {}.".format(channel_id)) - if ContentNodeFullTextSearch.objects.filter(channel_id=channel_id).exists(): + nodes_tsvector_query = (ccmodels.ContentNode.objects + .filter(tree_id=channel["main_tree__tree_id"]) + .annotate(channel_id=Value(channel_id), + contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), + keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, + author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR) + .order_by()) + if ContentNodeFullTextSearch.objects.filter(channel_id=channel_id).exists(): # First, delete nodes that are no longer in main_tree. - nodes_no_longer_in_main_tree = ~Exists(channel.main_tree.get_family().filter(id=OuterRef("contentnode_id"))) + nodes_no_longer_in_main_tree = ~Exists(ccmodels.ContentNode.objects.filter(id=OuterRef("contentnode_id"), tree_id=channel["main_tree__tree_id"])) ContentNodeFullTextSearch.objects.filter(nodes_no_longer_in_main_tree, channel_id=channel_id).delete() # Now, all remaining nodes are in main_tree, so let's update them. - update_count = (ContentNodeFullTextSearch.objects.filter(channel_id=channel_id) - .annotate(contentnode_tags=StringAgg("tags__tag_name", delimiter=" ")) - .update(keywords_tsvector=CONTENTNODE_PREFIXED_KEYWORDS_TSVECTOR, author_tsvector=CONTENTNODE_PREFIXED_AUTHOR_TSVECTOR)) + # Update only changed nodes. + nodes_to_update = ContentNodeFullTextSearch.objects.filter(channel_id=channel_id, contentnode__changed=True) + + update_objs = list() + for node in nodes_to_update: + corresponding_contentnode = nodes_tsvector_query.filter(pk=node.contentnode_id).values("keywords_tsvector", "author_tsvector").first() + if corresponding_contentnode: + node.keywords_tsvector = corresponding_contentnode["keywords_tsvector"] + node.author_tsvector = corresponding_contentnode["author_tsvector"] + update_objs.append(node) + ContentNodeFullTextSearch.objects.bulk_update(update_objs, ["keywords_tsvector", "author_tsvector"]) + del update_objs # Insert newly created nodes. - nodes_not_having_tsvector_record = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id"))) - nodes_to_insert = (channel.main_tree.get_family() + nodes_not_having_tsvector_record = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id"), channel_id=channel_id)) + nodes_to_insert = (nodes_tsvector_query .filter(nodes_not_having_tsvector_record) - .annotate(channel_id=Value(channel_id), - contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), - keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, - author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR) .values("id", "channel_id", "keywords_tsvector", "author_tsvector")) insert_objs = list() - for node in nodes_to_insert: obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"], keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"]) insert_objs.append(obj) - inserted_nodes_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs) - - logging.info("Successfully inserted {} and updated {} contentnode tsvectors.".format(len(inserted_nodes_list), update_count)) + logging.info("Successfully inserted {} contentnode tsvectors.".format(len(inserted_nodes_list))) @delay_user_storage_calculation @@ -887,8 +897,6 @@ def publish_channel( send_email=False, progress_tracker=None, language=settings.LANGUAGE_CODE, - - ): """ :type progress_tracker: contentcuration.utils.celery.ProgressTracker|None @@ -900,8 +908,9 @@ def publish_channel( set_channel_icon_encoding(channel) kolibri_temp_db = create_content_database(channel, force, user_id, force_exercises, progress_tracker=progress_tracker) increment_channel_version(channel) - mark_all_nodes_as_published(channel) add_tokens_to_channel(channel) + sync_contentnode_and_channel_tsvectors(channel_id=channel.id) + mark_all_nodes_as_published(channel) fill_published_fields(channel, version_notes) # Attributes not getting set for some reason, so just save it here @@ -914,10 +923,6 @@ def publish_channel( if channel.public: delete_public_channel_cache_keys() - # Enqueue tsvector task to update or create channel tsvectors and all its - # contentnodes tsvector entries. - create_or_update_tsvectors(channel_id=channel_id) - if send_email: with override(language): send_emails(channel, user_id, version_notes=version_notes) diff --git a/contentcuration/contentcuration/viewsets/channel.py b/contentcuration/contentcuration/viewsets/channel.py index ca5c487e0a..90b9a8f84d 100644 --- a/contentcuration/contentcuration/viewsets/channel.py +++ b/contentcuration/contentcuration/viewsets/channel.py @@ -122,10 +122,13 @@ def filter_deleted(self, queryset, name, value): return queryset.filter(deleted=value) def filter_keywords(self, queryset, name, value): + search_query = get_fts_search_query(value) + dash_replaced_search_query = get_fts_search_query(value.replace("-", "")) + channel_keywords_query = (Exists(ChannelFullTextSearch.objects.filter( - keywords_tsvector=get_fts_search_query(value.replace("-", "")), channel_id=OuterRef("id")))) + Q(keywords_tsvector=search_query) | Q(keywords_tsvector=dash_replaced_search_query), channel_id=OuterRef("id")))) contentnode_search_query = (Exists(ContentNodeFullTextSearch.objects.filter( - Q(keywords_tsvector=get_fts_search_query(value)) | Q(author_tsvector=get_fts_search_query(value)), channel_id=OuterRef("id")))) + Q(keywords_tsvector=search_query) | Q(author_tsvector=search_query), channel_id=OuterRef("id")))) return queryset.filter(Q(channel_keywords_query) | Q(contentnode_search_query)) diff --git a/contentcuration/search/constants.py b/contentcuration/search/constants.py index 9b4a29d246..1ac316c3ae 100644 --- a/contentcuration/search/constants.py +++ b/contentcuration/search/constants.py @@ -11,13 +11,6 @@ CONTENTNODE_AUTHOR_TSVECTOR_FIELDS = ("author", "aggregator", "provider") CONTENTNODE_AUTHOR_TSVECTOR = SearchVector(*CONTENTNODE_AUTHOR_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) -CONTENTNODE_PREFIXED_KEYWORDS_TSVECTOR_FIELDS = ("contentnode__id", "channel_id", "contentnode__node_id", "contentnode__content_id", - "contentnode__tree_id", "contentnode__title", "contentnode__description", "contentnode_tags") -CONTENTNODE_PREFIXED_KEYWORDS_TSVECTOR = SearchVector(*CONTENTNODE_PREFIXED_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) - -CONTENTNODE_PREFIXED_AUTHOR_TSVECTOR_FIELDS = ("contentnode__author", "contentnode__aggregator", "contentnode__provider") -CONTENTNODE_PREFIXED_AUTHOR_TSVECTOR = SearchVector(*CONTENTNODE_PREFIXED_AUTHOR_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) - # Channel vector and search fields. CHANNEL_KEYWORDS_TSVECTOR_FIELDS = ("id", "main_tree__tree_id", "name", "description", "tagline", "primary_channel_token") CHANNEL_KEYWORDS_TSVECTOR = SearchVector(*CHANNEL_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG) diff --git a/contentcuration/search/viewsets/contentnode.py b/contentcuration/search/viewsets/contentnode.py index 72c50aea3c..691dfbe70e 100644 --- a/contentcuration/search/viewsets/contentnode.py +++ b/contentcuration/search/viewsets/contentnode.py @@ -49,19 +49,22 @@ class ContentNodeFilter(RequiredFilterSet): def filter_channel_list(self, queryset, name, value): user = not self.request.user.is_anonymous and self.request.user channel_ids = [] + if value == "public": - channel_ids = Channel.objects.filter(public=True, deleted=False, main_tree__published=True).values_list("id", flat=True) + channel_ids = Channel.get_public_channels().values_list("id", flat=True) elif value == "edit" and user: - channel_ids = user.editable_channels.filter(main_tree__published=True).values_list("id", flat=True) + channel_ids = user.editable_channels.values_list("id", flat=True) elif value == "bookmark" and user: - channel_ids = user.bookmarked_channels.filter(main_tree__published=True).values_list("id", flat=True) + channel_ids = user.bookmarked_channels.values_list("id", flat=True) elif value == "view" and user: - channel_ids = user.view_only_channels.filter(main_tree__published=True).values_list("id", flat=True) + channel_ids = user.view_only_channels.values_list("id", flat=True) + return queryset.filter(channel_id__in=list(channel_ids)) def filter_keywords(self, queryset, name, value): - return queryset.filter(Q(keywords_tsvector=get_fts_search_query(value)) - | Q(author_tsvector=get_fts_search_query(value))) + search_query = get_fts_search_query(value) + return queryset.filter(Q(keywords_tsvector=search_query) + | Q(author_tsvector=search_query)) def filter_author(self, queryset, name, value): return queryset.filter(author_tsvector=get_fts_search_query(value)) @@ -98,6 +101,7 @@ class SearchContentNodeViewSet(ReadOnlyValuesViewset): filterset_class = ContentNodeFilter pagination_class = ListPagination permission_classes = [IsAuthenticated] + queryset = ContentNodeFullTextSearch.objects.all() field_map = { "id": "contentnode__id", @@ -137,12 +141,9 @@ class SearchContentNodeViewSet(ReadOnlyValuesViewset): "original_channel_name", ) - def get_queryset(self): - return ContentNodeFullTextSearch.objects.select_related("contentnode") - def annotate_queryset(self, queryset): """ - Annotates thumbnails, resources count and channel name. + Annotates thumbnails, resources count and original channel name. """ thumbnails = File.objects.filter( contentnode=OuterRef("contentnode__id"), preset__thumbnail=True From 2672512de766ba0970a4be9bdc622b656521961a Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Wed, 14 Sep 2022 20:19:40 +0530 Subject: [PATCH 17/25] fix: tests and ready for merge! <3 --- .../contentcuration/tests/testdata.py | 18 +++--- contentcuration/search/tests/test_search.py | 62 +++++++++++-------- 2 files changed, 43 insertions(+), 37 deletions(-) diff --git a/contentcuration/contentcuration/tests/testdata.py b/contentcuration/contentcuration/tests/testdata.py index 5560e853a5..50895337b1 100644 --- a/contentcuration/contentcuration/tests/testdata.py +++ b/contentcuration/contentcuration/tests/testdata.py @@ -195,23 +195,21 @@ def node(data, parent=None): return new_node -def tree(parent=None, tree_data=None): +def tree(parent=None): # Read from json fixture - if tree_data is None: - filepath = os.path.sep.join([os.path.dirname(__file__), "fixtures", "tree.json"]) - with open(filepath, "rb") as jsonfile: - tree_data = json.load(jsonfile) + filepath = os.path.sep.join([os.path.dirname(__file__), "fixtures", "tree.json"]) + with open(filepath, "rb") as jsonfile: + data = json.load(jsonfile) - return node(tree_data, parent) + return node(data, parent) -def channel(name="testchannel", create_main_tree=True, main_tree_data=None): +def channel(name="testchannel"): channel = cc.Channel.objects.create(name=name) channel.save() - if create_main_tree: - channel.main_tree = tree(tree_data=main_tree_data) - channel.save() + channel.main_tree = tree() + channel.save() return channel diff --git a/contentcuration/search/tests/test_search.py b/contentcuration/search/tests/test_search.py index 6b3d34cc41..8489ece577 100644 --- a/contentcuration/search/tests/test_search.py +++ b/contentcuration/search/tests/test_search.py @@ -3,28 +3,39 @@ from django.urls import reverse from contentcuration.models import Channel +from contentcuration.models import ContentNode from contentcuration.tests import testdata from contentcuration.tests.base import StudioAPITestCase +from contentcuration.utils.publish import sync_contentnode_and_channel_tsvectors + + +def dummy_publish(channel): + channel_nodes = ContentNode.objects.filter(tree_id=channel.main_tree.tree_id) + for node in channel_nodes: + node.published = True + node.changed = False + node.save() + sync_contentnode_and_channel_tsvectors(channel_id=channel.id) class SearchViewsetTestCase(StudioAPITestCase): + def setUp(self): + super().setUp() + self.channel = testdata.channel() + self.user = testdata.user() + self.channel.editors.add(self.user) + dummy_publish(self.channel) def test_filter_exclude_channels(self): - user = testdata.user() - self.client.force_authenticate(user=user) - channel = testdata.channel() - channel.editors.add(user) + self.client.force_authenticate(user=self.user) response = self.client.get( - reverse("search-list"), data={"exclude_channel": channel.id}, format="json", + reverse("search-list"), data={"exclude_channel": self.channel.id}, format="json", ) self.assertEqual(response.status_code, 200, response.content) self.assertEqual(response.data["results"], []) def test_filter_channels_by_edit(self): - user = testdata.user() - self.client.force_authenticate(user=user) - channel = testdata.channel() - channel.editors.add(user) + self.client.force_authenticate(user=self.user) response = self.client.get( reverse("search-list"), data={"channel_list": "edit"}, format="json", ) @@ -35,6 +46,7 @@ def test_search(self): users = [] channels = [] + # Create channels, users. for i in range(4): user = testdata.user(email="a{}@a.com".format(i)) users.append(user) @@ -46,20 +58,11 @@ def test_search(self): public_channel, editable_channel, viewable_channel, inaccessible_channel = channels - # Create public video node and publish it. + # Create public video node. public_video_node = testdata.node({ "title": "Kolibri video", "kind_id": "video", }, parent=public_channel.main_tree) - public_video_node.complete = True - public_video_node.published = True - public_video_node.changed = False - public_video_node.save() - - # Publish the public_channel. - public_channel.main_tree.published = True - public_channel.main_tree.changed = False - public_channel.main_tree.save() public_channel.public = True public_channel.save() @@ -72,6 +75,10 @@ def test_search(self): public_video_node.copy_to(target=viewable_channel.main_tree) public_video_node.copy_to(target=inaccessible_channel.main_tree) + # Publish all channels to make them searchable. + for channel in channels: + dummy_publish(channel) + # Get different nodes based on access. editable_channel.main_tree.refresh_from_db() editable_video_node = editable_channel.main_tree.get_descendants().first() @@ -93,12 +100,13 @@ def test_search(self): format="json", ) - for result in response.data["results"]: - self.assertNotEqual(result["id"], inaccessible_video_node.id) + result = response.data["results"][0] + + self.assertNotEqual(result["id"], inaccessible_video_node.id) - if channel_list == "public": - self.assertEqual(result["id"], public_video_node.id) - elif channel_list == "edit": - self.assertEqual(result["id"], editable_video_node.id) - elif channel_list == "view": - self.assertEqual(result["id"], viewable_video_node.id) + if channel_list == "public": + self.assertEqual(result["id"], public_video_node.id) + elif channel_list == "edit": + self.assertEqual(result["id"], editable_video_node.id) + elif channel_list == "view": + self.assertEqual(result["id"], viewable_video_node.id) From 1ffa30d7348a2bd2cd79961caf5b362cf0e1e39c Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Thu, 15 Sep 2022 01:41:51 +0530 Subject: [PATCH 18/25] fix: node command edge case; when published nodes go to trash tree, they remain as published --- .../management/commands/set_contentnode_tsvectors.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/contentcuration/search/management/commands/set_contentnode_tsvectors.py b/contentcuration/search/management/commands/set_contentnode_tsvectors.py index fbc862fae6..4e5673d9ec 100644 --- a/contentcuration/search/management/commands/set_contentnode_tsvectors.py +++ b/contentcuration/search/management/commands/set_contentnode_tsvectors.py @@ -32,7 +32,7 @@ def handle(self, *args, **options): .annotate(contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR) - .filter(tsvector_not_already_inserted_query, published=True) + .filter(tsvector_not_already_inserted_query, published=True, channel_id__isnull=False) .values("id", "channel_id", "keywords_tsvector", "author_tsvector").order_by()) insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE]) @@ -43,10 +43,9 @@ def handle(self, *args, **options): insert_objs = list() for node in insertable_nodes_tsvector: - if node["channel_id"]: - obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"], - keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"]) - insert_objs.append(obj) + obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"], + keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"]) + insert_objs.append(obj) inserted_objs_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs) From 57724e0787a08a77d9ee28f6be064aa8c3cf1e0d Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Fri, 16 Sep 2022 16:01:51 +0530 Subject: [PATCH 19/25] Enforce only-one search entries --- contentcuration/search/migrations/0003_fulltextsearch.py | 6 +++--- contentcuration/search/models.py | 4 ++-- contentcuration/search/viewsets/contentnode.py | 5 +---- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/contentcuration/search/migrations/0003_fulltextsearch.py b/contentcuration/search/migrations/0003_fulltextsearch.py index 2885a4655f..632df6a39e 100644 --- a/contentcuration/search/migrations/0003_fulltextsearch.py +++ b/contentcuration/search/migrations/0003_fulltextsearch.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.14 on 2022-09-08 07:19 +# Generated by Django 3.2.14 on 2022-09-16 08:55 import uuid import django.contrib.postgres.indexes @@ -28,7 +28,7 @@ class Migration(migrations.Migration): ('keywords_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)), ('author_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)), ('channel', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='channel_nodes_fts', to='contentcuration.channel')), - ('contentnode', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='node_fts', to='contentcuration.contentnode')), + ('contentnode', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='node_fts', to='contentcuration.contentnode')), ], ), migrations.CreateModel( @@ -36,7 +36,7 @@ class Migration(migrations.Migration): fields=[ ('id', contentcuration.models.UUIDField(default=uuid.uuid4, max_length=32, primary_key=True, serialize=False)), ('keywords_tsvector', django.contrib.postgres.search.SearchVectorField(blank=True, null=True)), - ('channel', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='channel_fts', to='contentcuration.channel')), + ('channel', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='channel_fts', to='contentcuration.channel')), ], ), AddIndexConcurrently( diff --git a/contentcuration/search/models.py b/contentcuration/search/models.py index 31793d4de7..9e121af509 100644 --- a/contentcuration/search/models.py +++ b/contentcuration/search/models.py @@ -25,7 +25,7 @@ class ContentNodeFullTextSearch(models.Model): id = StudioUUIDField(primary_key=True, default=uuid.uuid4) # The contentnode that this record points to. - contentnode = models.ForeignKey(ContentNode, on_delete=models.CASCADE, related_name="node_fts") + contentnode = models.OneToOneField(ContentNode, on_delete=models.CASCADE, related_name="node_fts") # The channel to which the contentnode belongs. Channel cannot be NULL because we only allow # searches to be made inside channels. @@ -46,7 +46,7 @@ class ChannelFullTextSearch(models.Model): id = StudioUUIDField(primary_key=True, default=uuid.uuid4) # The channel to which this record points. - channel = models.ForeignKey(Channel, on_delete=models.CASCADE, related_name="channel_fts") + channel = models.OneToOneField(Channel, on_delete=models.CASCADE, related_name="channel_fts") # This stores the channel keywords as tsvector for super fast searches. keywords_tsvector = SearchVectorField(null=True, blank=True) diff --git a/contentcuration/search/viewsets/contentnode.py b/contentcuration/search/viewsets/contentnode.py index 691dfbe70e..aa6c7fe9b7 100644 --- a/contentcuration/search/viewsets/contentnode.py +++ b/contentcuration/search/viewsets/contentnode.py @@ -4,7 +4,6 @@ from django.db.models import F from django.db.models import IntegerField from django.db.models import OuterRef -from django.db.models import Q from django.db.models import Subquery from django.db.models import Value from django.db.models.functions import Coalesce @@ -62,9 +61,7 @@ def filter_channel_list(self, queryset, name, value): return queryset.filter(channel_id__in=list(channel_ids)) def filter_keywords(self, queryset, name, value): - search_query = get_fts_search_query(value) - return queryset.filter(Q(keywords_tsvector=search_query) - | Q(author_tsvector=search_query)) + return queryset.filter(keywords_tsvector=get_fts_search_query(value)) def filter_author(self, queryset, name, value): return queryset.filter(author_tsvector=get_fts_search_query(value)) From e53e56a2e300ea31c10f994dfe3cf5ff0ec25186 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Fri, 16 Sep 2022 16:41:27 +0530 Subject: [PATCH 20/25] Remove unnecessary select_related --- .../search/management/commands/set_channel_tsvectors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contentcuration/search/management/commands/set_channel_tsvectors.py b/contentcuration/search/management/commands/set_channel_tsvectors.py index 305e9a8adb..d82f4848f5 100644 --- a/contentcuration/search/management/commands/set_channel_tsvectors.py +++ b/contentcuration/search/management/commands/set_channel_tsvectors.py @@ -27,8 +27,8 @@ def handle(self, *args, **options): channel_not_already_inserted_query = ~Exists(ChannelFullTextSearch.objects.filter(channel_id=OuterRef("id"))) - channel_query = (Channel.objects.select_related("main_tree") - .filter(channel_not_already_inserted_query, deleted=False, main_tree__published=True) + channel_query = (Channel.objects.filter(channel_not_already_inserted_query, + deleted=False, main_tree__published=True) .annotate(primary_channel_token=primary_token_subquery, keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR) .values("id", "keywords_tsvector")) From 4b3d4c7d0a586afce5236de13bfb8ce12064e52e Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Fri, 16 Sep 2022 17:00:04 +0530 Subject: [PATCH 21/25] fix cache tests mock by setting ContentNodeFullTextSearch --- contentcuration/contentcuration/tests/utils/test_cache.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/contentcuration/contentcuration/tests/utils/test_cache.py b/contentcuration/contentcuration/tests/utils/test_cache.py index 1bbf69d580..22a2a6a62b 100644 --- a/contentcuration/contentcuration/tests/utils/test_cache.py +++ b/contentcuration/contentcuration/tests/utils/test_cache.py @@ -1,5 +1,6 @@ import mock from django.test import SimpleTestCase +from search.models import ContentNodeFullTextSearch from ..helpers import mock_class_instance from contentcuration.models import ContentNode @@ -9,7 +10,9 @@ class ResourceSizeCacheTestCase(SimpleTestCase): def setUp(self): super(ResourceSizeCacheTestCase, self).setUp() - self.node = mock.Mock(spec_set=ContentNode()) + c = ContentNode() + c.node_fts = ContentNodeFullTextSearch() + self.node = mock.Mock(spec_set=c) self.node.pk = "abcdefghijklmnopqrstuvwxyz" self.redis_client = mock_class_instance("redis.client.StrictRedis") self.cache_client = mock_class_instance("django_redis.client.DefaultClient") From 44ab74c05b06881dabacb9c7bdd9d6bdbc8f4a65 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Fri, 16 Sep 2022 21:33:54 +0530 Subject: [PATCH 22/25] fix cache & nodes tests by using db-friendly TestCase --- .../contentcuration/tests/utils/test_cache.py | 9 +++------ .../contentcuration/tests/utils/test_nodes.py | 4 ++-- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/contentcuration/contentcuration/tests/utils/test_cache.py b/contentcuration/contentcuration/tests/utils/test_cache.py index 22a2a6a62b..5327da19ac 100644 --- a/contentcuration/contentcuration/tests/utils/test_cache.py +++ b/contentcuration/contentcuration/tests/utils/test_cache.py @@ -1,18 +1,15 @@ import mock -from django.test import SimpleTestCase -from search.models import ContentNodeFullTextSearch +from django.test import TestCase from ..helpers import mock_class_instance from contentcuration.models import ContentNode from contentcuration.utils.cache import ResourceSizeCache -class ResourceSizeCacheTestCase(SimpleTestCase): +class ResourceSizeCacheTestCase(TestCase): def setUp(self): super(ResourceSizeCacheTestCase, self).setUp() - c = ContentNode() - c.node_fts = ContentNodeFullTextSearch() - self.node = mock.Mock(spec_set=c) + self.node = mock.Mock(spec_set=ContentNode()) self.node.pk = "abcdefghijklmnopqrstuvwxyz" self.redis_client = mock_class_instance("redis.client.StrictRedis") self.cache_client = mock_class_instance("django_redis.client.DefaultClient") diff --git a/contentcuration/contentcuration/tests/utils/test_nodes.py b/contentcuration/contentcuration/tests/utils/test_nodes.py index 3b96c30a3c..83171288d6 100644 --- a/contentcuration/contentcuration/tests/utils/test_nodes.py +++ b/contentcuration/contentcuration/tests/utils/test_nodes.py @@ -6,7 +6,7 @@ from dateutil.parser import isoparse from django.db.models import F from django.db.models import Max -from django.test import SimpleTestCase +from django.test import TestCase from ..base import StudioTestCase from contentcuration.models import ContentNode @@ -42,7 +42,7 @@ def test_modified_since(self): @mock.patch("contentcuration.utils.nodes.ResourceSizeHelper") @mock.patch("contentcuration.utils.nodes.ResourceSizeCache") -class CalculateResourceSizeTestCase(SimpleTestCase): +class CalculateResourceSizeTestCase(TestCase): def setUp(self): super(CalculateResourceSizeTestCase, self).setUp() self.node = mock.Mock(spec_set=ContentNode()) From 001e788f38a486b883009a988faf9082e74eec5d Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Wed, 21 Sep 2022 22:53:33 +0530 Subject: [PATCH 23/25] Use command for tsv insertion & simpler tsv update on publish --- Makefile | 2 +- .../views/ImportFromChannels/BrowsingCard.vue | 2 +- .../contentcuration/tests/helpers.py | 8 ++ .../contentcuration/tests/utils/test_cache.py | 7 +- .../contentcuration/tests/utils/test_nodes.py | 8 +- .../contentcuration/utils/publish.py | 78 +++++++------------ .../commands/set_channel_tsvectors.py | 11 +-- .../commands/set_contentnode_tsvectors.py | 45 +++++++---- contentcuration/search/utils.py | 39 ++++++++++ .../search/viewsets/contentnode.py | 4 +- 10 files changed, 118 insertions(+), 86 deletions(-) diff --git a/Makefile b/Makefile index 29fe984285..99b55d3762 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ learningactivities: set-tsvectors: python contentcuration/manage.py set_channel_tsvectors - python contentcuration/manage.py set_contentnode_tsvectors + python contentcuration/manage.py set_contentnode_tsvectors --published ############################################################### # END PRODUCTION COMMANDS ##################################### diff --git a/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/BrowsingCard.vue b/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/BrowsingCard.vue index 70f1ccafb7..5d0447df50 100644 --- a/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/BrowsingCard.vue +++ b/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/BrowsingCard.vue @@ -149,7 +149,7 @@ if (this.isTopic) { return `${baseUrl}#/${this.node.id}`; } - return `${baseUrl}#/${this.node.parent}/${this.node.id}`; + return `${baseUrl}#/${this.node.parent_id}/${this.node.id}`; }, resourcesMsg() { let count; diff --git a/contentcuration/contentcuration/tests/helpers.py b/contentcuration/contentcuration/tests/helpers.py index 8e3172fcd4..cf1d54130e 100644 --- a/contentcuration/contentcuration/tests/helpers.py +++ b/contentcuration/contentcuration/tests/helpers.py @@ -2,7 +2,9 @@ from importlib import import_module import mock +from search.models import ContentNodeFullTextSearch +from contentcuration.models import ContentNode from contentcuration.models import TaskResult @@ -39,6 +41,12 @@ def mock_class_instance(target): else: target_cls = target + # ContentNode's node_fts field can be handled by Django when tests + # access the database but we mock it so that we don't need to query + # the database. By doing so we get faster test execution. + if type(target_cls) is ContentNode: + target_cls.node_fts = ContentNodeFullTextSearch() + class MockClass(target_cls): def __new__(cls, *args, **kwargs): return mock.Mock(spec_set=cls) diff --git a/contentcuration/contentcuration/tests/utils/test_cache.py b/contentcuration/contentcuration/tests/utils/test_cache.py index 5327da19ac..d16570648a 100644 --- a/contentcuration/contentcuration/tests/utils/test_cache.py +++ b/contentcuration/contentcuration/tests/utils/test_cache.py @@ -1,15 +1,14 @@ import mock -from django.test import TestCase +from django.test import SimpleTestCase from ..helpers import mock_class_instance -from contentcuration.models import ContentNode from contentcuration.utils.cache import ResourceSizeCache -class ResourceSizeCacheTestCase(TestCase): +class ResourceSizeCacheTestCase(SimpleTestCase): def setUp(self): super(ResourceSizeCacheTestCase, self).setUp() - self.node = mock.Mock(spec_set=ContentNode()) + self.node = mock_class_instance("contentcuration.models.ContentNode") self.node.pk = "abcdefghijklmnopqrstuvwxyz" self.redis_client = mock_class_instance("redis.client.StrictRedis") self.cache_client = mock_class_instance("django_redis.client.DefaultClient") diff --git a/contentcuration/contentcuration/tests/utils/test_nodes.py b/contentcuration/contentcuration/tests/utils/test_nodes.py index 83171288d6..be43d295dd 100644 --- a/contentcuration/contentcuration/tests/utils/test_nodes.py +++ b/contentcuration/contentcuration/tests/utils/test_nodes.py @@ -6,10 +6,10 @@ from dateutil.parser import isoparse from django.db.models import F from django.db.models import Max -from django.test import TestCase +from django.test import SimpleTestCase from ..base import StudioTestCase -from contentcuration.models import ContentNode +from contentcuration.tests.helpers import mock_class_instance from contentcuration.utils.nodes import calculate_resource_size from contentcuration.utils.nodes import ResourceSizeHelper from contentcuration.utils.nodes import SlowCalculationError @@ -42,10 +42,10 @@ def test_modified_since(self): @mock.patch("contentcuration.utils.nodes.ResourceSizeHelper") @mock.patch("contentcuration.utils.nodes.ResourceSizeCache") -class CalculateResourceSizeTestCase(TestCase): +class CalculateResourceSizeTestCase(SimpleTestCase): def setUp(self): super(CalculateResourceSizeTestCase, self).setUp() - self.node = mock.Mock(spec_set=ContentNode()) + self.node = mock_class_instance("contentcuration.models.ContentNode") def assertCalculation(self, cache, helper, force=False): helper().get_size.return_value = 456 diff --git a/contentcuration/contentcuration/utils/publish.py b/contentcuration/contentcuration/utils/publish.py index 2b49e13b24..017bf4a561 100644 --- a/contentcuration/contentcuration/utils/publish.py +++ b/contentcuration/contentcuration/utils/publish.py @@ -14,7 +14,6 @@ from itertools import chain from django.conf import settings -from django.contrib.postgres.aggregates import StringAgg from django.core.files import File from django.core.files.storage import default_storage as storage from django.core.management import call_command @@ -23,8 +22,8 @@ from django.db.models import Max from django.db.models import OuterRef from django.db.models import Q +from django.db.models import Subquery from django.db.models import Sum -from django.db.models import Value from django.db.utils import IntegrityError from django.template.loader import render_to_string from django.utils import timezone @@ -41,11 +40,10 @@ from le_utils.constants import roles from past.builtins import basestring from past.utils import old_div -from search.constants import CHANNEL_KEYWORDS_TSVECTOR -from search.constants import CONTENTNODE_AUTHOR_TSVECTOR -from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR from search.models import ChannelFullTextSearch from search.models import ContentNodeFullTextSearch +from search.utils import get_fts_annotated_channel_qs +from search.utils import get_fts_annotated_contentnode_qs from contentcuration import models as ccmodels from contentcuration.decorators import delay_user_storage_calculation @@ -823,35 +821,22 @@ def sync_contentnode_and_channel_tsvectors(channel_id): to reflect the current state of channel's main tree. """ # Update or create channel tsvector entry. - logging.info("Starting to set tsvectors for channel with id {}.".format(channel_id)) + logging.info("Setting tsvector for channel with id {}.".format(channel_id)) - from contentcuration.viewsets.channel import primary_token_subquery - - channel = (ccmodels.Channel.objects - .filter(pk=channel_id) - .annotate(primary_channel_token=primary_token_subquery, - keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR) + channel = (get_fts_annotated_channel_qs() .values("keywords_tsvector", "main_tree__tree_id") - .get()) + .get(pk=channel_id)) - if ChannelFullTextSearch.objects.filter(channel_id=channel_id).exists(): - update_count = ChannelFullTextSearch.objects.filter(channel_id=channel_id).update(keywords_tsvector=channel["keywords_tsvector"]) - logging.info("Updated {} channel tsvector.".format(update_count)) - else: - obj = ChannelFullTextSearch(channel_id=channel_id, keywords_tsvector=channel["keywords_tsvector"]) - obj.save() + obj, is_created = ChannelFullTextSearch.objects.update_or_create(channel_id=channel_id, defaults={"keywords_tsvector": channel["keywords_tsvector"]}) + del obj + + if is_created: logging.info("Created 1 channel tsvector.") + else: + logging.info("Updated 1 channel tsvector.") # Update or create contentnodes tsvector entry for channel_id. - logging.info("Starting to set tsvectors for all contentnodes in channel {}.".format(channel_id)) - - nodes_tsvector_query = (ccmodels.ContentNode.objects - .filter(tree_id=channel["main_tree__tree_id"]) - .annotate(channel_id=Value(channel_id), - contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), - keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, - author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR) - .order_by()) + logging.info("Setting tsvectors for all main tree contentnodes in channel {}.".format(channel_id)) if ContentNodeFullTextSearch.objects.filter(channel_id=channel_id).exists(): # First, delete nodes that are no longer in main_tree. @@ -860,31 +845,22 @@ def sync_contentnode_and_channel_tsvectors(channel_id): # Now, all remaining nodes are in main_tree, so let's update them. # Update only changed nodes. - nodes_to_update = ContentNodeFullTextSearch.objects.filter(channel_id=channel_id, contentnode__changed=True) - - update_objs = list() - for node in nodes_to_update: - corresponding_contentnode = nodes_tsvector_query.filter(pk=node.contentnode_id).values("keywords_tsvector", "author_tsvector").first() - if corresponding_contentnode: - node.keywords_tsvector = corresponding_contentnode["keywords_tsvector"] - node.author_tsvector = corresponding_contentnode["author_tsvector"] - update_objs.append(node) - ContentNodeFullTextSearch.objects.bulk_update(update_objs, ["keywords_tsvector", "author_tsvector"]) - del update_objs + changed_nodes_subquery = (get_fts_annotated_contentnode_qs(channel_id) + .filter(id=OuterRef("contentnode_id"), + tree_id=channel["main_tree__tree_id"], + complete=True, + changed=True) + .order_by()) + ContentNodeFullTextSearch.objects.filter(channel_id=channel_id).update( + keywords_tsvector=Subquery(changed_nodes_subquery.values_list("keywords_tsvector")[:1]), + author_tsvector=Subquery(changed_nodes_subquery.values_list("author_tsvector")[:1]) + ) # Insert newly created nodes. - nodes_not_having_tsvector_record = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id"), channel_id=channel_id)) - nodes_to_insert = (nodes_tsvector_query - .filter(nodes_not_having_tsvector_record) - .values("id", "channel_id", "keywords_tsvector", "author_tsvector")) - - insert_objs = list() - for node in nodes_to_insert: - obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"], - keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"]) - insert_objs.append(obj) - inserted_nodes_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs) - logging.info("Successfully inserted {} contentnode tsvectors.".format(len(inserted_nodes_list))) + call_command("set_contentnode_tsvectors", + "--channel-id={}".format(channel_id), + "--tree-id={}".format(channel["main_tree__tree_id"]), + "--complete") @delay_user_storage_calculation diff --git a/contentcuration/search/management/commands/set_channel_tsvectors.py b/contentcuration/search/management/commands/set_channel_tsvectors.py index d82f4848f5..68d7e17b51 100644 --- a/contentcuration/search/management/commands/set_channel_tsvectors.py +++ b/contentcuration/search/management/commands/set_channel_tsvectors.py @@ -7,11 +7,8 @@ from django.core.management.base import BaseCommand from django.db.models import Exists from django.db.models import OuterRef -from search.constants import CHANNEL_KEYWORDS_TSVECTOR from search.models import ChannelFullTextSearch - -from contentcuration.models import Channel -from contentcuration.viewsets.channel import primary_token_subquery +from search.utils import get_fts_annotated_channel_qs logmodule.basicConfig(level=logmodule.INFO) @@ -27,10 +24,8 @@ def handle(self, *args, **options): channel_not_already_inserted_query = ~Exists(ChannelFullTextSearch.objects.filter(channel_id=OuterRef("id"))) - channel_query = (Channel.objects.filter(channel_not_already_inserted_query, - deleted=False, main_tree__published=True) - .annotate(primary_channel_token=primary_token_subquery, - keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR) + channel_query = (get_fts_annotated_channel_qs().filter(channel_not_already_inserted_query, + deleted=False, main_tree__published=True) .values("id", "keywords_tsvector")) insertable_channels = list(channel_query[:CHUNKSIZE]) diff --git a/contentcuration/search/management/commands/set_contentnode_tsvectors.py b/contentcuration/search/management/commands/set_contentnode_tsvectors.py index 4e5673d9ec..c5e78ca8d1 100644 --- a/contentcuration/search/management/commands/set_contentnode_tsvectors.py +++ b/contentcuration/search/management/commands/set_contentnode_tsvectors.py @@ -4,15 +4,11 @@ import logging as logmodule import time -from django.contrib.postgres.aggregates import StringAgg from django.core.management.base import BaseCommand from django.db.models import Exists from django.db.models import OuterRef -from search.constants import CONTENTNODE_AUTHOR_TSVECTOR -from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR from search.models import ContentNodeFullTextSearch - -from contentcuration.models import ContentNode +from search.utils import get_fts_annotated_contentnode_qs logmodule.basicConfig(level=logmodule.INFO) @@ -22,20 +18,39 @@ class Command(BaseCommand): + def add_arguments(self, parser): + parser.add_argument("--channel-id", type=str, dest="channel_id", + help="The channel_id to annotate to the nodes. If not specified then each node's channel_id is queried and then annotated.") + parser.add_argument("--tree-id", type=int, dest="tree_id", + help="Set tsvectors for a specific tree_id nodes only. If not specified then tsvectors for all nodes of ContentNode table are set.") + parser.add_argument("--published", dest="published", action="store_true", help="Filters on whether node is published or not.") + parser.add_argument("--complete", dest="complete", action="store_true", help="Filters on whether node is complete or not.") - def handle(self, *args, **options): - start = time.time() + def get_tsvector_nodes_queryset(self, *args, **options): + tsvector_nodes_queryset = get_fts_annotated_contentnode_qs(channel_id=options["channel_id"]) + + if options["tree_id"]: + tsvector_nodes_queryset = tsvector_nodes_queryset.filter(tree_id=options["tree_id"]) + + if options["complete"]: + tsvector_nodes_queryset = tsvector_nodes_queryset.filter(complete=True) + + if options["published"]: + tsvector_nodes_queryset = tsvector_nodes_queryset.filter(published=True) tsvector_not_already_inserted_query = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id"))) + tsvector_nodes_queryset = (tsvector_nodes_queryset + .filter(tsvector_not_already_inserted_query, channel_id__isnull=False) + .values("id", "channel_id", "keywords_tsvector", "author_tsvector").order_by()) + + return tsvector_nodes_queryset + + def handle(self, *args, **options): + start = time.time() - tsvector_node_query = (ContentNode._annotate_channel_id(ContentNode.objects) - .annotate(contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), - keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, - author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR) - .filter(tsvector_not_already_inserted_query, published=True, channel_id__isnull=False) - .values("id", "channel_id", "keywords_tsvector", "author_tsvector").order_by()) + tsvector_nodes_queryset = self.get_tsvector_nodes_queryset(*args, **options) - insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE]) + insertable_nodes_tsvector = list(tsvector_nodes_queryset[:CHUNKSIZE]) total_tsvectors_inserted = 0 while insertable_nodes_tsvector: @@ -54,6 +69,6 @@ def handle(self, *args, **options): logging.info("Inserted {} contentnode tsvectors.".format(current_inserts_count)) - insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE]) + insertable_nodes_tsvector = list(tsvector_nodes_queryset[:CHUNKSIZE]) logging.info("Completed! Successfully inserted total of {} contentnode tsvectors in {} seconds.".format(total_tsvectors_inserted, time.time() - start)) diff --git a/contentcuration/search/utils.py b/contentcuration/search/utils.py index 9cb205bb47..4f6768f650 100644 --- a/contentcuration/search/utils.py +++ b/contentcuration/search/utils.py @@ -1,4 +1,9 @@ +from django.contrib.postgres.aggregates import StringAgg from django.contrib.postgres.search import SearchQuery +from django.db.models import Value +from search.constants import CHANNEL_KEYWORDS_TSVECTOR +from search.constants import CONTENTNODE_AUTHOR_TSVECTOR +from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR from search.constants import POSTGRES_FTS_CONFIG @@ -7,3 +12,37 @@ def get_fts_search_query(value): Returns a `SearchQuery` with our postgres full text search config set on it. """ return SearchQuery(value=value, config=POSTGRES_FTS_CONFIG) + + +def get_fts_annotated_contentnode_qs(channel_id=None): + """ + Returns a `ContentNode` queryset annotated with fields required for full text search. + + If `channel_id` is provided, annotates that specific `channel_id` else annotates + the `channel_id` to which the contentnode belongs. + """ + from contentcuration.models import ContentNode + + if channel_id: + queryset = ContentNode.objects.annotate(channel_id=Value(channel_id)) + else: + queryset = ContentNode._annotate_channel_id(ContentNode.objects) + + queryset = queryset.annotate( + contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), + keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, + author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR + ) + + return queryset + + +def get_fts_annotated_channel_qs(): + """ + Returns a `Channel` queryset annotated with fields required for full text search. + """ + from contentcuration.models import Channel + from contentcuration.viewsets.channel import primary_token_subquery + + return Channel.objects.annotate(primary_channel_token=primary_token_subquery, + keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR) diff --git a/contentcuration/search/viewsets/contentnode.py b/contentcuration/search/viewsets/contentnode.py index aa6c7fe9b7..58660fce5b 100644 --- a/contentcuration/search/viewsets/contentnode.py +++ b/contentcuration/search/viewsets/contentnode.py @@ -17,7 +17,7 @@ from contentcuration.models import Channel from contentcuration.models import File -from contentcuration.utils.pagination import CachedListPagination +from contentcuration.utils.pagination import ValuesViewsetPageNumberPagination from contentcuration.viewsets.base import ReadOnlyValuesViewset from contentcuration.viewsets.base import RequiredFilterSet from contentcuration.viewsets.common import NotNullMapArrayAgg @@ -25,7 +25,7 @@ from contentcuration.viewsets.common import UUIDInFilter -class ListPagination(CachedListPagination): +class ListPagination(ValuesViewsetPageNumberPagination): page_size = 25 page_size_query_param = "page_size" max_page_size = 100 From 9ec60cf200e8f445556e2d89f6eee1a2d9e1bb52 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Thu, 29 Sep 2022 00:09:07 +0530 Subject: [PATCH 24/25] fixes the strict update subquery, lightens it up --- contentcuration/contentcuration/utils/publish.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/contentcuration/contentcuration/utils/publish.py b/contentcuration/contentcuration/utils/publish.py index 017bf4a561..5b38d85576 100644 --- a/contentcuration/contentcuration/utils/publish.py +++ b/contentcuration/contentcuration/utils/publish.py @@ -845,18 +845,14 @@ def sync_contentnode_and_channel_tsvectors(channel_id): # Now, all remaining nodes are in main_tree, so let's update them. # Update only changed nodes. - changed_nodes_subquery = (get_fts_annotated_contentnode_qs(channel_id) - .filter(id=OuterRef("contentnode_id"), - tree_id=channel["main_tree__tree_id"], - complete=True, - changed=True) - .order_by()) - ContentNodeFullTextSearch.objects.filter(channel_id=channel_id).update( - keywords_tsvector=Subquery(changed_nodes_subquery.values_list("keywords_tsvector")[:1]), - author_tsvector=Subquery(changed_nodes_subquery.values_list("author_tsvector")[:1]) + node_tsv_subquery = get_fts_annotated_contentnode_qs(channel_id).filter(id=OuterRef("contentnode_id")).order_by() + ContentNodeFullTextSearch.objects.filter(channel_id=channel_id, contentnode__complete=True, contentnode__changed=True).update( + keywords_tsvector=Subquery(node_tsv_subquery.values("keywords_tsvector")[:1]), + author_tsvector=Subquery(node_tsv_subquery.values("author_tsvector")[:1]) ) # Insert newly created nodes. + # "set_contentnode_tsvectors" command is defined in "search/management/commands" directory. call_command("set_contentnode_tsvectors", "--channel-id={}".format(channel_id), "--tree-id={}".format(channel["main_tree__tree_id"]), From aae3be1891e635a8d87ab58814f0b625dbee4d77 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Thu, 29 Sep 2022 01:51:33 +0530 Subject: [PATCH 25/25] Do not output deleted channel nodes on search --- contentcuration/search/viewsets/contentnode.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/contentcuration/search/viewsets/contentnode.py b/contentcuration/search/viewsets/contentnode.py index 58660fce5b..676698031d 100644 --- a/contentcuration/search/viewsets/contentnode.py +++ b/contentcuration/search/viewsets/contentnode.py @@ -52,11 +52,11 @@ def filter_channel_list(self, queryset, name, value): if value == "public": channel_ids = Channel.get_public_channels().values_list("id", flat=True) elif value == "edit" and user: - channel_ids = user.editable_channels.values_list("id", flat=True) + channel_ids = user.editable_channels.filter(deleted=False).values_list("id", flat=True) elif value == "bookmark" and user: - channel_ids = user.bookmarked_channels.values_list("id", flat=True) + channel_ids = user.bookmarked_channels.filter(deleted=False).values_list("id", flat=True) elif value == "view" and user: - channel_ids = user.view_only_channels.values_list("id", flat=True) + channel_ids = user.view_only_channels.filter(deleted=False).values_list("id", flat=True) return queryset.filter(channel_id__in=list(channel_ids))