Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import from other channels search optimized #3399

Merged
merged 32 commits into from
Oct 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
357204c
Optimized import from other channels search
vkWeb Jun 2, 2022
9028b22
Merge branch 'unstable' into optimize/search
vkWeb Jun 2, 2022
d24416c
Add search test for channel filtering and location_ids handling
bjester Jun 15, 2022
676526e
Fix autodiscovery of search tests
vkWeb Jun 30, 2022
6d40c55
Remove location_ids, zero db queries on descendant resource count
vkWeb Jul 9, 2022
094f05f
Upgrade django debug toolbar and fix its settings
vkWeb Jul 12, 2022
701ddc9
Remove unnecessary dev settings
vkWeb Jul 22, 2022
1a14749
Add .envrc to .gitignore
vkWeb Jul 22, 2022
bff595c
Add vector search column & indexes, also GiST trigram index
vkWeb Aug 12, 2022
8b7152e
Merge branch 'unstable' into optimize/search
vkWeb Aug 12, 2022
c0e55ee
Remove cyclic migration conflicts
vkWeb Aug 12, 2022
34e8436
Fix wrong indentation happened due to merge conflict
vkWeb Aug 12, 2022
e00c512
Add a command for setting tsvectors and fix tests
vkWeb Aug 14, 2022
f3280d9
Remove grade_level default to pass failing tests
vkWeb Aug 14, 2022
475aeef
Merge branch 'unstable' into optimize/search
vkWeb Aug 24, 2022
3ff0edd
Full text search models and data migrations
vkWeb Sep 8, 2022
fffd912
Merge branch 'unstable' into optimize/search
vkWeb Sep 8, 2022
974b69e
Resolve conflicts and remove old index refs
vkWeb Sep 8, 2022
e15b015
feat: full text search!
vkWeb Sep 10, 2022
aa62dc2
Sync tsvectors on publish!
vkWeb Sep 14, 2022
2672512
fix: tests and ready for merge! <3
vkWeb Sep 14, 2022
1ffa30d
fix: node command edge case; when published nodes go to trash tree, t…
vkWeb Sep 14, 2022
55e4acf
Merge branch 'unstable' into optimize/search
vkWeb Sep 16, 2022
57724e0
Enforce only-one search entries
vkWeb Sep 16, 2022
e53e56a
Remove unnecessary select_related
vkWeb Sep 16, 2022
4b3d4c7
fix cache tests mock by setting ContentNodeFullTextSearch
vkWeb Sep 16, 2022
44ab74c
fix cache & nodes tests by using db-friendly TestCase
vkWeb Sep 16, 2022
4beaf3c
Merge branch 'unstable' into optimize/search
vkWeb Sep 16, 2022
001e788
Use command for tsv insertion & simpler tsv update on publish
vkWeb Sep 21, 2022
ff59495
Merge branch 'unstable' into optimize/search
vkWeb Sep 27, 2022
9ec60cf
fixes the strict update subquery, lightens it up
vkWeb Sep 28, 2022
aae3be1
Do not output deleted channel nodes on search
vkWeb Sep 28, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ learningactivities:

set-tsvectors:
python contentcuration/manage.py set_channel_tsvectors
python contentcuration/manage.py set_contentnode_tsvectors
python contentcuration/manage.py set_contentnode_tsvectors --published

###############################################################
# END PRODUCTION COMMANDS #####################################
Expand Down
47 changes: 0 additions & 47 deletions contentcuration/contentcuration/debug/middleware.py

This file was deleted.

11 changes: 8 additions & 3 deletions contentcuration/contentcuration/debug_panel_settings.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from .dev_settings import * # noqa

# These endpoints will throw an error on the django debug panel
# These endpoints will throw an error on the django debug panel.
EXCLUDED_DEBUG_URLS = [
"/content/storage",

# Disabling sync API because as soon as the sync API gets polled
# the current request data gets overwritten.
# Can be removed after websockets deployment.
"/api/sync",
]

DEBUG_PANEL_ACTIVE = True
Expand All @@ -14,10 +19,10 @@ def custom_show_toolbar(request):
) # noqa F405


# if debug_panel exists, add it to our INSTALLED_APPS
# if debug_panel exists, add it to our INSTALLED_APPS.
INSTALLED_APPS += ("debug_panel", "debug_toolbar", "pympler") # noqa F405
MIDDLEWARE += ( # noqa F405
"contentcuration.debug.middleware.CustomDebugPanelMiddleware",
"debug_toolbar.middleware.DebugToolbarMiddleware",
)
DEBUG_TOOLBAR_CONFIG = {
"SHOW_TOOLBAR_CALLBACK": custom_show_toolbar,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@
if (this.isTopic) {
return `${baseUrl}#/${this.node.id}`;
}
return `${baseUrl}#/${this.node.parent}/${this.node.id}`;
return `${baseUrl}#/${this.node.parent_id}/${this.node.id}`;
},
resourcesMsg() {
let count;
Expand All @@ -160,13 +160,8 @@
}
return this.$tr('resourcesCount', { count });
},
numLocations() {
return this.node.location_ids.length;
},
goToLocationLabel() {
return this.numLocations > 1
? this.$tr('goToPluralLocationsAction', { count: this.numLocations })
: this.$tr('goToSingleLocationAction');
return this.$tr('goToSingleLocationAction');
},
isTopic() {
return this.node.kind === ContentKindsNames.TOPIC;
Expand All @@ -189,8 +184,6 @@
$trs: {
tagsList: 'Tags: {tags}',
goToSingleLocationAction: 'Go to location',
goToPluralLocationsAction:
'In {count, number} {count, plural, one {location} other {locations}}',
addToClipboardAction: 'Copy to clipboard',
resourcesCount: '{count, number} {count, plural, one {resource} other {resources}}',
coach: 'Resource for coaches',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@
[this.channelFilter]: true,
page: this.$route.query.page || 1,
exclude: this.currentChannelId,
published: true,
}).then(page => {
this.pageCount = page.total_pages;
this.channels = page.results;
Expand Down
8 changes: 8 additions & 0 deletions contentcuration/contentcuration/tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
from importlib import import_module

import mock
from search.models import ContentNodeFullTextSearch

from contentcuration.models import ContentNode
from contentcuration.models import TaskResult


Expand Down Expand Up @@ -39,6 +41,12 @@ def mock_class_instance(target):
else:
target_cls = target

# ContentNode's node_fts field can be handled by Django when tests
# access the database but we mock it so that we don't need to query
# the database. By doing so we get faster test execution.
if type(target_cls) is ContentNode:
target_cls.node_fts = ContentNodeFullTextSearch()

class MockClass(target_cls):
def __new__(cls, *args, **kwargs):
return mock.Mock(spec_set=cls)
Expand Down
7 changes: 3 additions & 4 deletions contentcuration/contentcuration/tests/utils/test_cache.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import mock
from django.test import TestCase
from django.test import SimpleTestCase

from ..helpers import mock_class_instance
from contentcuration.models import ContentNode
from contentcuration.utils.cache import ResourceSizeCache


class ResourceSizeCacheTestCase(TestCase):
class ResourceSizeCacheTestCase(SimpleTestCase):
def setUp(self):
super(ResourceSizeCacheTestCase, self).setUp()
self.node = mock.Mock(spec_set=ContentNode())
self.node = mock_class_instance("contentcuration.models.ContentNode")
self.node.pk = "abcdefghijklmnopqrstuvwxyz"
self.redis_client = mock_class_instance("redis.client.StrictRedis")
self.cache_client = mock_class_instance("django_redis.client.DefaultClient")
Expand Down
8 changes: 4 additions & 4 deletions contentcuration/contentcuration/tests/utils/test_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
from dateutil.parser import isoparse
from django.db.models import F
from django.db.models import Max
from django.test import TestCase
from django.test import SimpleTestCase

from ..base import StudioTestCase
from contentcuration.models import ContentNode
from contentcuration.tests.helpers import mock_class_instance
from contentcuration.utils.nodes import calculate_resource_size
from contentcuration.utils.nodes import ResourceSizeHelper
from contentcuration.utils.nodes import SlowCalculationError
Expand Down Expand Up @@ -42,10 +42,10 @@ def test_modified_since(self):

@mock.patch("contentcuration.utils.nodes.ResourceSizeHelper")
@mock.patch("contentcuration.utils.nodes.ResourceSizeCache")
class CalculateResourceSizeTestCase(TestCase):
class CalculateResourceSizeTestCase(SimpleTestCase):
def setUp(self):
super(CalculateResourceSizeTestCase, self).setUp()
self.node = mock.Mock(spec_set=ContentNode())
self.node = mock_class_instance("contentcuration.models.ContentNode")

def assertCalculation(self, cache, helper, force=False):
helper().get_size.return_value = 456
Expand Down
54 changes: 53 additions & 1 deletion contentcuration/contentcuration/utils/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@
from django.core.files.storage import default_storage as storage
from django.core.management import call_command
from django.db.models import Count
from django.db.models import Exists
from django.db.models import Max
from django.db.models import OuterRef
from django.db.models import Q
from django.db.models import Subquery
from django.db.models import Sum
from django.db.utils import IntegrityError
from django.template.loader import render_to_string
Expand All @@ -37,6 +40,10 @@
from le_utils.constants import roles
from past.builtins import basestring
from past.utils import old_div
from search.models import ChannelFullTextSearch
from search.models import ContentNodeFullTextSearch
from search.utils import get_fts_annotated_channel_qs
from search.utils import get_fts_annotated_contentnode_qs

from contentcuration import models as ccmodels
from contentcuration.decorators import delay_user_storage_calculation
Expand Down Expand Up @@ -808,6 +815,50 @@ def fill_published_fields(channel, version_notes):
channel.save()


def sync_contentnode_and_channel_tsvectors(channel_id):
"""
Creates, deletes and updates tsvectors of the channel and all its content nodes
to reflect the current state of channel's main tree.
vkWeb marked this conversation as resolved.
Show resolved Hide resolved
"""
# Update or create channel tsvector entry.
logging.info("Setting tsvector for channel with id {}.".format(channel_id))

channel = (get_fts_annotated_channel_qs()
.values("keywords_tsvector", "main_tree__tree_id")
.get(pk=channel_id))

obj, is_created = ChannelFullTextSearch.objects.update_or_create(channel_id=channel_id, defaults={"keywords_tsvector": channel["keywords_tsvector"]})
del obj

if is_created:
logging.info("Created 1 channel tsvector.")
else:
logging.info("Updated 1 channel tsvector.")

# Update or create contentnodes tsvector entry for channel_id.
logging.info("Setting tsvectors for all main tree contentnodes in channel {}.".format(channel_id))

vkWeb marked this conversation as resolved.
Show resolved Hide resolved
if ContentNodeFullTextSearch.objects.filter(channel_id=channel_id).exists():
# First, delete nodes that are no longer in main_tree.
nodes_no_longer_in_main_tree = ~Exists(ccmodels.ContentNode.objects.filter(id=OuterRef("contentnode_id"), tree_id=channel["main_tree__tree_id"]))
ContentNodeFullTextSearch.objects.filter(nodes_no_longer_in_main_tree, channel_id=channel_id).delete()

vkWeb marked this conversation as resolved.
Show resolved Hide resolved
# Now, all remaining nodes are in main_tree, so let's update them.
# Update only changed nodes.
node_tsv_subquery = get_fts_annotated_contentnode_qs(channel_id).filter(id=OuterRef("contentnode_id")).order_by()
ContentNodeFullTextSearch.objects.filter(channel_id=channel_id, contentnode__complete=True, contentnode__changed=True).update(
vkWeb marked this conversation as resolved.
Show resolved Hide resolved
keywords_tsvector=Subquery(node_tsv_subquery.values("keywords_tsvector")[:1]),
author_tsvector=Subquery(node_tsv_subquery.values("author_tsvector")[:1])
)

# Insert newly created nodes.
# "set_contentnode_tsvectors" command is defined in "search/management/commands" directory.
call_command("set_contentnode_tsvectors",
"--channel-id={}".format(channel_id),
"--tree-id={}".format(channel["main_tree__tree_id"]),
"--complete")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason not to pass the command options as keyword args? It would be slightly cleaner. The only caveat is that it bypasses the argument parser, but for your purposes, I don't see that there would be a difference. https://docs.djangoproject.com/en/3.2/ref/django-admin/#django.core.management.call_command

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason I chose this was to let the argument parser get invoked. It helped me manual test (and gave me the confidence) on what would happen when this command gets run from the commandline.



@delay_user_storage_calculation
def publish_channel(
user_id,
Expand All @@ -829,8 +880,9 @@ def publish_channel(
set_channel_icon_encoding(channel)
kolibri_temp_db = create_content_database(channel, force, user_id, force_exercises, progress_tracker=progress_tracker)
increment_channel_version(channel)
mark_all_nodes_as_published(channel)
add_tokens_to_channel(channel)
sync_contentnode_and_channel_tsvectors(channel_id=channel.id)
mark_all_nodes_as_published(channel)
fill_published_fields(channel, version_notes)

# Attributes not getting set for some reason, so just save it here
Expand Down
29 changes: 12 additions & 17 deletions contentcuration/contentcuration/viewsets/channel.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
from rest_framework.serializers import CharField
from rest_framework.serializers import FloatField
from rest_framework.serializers import IntegerField
from search.models import ChannelFullTextSearch
from search.models import ContentNodeFullTextSearch
from search.utils import get_fts_search_query

from contentcuration.decorators import cache_no_user_data
from contentcuration.models import Change
Expand Down Expand Up @@ -119,23 +122,15 @@ def filter_deleted(self, queryset, name, value):
return queryset.filter(deleted=value)

def filter_keywords(self, queryset, name, value):
# TODO: Wait until we show more metadata on cards to add this back in
# keywords_query = self.main_tree_query.filter(
# Q(tags__tag_name__icontains=value)
# | Q(author__icontains=value)
# | Q(aggregator__icontains=value)
# | Q(provider__icontains=value)
# )
return queryset.annotate(
# keyword_match_count=SQCount(keywords_query, field="content_id"),
primary_token=primary_token_subquery,
).filter(
Q(name__icontains=value)
| Q(description__icontains=value)
| Q(pk__istartswith=value)
| Q(primary_token=value.replace("-", ""))
# | Q(keyword_match_count__gt=0)
)
search_query = get_fts_search_query(value)
dash_replaced_search_query = get_fts_search_query(value.replace("-", ""))

channel_keywords_query = (Exists(ChannelFullTextSearch.objects.filter(
Q(keywords_tsvector=search_query) | Q(keywords_tsvector=dash_replaced_search_query), channel_id=OuterRef("id"))))
contentnode_search_query = (Exists(ContentNodeFullTextSearch.objects.filter(
Q(keywords_tsvector=search_query) | Q(author_tsvector=search_query), channel_id=OuterRef("id"))))

return queryset.filter(Q(channel_keywords_query) | Q(contentnode_search_query))

def filter_languages(self, queryset, name, value):
languages = value.split(",")
Expand Down
1 change: 0 additions & 1 deletion contentcuration/search/admin.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,8 @@
from django.core.management.base import BaseCommand
from django.db.models import Exists
from django.db.models import OuterRef
from search.constants import CHANNEL_KEYWORDS_TSVECTOR
from search.models import ChannelFullTextSearch

from contentcuration.models import Channel
from contentcuration.viewsets.channel import primary_token_subquery
from search.utils import get_fts_annotated_channel_qs


logmodule.basicConfig(level=logmodule.INFO)
Expand All @@ -27,10 +24,8 @@ def handle(self, *args, **options):

channel_not_already_inserted_query = ~Exists(ChannelFullTextSearch.objects.filter(channel_id=OuterRef("id")))

channel_query = (Channel.objects.filter(channel_not_already_inserted_query,
deleted=False, main_tree__published=True)
.annotate(primary_channel_token=primary_token_subquery,
keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR)
channel_query = (get_fts_annotated_channel_qs().filter(channel_not_already_inserted_query,
deleted=False, main_tree__published=True)
.values("id", "keywords_tsvector"))

insertable_channels = list(channel_query[:CHUNKSIZE])
Expand Down
Loading