Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import from other channels search optimized #3399

Merged
merged 32 commits into from
Oct 7, 2022
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
357204c
Optimized import from other channels search
vkWeb Jun 2, 2022
9028b22
Merge branch 'unstable' into optimize/search
vkWeb Jun 2, 2022
d24416c
Add search test for channel filtering and location_ids handling
bjester Jun 15, 2022
676526e
Fix autodiscovery of search tests
vkWeb Jun 30, 2022
6d40c55
Remove location_ids, zero db queries on descendant resource count
vkWeb Jul 9, 2022
094f05f
Upgrade django debug toolbar and fix its settings
vkWeb Jul 12, 2022
701ddc9
Remove unnecessary dev settings
vkWeb Jul 22, 2022
1a14749
Add .envrc to .gitignore
vkWeb Jul 22, 2022
bff595c
Add vector search column & indexes, also GiST trigram index
vkWeb Aug 12, 2022
8b7152e
Merge branch 'unstable' into optimize/search
vkWeb Aug 12, 2022
c0e55ee
Remove cyclic migration conflicts
vkWeb Aug 12, 2022
34e8436
Fix wrong indentation happened due to merge conflict
vkWeb Aug 12, 2022
e00c512
Add a command for setting tsvectors and fix tests
vkWeb Aug 14, 2022
f3280d9
Remove grade_level default to pass failing tests
vkWeb Aug 14, 2022
475aeef
Merge branch 'unstable' into optimize/search
vkWeb Aug 24, 2022
3ff0edd
Full text search models and data migrations
vkWeb Sep 8, 2022
fffd912
Merge branch 'unstable' into optimize/search
vkWeb Sep 8, 2022
974b69e
Resolve conflicts and remove old index refs
vkWeb Sep 8, 2022
e15b015
feat: full text search!
vkWeb Sep 10, 2022
aa62dc2
Sync tsvectors on publish!
vkWeb Sep 14, 2022
2672512
fix: tests and ready for merge! <3
vkWeb Sep 14, 2022
1ffa30d
fix: node command edge case; when published nodes go to trash tree, t…
vkWeb Sep 14, 2022
55e4acf
Merge branch 'unstable' into optimize/search
vkWeb Sep 16, 2022
57724e0
Enforce only-one search entries
vkWeb Sep 16, 2022
e53e56a
Remove unnecessary select_related
vkWeb Sep 16, 2022
4b3d4c7
fix cache tests mock by setting ContentNodeFullTextSearch
vkWeb Sep 16, 2022
44ab74c
fix cache & nodes tests by using db-friendly TestCase
vkWeb Sep 16, 2022
4beaf3c
Merge branch 'unstable' into optimize/search
vkWeb Sep 16, 2022
001e788
Use command for tsv insertion & simpler tsv update on publish
vkWeb Sep 21, 2022
ff59495
Merge branch 'unstable' into optimize/search
vkWeb Sep 27, 2022
9ec60cf
fixes the strict update subquery, lightens it up
vkWeb Sep 28, 2022
aae3be1
Do not output deleted channel nodes on search
vkWeb Sep 28, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 0 additions & 47 deletions contentcuration/contentcuration/debug/middleware.py

This file was deleted.

10 changes: 7 additions & 3 deletions contentcuration/contentcuration/debug_panel_settings.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from .dev_settings import * # noqa

# These endpoints will throw an error on the django debug panel
# These endpoints will throw an error on the django debug panel.
EXCLUDED_DEBUG_URLS = [
"/content/storage",

# Disabling task API because as soon as the task API gets polled
# the current request data gets overwritten.
"/api/task",
]

DEBUG_PANEL_ACTIVE = True
Expand All @@ -14,10 +18,10 @@ def custom_show_toolbar(request):
) # noqa F405


# if debug_panel exists, add it to our INSTALLED_APPS
# if debug_panel exists, add it to our INSTALLED_APPS.
INSTALLED_APPS += ("debug_panel", "debug_toolbar", "pympler") # noqa F405
MIDDLEWARE += ( # noqa F405
"contentcuration.debug.middleware.CustomDebugPanelMiddleware",
"debug_toolbar.middleware.DebugToolbarMiddleware",
)
DEBUG_TOOLBAR_CONFIG = {
"SHOW_TOOLBAR_CALLBACK": custom_show_toolbar,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,13 +160,8 @@
}
return this.$tr('resourcesCount', { count });
},
numLocations() {
return this.node.location_ids.length;
},
goToLocationLabel() {
return this.numLocations > 1
? this.$tr('goToPluralLocationsAction', { count: this.numLocations })
: this.$tr('goToSingleLocationAction');
return this.$tr('goToSingleLocationAction');
},
isTopic() {
return this.node.kind === ContentKindsNames.TOPIC;
Expand All @@ -189,8 +184,6 @@
$trs: {
tagsList: 'Tags: {tags}',
goToSingleLocationAction: 'Go to location',
goToPluralLocationsAction:
'In {count, number} {count, plural, one {location} other {locations}}',
addToClipboardAction: 'Copy to clipboard',
resourcesCount: '{count, number} {count, plural, one {resource} other {resources}}',
coach: 'Resource for coaches',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""
This command sets tsvector in title_description_search_vector field in batches.
The batches are created on the basis of channel_id. This enables resumption. Also helps
in cases of failure or memory overflow.
"""
import logging as logmodule

from django.core.cache import cache
from django.core.management.base import BaseCommand

from contentcuration.models import Channel
from contentcuration.models import ContentNode
from contentcuration.models import POSTGRES_SEARCH_VECTOR


logmodule.basicConfig(level=logmodule.INFO)
logging = logmodule.getLogger(__name__)


UPDATED_TS_VECTORS_CACHE_KEY = "tsvectors_updated_for_channel_ids"
UPDATED_TS_VECTORS_FOR_NULL_CHANNEL_CACHE_KEY = "tsvectors_updated_for_null_channels"


class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument(
"--public",
action="store_true",
help="Set tsvector for only the public channel nodes instead of all nodes.",
)
parser.add_argument(
"--no-cache",
action="store_true",
help="Disables the cache. This updates all previously updated nodes.",
)

def handle(self, *args, **options):
if options["no_cache"]:
updated_channel_ids = []
do_update_nodes_with_null_channel_id = True
else:
updated_channel_ids = [] if cache.get(UPDATED_TS_VECTORS_CACHE_KEY) is None else cache.get(UPDATED_TS_VECTORS_CACHE_KEY)
vkWeb marked this conversation as resolved.
Show resolved Hide resolved
do_update_nodes_with_null_channel_id = not cache.get(UPDATED_TS_VECTORS_FOR_NULL_CHANNEL_CACHE_KEY)

if options["public"]:
to_update_channel_ids = list(Channel.get_public_channels().exclude(id__in=updated_channel_ids).values_list("id", flat=True))
do_update_nodes_with_null_channel_id = False
logging.info("Started setting tsvector for public channel nodes.")
else:
to_update_channel_ids = list(Channel.objects.exclude(id__in=updated_channel_ids).values_list("id", flat=True))
vkWeb marked this conversation as resolved.
Show resolved Hide resolved
logging.info("Started setting tsvector for all nodes.")

annotated_contentnode_qs = ContentNode._annotate_channel_id(ContentNode.objects)

for channel_id in to_update_channel_ids:
logging.info("Setting tsvector for nodes of channel {}.".format(channel_id))
annotated_contentnode_qs.filter(channel_id=channel_id).update(title_description_search_vector=POSTGRES_SEARCH_VECTOR)
updated_channel_ids.append(channel_id)
cache.set(UPDATED_TS_VECTORS_CACHE_KEY, updated_channel_ids, None)
logging.info("Finished setting tsvector for nodes of channel {}.".format(channel_id))

if do_update_nodes_with_null_channel_id:
vkWeb marked this conversation as resolved.
Show resolved Hide resolved
logging.info("Setting tsvector for nodes with NULL channel_id.")
annotated_contentnode_qs.filter(channel_id__isnull=True).update(title_description_search_vector=POSTGRES_SEARCH_VECTOR)
cache.set(UPDATED_TS_VECTORS_FOR_NULL_CHANNEL_CACHE_KEY, True, None)
logging.info("Finished setting tsvector for nodes with NULL channel_id.")

if options["public"]:
logging.info("Finished setting tsvector for public channel nodes.")
else:
logging.info("Finished setting tsvector for all nodes.")
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Generated by Django 3.2.13 on 2022-08-10 19:20
import django.contrib.postgres.indexes
import django.contrib.postgres.search
from django.contrib.postgres.operations import AddIndexConcurrently
from django.contrib.postgres.operations import TrigramExtension
from django.db import migrations


class Migration(migrations.Migration):

atomic = False

dependencies = [
('contentcuration', '0140_delete_task'),
]

operations = [
# Installs the pg_trgm module that comes pre-bundled with PostgreSQL 9.6.
TrigramExtension(),

migrations.AddField(
model_name='contentnode',
name='title_description_search_vector',
field=django.contrib.postgres.search.SearchVectorField(blank=True, null=True),
),
AddIndexConcurrently(
model_name='contentnode',
index=django.contrib.postgres.indexes.GinIndex(fields=['title_description_search_vector'], name='node_search_vector_gin_idx'),
),
AddIndexConcurrently(
model_name='contenttag',
index=django.contrib.postgres.indexes.GistIndex(fields=['tag_name'], name='contenttag_tag_name_gist_idx', opclasses=['gist_trgm_ops']),
),
]
32 changes: 31 additions & 1 deletion contentcuration/contentcuration/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,15 @@
from datetime import datetime

import pytz
from celery import states
from django.conf import settings
from django.contrib.auth.base_user import AbstractBaseUser
from django.contrib.auth.base_user import BaseUserManager
from django.contrib.auth.models import PermissionsMixin
from django.contrib.postgres.indexes import GinIndex
from django.contrib.postgres.indexes import GistIndex
from django.contrib.postgres.search import SearchQuery
from django.contrib.postgres.search import SearchVector
from django.contrib.postgres.search import SearchVectorField
from django.contrib.sessions.models import Session
from django.core.cache import cache
from django.core.exceptions import MultipleObjectsReturned
Expand Down Expand Up @@ -1093,6 +1097,9 @@ def delete(self, *args, **kwargs):
self.secret_token.delete()


CONTENT_TAG_NAME__INDEX_NAME = "contenttag_tag_name_gist_idx"


class ContentTag(models.Model):
id = UUIDField(primary_key=True, default=uuid.uuid4)
tag_name = models.CharField(max_length=50)
Expand All @@ -1104,6 +1111,7 @@ def __str__(self):

class Meta:
unique_together = ['tag_name', 'channel']
indexes = [GistIndex(fields=["tag_name"], name=CONTENT_TAG_NAME__INDEX_NAME, opclasses=["gist_trgm_ops"])]


def delegate_manager(method):
Expand Down Expand Up @@ -1147,6 +1155,12 @@ def __str__(self):
NODE_ID_INDEX_NAME = "node_id_idx"
NODE_MODIFIED_INDEX_NAME = "node_modified_idx"
NODE_MODIFIED_DESC_INDEX_NAME = "node_modified_desc_idx"
NODE_SEARCH_VECTOR_GIN_INDEX_NAME = "node_search_vector_gin_idx"

# Ours postgres full text search configuration.
POSTGRES_FTS_CONFIG = "simple"
# Search vector to create tsvector of title and description concatenated.
POSTGRES_SEARCH_VECTOR = SearchVector("title", "description", config=POSTGRES_FTS_CONFIG)
CONTENTNODE_TREE_ID_CACHE_KEY = "contentnode_{pk}__tree_id"


Expand Down Expand Up @@ -1243,6 +1257,10 @@ class ContentNode(MPTTModel, models.Model):
# this duration should be in seconds.
suggested_duration = models.IntegerField(blank=True, null=True, help_text="Suggested duration for the content node (in seconds)")

# A field to store the ts_vector form of (title + ' ' + description).
# This significantly increases the search performance.
title_description_search_vector = SearchVectorField(blank=True, null=True)
vkWeb marked this conversation as resolved.
Show resolved Hide resolved

objects = CustomContentNodeTreeManager()

# Track all updates and ignore a blacklist of attributes
Expand Down Expand Up @@ -1339,6 +1357,12 @@ def filter_view_queryset(cls, queryset, user):
| Q(public=True)
)

@classmethod
def search(self, queryset, search_term):
search_query = Q(title_description_search_vector=SearchQuery(value=search_term, config=POSTGRES_FTS_CONFIG, search_type="plain"))
vkWeb marked this conversation as resolved.
Show resolved Hide resolved
tags_query = Q(tags__tag_name__icontains=search_term)
return queryset.filter(search_query | tags_query)

@raise_if_unsaved
def get_root(self):
# Only topics can be root nodes
Expand Down Expand Up @@ -1822,8 +1846,10 @@ def set_default_learning_activity(self):

def save(self, skip_lock=False, *args, **kwargs):
if self._state.adding:
is_create = True
self.on_create()
else:
is_create = False
self.on_update()

# Logic borrowed from mptt - do a simple check to see if we have changed
Expand Down Expand Up @@ -1867,6 +1893,9 @@ def save(self, skip_lock=False, *args, **kwargs):
if changed_ids:
ContentNode.objects.filter(id__in=changed_ids).update(changed=True)

if is_create:
ContentNode.filter_by_pk(pk=self.id).update(title_description_search_vector=POSTGRES_SEARCH_VECTOR)
vkWeb marked this conversation as resolved.
Show resolved Hide resolved

# Copied from MPTT
save.alters_data = True

Expand Down Expand Up @@ -1909,6 +1938,7 @@ class Meta:
indexes = [
models.Index(fields=["node_id"], name=NODE_ID_INDEX_NAME),
models.Index(fields=["-modified"], name=NODE_MODIFIED_DESC_INDEX_NAME),
GinIndex(fields=["title_description_search_vector"], name=NODE_SEARCH_VECTOR_GIN_INDEX_NAME),
]


Expand Down
2 changes: 1 addition & 1 deletion contentcuration/contentcuration/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
'webpack_loader',
'django_filters',
'mathfilters',
'django.contrib.postgres',
vkWeb marked this conversation as resolved.
Show resolved Hide resolved
'django_celery_results',
)

Expand Down Expand Up @@ -221,7 +222,6 @@

IS_CONTENTNODE_TABLE_PARTITIONED = os.getenv("IS_CONTENTNODE_TABLE_PARTITIONED") or False


DATABASE_ROUTERS = [
"kolibri_content.router.ContentDBRouter",
]
Expand Down
18 changes: 10 additions & 8 deletions contentcuration/contentcuration/tests/testdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,21 +195,23 @@ def node(data, parent=None):
return new_node


def tree(parent=None):
def tree(parent=None, tree_data=None):
# Read from json fixture
filepath = os.path.sep.join([os.path.dirname(__file__), "fixtures", "tree.json"])
with open(filepath, "rb") as jsonfile:
data = json.load(jsonfile)
if tree_data is None:
filepath = os.path.sep.join([os.path.dirname(__file__), "fixtures", "tree.json"])
with open(filepath, "rb") as jsonfile:
tree_data = json.load(jsonfile)

return node(data, parent)
return node(tree_data, parent)


def channel(name="testchannel"):
def channel(name="testchannel", create_main_tree=True, main_tree_data=None):
channel = cc.Channel.objects.create(name=name)
channel.save()

channel.main_tree = tree()
channel.save()
if create_main_tree:
channel.main_tree = tree(tree_data=main_tree_data)
channel.save()

return channel

Expand Down
4 changes: 4 additions & 0 deletions contentcuration/contentcuration/utils/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,10 @@ def queue_get_return_none_when_empty():
logging.debug("Mapping node with id {id}".format(
id=node.pk))

# Update tsvector for this node.
node.title_description_search_vector = ccmodels.POSTGRES_SEARCH_VECTOR
node.save(update_fields=["title_description_search_vector"])

if node.get_descendants(include_self=True).exclude(kind_id=content_kinds.TOPIC).exists() and node.complete:
children = (node.children.all())
node_queue.extend(children)
Expand Down
3 changes: 2 additions & 1 deletion contentcuration/contentcuration/viewsets/contentnode.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,8 @@ def delete_from_changes(self, changes):


def dict_if_none(obj, field_name=None):
return obj[field_name] if obj[field_name] else {}
vkWeb marked this conversation as resolved.
Show resolved Hide resolved
value = obj.get(field_name)
return value if value else {}


# Apply mixin first to override ValuesViewset
Expand Down
Loading