Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import from other channels search optimized #3399

Merged
merged 32 commits into from
Oct 7, 2022
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
357204c
Optimized import from other channels search
vkWeb Jun 2, 2022
9028b22
Merge branch 'unstable' into optimize/search
vkWeb Jun 2, 2022
d24416c
Add search test for channel filtering and location_ids handling
bjester Jun 15, 2022
676526e
Fix autodiscovery of search tests
vkWeb Jun 30, 2022
6d40c55
Remove location_ids, zero db queries on descendant resource count
vkWeb Jul 9, 2022
094f05f
Upgrade django debug toolbar and fix its settings
vkWeb Jul 12, 2022
701ddc9
Remove unnecessary dev settings
vkWeb Jul 22, 2022
1a14749
Add .envrc to .gitignore
vkWeb Jul 22, 2022
bff595c
Add vector search column & indexes, also GiST trigram index
vkWeb Aug 12, 2022
8b7152e
Merge branch 'unstable' into optimize/search
vkWeb Aug 12, 2022
c0e55ee
Remove cyclic migration conflicts
vkWeb Aug 12, 2022
34e8436
Fix wrong indentation happened due to merge conflict
vkWeb Aug 12, 2022
e00c512
Add a command for setting tsvectors and fix tests
vkWeb Aug 14, 2022
f3280d9
Remove grade_level default to pass failing tests
vkWeb Aug 14, 2022
475aeef
Merge branch 'unstable' into optimize/search
vkWeb Aug 24, 2022
3ff0edd
Full text search models and data migrations
vkWeb Sep 8, 2022
fffd912
Merge branch 'unstable' into optimize/search
vkWeb Sep 8, 2022
974b69e
Resolve conflicts and remove old index refs
vkWeb Sep 8, 2022
e15b015
feat: full text search!
vkWeb Sep 10, 2022
aa62dc2
Sync tsvectors on publish!
vkWeb Sep 14, 2022
2672512
fix: tests and ready for merge! <3
vkWeb Sep 14, 2022
1ffa30d
fix: node command edge case; when published nodes go to trash tree, t…
vkWeb Sep 14, 2022
55e4acf
Merge branch 'unstable' into optimize/search
vkWeb Sep 16, 2022
57724e0
Enforce only-one search entries
vkWeb Sep 16, 2022
e53e56a
Remove unnecessary select_related
vkWeb Sep 16, 2022
4b3d4c7
fix cache tests mock by setting ContentNodeFullTextSearch
vkWeb Sep 16, 2022
44ab74c
fix cache & nodes tests by using db-friendly TestCase
vkWeb Sep 16, 2022
4beaf3c
Merge branch 'unstable' into optimize/search
vkWeb Sep 16, 2022
001e788
Use command for tsv insertion & simpler tsv update on publish
vkWeb Sep 21, 2022
ff59495
Merge branch 'unstable' into optimize/search
vkWeb Sep 27, 2022
9ec60cf
fixes the strict update subquery, lightens it up
vkWeb Sep 28, 2022
aae3be1
Do not output deleted channel nodes on search
vkWeb Sep 28, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 0 additions & 47 deletions contentcuration/contentcuration/debug/middleware.py

This file was deleted.

11 changes: 8 additions & 3 deletions contentcuration/contentcuration/debug_panel_settings.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from .dev_settings import * # noqa

# These endpoints will throw an error on the django debug panel
# These endpoints will throw an error on the django debug panel.
EXCLUDED_DEBUG_URLS = [
"/content/storage",

# Disabling sync API because as soon as the sync API gets polled
# the current request data gets overwritten.
# Can be removed after websockets deployment.
"/api/sync",
]

DEBUG_PANEL_ACTIVE = True
Expand All @@ -14,10 +19,10 @@ def custom_show_toolbar(request):
) # noqa F405


# if debug_panel exists, add it to our INSTALLED_APPS
# if debug_panel exists, add it to our INSTALLED_APPS.
INSTALLED_APPS += ("debug_panel", "debug_toolbar", "pympler") # noqa F405
MIDDLEWARE += ( # noqa F405
"contentcuration.debug.middleware.CustomDebugPanelMiddleware",
"debug_toolbar.middleware.DebugToolbarMiddleware",
)
DEBUG_TOOLBAR_CONFIG = {
"SHOW_TOOLBAR_CALLBACK": custom_show_toolbar,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,13 +160,8 @@
}
return this.$tr('resourcesCount', { count });
},
numLocations() {
return this.node.location_ids.length;
},
goToLocationLabel() {
return this.numLocations > 1
? this.$tr('goToPluralLocationsAction', { count: this.numLocations })
: this.$tr('goToSingleLocationAction');
return this.$tr('goToSingleLocationAction');
},
isTopic() {
return this.node.kind === ContentKindsNames.TOPIC;
Expand All @@ -189,8 +184,6 @@
$trs: {
tagsList: 'Tags: {tags}',
goToSingleLocationAction: 'Go to location',
goToPluralLocationsAction:
'In {count, number} {count, plural, one {location} other {locations}}',
addToClipboardAction: 'Copy to clipboard',
resourcesCount: '{count, number} {count, plural, one {resource} other {resources}}',
coach: 'Resource for coaches',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@
[this.channelFilter]: true,
page: this.$route.query.page || 1,
exclude: this.currentChannelId,
published: true,
}).then(page => {
this.pageCount = page.total_pages;
this.channels = page.results;
Expand Down
1 change: 0 additions & 1 deletion contentcuration/contentcuration/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from datetime import datetime

import pytz
from celery import states
from django.conf import settings
from django.contrib.auth.base_user import AbstractBaseUser
from django.contrib.auth.base_user import BaseUserManager
Expand Down
2 changes: 1 addition & 1 deletion contentcuration/contentcuration/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
'webpack_loader',
'django_filters',
'mathfilters',
'django.contrib.postgres',
vkWeb marked this conversation as resolved.
Show resolved Hide resolved
'django_celery_results',
)

Expand Down Expand Up @@ -220,7 +221,6 @@

IS_CONTENTNODE_TABLE_PARTITIONED = os.getenv("IS_CONTENTNODE_TABLE_PARTITIONED") or False


DATABASE_ROUTERS = [
"kolibri_content.router.ContentDBRouter",
]
Expand Down
82 changes: 81 additions & 1 deletion contentcuration/contentcuration/utils/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,17 @@
from itertools import chain

from django.conf import settings
from django.contrib.postgres.aggregates import StringAgg
from django.core.files import File
from django.core.files.storage import default_storage as storage
from django.core.management import call_command
from django.db.models import Count
from django.db.models import Exists
from django.db.models import Max
from django.db.models import OuterRef
from django.db.models import Q
from django.db.models import Sum
from django.db.models import Value
from django.db.utils import IntegrityError
from django.template.loader import render_to_string
from django.utils import timezone
Expand All @@ -37,6 +41,11 @@
from le_utils.constants import roles
from past.builtins import basestring
from past.utils import old_div
from search.constants import CHANNEL_KEYWORDS_TSVECTOR
from search.constants import CONTENTNODE_AUTHOR_TSVECTOR
from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR
from search.models import ChannelFullTextSearch
from search.models import ContentNodeFullTextSearch

from contentcuration import models as ccmodels
from contentcuration.decorators import delay_user_storage_calculation
Expand Down Expand Up @@ -808,6 +817,76 @@ def fill_published_fields(channel, version_notes):
channel.save()


def sync_contentnode_and_channel_tsvectors(channel_id):
"""
Creates, deletes and updates tsvectors of the channel and all its content nodes
to reflect the current state of channel's main tree.
vkWeb marked this conversation as resolved.
Show resolved Hide resolved
"""
# Update or create channel tsvector entry.
logging.info("Starting to set tsvectors for channel with id {}.".format(channel_id))

from contentcuration.viewsets.channel import primary_token_subquery

channel = (ccmodels.Channel.objects
.filter(pk=channel_id)
.annotate(primary_channel_token=primary_token_subquery,
keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR)
.values("keywords_tsvector", "main_tree__tree_id")
.get())

if ChannelFullTextSearch.objects.filter(channel_id=channel_id).exists():
vkWeb marked this conversation as resolved.
Show resolved Hide resolved
update_count = ChannelFullTextSearch.objects.filter(channel_id=channel_id).update(keywords_tsvector=channel["keywords_tsvector"])
logging.info("Updated {} channel tsvector.".format(update_count))
else:
obj = ChannelFullTextSearch(channel_id=channel_id, keywords_tsvector=channel["keywords_tsvector"])
obj.save()
logging.info("Created 1 channel tsvector.")

# Update or create contentnodes tsvector entry for channel_id.
logging.info("Starting to set tsvectors for all contentnodes in channel {}.".format(channel_id))

nodes_tsvector_query = (ccmodels.ContentNode.objects
.filter(tree_id=channel["main_tree__tree_id"])
vkWeb marked this conversation as resolved.
Show resolved Hide resolved
.annotate(channel_id=Value(channel_id),
contentnode_tags=StringAgg("tags__tag_name", delimiter=" "),
keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR,
author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR)
.order_by())

vkWeb marked this conversation as resolved.
Show resolved Hide resolved
if ContentNodeFullTextSearch.objects.filter(channel_id=channel_id).exists():
# First, delete nodes that are no longer in main_tree.
nodes_no_longer_in_main_tree = ~Exists(ccmodels.ContentNode.objects.filter(id=OuterRef("contentnode_id"), tree_id=channel["main_tree__tree_id"]))
ContentNodeFullTextSearch.objects.filter(nodes_no_longer_in_main_tree, channel_id=channel_id).delete()

vkWeb marked this conversation as resolved.
Show resolved Hide resolved
# Now, all remaining nodes are in main_tree, so let's update them.
# Update only changed nodes.
nodes_to_update = ContentNodeFullTextSearch.objects.filter(channel_id=channel_id, contentnode__changed=True)

update_objs = list()
for node in nodes_to_update:
corresponding_contentnode = nodes_tsvector_query.filter(pk=node.contentnode_id).values("keywords_tsvector", "author_tsvector").first()
if corresponding_contentnode:
node.keywords_tsvector = corresponding_contentnode["keywords_tsvector"]
node.author_tsvector = corresponding_contentnode["author_tsvector"]
update_objs.append(node)
ContentNodeFullTextSearch.objects.bulk_update(update_objs, ["keywords_tsvector", "author_tsvector"])
del update_objs

# Insert newly created nodes.
nodes_not_having_tsvector_record = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id"), channel_id=channel_id))
nodes_to_insert = (nodes_tsvector_query
.filter(nodes_not_having_tsvector_record)
.values("id", "channel_id", "keywords_tsvector", "author_tsvector"))

insert_objs = list()
for node in nodes_to_insert:
obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"],
keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"])
insert_objs.append(obj)
inserted_nodes_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs)
logging.info("Successfully inserted {} contentnode tsvectors.".format(len(inserted_nodes_list)))


@delay_user_storage_calculation
def publish_channel(
user_id,
Expand All @@ -829,8 +908,9 @@ def publish_channel(
set_channel_icon_encoding(channel)
kolibri_temp_db = create_content_database(channel, force, user_id, force_exercises, progress_tracker=progress_tracker)
increment_channel_version(channel)
mark_all_nodes_as_published(channel)
add_tokens_to_channel(channel)
sync_contentnode_and_channel_tsvectors(channel_id=channel.id)
mark_all_nodes_as_published(channel)
fill_published_fields(channel, version_notes)

# Attributes not getting set for some reason, so just save it here
Expand Down
29 changes: 12 additions & 17 deletions contentcuration/contentcuration/viewsets/channel.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
from rest_framework.serializers import CharField
from rest_framework.serializers import FloatField
from rest_framework.serializers import IntegerField
from search.models import ChannelFullTextSearch
from search.models import ContentNodeFullTextSearch
from search.utils import get_fts_search_query

from contentcuration.decorators import cache_no_user_data
from contentcuration.models import Change
Expand Down Expand Up @@ -119,23 +122,15 @@ def filter_deleted(self, queryset, name, value):
return queryset.filter(deleted=value)

def filter_keywords(self, queryset, name, value):
# TODO: Wait until we show more metadata on cards to add this back in
# keywords_query = self.main_tree_query.filter(
# Q(tags__tag_name__icontains=value)
# | Q(author__icontains=value)
# | Q(aggregator__icontains=value)
# | Q(provider__icontains=value)
# )
return queryset.annotate(
# keyword_match_count=SQCount(keywords_query, field="content_id"),
primary_token=primary_token_subquery,
).filter(
Q(name__icontains=value)
| Q(description__icontains=value)
| Q(pk__istartswith=value)
| Q(primary_token=value.replace("-", ""))
# | Q(keyword_match_count__gt=0)
)
search_query = get_fts_search_query(value)
dash_replaced_search_query = get_fts_search_query(value.replace("-", ""))

channel_keywords_query = (Exists(ChannelFullTextSearch.objects.filter(
Q(keywords_tsvector=search_query) | Q(keywords_tsvector=dash_replaced_search_query), channel_id=OuterRef("id"))))
contentnode_search_query = (Exists(ContentNodeFullTextSearch.objects.filter(
Q(keywords_tsvector=search_query) | Q(author_tsvector=search_query), channel_id=OuterRef("id"))))

return queryset.filter(Q(channel_keywords_query) | Q(contentnode_search_query))

def filter_languages(self, queryset, name, value):
languages = value.split(",")
Expand Down
1 change: 0 additions & 1 deletion contentcuration/search/admin.py

This file was deleted.

16 changes: 16 additions & 0 deletions contentcuration/search/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from django.contrib.postgres.search import SearchVector

# Postgres full text search configuration. We use "simple" to make search
# language agnostic.
POSTGRES_FTS_CONFIG = "simple"

# ContentNode vectors and search fields.
CONTENTNODE_KEYWORDS_TSVECTOR_FIELDS = ("id", "channel_id", "node_id", "content_id", "tree_id", "title", "description", "contentnode_tags")
CONTENTNODE_KEYWORDS_TSVECTOR = SearchVector(*CONTENTNODE_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG)

CONTENTNODE_AUTHOR_TSVECTOR_FIELDS = ("author", "aggregator", "provider")
CONTENTNODE_AUTHOR_TSVECTOR = SearchVector(*CONTENTNODE_AUTHOR_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG)

# Channel vector and search fields.
CHANNEL_KEYWORDS_TSVECTOR_FIELDS = ("id", "main_tree__tree_id", "name", "description", "tagline", "primary_channel_token")
CHANNEL_KEYWORDS_TSVECTOR = SearchVector(*CHANNEL_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG)
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""
This command inserts in bulk channel tsvectors to the ChannelFullTextSearch table.
"""
import logging as logmodule
import time

from django.core.management.base import BaseCommand
from django.db.models import Exists
from django.db.models import OuterRef
from search.constants import CHANNEL_KEYWORDS_TSVECTOR
from search.models import ChannelFullTextSearch

from contentcuration.models import Channel
from contentcuration.viewsets.channel import primary_token_subquery


logmodule.basicConfig(level=logmodule.INFO)
logging = logmodule.getLogger("command")

CHUNKSIZE = 5000


class Command(BaseCommand):

def handle(self, *args, **options):
start = time.time()

channel_not_already_inserted_query = ~Exists(ChannelFullTextSearch.objects.filter(channel_id=OuterRef("id")))

channel_query = (Channel.objects.select_related("main_tree")
.filter(channel_not_already_inserted_query, deleted=False, main_tree__published=True)
.annotate(primary_channel_token=primary_token_subquery,
keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR)
.values("id", "keywords_tsvector"))

insertable_channels = list(channel_query[:CHUNKSIZE])
total_channel_tsvectors_inserted = 0

while insertable_channels:
logging.info("Inserting channel tsvectors.")

insert_objs = list()
for channel in insertable_channels:
obj = ChannelFullTextSearch(channel_id=channel["id"], keywords_tsvector=channel["keywords_tsvector"])
insert_objs.append(obj)

inserted_objs_list = ChannelFullTextSearch.objects.bulk_create(insert_objs)

current_inserts_count = len(inserted_objs_list)
total_channel_tsvectors_inserted = total_channel_tsvectors_inserted + current_inserts_count

logging.info("Inserted {} channel tsvectors.".format(current_inserts_count))

insertable_channels = list(channel_query[:CHUNKSIZE])

logging.info("Completed! successfully inserted total of {} channel tsvectors in {} seconds.".format(
total_channel_tsvectors_inserted, time.time() - start))
Loading