Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import from other channels search optimized #3399

Merged
merged 32 commits into from
Oct 7, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
357204c
Optimized import from other channels search
vkWeb Jun 2, 2022
9028b22
Merge branch 'unstable' into optimize/search
vkWeb Jun 2, 2022
d24416c
Add search test for channel filtering and location_ids handling
bjester Jun 15, 2022
676526e
Fix autodiscovery of search tests
vkWeb Jun 30, 2022
6d40c55
Remove location_ids, zero db queries on descendant resource count
vkWeb Jul 9, 2022
094f05f
Upgrade django debug toolbar and fix its settings
vkWeb Jul 12, 2022
701ddc9
Remove unnecessary dev settings
vkWeb Jul 22, 2022
1a14749
Add .envrc to .gitignore
vkWeb Jul 22, 2022
bff595c
Add vector search column & indexes, also GiST trigram index
vkWeb Aug 12, 2022
8b7152e
Merge branch 'unstable' into optimize/search
vkWeb Aug 12, 2022
c0e55ee
Remove cyclic migration conflicts
vkWeb Aug 12, 2022
34e8436
Fix wrong indentation happened due to merge conflict
vkWeb Aug 12, 2022
e00c512
Add a command for setting tsvectors and fix tests
vkWeb Aug 14, 2022
f3280d9
Remove grade_level default to pass failing tests
vkWeb Aug 14, 2022
475aeef
Merge branch 'unstable' into optimize/search
vkWeb Aug 24, 2022
3ff0edd
Full text search models and data migrations
vkWeb Sep 8, 2022
fffd912
Merge branch 'unstable' into optimize/search
vkWeb Sep 8, 2022
974b69e
Resolve conflicts and remove old index refs
vkWeb Sep 8, 2022
e15b015
feat: full text search!
vkWeb Sep 10, 2022
aa62dc2
Sync tsvectors on publish!
vkWeb Sep 14, 2022
2672512
fix: tests and ready for merge! <3
vkWeb Sep 14, 2022
1ffa30d
fix: node command edge case; when published nodes go to trash tree, t…
vkWeb Sep 14, 2022
55e4acf
Merge branch 'unstable' into optimize/search
vkWeb Sep 16, 2022
57724e0
Enforce only-one search entries
vkWeb Sep 16, 2022
e53e56a
Remove unnecessary select_related
vkWeb Sep 16, 2022
4b3d4c7
fix cache tests mock by setting ContentNodeFullTextSearch
vkWeb Sep 16, 2022
44ab74c
fix cache & nodes tests by using db-friendly TestCase
vkWeb Sep 16, 2022
4beaf3c
Merge branch 'unstable' into optimize/search
vkWeb Sep 16, 2022
001e788
Use command for tsv insertion & simpler tsv update on publish
vkWeb Sep 21, 2022
ff59495
Merge branch 'unstable' into optimize/search
vkWeb Sep 27, 2022
9ec60cf
fixes the strict update subquery, lightens it up
vkWeb Sep 28, 2022
aae3be1
Do not output deleted channel nodes on search
vkWeb Sep 28, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@
[this.channelFilter]: true,
page: this.$route.query.page || 1,
exclude: this.currentChannelId,
published: true,
}).then(page => {
this.pageCount = page.total_pages;
this.channels = page.results;
Expand Down
65 changes: 35 additions & 30 deletions contentcuration/contentcuration/utils/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@
from search.constants import CHANNEL_KEYWORDS_TSVECTOR
from search.constants import CONTENTNODE_AUTHOR_TSVECTOR
from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR
from search.constants import CONTENTNODE_PREFIXED_AUTHOR_TSVECTOR
from search.constants import CONTENTNODE_PREFIXED_KEYWORDS_TSVECTOR
from search.models import ChannelFullTextSearch
from search.models import ContentNodeFullTextSearch

Expand Down Expand Up @@ -819,62 +817,74 @@ def fill_published_fields(channel, version_notes):
channel.save()


def create_or_update_tsvectors(channel_id):
def sync_contentnode_and_channel_tsvectors(channel_id):
"""
Create or update tsvectors for the channel and all its content nodes.
Creates, deletes and updates tsvectors of the channel and all its content nodes
to reflect the current state of channel's main tree.
vkWeb marked this conversation as resolved.
Show resolved Hide resolved
"""
# Update or create channel tsvector entry.
logging.info("Starting to set tsvectors for channel with id {}.".format(channel_id))

from contentcuration.viewsets.channel import primary_token_subquery

channel = (ccmodels.Channel.objects
.filter(pk=channel_id)
.annotate(primary_channel_token=primary_token_subquery,
keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR)
.get(pk=channel_id))
.values("keywords_tsvector", "main_tree__tree_id")
.get())

if ChannelFullTextSearch.objects.filter(channel_id=channel_id).exists():
vkWeb marked this conversation as resolved.
Show resolved Hide resolved
update_count = ChannelFullTextSearch.objects.filter(channel_id=channel_id).update(keywords_tsvector=channel.keywords_tsvector)
update_count = ChannelFullTextSearch.objects.filter(channel_id=channel_id).update(keywords_tsvector=channel["keywords_tsvector"])
logging.info("Updated {} channel tsvector.".format(update_count))
else:
obj = ChannelFullTextSearch(channel_id=channel_id, keywords_tsvector=channel.keywords_tsvector)
obj = ChannelFullTextSearch(channel_id=channel_id, keywords_tsvector=channel["keywords_tsvector"])
obj.save()
logging.info("Created 1 channel tsvector.")

# Update or create contentnodes tsvector entry for channel_id.
logging.info("Setting tsvectors for all contentnodes in channel {}.".format(channel_id))
logging.info("Starting to set tsvectors for all contentnodes in channel {}.".format(channel_id))

if ContentNodeFullTextSearch.objects.filter(channel_id=channel_id).exists():
nodes_tsvector_query = (ccmodels.ContentNode.objects
.filter(tree_id=channel["main_tree__tree_id"])
vkWeb marked this conversation as resolved.
Show resolved Hide resolved
.annotate(channel_id=Value(channel_id),
contentnode_tags=StringAgg("tags__tag_name", delimiter=" "),
keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR,
author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR)
.order_by())

vkWeb marked this conversation as resolved.
Show resolved Hide resolved
if ContentNodeFullTextSearch.objects.filter(channel_id=channel_id).exists():
# First, delete nodes that are no longer in main_tree.
nodes_no_longer_in_main_tree = ~Exists(channel.main_tree.get_family().filter(id=OuterRef("contentnode_id")))
nodes_no_longer_in_main_tree = ~Exists(ccmodels.ContentNode.objects.filter(id=OuterRef("contentnode_id"), tree_id=channel["main_tree__tree_id"]))
ContentNodeFullTextSearch.objects.filter(nodes_no_longer_in_main_tree, channel_id=channel_id).delete()

vkWeb marked this conversation as resolved.
Show resolved Hide resolved
# Now, all remaining nodes are in main_tree, so let's update them.
update_count = (ContentNodeFullTextSearch.objects.filter(channel_id=channel_id)
.annotate(contentnode_tags=StringAgg("tags__tag_name", delimiter=" "))
.update(keywords_tsvector=CONTENTNODE_PREFIXED_KEYWORDS_TSVECTOR, author_tsvector=CONTENTNODE_PREFIXED_AUTHOR_TSVECTOR))
# Update only changed nodes.
nodes_to_update = ContentNodeFullTextSearch.objects.filter(channel_id=channel_id, contentnode__changed=True)

update_objs = list()
for node in nodes_to_update:
corresponding_contentnode = nodes_tsvector_query.filter(pk=node.contentnode_id).values("keywords_tsvector", "author_tsvector").first()
if corresponding_contentnode:
node.keywords_tsvector = corresponding_contentnode["keywords_tsvector"]
node.author_tsvector = corresponding_contentnode["author_tsvector"]
update_objs.append(node)
ContentNodeFullTextSearch.objects.bulk_update(update_objs, ["keywords_tsvector", "author_tsvector"])
del update_objs

# Insert newly created nodes.
nodes_not_having_tsvector_record = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id")))
nodes_to_insert = (channel.main_tree.get_family()
nodes_not_having_tsvector_record = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id"), channel_id=channel_id))
nodes_to_insert = (nodes_tsvector_query
.filter(nodes_not_having_tsvector_record)
.annotate(channel_id=Value(channel_id),
contentnode_tags=StringAgg("tags__tag_name", delimiter=" "),
keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR,
author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR)
.values("id", "channel_id", "keywords_tsvector", "author_tsvector"))

insert_objs = list()

for node in nodes_to_insert:
obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"],
keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"])
insert_objs.append(obj)

inserted_nodes_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs)

logging.info("Successfully inserted {} and updated {} contentnode tsvectors.".format(len(inserted_nodes_list), update_count))
logging.info("Successfully inserted {} contentnode tsvectors.".format(len(inserted_nodes_list)))


@delay_user_storage_calculation
Expand All @@ -887,8 +897,6 @@ def publish_channel(
send_email=False,
progress_tracker=None,
language=settings.LANGUAGE_CODE,


):
"""
:type progress_tracker: contentcuration.utils.celery.ProgressTracker|None
Expand All @@ -900,8 +908,9 @@ def publish_channel(
set_channel_icon_encoding(channel)
kolibri_temp_db = create_content_database(channel, force, user_id, force_exercises, progress_tracker=progress_tracker)
increment_channel_version(channel)
mark_all_nodes_as_published(channel)
add_tokens_to_channel(channel)
sync_contentnode_and_channel_tsvectors(channel_id=channel.id)
mark_all_nodes_as_published(channel)
fill_published_fields(channel, version_notes)

# Attributes not getting set for some reason, so just save it here
Expand All @@ -914,10 +923,6 @@ def publish_channel(
if channel.public:
delete_public_channel_cache_keys()

# Enqueue tsvector task to update or create channel tsvectors and all its
# contentnodes tsvector entries.
create_or_update_tsvectors(channel_id=channel_id)

if send_email:
with override(language):
send_emails(channel, user_id, version_notes=version_notes)
Expand Down
7 changes: 5 additions & 2 deletions contentcuration/contentcuration/viewsets/channel.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,13 @@ def filter_deleted(self, queryset, name, value):
return queryset.filter(deleted=value)

def filter_keywords(self, queryset, name, value):
search_query = get_fts_search_query(value)
dash_replaced_search_query = get_fts_search_query(value.replace("-", ""))

channel_keywords_query = (Exists(ChannelFullTextSearch.objects.filter(
keywords_tsvector=get_fts_search_query(value.replace("-", "")), channel_id=OuterRef("id"))))
Q(keywords_tsvector=search_query) | Q(keywords_tsvector=dash_replaced_search_query), channel_id=OuterRef("id"))))
contentnode_search_query = (Exists(ContentNodeFullTextSearch.objects.filter(
Q(keywords_tsvector=get_fts_search_query(value)) | Q(author_tsvector=get_fts_search_query(value)), channel_id=OuterRef("id"))))
Q(keywords_tsvector=search_query) | Q(author_tsvector=search_query), channel_id=OuterRef("id"))))

return queryset.filter(Q(channel_keywords_query) | Q(contentnode_search_query))

Expand Down
7 changes: 0 additions & 7 deletions contentcuration/search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,6 @@
CONTENTNODE_AUTHOR_TSVECTOR_FIELDS = ("author", "aggregator", "provider")
CONTENTNODE_AUTHOR_TSVECTOR = SearchVector(*CONTENTNODE_AUTHOR_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG)

CONTENTNODE_PREFIXED_KEYWORDS_TSVECTOR_FIELDS = ("contentnode__id", "channel_id", "contentnode__node_id", "contentnode__content_id",
"contentnode__tree_id", "contentnode__title", "contentnode__description", "contentnode_tags")
CONTENTNODE_PREFIXED_KEYWORDS_TSVECTOR = SearchVector(*CONTENTNODE_PREFIXED_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG)

CONTENTNODE_PREFIXED_AUTHOR_TSVECTOR_FIELDS = ("contentnode__author", "contentnode__aggregator", "contentnode__provider")
CONTENTNODE_PREFIXED_AUTHOR_TSVECTOR = SearchVector(*CONTENTNODE_PREFIXED_AUTHOR_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG)

# Channel vector and search fields.
CHANNEL_KEYWORDS_TSVECTOR_FIELDS = ("id", "main_tree__tree_id", "name", "description", "tagline", "primary_channel_token")
CHANNEL_KEYWORDS_TSVECTOR = SearchVector(*CHANNEL_KEYWORDS_TSVECTOR_FIELDS, config=POSTGRES_FTS_CONFIG)
21 changes: 11 additions & 10 deletions contentcuration/search/viewsets/contentnode.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,22 @@ class ContentNodeFilter(RequiredFilterSet):
def filter_channel_list(self, queryset, name, value):
user = not self.request.user.is_anonymous and self.request.user
channel_ids = []

if value == "public":
channel_ids = Channel.objects.filter(public=True, deleted=False, main_tree__published=True).values_list("id", flat=True)
channel_ids = Channel.get_public_channels().values_list("id", flat=True)
elif value == "edit" and user:
channel_ids = user.editable_channels.filter(main_tree__published=True).values_list("id", flat=True)
channel_ids = user.editable_channels.values_list("id", flat=True)
elif value == "bookmark" and user:
channel_ids = user.bookmarked_channels.filter(main_tree__published=True).values_list("id", flat=True)
channel_ids = user.bookmarked_channels.values_list("id", flat=True)
elif value == "view" and user:
channel_ids = user.view_only_channels.filter(main_tree__published=True).values_list("id", flat=True)
channel_ids = user.view_only_channels.values_list("id", flat=True)

return queryset.filter(channel_id__in=list(channel_ids))
vkWeb marked this conversation as resolved.
Show resolved Hide resolved

def filter_keywords(self, queryset, name, value):
return queryset.filter(Q(keywords_tsvector=get_fts_search_query(value))
| Q(author_tsvector=get_fts_search_query(value)))
search_query = get_fts_search_query(value)
return queryset.filter(Q(keywords_tsvector=search_query)
| Q(author_tsvector=search_query))
vkWeb marked this conversation as resolved.
Show resolved Hide resolved

def filter_author(self, queryset, name, value):
return queryset.filter(author_tsvector=get_fts_search_query(value))
Expand Down Expand Up @@ -98,6 +101,7 @@ class SearchContentNodeViewSet(ReadOnlyValuesViewset):
filterset_class = ContentNodeFilter
pagination_class = ListPagination
permission_classes = [IsAuthenticated]
queryset = ContentNodeFullTextSearch.objects.all()

field_map = {
"id": "contentnode__id",
Expand Down Expand Up @@ -137,12 +141,9 @@ class SearchContentNodeViewSet(ReadOnlyValuesViewset):
"original_channel_name",
)

def get_queryset(self):
return ContentNodeFullTextSearch.objects.select_related("contentnode")

def annotate_queryset(self, queryset):
"""
Annotates thumbnails, resources count and channel name.
Annotates thumbnails, resources count and original channel name.
"""
thumbnails = File.objects.filter(
contentnode=OuterRef("contentnode__id"), preset__thumbnail=True
Expand Down