-
-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[fix #4265] Port Document search API for Elasticsearch 6.x #4309
Changes from 8 commits
bb4e5aa
4dc3e35
fe2aef1
41a67a3
85b4686
b01981f
75de7f6
e2e8cbb
733b030
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from rest_framework import generics | ||
from rest_framework import exceptions | ||
from rest_framework.exceptions import ValidationError | ||
|
||
from readthedocs.projects.models import Project | ||
from readthedocs.search.documents import PageDocument | ||
from readthedocs.search.filters import SearchFilterBackend | ||
from readthedocs.search.pagination import SearchPagination | ||
from readthedocs.search.serializers import PageSearchSerializer | ||
|
||
|
||
class PageSearchAPIView(generics.ListAPIView): | ||
pagination_class = SearchPagination | ||
filter_backends = [SearchFilterBackend] | ||
serializer_class = PageSearchSerializer | ||
|
||
def get_queryset(self): | ||
""" | ||
Return Elasticsearch DSL Search object instead of Django Queryset. | ||
|
||
Django Queryset and elasticsearch-dsl ``Search`` object is similar pattern. | ||
So for searching, its possible to return ``Search`` object instead of queryset. | ||
The ``filter_backends`` and ``pagination_class`` is compatible with ``Search`` | ||
""" | ||
# Validate all the required params are there | ||
self.validate_query_params() | ||
query = self.request.query_params.get('query', '') | ||
queryset = PageDocument.simple_search(query=query) | ||
return queryset | ||
|
||
def validate_query_params(self): | ||
required_query_params = {'query', 'project', 'version'} # python `set` literal is `{}` | ||
request_params = set(self.request.query_params.keys()) | ||
missing_params = required_query_params - request_params | ||
if missing_params: | ||
errors = {} | ||
for param in missing_params: | ||
errors[param] = ["This query param is required"] | ||
|
||
raise ValidationError(errors) | ||
|
||
def get_serializer_context(self): | ||
context = super(PageSearchAPIView, self).get_serializer_context() | ||
context['projects_info'] = self.get_projects_info() | ||
return context | ||
|
||
def _get_all_projects(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't this the same logic as in |
||
"""Return list of project and its subprojects.""" | ||
project_slug = self.request.query_params.get('project') | ||
queryset = Project.objects.api(self.request.user).only('slug') | ||
|
||
project = generics.get_object_or_404(queryset, slug=project_slug) | ||
subprojects = queryset.filter(superprojects__parent_id=project.id) | ||
|
||
project_list = list(subprojects) + [project] | ||
return project_list | ||
|
||
def get_projects_info(self): | ||
version_slug = self.request.query_params.get('version') | ||
all_projects = self._get_all_projects() | ||
projects_info = {} | ||
|
||
for project in all_projects: | ||
data = {'docs_url': project.get_docs_url(version_slug=version_slug)} | ||
projects_info[project.slug] = data | ||
|
||
return projects_info |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
from django.conf import settings | ||
from django_elasticsearch_dsl import DocType, Index, fields | ||
from elasticsearch_dsl.query import SimpleQueryString, Bool | ||
|
||
from readthedocs.projects.models import Project, HTMLFile | ||
from .conf import SEARCH_EXCLUDED_FILE | ||
|
@@ -60,14 +61,19 @@ class Meta(object): | |
content = fields.TextField(attr='processed_json.content') | ||
path = fields.TextField(attr='processed_json.path') | ||
|
||
# Fields to perform search with weight | ||
search_fields = ['title^10', 'headers^5', 'content'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this used by the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Its not actually used by There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hrm, good point. I guess it's fine for now then. |
||
|
||
@classmethod | ||
def faceted_search(cls, query, projects_list=None, versions_list=None, using=None, index=None): | ||
es_query = cls.get_es_query(query=query) | ||
kwargs = { | ||
'using': using or cls._doc_type.using, | ||
'index': index or cls._doc_type.index, | ||
'doc_types': [cls], | ||
'model': cls._doc_type.model, | ||
'query': query | ||
'query': es_query, | ||
'fields': cls.search_fields | ||
} | ||
filters = {} | ||
|
||
|
@@ -80,6 +86,32 @@ def faceted_search(cls, query, projects_list=None, versions_list=None, using=Non | |
|
||
return FileSearch(**kwargs) | ||
|
||
@classmethod | ||
def simple_search(cls, query, using=None, index=None): | ||
es_search = cls.search(using=using, index=index) | ||
es_query = cls.get_es_query(query=query) | ||
highlighted_fields = [f.split('^', 1)[0] for f in cls.search_fields] | ||
|
||
es_search = es_search.query(es_query).highlight(*highlighted_fields) | ||
return es_search | ||
|
||
@classmethod | ||
def get_es_query(cls, query): | ||
"""Return the Elasticsearch query generated from the query string""" | ||
all_queries = [] | ||
|
||
# Need to search for both 'AND' and 'OR' operations | ||
# The score of AND should be higher as it satisfies both OR and AND | ||
for operator in ['AND', 'OR']: | ||
query_string = SimpleQueryString(query=query, fields=cls.search_fields, | ||
default_operator=operator) | ||
all_queries.append(query_string) | ||
|
||
# Run bool query with should, so it returns result where either of the query matches | ||
bool_query = Bool(should=all_queries) | ||
|
||
return bool_query | ||
|
||
def get_queryset(self): | ||
"""Overwrite default queryset to filter certain files to index""" | ||
queryset = super(PageDocument, self).get_queryset() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,11 +9,13 @@ class RTDFacetedSearch(FacetedSearch): | |
# TODO: Remove the overwrite when the elastic/elasticsearch-dsl-py#916 | ||
# See more: https://github.com/elastic/elasticsearch-dsl-py/issues/916 | ||
|
||
def __init__(self, using, index, doc_types, model, **kwargs): | ||
def __init__(self, using, index, doc_types, model, fields=None, **kwargs): | ||
self.using = using | ||
self.index = index | ||
self.doc_types = doc_types | ||
self._model = model | ||
if fields: | ||
self.fields = fields | ||
super(RTDFacetedSearch, self).__init__(**kwargs) | ||
|
||
|
||
|
@@ -25,26 +27,18 @@ class ProjectSearch(RTDFacetedSearch): | |
|
||
|
||
class FileSearch(RTDFacetedSearch): | ||
fields = ['title^10', 'headers^5', 'content'] | ||
facets = { | ||
'project': TermsFacet(field='project'), | ||
'version': TermsFacet(field='version') | ||
} | ||
|
||
def query(self, search, query): | ||
"""Add query part to ``search``""" | ||
""" | ||
Add query part to ``search`` | ||
Overriding because we pass ES Query object instead of string | ||
""" | ||
if query: | ||
all_queries = [] | ||
|
||
# Need to search for both 'AND' and 'OR' operations | ||
# The score of AND should be higher as it comes first | ||
for operator in ['AND', 'OR']: | ||
query_string = SimpleQueryString(query=query, fields=self.fields, | ||
default_operator=operator) | ||
all_queries.append(query_string) | ||
|
||
# Run bool query with should, so it returns result where either of the query matches | ||
bool_query = Bool(should=all_queries) | ||
search = search.query(bool_query) | ||
search = search.query(query) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Feels like we shouldn't be overwriting the object here. Will we return something invalid if there is no There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have actually a mixed feeling about this. Do you have any idea about how to do this withtout overriding the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean mostly just the name of the object. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah. |
||
|
||
return search |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from rest_framework import filters | ||
|
||
|
||
class SearchFilterBackend(filters.BaseFilterBackend): | ||
|
||
"""Filter search result with project""" | ||
|
||
def filter_queryset(self, request, queryset, view): | ||
"""Overwrite the method to compatible with Elasticsearch DSL Search object.""" | ||
# ``queryset`` is actually a Elasticsearch DSL ``Search`` object. | ||
# So change the variable name | ||
es_search = queryset | ||
version_slug = request.query_params.get('version') | ||
projects_info = view.get_projects_info() | ||
project_slug_list = projects_info.keys() | ||
# Elasticsearch ``terms`` query can take multiple values as list, | ||
# while ``term`` query takes single value. | ||
filtered_es_search = (es_search.filter('terms', project=project_slug_list) | ||
.filter('term', version=version_slug)) | ||
return filtered_es_search |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from rest_framework.pagination import PageNumberPagination | ||
|
||
|
||
class SearchPagination(PageNumberPagination): | ||
page_size = 10 | ||
page_size_query_param = 'page_size' | ||
max_page_size = 100 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from rest_framework import serializers | ||
|
||
from readthedocs.projects.models import Project | ||
|
||
|
||
class PageSearchSerializer(serializers.Serializer): | ||
project = serializers.CharField() | ||
version = serializers.CharField() | ||
title = serializers.CharField() | ||
path = serializers.CharField() | ||
link = serializers.SerializerMethodField() | ||
highlight = serializers.SerializerMethodField() | ||
|
||
def get_link(self, obj): | ||
projects_info = self.context.get('projects_info') | ||
if projects_info: | ||
project_data = projects_info[obj.project] | ||
docs_url = project_data['docs_url'] | ||
return docs_url + obj.path | ||
|
||
def get_highlight(self, obj): | ||
highlight = getattr(obj.meta, 'highlight', None) | ||
if highlight: | ||
return highlight.to_dict() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
projects_info
I think is a bad name. It seems like we're just getting thedocs_url
from it for now, so we should probably be more clear about that. Perhapscontext['project_urls']
or something?