-
-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[fix #4265] Port Document search API for Elasticsearch 6.x #4309
Changes from all commits
bb4e5aa
4dc3e35
fe2aef1
41a67a3
85b4686
b01981f
75de7f6
e2e8cbb
733b030
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from rest_framework import generics | ||
from rest_framework import exceptions | ||
from rest_framework.exceptions import ValidationError | ||
|
||
from readthedocs.projects.models import Project | ||
from readthedocs.search.documents import PageDocument | ||
from readthedocs.search.filters import SearchFilterBackend | ||
from readthedocs.search.pagination import SearchPagination | ||
from readthedocs.search.serializers import PageSearchSerializer | ||
from readthedocs.search.utils import get_project_list_or_404 | ||
|
||
|
||
class PageSearchAPIView(generics.ListAPIView): | ||
pagination_class = SearchPagination | ||
filter_backends = [SearchFilterBackend] | ||
serializer_class = PageSearchSerializer | ||
|
||
def get_queryset(self): | ||
""" | ||
Return Elasticsearch DSL Search object instead of Django Queryset. | ||
Django Queryset and elasticsearch-dsl ``Search`` object is similar pattern. | ||
So for searching, its possible to return ``Search`` object instead of queryset. | ||
The ``filter_backends`` and ``pagination_class`` is compatible with ``Search`` | ||
""" | ||
# Validate all the required params are there | ||
self.validate_query_params() | ||
query = self.request.query_params.get('query', '') | ||
queryset = PageDocument.simple_search(query=query) | ||
return queryset | ||
|
||
def validate_query_params(self): | ||
required_query_params = {'query', 'project', 'version'} # python `set` literal is `{}` | ||
request_params = set(self.request.query_params.keys()) | ||
missing_params = required_query_params - request_params | ||
if missing_params: | ||
errors = {} | ||
for param in missing_params: | ||
errors[param] = ["This query param is required"] | ||
|
||
raise ValidationError(errors) | ||
|
||
def get_serializer_context(self): | ||
context = super(PageSearchAPIView, self).get_serializer_context() | ||
context['projects_url'] = self.get_all_projects_url() | ||
return context | ||
|
||
def get_all_projects_url(self): | ||
version_slug = self.request.query_params.get('version') | ||
project_slug = self.request.query_params.get('project') | ||
all_projects = get_project_list_or_404(project_slug=project_slug, user=self.request.user) | ||
projects_url = {} | ||
|
||
for project in all_projects: | ||
projects_url[project.slug] = project.get_docs_url(version_slug=version_slug) | ||
|
||
return projects_url |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,11 +9,13 @@ class RTDFacetedSearch(FacetedSearch): | |
# TODO: Remove the overwrite when the elastic/elasticsearch-dsl-py#916 | ||
# See more: https://github.com/elastic/elasticsearch-dsl-py/issues/916 | ||
|
||
def __init__(self, using, index, doc_types, model, **kwargs): | ||
def __init__(self, using, index, doc_types, model, fields=None, **kwargs): | ||
self.using = using | ||
self.index = index | ||
self.doc_types = doc_types | ||
self._model = model | ||
if fields: | ||
self.fields = fields | ||
super(RTDFacetedSearch, self).__init__(**kwargs) | ||
|
||
|
||
|
@@ -25,26 +27,18 @@ class ProjectSearch(RTDFacetedSearch): | |
|
||
|
||
class FileSearch(RTDFacetedSearch): | ||
fields = ['title^10', 'headers^5', 'content'] | ||
facets = { | ||
'project': TermsFacet(field='project'), | ||
'version': TermsFacet(field='version') | ||
} | ||
|
||
def query(self, search, query): | ||
"""Add query part to ``search``""" | ||
""" | ||
Add query part to ``search`` | ||
Overriding because we pass ES Query object instead of string | ||
""" | ||
if query: | ||
all_queries = [] | ||
|
||
# Need to search for both 'AND' and 'OR' operations | ||
# The score of AND should be higher as it comes first | ||
for operator in ['AND', 'OR']: | ||
query_string = SimpleQueryString(query=query, fields=self.fields, | ||
default_operator=operator) | ||
all_queries.append(query_string) | ||
|
||
# Run bool query with should, so it returns result where either of the query matches | ||
bool_query = Bool(should=all_queries) | ||
search = search.query(bool_query) | ||
search = search.query(query) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Feels like we shouldn't be overwriting the object here. Will we return something invalid if there is no There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have actually a mixed feeling about this. Do you have any idea about how to do this withtout overriding the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean mostly just the name of the object. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah. |
||
|
||
return search |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from rest_framework import filters | ||
|
||
|
||
class SearchFilterBackend(filters.BaseFilterBackend): | ||
|
||
"""Filter search result with project""" | ||
|
||
def filter_queryset(self, request, queryset, view): | ||
"""Overwrite the method to compatible with Elasticsearch DSL Search object.""" | ||
# ``queryset`` is actually a Elasticsearch DSL ``Search`` object. | ||
# So change the variable name | ||
es_search = queryset | ||
version_slug = request.query_params.get('version') | ||
projects_info = view.get_all_projects_url() | ||
project_slug_list = list(projects_info.keys()) | ||
# Elasticsearch ``terms`` query can take multiple values as list, | ||
# while ``term`` query takes single value. | ||
filtered_es_search = (es_search.filter('terms', project=project_slug_list) | ||
.filter('term', version=version_slug)) | ||
return filtered_es_search |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from rest_framework.pagination import PageNumberPagination | ||
|
||
|
||
class SearchPagination(PageNumberPagination): | ||
page_size = 10 | ||
page_size_query_param = 'page_size' | ||
max_page_size = 100 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from rest_framework import serializers | ||
|
||
from readthedocs.projects.models import Project | ||
|
||
|
||
class PageSearchSerializer(serializers.Serializer): | ||
project = serializers.CharField() | ||
version = serializers.CharField() | ||
title = serializers.CharField() | ||
path = serializers.CharField() | ||
link = serializers.SerializerMethodField() | ||
highlight = serializers.SerializerMethodField() | ||
|
||
def get_link(self, obj): | ||
projects_url = self.context.get('projects_url') | ||
if projects_url: | ||
docs_url = projects_url[obj.project] | ||
return docs_url + obj.path | ||
|
||
def get_highlight(self, obj): | ||
highlight = getattr(obj.meta, 'highlight', None) | ||
if highlight: | ||
return highlight.to_dict() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
import pytest | ||
from django.core.urlresolvers import reverse | ||
from django_dynamic_fixture import G | ||
|
||
from readthedocs.builds.models import Version | ||
from readthedocs.projects.models import HTMLFile | ||
from readthedocs.search.tests.utils import get_search_query_from_project_file | ||
|
||
|
||
@pytest.mark.django_db | ||
@pytest.mark.search | ||
class TestDocumentSearch(object): | ||
url = reverse('doc_search') | ||
|
||
@pytest.mark.parametrize('data_type', ['content', 'headers', 'title']) | ||
@pytest.mark.parametrize('page_num', [0, 1]) | ||
def test_search_works(self, api_client, project, data_type, page_num): | ||
query = get_search_query_from_project_file(project_slug=project.slug, page_num=page_num, | ||
data_type=data_type) | ||
|
||
version = project.versions.all()[0] | ||
search_params = {'project': project.slug, 'version': version.slug, 'query': query} | ||
resp = api_client.get(self.url, search_params) | ||
assert resp.status_code == 200 | ||
|
||
data = resp.data['results'] | ||
assert len(data) == 1 | ||
project_data = data[0] | ||
assert project_data['project'] == project.slug | ||
|
||
# Check highlight return correct object | ||
all_highlights = project_data['highlight'][data_type] | ||
for highlight in all_highlights: | ||
# Make it lower because our search is case insensitive | ||
assert query.lower() in highlight.lower() | ||
|
||
def test_doc_search_filter_by_project(self, api_client): | ||
"""Test Doc search result are filtered according to project""" | ||
|
||
# `Github` word is present both in `kuma` and `pipeline` files | ||
# so search with this phrase but filter through `kuma` project | ||
search_params = {'query': 'GitHub', 'project': 'kuma', 'version': 'latest'} | ||
resp = api_client.get(self.url, search_params) | ||
assert resp.status_code == 200 | ||
|
||
data = resp.data['results'] | ||
assert len(data) == 1 | ||
assert data[0]['project'] == 'kuma' | ||
|
||
def test_doc_search_filter_by_version(self, api_client, project): | ||
"""Test Doc search result are filtered according to version""" | ||
query = get_search_query_from_project_file(project_slug=project.slug) | ||
latest_version = project.versions.all()[0] | ||
# Create another version | ||
dummy_version = G(Version, project=project) | ||
# Create HTMLFile same as the latest version | ||
latest_version_files = HTMLFile.objects.all().filter(version=latest_version) | ||
for f in latest_version_files: | ||
f.version = dummy_version | ||
# Make primary key to None, so django will create new object | ||
f.pk = None | ||
f.save() | ||
|
||
search_params = {'query': query, 'project': project.slug, 'version': dummy_version.slug} | ||
resp = api_client.get(self.url, search_params) | ||
assert resp.status_code == 200 | ||
|
||
data = resp.data['results'] | ||
assert len(data) == 1 | ||
assert data[0]['project'] == project.slug | ||
|
||
def test_doc_search_pagination(self, api_client, project): | ||
"""Test Doc search result can be paginated""" | ||
latest_version = project.versions.all()[0] | ||
html_file = HTMLFile.objects.filter(version=latest_version)[0] | ||
title = html_file.processed_json['title'] | ||
query = title.split()[0] | ||
|
||
# Create 15 more same html file | ||
for _ in range(15): | ||
# Make primary key to None, so django will create new object | ||
html_file.pk = None | ||
html_file.save() | ||
|
||
search_params = {'query': query, 'project': project.slug, 'version': latest_version.slug} | ||
resp = api_client.get(self.url, search_params) | ||
assert resp.status_code == 200 | ||
|
||
# Check the count is 16 (1 existing and 15 new created) | ||
assert resp.data['count'] == 16 | ||
# Check there are next url | ||
assert resp.data['next'] is not None | ||
# There should be only 10 data as the pagination is 10 by default | ||
assert len(resp.data['results']) == 10 | ||
|
||
# Add `page_size` parameter and check the data is paginated accordingly | ||
search_params['page_size'] = 5 | ||
resp = api_client.get(self.url, search_params) | ||
assert resp.status_code == 200 | ||
|
||
assert len(resp.data['results']) == 5 | ||
|
||
def test_doc_search_without_parameters(self, api_client, project): | ||
"""Hitting Document Search endpoint without query parameters should return error""" | ||
resp = api_client.get(self.url) | ||
assert resp.status_code == 400 | ||
# Check error message is there | ||
assert sorted(['query', 'project', 'version']) == sorted(resp.data.keys()) | ||
|
||
def test_doc_search_subprojects(self, api_client, all_projects): | ||
"""Test Document search return results from subprojects also""" | ||
project = all_projects[0] | ||
subproject = all_projects[1] | ||
version = project.versions.all()[0] | ||
# Add another project as subproject of the project | ||
project.add_subproject(subproject) | ||
|
||
# Now search with subproject content but explicitly filter by the parent project | ||
query = get_search_query_from_project_file(project_slug=subproject.slug) | ||
search_params = {'query': query, 'project': project.slug, 'version': version.slug} | ||
resp = api_client.get(self.url, search_params) | ||
assert resp.status_code == 200 | ||
|
||
data = resp.data['results'] | ||
assert len(data) == 1 | ||
assert data[0]['project'] == subproject.slug | ||
# Check the link is the subproject document link | ||
document_link = subproject.get_docs_url(version_slug=version.slug) | ||
assert document_link in data[0]['link'] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this used by the
DocType
class directly, or are we storing it here to add later? If the prior, we should probably set it in an__init__
method or somewhere else, so it doesn't get confused with the other data here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Its not actually used by
DocType
class, but we are using it in class method. I dont know if keeping it in__init__
method will make it work in class method!There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hrm, good point. I guess it's fine for now then.