-
-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[fix #4265] Port Document search API for Elasticsearch 6.x #4309
Changes from 6 commits
bb4e5aa
4dc3e35
fe2aef1
41a67a3
85b4686
b01981f
75de7f6
e2e8cbb
733b030
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
from rest_framework import generics | ||
from rest_framework import exceptions | ||
from rest_framework.exceptions import ValidationError | ||
|
||
from readthedocs.search.documents import PageDocument | ||
from readthedocs.search.filters import SearchFilterBackend | ||
from readthedocs.search.pagination import SearchPagination | ||
from readthedocs.search.serializers import PageSearchSerializer | ||
|
||
|
||
class PageSearchAPIView(generics.ListAPIView): | ||
pagination_class = SearchPagination | ||
filter_backends = [SearchFilterBackend] | ||
serializer_class = PageSearchSerializer | ||
|
||
def get_queryset(self): | ||
""" | ||
Return Elasticsearch DSL Search object instead of Django Queryset. | ||
|
||
Django Queryset and elasticsearch-dsl ``Search`` object is similar pattern. | ||
So for searching, its possible to return ``Search`` object instead of queryset. | ||
The ``filter_backends`` and ``pagination_class`` is compatible with ``Search`` | ||
""" | ||
# Validate all the required params are there | ||
self.validate_query_params() | ||
query = self.request.query_params.get('query', '') | ||
queryset = PageDocument.simple_search(query=query) | ||
return queryset | ||
|
||
def validate_query_params(self): | ||
required_query_params = {'query', 'project', 'version'} # python `set` literal is `{}` | ||
request_params = set(self.request.query_params.keys()) | ||
missing_params = required_query_params - request_params | ||
if missing_params: | ||
errors = {} | ||
for param in missing_params: | ||
errors[param] = ["This query param is required"] | ||
|
||
raise ValidationError(errors) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
from django.conf import settings | ||
from django_elasticsearch_dsl import DocType, Index, fields | ||
from elasticsearch_dsl.query import SimpleQueryString, Bool | ||
|
||
from readthedocs.projects.models import Project, HTMLFile | ||
from .conf import SEARCH_EXCLUDED_FILE | ||
|
@@ -60,14 +61,19 @@ class Meta(object): | |
content = fields.TextField(attr='processed_json.content') | ||
path = fields.TextField(attr='processed_json.path') | ||
|
||
# Fields to perform search with weight | ||
search_fields = ['title^10', 'headers^5', 'content'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this used by the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Its not actually used by There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hrm, good point. I guess it's fine for now then. |
||
|
||
@classmethod | ||
def faceted_search(cls, query, projects_list=None, versions_list=None, using=None, index=None): | ||
es_query = cls.get_es_query(query=query) | ||
kwargs = { | ||
'using': using or cls._doc_type.using, | ||
'index': index or cls._doc_type.index, | ||
'doc_types': [cls], | ||
'model': cls._doc_type.model, | ||
'query': query | ||
'query': es_query, | ||
'fields': cls.search_fields | ||
} | ||
filters = {} | ||
|
||
|
@@ -80,6 +86,32 @@ def faceted_search(cls, query, projects_list=None, versions_list=None, using=Non | |
|
||
return FileSearch(**kwargs) | ||
|
||
@classmethod | ||
def simple_search(cls, query, using=None, index=None): | ||
es_search = cls.search(using=using, index=index) | ||
es_query = cls.get_es_query(query=query) | ||
highlighted_fields = [f.split('^', 1)[0] for f in cls.search_fields] | ||
|
||
es_search = es_search.query(es_query).highlight(*highlighted_fields) | ||
return es_search | ||
|
||
@classmethod | ||
def get_es_query(cls, query): | ||
"""Return the Elasticsearch query generated from the query string""" | ||
all_queries = [] | ||
|
||
# Need to search for both 'AND' and 'OR' operations | ||
# The score of AND should be higher as it satisfies both OR and AND | ||
for operator in ['AND', 'OR']: | ||
query_string = SimpleQueryString(query=query, fields=cls.search_fields, | ||
default_operator=operator) | ||
all_queries.append(query_string) | ||
|
||
# Run bool query with should, so it returns result where either of the query matches | ||
bool_query = Bool(should=all_queries) | ||
|
||
return bool_query | ||
|
||
def get_queryset(self): | ||
"""Overwrite default queryset to filter certain files to index""" | ||
queryset = super(PageDocument, self).get_queryset() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,11 +9,13 @@ class RTDFacetedSearch(FacetedSearch): | |
# TODO: Remove the overwrite when the elastic/elasticsearch-dsl-py#916 | ||
# See more: https://github.com/elastic/elasticsearch-dsl-py/issues/916 | ||
|
||
def __init__(self, using, index, doc_types, model, **kwargs): | ||
def __init__(self, using, index, doc_types, model, fields=None, **kwargs): | ||
self.using = using | ||
self.index = index | ||
self.doc_types = doc_types | ||
self._model = model | ||
if fields: | ||
self.fields = fields | ||
super(RTDFacetedSearch, self).__init__(**kwargs) | ||
|
||
|
||
|
@@ -25,26 +27,18 @@ class ProjectSearch(RTDFacetedSearch): | |
|
||
|
||
class FileSearch(RTDFacetedSearch): | ||
fields = ['title^10', 'headers^5', 'content'] | ||
facets = { | ||
'project': TermsFacet(field='project'), | ||
'version': TermsFacet(field='version') | ||
} | ||
|
||
def query(self, search, query): | ||
"""Add query part to ``search``""" | ||
""" | ||
Add query part to ``search`` | ||
Overriding because we pass ES Query object instead of string | ||
""" | ||
if query: | ||
all_queries = [] | ||
|
||
# Need to search for both 'AND' and 'OR' operations | ||
# The score of AND should be higher as it comes first | ||
for operator in ['AND', 'OR']: | ||
query_string = SimpleQueryString(query=query, fields=self.fields, | ||
default_operator=operator) | ||
all_queries.append(query_string) | ||
|
||
# Run bool query with should, so it returns result where either of the query matches | ||
bool_query = Bool(should=all_queries) | ||
search = search.query(bool_query) | ||
search = search.query(query) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Feels like we shouldn't be overwriting the object here. Will we return something invalid if there is no There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have actually a mixed feeling about this. Do you have any idea about how to do this withtout overriding the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean mostly just the name of the object. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah. |
||
|
||
return search |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from rest_framework import filters | ||
|
||
from readthedocs.search.utils import get_project_slug_list_or_404 | ||
|
||
|
||
class SearchFilterBackend(filters.BaseFilterBackend): | ||
|
||
"""Filter search result with project""" | ||
|
||
def filter_queryset(self, request, queryset, view): | ||
"""Overwrite the method to compatible with Elasticsearch DSL Search object.""" | ||
# ``queryset`` is actually a Elasticsearch DSL ``Search`` object. | ||
# So change the variable name | ||
es_search = queryset | ||
project_slug = request.query_params.get('project') | ||
version_slug = request.query_params.get('version') | ||
project_slug_list = get_project_slug_list_or_404(project_slug=project_slug, | ||
user=request.user) | ||
# Elasticsearch ``terms`` query can take multiple values as list, | ||
# while ``term`` query takes single value. | ||
filtered_es_search = (es_search.filter('terms', project=project_slug_list) | ||
.filter('term', version=version_slug)) | ||
return filtered_es_search |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from rest_framework.pagination import PageNumberPagination | ||
|
||
|
||
class SearchPagination(PageNumberPagination): | ||
page_size = 10 | ||
page_size_query_param = 'page_size' | ||
max_page_size = 100 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from rest_framework import serializers | ||
|
||
|
||
class PageSearchSerializer(serializers.Serializer): | ||
project = serializers.CharField() | ||
version = serializers.CharField() | ||
title = serializers.CharField() | ||
path = serializers.CharField() | ||
highlight = serializers.SerializerMethodField() | ||
|
||
def get_highlight(self, obj): | ||
highlight = getattr(obj.meta, 'highlight', None) | ||
if highlight: | ||
return highlight.to_dict() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
import pytest | ||
from django.core.urlresolvers import reverse | ||
from django_dynamic_fixture import G | ||
|
||
from readthedocs.builds.models import Version | ||
from readthedocs.projects.models import HTMLFile | ||
from readthedocs.search.tests.utils import get_search_query_from_project_file | ||
|
||
|
||
@pytest.mark.django_db | ||
@pytest.mark.search | ||
class TestDocumentSearch(object): | ||
url = reverse('doc_search') | ||
|
||
@pytest.mark.parametrize('data_type', ['content', 'headers', 'title']) | ||
@pytest.mark.parametrize('page_num', [0, 1]) | ||
def test_search_works(self, api_client, project, data_type, page_num): | ||
query = get_search_query_from_project_file(project_slug=project.slug, page_num=page_num, | ||
data_type=data_type) | ||
|
||
version = project.versions.all()[0] | ||
search_params = {'project': project.slug, 'version': version.slug, 'query': query} | ||
resp = api_client.get(self.url, search_params) | ||
assert resp.status_code == 200 | ||
|
||
data = resp.data['results'] | ||
assert len(data) == 1 | ||
project_data = data[0] | ||
assert project_data['project'] == project.slug | ||
|
||
# Check highlight return correct object | ||
all_highlights = project_data['highlight'][data_type] | ||
for highlight in all_highlights: | ||
# Make it lower because our search is case insensitive | ||
assert query.lower() in highlight.lower() | ||
|
||
def test_doc_search_filter_by_project(self, api_client): | ||
"""Test Doc search result are filtered according to project""" | ||
|
||
# `Github` word is present both in `kuma` and `pipeline` files | ||
# so search with this phrase but filter through `kuma` project | ||
search_params = {'query': 'GitHub', 'project': 'kuma', 'version': 'latest'} | ||
resp = api_client.get(self.url, search_params) | ||
assert resp.status_code == 200 | ||
|
||
data = resp.data['results'] | ||
assert len(data) == 1 | ||
assert data[0]['project'] == 'kuma' | ||
|
||
def test_doc_search_filter_by_version(self, api_client, project): | ||
"""Test Doc search result are filtered according to version""" | ||
query = get_search_query_from_project_file(project_slug=project.slug) | ||
latest_version = project.versions.all()[0] | ||
# Create another version | ||
dummy_version = G(Version, project=project) | ||
# Create HTMLFile same as the latest version | ||
latest_version_files = HTMLFile.objects.all().filter(version=latest_version) | ||
for f in latest_version_files: | ||
f.version = dummy_version | ||
# Make primary key to None, so django will create new object | ||
f.pk = None | ||
f.save() | ||
|
||
search_params = {'query': query, 'project': project.slug, 'version': dummy_version.slug} | ||
resp = api_client.get(self.url, search_params) | ||
assert resp.status_code == 200 | ||
|
||
data = resp.data['results'] | ||
assert len(data) == 1 | ||
assert data[0]['project'] == project.slug | ||
|
||
def test_doc_search_pagination(self, api_client, project): | ||
"""Test Doc search result can be paginated""" | ||
latest_version = project.versions.all()[0] | ||
html_file = HTMLFile.objects.filter(version=latest_version)[0] | ||
title = html_file.processed_json['title'] | ||
query = title.split()[0] | ||
|
||
# Create 15 more same html file | ||
for _ in range(15): | ||
# Make primary key to None, so django will create new object | ||
html_file.pk = None | ||
html_file.save() | ||
|
||
search_params = {'query': query, 'project': project.slug, 'version': latest_version.slug} | ||
resp = api_client.get(self.url, search_params) | ||
assert resp.status_code == 200 | ||
|
||
# Check the count is 16 (1 existing and 15 new created) | ||
assert resp.data['count'] == 16 | ||
# Check there are next url | ||
assert resp.data['next'] is not None | ||
# There should be only 10 data as the pagination is 10 by default | ||
assert len(resp.data['results']) == 10 | ||
|
||
# Add `page_size` parameter and check the data is paginated accordingly | ||
search_params['page_size'] = 5 | ||
resp = api_client.get(self.url, search_params) | ||
assert resp.status_code == 200 | ||
|
||
assert len(resp.data['results']) == 5 | ||
|
||
def test_doc_search_without_parameters(self, api_client, project): | ||
"""Hitting Document Search endpoint without query parameters should return error""" | ||
resp = api_client.get(self.url) | ||
assert resp.status_code == 400 | ||
# Check error message is there | ||
assert sorted(['query', 'project', 'version']) == sorted(resp.data.keys()) | ||
|
||
def test_doc_search_subprojects(self, api_client, all_projects): | ||
"""Test Document search return results from subprojects also""" | ||
project = all_projects[0] | ||
subproject = all_projects[1] | ||
version = project.versions.all()[0] | ||
# Add another project as subproject of the project | ||
project.add_subproject(subproject) | ||
|
||
# Now search with subproject content but explicitly filter by the parent project | ||
query = get_search_query_from_project_file(project_slug=subproject.slug) | ||
search_params = {'query': query, 'project': project.slug, 'version': version.slug} | ||
resp = api_client.get(self.url, search_params) | ||
assert resp.status_code == 200 | ||
|
||
data = resp.data['results'] | ||
assert len(data) == 1 | ||
assert data[0]['project'] == subproject.slug |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's not remove this yet, until we've deployed a full version to replace it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I dont know, but with this, the
reverse
function act differently.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Without having comment, its returns something.
But it should return
I have no idea why this is happening. maybe a bug in django?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Believe it's a bad regex. It should be
r'^search/$'
-- now it's just catching anything that ends in/api/v2/<anythinghere>search/
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @ericholscher . I did not suspect the regex can be wrong! It worked!