Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fix #4265] Port Document search API for Elasticsearch 6.x #4309

Merged
merged 9 commits into from
Jul 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging

import pytest
from rest_framework.test import APIClient


def pytest_addoption(parser):
Expand All @@ -17,3 +18,8 @@ def pytest_configure(config):
@pytest.fixture(autouse=True)
def settings_modification(settings):
settings.CELERY_ALWAYS_EAGER = True


@pytest.fixture
def api_client():
return APIClient()
14 changes: 3 additions & 11 deletions readthedocs/restapi/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
url(r'index_search/',
search_views.index_search,
name='index_search'),
url(r'search/$', views.search_views.search, name='api_search'),
url(r'^search/$', views.search_views.search, name='api_search'),
url(r'search/project/$',
search_views.project_search,
name='api_project_search'),
Expand Down Expand Up @@ -85,20 +85,12 @@
name='api_webhook'),
]


urlpatterns += function_urls
urlpatterns += search_urls
urlpatterns += task_urls
urlpatterns += search_urls
urlpatterns += integration_urls

try:
from readthedocsext.search.docsearch import DocSearch
api_search_urls = [
url(r'^docsearch/$', DocSearch.as_view(), name='doc_search'),
]
urlpatterns += api_search_urls
except ImportError:
pass

try:
from readthedocsext.donate.restapi.urls import urlpatterns as sustainability_urls

Expand Down
57 changes: 57 additions & 0 deletions readthedocs/search/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from rest_framework import generics
from rest_framework import exceptions
from rest_framework.exceptions import ValidationError

from readthedocs.projects.models import Project
from readthedocs.search.documents import PageDocument
from readthedocs.search.filters import SearchFilterBackend
from readthedocs.search.pagination import SearchPagination
from readthedocs.search.serializers import PageSearchSerializer
from readthedocs.search.utils import get_project_list_or_404


class PageSearchAPIView(generics.ListAPIView):
pagination_class = SearchPagination
filter_backends = [SearchFilterBackend]
serializer_class = PageSearchSerializer

def get_queryset(self):
"""
Return Elasticsearch DSL Search object instead of Django Queryset.
Django Queryset and elasticsearch-dsl ``Search`` object is similar pattern.
So for searching, its possible to return ``Search`` object instead of queryset.
The ``filter_backends`` and ``pagination_class`` is compatible with ``Search``
"""
# Validate all the required params are there
self.validate_query_params()
query = self.request.query_params.get('query', '')
queryset = PageDocument.simple_search(query=query)
return queryset

def validate_query_params(self):
required_query_params = {'query', 'project', 'version'} # python `set` literal is `{}`
request_params = set(self.request.query_params.keys())
missing_params = required_query_params - request_params
if missing_params:
errors = {}
for param in missing_params:
errors[param] = ["This query param is required"]

raise ValidationError(errors)

def get_serializer_context(self):
context = super(PageSearchAPIView, self).get_serializer_context()
context['projects_url'] = self.get_all_projects_url()
return context

def get_all_projects_url(self):
version_slug = self.request.query_params.get('version')
project_slug = self.request.query_params.get('project')
all_projects = get_project_list_or_404(project_slug=project_slug, user=self.request.user)
projects_url = {}

for project in all_projects:
projects_url[project.slug] = project.get_docs_url(version_slug=version_slug)

return projects_url
34 changes: 33 additions & 1 deletion readthedocs/search/documents.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from django.conf import settings
from django_elasticsearch_dsl import DocType, Index, fields
from elasticsearch_dsl.query import SimpleQueryString, Bool

from readthedocs.projects.models import Project, HTMLFile
from .conf import SEARCH_EXCLUDED_FILE
Expand Down Expand Up @@ -60,14 +61,19 @@ class Meta(object):
content = fields.TextField(attr='processed_json.content')
path = fields.TextField(attr='processed_json.path')

# Fields to perform search with weight
search_fields = ['title^10', 'headers^5', 'content']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this used by the DocType class directly, or are we storing it here to add later? If the prior, we should probably set it in an __init__ method or somewhere else, so it doesn't get confused with the other data here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its not actually used by DocType class, but we are using it in class method. I dont know if keeping it in __init__ method will make it work in class method!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hrm, good point. I guess it's fine for now then.


@classmethod
def faceted_search(cls, query, projects_list=None, versions_list=None, using=None, index=None):
es_query = cls.get_es_query(query=query)
kwargs = {
'using': using or cls._doc_type.using,
'index': index or cls._doc_type.index,
'doc_types': [cls],
'model': cls._doc_type.model,
'query': query
'query': es_query,
'fields': cls.search_fields
}
filters = {}

Expand All @@ -80,6 +86,32 @@ def faceted_search(cls, query, projects_list=None, versions_list=None, using=Non

return FileSearch(**kwargs)

@classmethod
def simple_search(cls, query, using=None, index=None):
es_search = cls.search(using=using, index=index)
es_query = cls.get_es_query(query=query)
highlighted_fields = [f.split('^', 1)[0] for f in cls.search_fields]

es_search = es_search.query(es_query).highlight(*highlighted_fields)
return es_search

@classmethod
def get_es_query(cls, query):
"""Return the Elasticsearch query generated from the query string"""
all_queries = []

# Need to search for both 'AND' and 'OR' operations
# The score of AND should be higher as it satisfies both OR and AND
for operator in ['AND', 'OR']:
query_string = SimpleQueryString(query=query, fields=cls.search_fields,
default_operator=operator)
all_queries.append(query_string)

# Run bool query with should, so it returns result where either of the query matches
bool_query = Bool(should=all_queries)

return bool_query

def get_queryset(self):
"""Overwrite default queryset to filter certain files to index"""
queryset = super(PageDocument, self).get_queryset()
Expand Down
24 changes: 9 additions & 15 deletions readthedocs/search/faceted_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@ class RTDFacetedSearch(FacetedSearch):
# TODO: Remove the overwrite when the elastic/elasticsearch-dsl-py#916
# See more: https://github.com/elastic/elasticsearch-dsl-py/issues/916

def __init__(self, using, index, doc_types, model, **kwargs):
def __init__(self, using, index, doc_types, model, fields=None, **kwargs):
self.using = using
self.index = index
self.doc_types = doc_types
self._model = model
if fields:
self.fields = fields
super(RTDFacetedSearch, self).__init__(**kwargs)


Expand All @@ -25,26 +27,18 @@ class ProjectSearch(RTDFacetedSearch):


class FileSearch(RTDFacetedSearch):
fields = ['title^10', 'headers^5', 'content']
facets = {
'project': TermsFacet(field='project'),
'version': TermsFacet(field='version')
}

def query(self, search, query):
"""Add query part to ``search``"""
"""
Add query part to ``search``
Overriding because we pass ES Query object instead of string
"""
if query:
all_queries = []

# Need to search for both 'AND' and 'OR' operations
# The score of AND should be higher as it comes first
for operator in ['AND', 'OR']:
query_string = SimpleQueryString(query=query, fields=self.fields,
default_operator=operator)
all_queries.append(query_string)

# Run bool query with should, so it returns result where either of the query matches
bool_query = Bool(should=all_queries)
search = search.query(bool_query)
search = search.query(query)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feels like we shouldn't be overwriting the object here. Will we return something invalid if there is no query because we have the same object name?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have actually a mixed feeling about this. Do you have any idea about how to do this withtout overriding the search method?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean mostly just the name of the object. search = search.query(query) -- is that similar to a queryset = queryset.filter()? It just feels like a weird bit of syntax to rewrite the name of the object over again.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah. search = search.query(query) -- is that similar to a queryset = queryset.filter(). If we do not overwrite the same object, then another variable need to be assigned and check if that is not None


return search
20 changes: 20 additions & 0 deletions readthedocs/search/filters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from rest_framework import filters


class SearchFilterBackend(filters.BaseFilterBackend):

"""Filter search result with project"""

def filter_queryset(self, request, queryset, view):
"""Overwrite the method to compatible with Elasticsearch DSL Search object."""
# ``queryset`` is actually a Elasticsearch DSL ``Search`` object.
# So change the variable name
es_search = queryset
version_slug = request.query_params.get('version')
projects_info = view.get_all_projects_url()
project_slug_list = list(projects_info.keys())
# Elasticsearch ``terms`` query can take multiple values as list,
# while ``term`` query takes single value.
filtered_es_search = (es_search.filter('terms', project=project_slug_list)
.filter('term', version=version_slug))
return filtered_es_search
7 changes: 7 additions & 0 deletions readthedocs/search/pagination.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from rest_framework.pagination import PageNumberPagination


class SearchPagination(PageNumberPagination):
page_size = 10
page_size_query_param = 'page_size'
max_page_size = 100
23 changes: 23 additions & 0 deletions readthedocs/search/serializers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from rest_framework import serializers

from readthedocs.projects.models import Project


class PageSearchSerializer(serializers.Serializer):
project = serializers.CharField()
version = serializers.CharField()
title = serializers.CharField()
path = serializers.CharField()
link = serializers.SerializerMethodField()
highlight = serializers.SerializerMethodField()

def get_link(self, obj):
projects_url = self.context.get('projects_url')
if projects_url:
docs_url = projects_url[obj.project]
return docs_url + obj.path

def get_highlight(self, obj):
highlight = getattr(obj.meta, 'highlight', None)
if highlight:
return highlight.to_dict()
129 changes: 129 additions & 0 deletions readthedocs/search/tests/test_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import pytest
from django.core.urlresolvers import reverse
from django_dynamic_fixture import G

from readthedocs.builds.models import Version
from readthedocs.projects.models import HTMLFile
from readthedocs.search.tests.utils import get_search_query_from_project_file


@pytest.mark.django_db
@pytest.mark.search
class TestDocumentSearch(object):
url = reverse('doc_search')

@pytest.mark.parametrize('data_type', ['content', 'headers', 'title'])
@pytest.mark.parametrize('page_num', [0, 1])
def test_search_works(self, api_client, project, data_type, page_num):
query = get_search_query_from_project_file(project_slug=project.slug, page_num=page_num,
data_type=data_type)

version = project.versions.all()[0]
search_params = {'project': project.slug, 'version': version.slug, 'query': query}
resp = api_client.get(self.url, search_params)
assert resp.status_code == 200

data = resp.data['results']
assert len(data) == 1
project_data = data[0]
assert project_data['project'] == project.slug

# Check highlight return correct object
all_highlights = project_data['highlight'][data_type]
for highlight in all_highlights:
# Make it lower because our search is case insensitive
assert query.lower() in highlight.lower()

def test_doc_search_filter_by_project(self, api_client):
"""Test Doc search result are filtered according to project"""

# `Github` word is present both in `kuma` and `pipeline` files
# so search with this phrase but filter through `kuma` project
search_params = {'query': 'GitHub', 'project': 'kuma', 'version': 'latest'}
resp = api_client.get(self.url, search_params)
assert resp.status_code == 200

data = resp.data['results']
assert len(data) == 1
assert data[0]['project'] == 'kuma'

def test_doc_search_filter_by_version(self, api_client, project):
"""Test Doc search result are filtered according to version"""
query = get_search_query_from_project_file(project_slug=project.slug)
latest_version = project.versions.all()[0]
# Create another version
dummy_version = G(Version, project=project)
# Create HTMLFile same as the latest version
latest_version_files = HTMLFile.objects.all().filter(version=latest_version)
for f in latest_version_files:
f.version = dummy_version
# Make primary key to None, so django will create new object
f.pk = None
f.save()

search_params = {'query': query, 'project': project.slug, 'version': dummy_version.slug}
resp = api_client.get(self.url, search_params)
assert resp.status_code == 200

data = resp.data['results']
assert len(data) == 1
assert data[0]['project'] == project.slug

def test_doc_search_pagination(self, api_client, project):
"""Test Doc search result can be paginated"""
latest_version = project.versions.all()[0]
html_file = HTMLFile.objects.filter(version=latest_version)[0]
title = html_file.processed_json['title']
query = title.split()[0]

# Create 15 more same html file
for _ in range(15):
# Make primary key to None, so django will create new object
html_file.pk = None
html_file.save()

search_params = {'query': query, 'project': project.slug, 'version': latest_version.slug}
resp = api_client.get(self.url, search_params)
assert resp.status_code == 200

# Check the count is 16 (1 existing and 15 new created)
assert resp.data['count'] == 16
# Check there are next url
assert resp.data['next'] is not None
# There should be only 10 data as the pagination is 10 by default
assert len(resp.data['results']) == 10

# Add `page_size` parameter and check the data is paginated accordingly
search_params['page_size'] = 5
resp = api_client.get(self.url, search_params)
assert resp.status_code == 200

assert len(resp.data['results']) == 5

def test_doc_search_without_parameters(self, api_client, project):
"""Hitting Document Search endpoint without query parameters should return error"""
resp = api_client.get(self.url)
assert resp.status_code == 400
# Check error message is there
assert sorted(['query', 'project', 'version']) == sorted(resp.data.keys())

def test_doc_search_subprojects(self, api_client, all_projects):
"""Test Document search return results from subprojects also"""
project = all_projects[0]
subproject = all_projects[1]
version = project.versions.all()[0]
# Add another project as subproject of the project
project.add_subproject(subproject)

# Now search with subproject content but explicitly filter by the parent project
query = get_search_query_from_project_file(project_slug=subproject.slug)
search_params = {'query': query, 'project': project.slug, 'version': version.slug}
resp = api_client.get(self.url, search_params)
assert resp.status_code == 200

data = resp.data['results']
assert len(data) == 1
assert data[0]['project'] == subproject.slug
# Check the link is the subproject document link
document_link = subproject.get_docs_url(version_slug=version.slug)
assert document_link in data[0]['link']
Loading