Skip to content

Commit

Permalink
Merge pull request readthedocs#4309 from safwanrahman/search_api
Browse files Browse the repository at this point in the history
[fix readthedocs#4265] Port Document search API for Elasticsearch 6.x
  • Loading branch information
ericholscher authored and safwanrahman committed Jul 16, 2018
2 parents 38143e3 + 30f0bb7 commit 96681e2
Show file tree
Hide file tree
Showing 12 changed files with 310 additions and 36 deletions.
6 changes: 6 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
import pytest
from rest_framework.test import APIClient

try:
# TODO: this file is read/executed even when called from ``readthedocsinc``,
Expand Down Expand Up @@ -44,3 +45,8 @@ def pytest_configure(config):
@pytest.fixture(autouse=True)
def settings_modification(settings):
settings.CELERY_ALWAYS_EAGER = True


@pytest.fixture
def api_client():
return APIClient()
14 changes: 3 additions & 11 deletions readthedocs/restapi/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
url(r'index_search/',
search_views.index_search,
name='index_search'),
url(r'search/$', views.search_views.search, name='api_search'),
url(r'^search/$', views.search_views.search, name='api_search'),
url(r'search/project/$',
search_views.project_search,
name='api_project_search'),
Expand Down Expand Up @@ -85,20 +85,12 @@
name='api_webhook'),
]


urlpatterns += function_urls
urlpatterns += search_urls
urlpatterns += task_urls
urlpatterns += search_urls
urlpatterns += integration_urls

try:
from readthedocsext.search.docsearch import DocSearch
api_search_urls = [
url(r'^docsearch/$', DocSearch.as_view(), name='doc_search'),
]
urlpatterns += api_search_urls
except ImportError:
pass

try:
from readthedocsext.donate.restapi.urls import urlpatterns as sustainability_urls

Expand Down
57 changes: 57 additions & 0 deletions readthedocs/search/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from rest_framework import generics
from rest_framework import exceptions
from rest_framework.exceptions import ValidationError

from readthedocs.projects.models import Project
from readthedocs.search.documents import PageDocument
from readthedocs.search.filters import SearchFilterBackend
from readthedocs.search.pagination import SearchPagination
from readthedocs.search.serializers import PageSearchSerializer
from readthedocs.search.utils import get_project_list_or_404


class PageSearchAPIView(generics.ListAPIView):
pagination_class = SearchPagination
filter_backends = [SearchFilterBackend]
serializer_class = PageSearchSerializer

def get_queryset(self):
"""
Return Elasticsearch DSL Search object instead of Django Queryset.
Django Queryset and elasticsearch-dsl ``Search`` object is similar pattern.
So for searching, its possible to return ``Search`` object instead of queryset.
The ``filter_backends`` and ``pagination_class`` is compatible with ``Search``
"""
# Validate all the required params are there
self.validate_query_params()
query = self.request.query_params.get('query', '')
queryset = PageDocument.simple_search(query=query)
return queryset

def validate_query_params(self):
required_query_params = {'query', 'project', 'version'} # python `set` literal is `{}`
request_params = set(self.request.query_params.keys())
missing_params = required_query_params - request_params
if missing_params:
errors = {}
for param in missing_params:
errors[param] = ["This query param is required"]

raise ValidationError(errors)

def get_serializer_context(self):
context = super(PageSearchAPIView, self).get_serializer_context()
context['projects_url'] = self.get_all_projects_url()
return context

def get_all_projects_url(self):
version_slug = self.request.query_params.get('version')
project_slug = self.request.query_params.get('project')
all_projects = get_project_list_or_404(project_slug=project_slug, user=self.request.user)
projects_url = {}

for project in all_projects:
projects_url[project.slug] = project.get_docs_url(version_slug=version_slug)

return projects_url
34 changes: 33 additions & 1 deletion readthedocs/search/documents.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from django.conf import settings
from django_elasticsearch_dsl import DocType, Index, fields
from elasticsearch_dsl.query import SimpleQueryString, Bool

from readthedocs.projects.models import Project, HTMLFile
from .conf import SEARCH_EXCLUDED_FILE
Expand Down Expand Up @@ -60,14 +61,19 @@ class Meta(object):
content = fields.TextField(attr='processed_json.content')
path = fields.TextField(attr='processed_json.path')

# Fields to perform search with weight
search_fields = ['title^10', 'headers^5', 'content']

@classmethod
def faceted_search(cls, query, projects_list=None, versions_list=None, using=None, index=None):
es_query = cls.get_es_query(query=query)
kwargs = {
'using': using or cls._doc_type.using,
'index': index or cls._doc_type.index,
'doc_types': [cls],
'model': cls._doc_type.model,
'query': query
'query': es_query,
'fields': cls.search_fields
}
filters = {}

Expand All @@ -80,6 +86,32 @@ def faceted_search(cls, query, projects_list=None, versions_list=None, using=Non

return FileSearch(**kwargs)

@classmethod
def simple_search(cls, query, using=None, index=None):
es_search = cls.search(using=using, index=index)
es_query = cls.get_es_query(query=query)
highlighted_fields = [f.split('^', 1)[0] for f in cls.search_fields]

es_search = es_search.query(es_query).highlight(*highlighted_fields)
return es_search

@classmethod
def get_es_query(cls, query):
"""Return the Elasticsearch query generated from the query string"""
all_queries = []

# Need to search for both 'AND' and 'OR' operations
# The score of AND should be higher as it satisfies both OR and AND
for operator in ['AND', 'OR']:
query_string = SimpleQueryString(query=query, fields=cls.search_fields,
default_operator=operator)
all_queries.append(query_string)

# Run bool query with should, so it returns result where either of the query matches
bool_query = Bool(should=all_queries)

return bool_query

def get_queryset(self):
"""Overwrite default queryset to filter certain files to index"""
queryset = super(PageDocument, self).get_queryset()
Expand Down
24 changes: 9 additions & 15 deletions readthedocs/search/faceted_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@ class RTDFacetedSearch(FacetedSearch):
# TODO: Remove the overwrite when the elastic/elasticsearch-dsl-py#916
# See more: https://github.com/elastic/elasticsearch-dsl-py/issues/916

def __init__(self, using, index, doc_types, model, **kwargs):
def __init__(self, using, index, doc_types, model, fields=None, **kwargs):
self.using = using
self.index = index
self.doc_types = doc_types
self._model = model
if fields:
self.fields = fields
super(RTDFacetedSearch, self).__init__(**kwargs)


Expand All @@ -25,26 +27,18 @@ class ProjectSearch(RTDFacetedSearch):


class FileSearch(RTDFacetedSearch):
fields = ['title^10', 'headers^5', 'content']
facets = {
'project': TermsFacet(field='project'),
'version': TermsFacet(field='version')
}

def query(self, search, query):
"""Add query part to ``search``"""
"""
Add query part to ``search``
Overriding because we pass ES Query object instead of string
"""
if query:
all_queries = []

# Need to search for both 'AND' and 'OR' operations
# The score of AND should be higher as it comes first
for operator in ['AND', 'OR']:
query_string = SimpleQueryString(query=query, fields=self.fields,
default_operator=operator)
all_queries.append(query_string)

# Run bool query with should, so it returns result where either of the query matches
bool_query = Bool(should=all_queries)
search = search.query(bool_query)
search = search.query(query)

return search
20 changes: 20 additions & 0 deletions readthedocs/search/filters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from rest_framework import filters


class SearchFilterBackend(filters.BaseFilterBackend):

"""Filter search result with project"""

def filter_queryset(self, request, queryset, view):
"""Overwrite the method to compatible with Elasticsearch DSL Search object."""
# ``queryset`` is actually a Elasticsearch DSL ``Search`` object.
# So change the variable name
es_search = queryset
version_slug = request.query_params.get('version')
projects_info = view.get_all_projects_url()
project_slug_list = list(projects_info.keys())
# Elasticsearch ``terms`` query can take multiple values as list,
# while ``term`` query takes single value.
filtered_es_search = (es_search.filter('terms', project=project_slug_list)
.filter('term', version=version_slug))
return filtered_es_search
7 changes: 7 additions & 0 deletions readthedocs/search/pagination.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from rest_framework.pagination import PageNumberPagination


class SearchPagination(PageNumberPagination):
page_size = 10
page_size_query_param = 'page_size'
max_page_size = 100
23 changes: 23 additions & 0 deletions readthedocs/search/serializers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from rest_framework import serializers

from readthedocs.projects.models import Project


class PageSearchSerializer(serializers.Serializer):
project = serializers.CharField()
version = serializers.CharField()
title = serializers.CharField()
path = serializers.CharField()
link = serializers.SerializerMethodField()
highlight = serializers.SerializerMethodField()

def get_link(self, obj):
projects_url = self.context.get('projects_url')
if projects_url:
docs_url = projects_url[obj.project]
return docs_url + obj.path

def get_highlight(self, obj):
highlight = getattr(obj.meta, 'highlight', None)
if highlight:
return highlight.to_dict()
129 changes: 129 additions & 0 deletions readthedocs/search/tests/test_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import pytest
from django.core.urlresolvers import reverse
from django_dynamic_fixture import G

from readthedocs.builds.models import Version
from readthedocs.projects.models import HTMLFile
from readthedocs.search.tests.utils import get_search_query_from_project_file


@pytest.mark.django_db
@pytest.mark.search
class TestDocumentSearch(object):
url = reverse('doc_search')

@pytest.mark.parametrize('data_type', ['content', 'headers', 'title'])
@pytest.mark.parametrize('page_num', [0, 1])
def test_search_works(self, api_client, project, data_type, page_num):
query = get_search_query_from_project_file(project_slug=project.slug, page_num=page_num,
data_type=data_type)

version = project.versions.all()[0]
search_params = {'project': project.slug, 'version': version.slug, 'query': query}
resp = api_client.get(self.url, search_params)
assert resp.status_code == 200

data = resp.data['results']
assert len(data) == 1
project_data = data[0]
assert project_data['project'] == project.slug

# Check highlight return correct object
all_highlights = project_data['highlight'][data_type]
for highlight in all_highlights:
# Make it lower because our search is case insensitive
assert query.lower() in highlight.lower()

def test_doc_search_filter_by_project(self, api_client):
"""Test Doc search result are filtered according to project"""

# `Github` word is present both in `kuma` and `pipeline` files
# so search with this phrase but filter through `kuma` project
search_params = {'query': 'GitHub', 'project': 'kuma', 'version': 'latest'}
resp = api_client.get(self.url, search_params)
assert resp.status_code == 200

data = resp.data['results']
assert len(data) == 1
assert data[0]['project'] == 'kuma'

def test_doc_search_filter_by_version(self, api_client, project):
"""Test Doc search result are filtered according to version"""
query = get_search_query_from_project_file(project_slug=project.slug)
latest_version = project.versions.all()[0]
# Create another version
dummy_version = G(Version, project=project)
# Create HTMLFile same as the latest version
latest_version_files = HTMLFile.objects.all().filter(version=latest_version)
for f in latest_version_files:
f.version = dummy_version
# Make primary key to None, so django will create new object
f.pk = None
f.save()

search_params = {'query': query, 'project': project.slug, 'version': dummy_version.slug}
resp = api_client.get(self.url, search_params)
assert resp.status_code == 200

data = resp.data['results']
assert len(data) == 1
assert data[0]['project'] == project.slug

def test_doc_search_pagination(self, api_client, project):
"""Test Doc search result can be paginated"""
latest_version = project.versions.all()[0]
html_file = HTMLFile.objects.filter(version=latest_version)[0]
title = html_file.processed_json['title']
query = title.split()[0]

# Create 15 more same html file
for _ in range(15):
# Make primary key to None, so django will create new object
html_file.pk = None
html_file.save()

search_params = {'query': query, 'project': project.slug, 'version': latest_version.slug}
resp = api_client.get(self.url, search_params)
assert resp.status_code == 200

# Check the count is 16 (1 existing and 15 new created)
assert resp.data['count'] == 16
# Check there are next url
assert resp.data['next'] is not None
# There should be only 10 data as the pagination is 10 by default
assert len(resp.data['results']) == 10

# Add `page_size` parameter and check the data is paginated accordingly
search_params['page_size'] = 5
resp = api_client.get(self.url, search_params)
assert resp.status_code == 200

assert len(resp.data['results']) == 5

def test_doc_search_without_parameters(self, api_client, project):
"""Hitting Document Search endpoint without query parameters should return error"""
resp = api_client.get(self.url)
assert resp.status_code == 400
# Check error message is there
assert sorted(['query', 'project', 'version']) == sorted(resp.data.keys())

def test_doc_search_subprojects(self, api_client, all_projects):
"""Test Document search return results from subprojects also"""
project = all_projects[0]
subproject = all_projects[1]
version = project.versions.all()[0]
# Add another project as subproject of the project
project.add_subproject(subproject)

# Now search with subproject content but explicitly filter by the parent project
query = get_search_query_from_project_file(project_slug=subproject.slug)
search_params = {'query': query, 'project': project.slug, 'version': version.slug}
resp = api_client.get(self.url, search_params)
assert resp.status_code == 200

data = resp.data['results']
assert len(data) == 1
assert data[0]['project'] == subproject.slug
# Check the link is the subproject document link
document_link = subproject.get_docs_url(version_slug=version.slug)
assert document_link in data[0]['link']
Loading

0 comments on commit 96681e2

Please sign in to comment.