Merge pull request readthedocs#4309 from safwanrahman/search_api

[fix readthedocs#4265] Port Document search API for Elasticsearch 6.x
safwanrahman · Jul 16, 2018 · 1b47227 · 1b47227
2 parents e9bfeee + d30bac3
commit 1b47227
Show file tree

Hide file tree

Showing 12 changed files with 310 additions and 36 deletions.
diff --git a/conftest.py b/conftest.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 import pytest
+from rest_framework.test import APIClient
 
 try:
     # TODO: this file is read/executed even when called from ``readthedocsinc``,
@@ -44,3 +45,8 @@ def pytest_configure(config):
 @pytest.fixture(autouse=True)
 def settings_modification(settings):
     settings.CELERY_ALWAYS_EAGER = True
+
+
+@pytest.fixture
+def api_client():
+    return APIClient()
diff --git a/readthedocs/restapi/urls.py b/readthedocs/restapi/urls.py
@@ -48,7 +48,7 @@
     url(r'index_search/',
         search_views.index_search,
         name='index_search'),
-    url(r'search/$', views.search_views.search, name='api_search'),
+    url(r'^search/$', views.search_views.search, name='api_search'),
     url(r'search/project/$',
         search_views.project_search,
         name='api_project_search'),
@@ -85,20 +85,12 @@
         name='api_webhook'),
 ]
 
+
 urlpatterns += function_urls
-urlpatterns += search_urls
 urlpatterns += task_urls
+urlpatterns += search_urls
 urlpatterns += integration_urls
 
-try:
-    from readthedocsext.search.docsearch import DocSearch
-    api_search_urls = [
-        url(r'^docsearch/$', DocSearch.as_view(), name='doc_search'),
-    ]
-    urlpatterns += api_search_urls
-except ImportError:
-    pass
-
 try:
     from readthedocsext.donate.restapi.urls import urlpatterns as sustainability_urls
 

diff --git a/readthedocs/search/api.py b/readthedocs/search/api.py
@@ -0,0 +1,57 @@
+from rest_framework import generics
+from rest_framework import exceptions
+from rest_framework.exceptions import ValidationError
+
+from readthedocs.projects.models import Project
+from readthedocs.search.documents import PageDocument
+from readthedocs.search.filters import SearchFilterBackend
+from readthedocs.search.pagination import SearchPagination
+from readthedocs.search.serializers import PageSearchSerializer
+from readthedocs.search.utils import get_project_list_or_404
+
+
+class PageSearchAPIView(generics.ListAPIView):
+    pagination_class = SearchPagination
+    filter_backends = [SearchFilterBackend]
+    serializer_class = PageSearchSerializer
+
+    def get_queryset(self):
+        """
+        Return Elasticsearch DSL Search object instead of Django Queryset.
+
+        Django Queryset and elasticsearch-dsl ``Search`` object is similar pattern.
+        So for searching, its possible to return ``Search`` object instead of queryset.
+        The ``filter_backends`` and ``pagination_class`` is compatible with ``Search``
+        """
+        # Validate all the required params are there
+        self.validate_query_params()
+        query = self.request.query_params.get('query', '')
+        queryset = PageDocument.simple_search(query=query)
+        return queryset
+
+    def validate_query_params(self):
+        required_query_params = {'query', 'project', 'version'}  # python `set` literal is `{}`
+        request_params = set(self.request.query_params.keys())
+        missing_params = required_query_params - request_params
+        if missing_params:
+            errors = {}
+            for param in missing_params:
+                errors[param] = ["This query param is required"]
+
+            raise ValidationError(errors)
+
+    def get_serializer_context(self):
+        context = super(PageSearchAPIView, self).get_serializer_context()
+        context['projects_url'] = self.get_all_projects_url()
+        return context
+
+    def get_all_projects_url(self):
+        version_slug = self.request.query_params.get('version')
+        project_slug = self.request.query_params.get('project')
+        all_projects = get_project_list_or_404(project_slug=project_slug, user=self.request.user)
+        projects_url = {}
+
+        for project in all_projects:
+            projects_url[project.slug] = project.get_docs_url(version_slug=version_slug)
+
+        return projects_url
diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py
@@ -1,5 +1,6 @@
 from django.conf import settings
 from django_elasticsearch_dsl import DocType, Index, fields
+from elasticsearch_dsl.query import SimpleQueryString, Bool
 
 from readthedocs.projects.models import Project, HTMLFile
 from .conf import SEARCH_EXCLUDED_FILE
@@ -60,14 +61,19 @@ class Meta(object):
     content = fields.TextField(attr='processed_json.content')
     path = fields.TextField(attr='processed_json.path')
 
+    # Fields to perform search with weight
+    search_fields = ['title^10', 'headers^5', 'content']
+
     @classmethod
     def faceted_search(cls, query, projects_list=None, versions_list=None, using=None, index=None):
+        es_query = cls.get_es_query(query=query)
         kwargs = {
             'using': using or cls._doc_type.using,
             'index': index or cls._doc_type.index,
             'doc_types': [cls],
             'model': cls._doc_type.model,
-            'query': query
+            'query': es_query,
+            'fields': cls.search_fields
         }
         filters = {}
 
@@ -80,6 +86,32 @@ def faceted_search(cls, query, projects_list=None, versions_list=None, using=Non
 
         return FileSearch(**kwargs)
 
+    @classmethod
+    def simple_search(cls, query, using=None, index=None):
+        es_search = cls.search(using=using, index=index)
+        es_query = cls.get_es_query(query=query)
+        highlighted_fields = [f.split('^', 1)[0] for f in cls.search_fields]
+
+        es_search = es_search.query(es_query).highlight(*highlighted_fields)
+        return es_search
+
+    @classmethod
+    def get_es_query(cls, query):
+        """Return the Elasticsearch query generated from the query string"""
+        all_queries = []
+
+        # Need to search for both 'AND' and 'OR' operations
+        # The score of AND should be higher as it satisfies both OR and AND
+        for operator in ['AND', 'OR']:
+            query_string = SimpleQueryString(query=query, fields=cls.search_fields,
+                                             default_operator=operator)
+            all_queries.append(query_string)
+
+        # Run bool query with should, so it returns result where either of the query matches
+        bool_query = Bool(should=all_queries)
+
+        return bool_query
+
     def get_queryset(self):
         """Overwrite default queryset to filter certain files to index"""
         queryset = super(PageDocument, self).get_queryset()

diff --git a/readthedocs/search/faceted_search.py b/readthedocs/search/faceted_search.py
@@ -9,11 +9,13 @@ class RTDFacetedSearch(FacetedSearch):
     # TODO: Remove the overwrite when the elastic/elasticsearch-dsl-py#916
     # See more: https://github.com/elastic/elasticsearch-dsl-py/issues/916
 
-    def __init__(self, using, index, doc_types, model, **kwargs):
+    def __init__(self, using, index, doc_types, model, fields=None, **kwargs):
         self.using = using
         self.index = index
         self.doc_types = doc_types
         self._model = model
+        if fields:
+            self.fields = fields
         super(RTDFacetedSearch, self).__init__(**kwargs)
 
 
@@ -25,26 +27,18 @@ class ProjectSearch(RTDFacetedSearch):
 
 
 class FileSearch(RTDFacetedSearch):
-    fields = ['title^10', 'headers^5', 'content']
     facets = {
         'project': TermsFacet(field='project'),
         'version': TermsFacet(field='version')
     }
 
     def query(self, search, query):
-        """Add query part to ``search``"""
+        """
+        Add query part to ``search``
+
+        Overriding because we pass ES Query object instead of string
+        """
         if query:
-            all_queries = []
-
-            # Need to search for both 'AND' and 'OR' operations
-            # The score of AND should be higher as it comes first
-            for operator in ['AND', 'OR']:
-                query_string = SimpleQueryString(query=query, fields=self.fields,
-                                                 default_operator=operator)
-                all_queries.append(query_string)
-
-            # Run bool query with should, so it returns result where either of the query matches
-            bool_query = Bool(should=all_queries)
-            search = search.query(bool_query)
+            search = search.query(query)
 
         return search
diff --git a/readthedocs/search/filters.py b/readthedocs/search/filters.py
@@ -0,0 +1,20 @@
+from rest_framework import filters
+
+
+class SearchFilterBackend(filters.BaseFilterBackend):
+
+    """Filter search result with project"""
+
+    def filter_queryset(self, request, queryset, view):
+        """Overwrite the method to compatible with Elasticsearch DSL Search object."""
+        # ``queryset`` is actually a Elasticsearch DSL ``Search`` object.
+        # So change the variable name
+        es_search = queryset
+        version_slug = request.query_params.get('version')
+        projects_info = view.get_all_projects_url()
+        project_slug_list = list(projects_info.keys())
+        # Elasticsearch ``terms`` query can take multiple values as list,
+        # while ``term`` query takes single value.
+        filtered_es_search = (es_search.filter('terms', project=project_slug_list)
+                                       .filter('term', version=version_slug))
+        return filtered_es_search
diff --git a/readthedocs/search/pagination.py b/readthedocs/search/pagination.py
@@ -0,0 +1,7 @@
+from rest_framework.pagination import PageNumberPagination
+
+
+class SearchPagination(PageNumberPagination):
+    page_size = 10
+    page_size_query_param = 'page_size'
+    max_page_size = 100
diff --git a/readthedocs/search/serializers.py b/readthedocs/search/serializers.py
@@ -0,0 +1,23 @@
+from rest_framework import serializers
+
+from readthedocs.projects.models import Project
+
+
+class PageSearchSerializer(serializers.Serializer):
+    project = serializers.CharField()
+    version = serializers.CharField()
+    title = serializers.CharField()
+    path = serializers.CharField()
+    link = serializers.SerializerMethodField()
+    highlight = serializers.SerializerMethodField()
+
+    def get_link(self, obj):
+        projects_url = self.context.get('projects_url')
+        if projects_url:
+            docs_url = projects_url[obj.project]
+            return docs_url + obj.path
+
+    def get_highlight(self, obj):
+        highlight = getattr(obj.meta, 'highlight', None)
+        if highlight:
+            return highlight.to_dict()
diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py
@@ -0,0 +1,129 @@
+import pytest
+from django.core.urlresolvers import reverse
+from django_dynamic_fixture import G
+
+from readthedocs.builds.models import Version
+from readthedocs.projects.models import HTMLFile
+from readthedocs.search.tests.utils import get_search_query_from_project_file
+
+
+@pytest.mark.django_db
+@pytest.mark.search
+class TestDocumentSearch(object):
+    url = reverse('doc_search')
+
+    @pytest.mark.parametrize('data_type', ['content', 'headers', 'title'])
+    @pytest.mark.parametrize('page_num', [0, 1])
+    def test_search_works(self, api_client, project, data_type, page_num):
+        query = get_search_query_from_project_file(project_slug=project.slug, page_num=page_num,
+                                                   data_type=data_type)
+
+        version = project.versions.all()[0]
+        search_params = {'project': project.slug, 'version': version.slug, 'query': query}
+        resp = api_client.get(self.url, search_params)
+        assert resp.status_code == 200
+
+        data = resp.data['results']
+        assert len(data) == 1
+        project_data = data[0]
+        assert project_data['project'] == project.slug
+
+        # Check highlight return correct object
+        all_highlights = project_data['highlight'][data_type]
+        for highlight in all_highlights:
+            # Make it lower because our search is case insensitive
+            assert query.lower() in highlight.lower()
+
+    def test_doc_search_filter_by_project(self, api_client):
+        """Test Doc search result are filtered according to project"""
+
+        # `Github` word is present both in `kuma` and `pipeline` files
+        # so search with this phrase but filter through `kuma` project
+        search_params = {'query': 'GitHub', 'project': 'kuma', 'version': 'latest'}
+        resp = api_client.get(self.url, search_params)
+        assert resp.status_code == 200
+
+        data = resp.data['results']
+        assert len(data) == 1
+        assert data[0]['project'] == 'kuma'
+
+    def test_doc_search_filter_by_version(self, api_client, project):
+        """Test Doc search result are filtered according to version"""
+        query = get_search_query_from_project_file(project_slug=project.slug)
+        latest_version = project.versions.all()[0]
+        # Create another version
+        dummy_version = G(Version, project=project)
+        # Create HTMLFile same as the latest version
+        latest_version_files = HTMLFile.objects.all().filter(version=latest_version)
+        for f in latest_version_files:
+            f.version = dummy_version
+            # Make primary key to None, so django will create new object
+            f.pk = None
+            f.save()
+
+        search_params = {'query': query, 'project': project.slug, 'version': dummy_version.slug}
+        resp = api_client.get(self.url, search_params)
+        assert resp.status_code == 200
+
+        data = resp.data['results']
+        assert len(data) == 1
+        assert data[0]['project'] == project.slug
+
+    def test_doc_search_pagination(self, api_client, project):
+        """Test Doc search result can be paginated"""
+        latest_version = project.versions.all()[0]
+        html_file = HTMLFile.objects.filter(version=latest_version)[0]
+        title = html_file.processed_json['title']
+        query = title.split()[0]
+
+        # Create 15 more same html file
+        for _ in range(15):
+            # Make primary key to None, so django will create new object
+            html_file.pk = None
+            html_file.save()
+
+        search_params = {'query': query, 'project': project.slug, 'version': latest_version.slug}
+        resp = api_client.get(self.url, search_params)
+        assert resp.status_code == 200
+
+        # Check the count is 16 (1 existing and 15 new created)
+        assert resp.data['count'] == 16
+        # Check there are next url
+        assert resp.data['next'] is not None
+        # There should be only 10 data as the pagination is 10 by default
+        assert len(resp.data['results']) == 10
+
+        # Add `page_size` parameter and check the data is paginated accordingly
+        search_params['page_size'] = 5
+        resp = api_client.get(self.url, search_params)
+        assert resp.status_code == 200
+
+        assert len(resp.data['results']) == 5
+
+    def test_doc_search_without_parameters(self, api_client, project):
+        """Hitting Document Search endpoint without query parameters should return error"""
+        resp = api_client.get(self.url)
+        assert resp.status_code == 400
+        # Check error message is there
+        assert sorted(['query', 'project', 'version']) == sorted(resp.data.keys())
+
+    def test_doc_search_subprojects(self, api_client, all_projects):
+        """Test Document search return results from subprojects also"""
+        project = all_projects[0]
+        subproject = all_projects[1]
+        version = project.versions.all()[0]
+        # Add another project as subproject of the project
+        project.add_subproject(subproject)
+
+        # Now search with subproject content but explicitly filter by the parent project
+        query = get_search_query_from_project_file(project_slug=subproject.slug)
+        search_params = {'query': query, 'project': project.slug, 'version': version.slug}
+        resp = api_client.get(self.url, search_params)
+        assert resp.status_code == 200
+
+        data = resp.data['results']
+        assert len(data) == 1
+        assert data[0]['project'] == subproject.slug
+        # Check the link is the subproject document link
+        document_link = subproject.get_docs_url(version_slug=version.slug)
+        assert document_link in data[0]['link']