diff --git a/readthedocs/builds/models.py b/readthedocs/builds/models.py index c36ba7c37cf..ad26a15244c 100644 --- a/readthedocs/builds/models.py +++ b/readthedocs/builds/models.py @@ -78,6 +78,8 @@ MEDIA_TYPES, PRIVACY_CHOICES, SPHINX, + SPHINX_HTMLDIR, + SPHINX_SINGLEHTML, ) from readthedocs.projects.models import APIProject, Project from readthedocs.projects.version_handling import determine_stable_version @@ -361,6 +363,10 @@ def supports_wipe(self): """Return True if version is not external.""" return not self.type == EXTERNAL + @property + def is_sphinx_type(self): + return self.documentation_type in {SPHINX, SPHINX_HTMLDIR, SPHINX_SINGLEHTML} + def get_subdomain_url(self): external = self.type == EXTERNAL return self.project.get_docs_url( diff --git a/readthedocs/doc_builder/backends/mkdocs.py b/readthedocs/doc_builder/backends/mkdocs.py index 2b0842829d0..2239eb75c15 100644 --- a/readthedocs/doc_builder/backends/mkdocs.py +++ b/readthedocs/doc_builder/backends/mkdocs.py @@ -11,10 +11,10 @@ import yaml from django.conf import settings from django.template import loader as template_loader -from readthedocs.projects.constants import MKDOCS_HTML, MKDOCS from readthedocs.doc_builder.base import BaseBuilder from readthedocs.doc_builder.exceptions import MkDocsYAMLParseError +from readthedocs.projects.constants import MKDOCS, MKDOCS_HTML from readthedocs.projects.models import Feature @@ -314,17 +314,12 @@ def get_theme_name(self, mkdocs_config): class MkdocsHTML(BaseMkdocs): + type = 'mkdocs' builder = 'build' build_dir = '_build/html' -class MkdocsJSON(BaseMkdocs): - type = 'mkdocs_json' - builder = 'json' - build_dir = '_build/json' - - class SafeLoaderIgnoreUnknown(yaml.SafeLoader): # pylint: disable=too-many-ancestors """ diff --git a/readthedocs/doc_builder/loader.py b/readthedocs/doc_builder/loader.py index 1f8d8256567..bf826fd17c3 100644 --- a/readthedocs/doc_builder/loader.py +++ b/readthedocs/doc_builder/loader.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - """Lookup tables for builders and backends.""" from importlib import import_module @@ -21,7 +19,6 @@ 'sphinx_singlehtmllocalmedia': sphinx.LocalMediaBuilder, # Other markup 'mkdocs': mkdocs.MkdocsHTML, - 'mkdocs_json': mkdocs.MkdocsJSON, } diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index 65af1bc7688..b31d76f69de 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -39,7 +39,7 @@ validate_repository_url, ) from readthedocs.projects.version_handling import determine_stable_version -from readthedocs.search.parse_json import process_file +from readthedocs.search.parse_json import process_file, process_mkdocs_index_file from readthedocs.vcs_support.backends import backend_cls from readthedocs.vcs_support.utils import Lock, NonBlockingLock @@ -1329,7 +1329,7 @@ class Meta: objects = HTMLFileManager.from_queryset(HTMLFileQuerySet)() - def get_processed_json(self): + def get_processed_json_sphinx(self): """ Get the parsed JSON for search indexing. @@ -1373,6 +1373,52 @@ def get_processed_json(self): 'domain_data': {}, } + def get_processed_json_mkdocs(self): + log.debug('Processing mkdocs index') + storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)() + storage_path = self.project.get_storage_path( + type_='html', version_slug=self.version.slug, include_file=False + ) + try: + file_path = storage.join(storage_path, 'search/search_index.json') + if storage.exists(file_path): + index_data = process_mkdocs_index_file(file_path, page=self.path) + if index_data: + return index_data + except Exception: + log.warning( + 'Unhandled exception during search processing file: %s', + file_path, + ) + return { + 'path': self.path, + 'title': '', + 'sections': [], + 'domain_data': {}, + } + + def get_processed_json(self): + """ + Get the parsed JSON for search indexing. + + Returns a dictionary with the following structure. + { + 'path': 'file path', + 'title': 'Title', + 'sections': [ + { + 'id': 'section-anchor', + 'title': 'Section title', + 'content': 'Section content', + }, + ], + 'domain_data': {}, + } + """ + if self.version.is_sphinx_type: + return self.get_processed_json_sphinx() + return self.get_processed_json_mkdocs() + @cached_property def processed_json(self): return self.get_processed_json() diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py index 98025185c5a..39afb6d7711 100644 --- a/readthedocs/projects/tasks.py +++ b/readthedocs/projects/tasks.py @@ -1234,12 +1234,14 @@ def get_final_doctype(self): return html_builder.get_final_doctype() def build_docs_search(self): - """Build search data.""" - # Search is always run in sphinx using the rtd-sphinx-extension. - # Mkdocs has no search currently. - if self.is_type_sphinx() and self.version.type != EXTERNAL: - return True - return False + """ + Build search data. + + .. note:: + For MkDocs search is indexed from its ``html`` artifacts. + And in sphinx is run using the rtd-sphinx-extension. + """ + return self.is_type_sphinx() and self.version.type != EXTERNAL def build_docs_localmedia(self): """Get local media files with separate build.""" @@ -1593,6 +1595,9 @@ def _create_intersphinx_data(version, commit, build): :param commit: Commit that updated path :param build: Build id """ + if not version.is_sphinx_type: + return + storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)() html_storage_path = version.project.get_storage_path( diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py index 9ea3b704f65..68dd80ee8d3 100644 --- a/readthedocs/search/documents.py +++ b/readthedocs/search/documents.py @@ -109,8 +109,10 @@ class Meta: def prepare_domains(self, html_file): """Prepares and returns the values for domains field.""" - all_domains = [] + if not html_file.version.is_sphinx_type: + return [] + all_domains = [] try: domains_qs = html_file.sphinx_domains.exclude( domain='std', @@ -172,11 +174,8 @@ def get_queryset(self): """Overwrite default queryset to filter certain files to index.""" queryset = super().get_queryset() - # Do not index files that belong to non sphinx project - # Also do not index certain files - queryset = queryset.internal().filter( - project__documentation_type__contains='sphinx' - ) + # Do not index files from external versions + queryset = queryset.internal().all() # TODO: Make this smarter # This was causing issues excluding some valid user documentation pages diff --git a/readthedocs/search/parse_json.py b/readthedocs/search/parse_json.py index 6bc5659a581..e703321e89f 100644 --- a/readthedocs/search/parse_json.py +++ b/readthedocs/search/parse_json.py @@ -1,6 +1,7 @@ """Functions related to converting content into dict/JSON structures.""" import logging +from urllib.parse import urlparse import orjson as json from django.conf import settings @@ -195,5 +196,56 @@ def parse_content(content, remove_first_line=False): content = content[1:] # converting newlines to ". " - content = ' '.join([text.strip() for text in content if text]) + content = ' '.join(text.strip() for text in content if text) return content + + +def process_mkdocs_index_file(json_storage_path, page): + """Reads the json index file and parses it into a structured dict.""" + log.debug('Processing JSON index file: %s', json_storage_path) + + storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)() + try: + with storage.open(json_storage_path, mode='r') as f: + file_contents = f.read() + except IOError: + log.info('Unable to read file: %s', json_storage_path) + raise + + data = json.loads(file_contents) + page_data = {} + + for section in data.get('docs', []): + parsed_path = urlparse(section.get('location', '')) + fragment = parsed_path.fragment + path = parsed_path.path + + # Some old versions of mkdocs + # index the pages as ``/page.html`` insted of ``page.html``. + path = path.lstrip('/') + + if path == '' or path.endswith('/'): + path += 'index.html' + + if page != path: + continue + + title = HTMLParser(section.get('title')).text() + content = parse_content( + HTMLParser(section.get('text')).text() + ) + + if not fragment: + page_data.update({ + 'path': path, + 'title': title, + 'domain_data': {}, + }) + else: + page_data.setdefault('sections', []).append({ + 'id': fragment, + 'title': title, + 'content': content, + }) + + return page_data diff --git a/readthedocs/search/tests/conftest.py b/readthedocs/search/tests/conftest.py index 450959033c9..1705118b380 100644 --- a/readthedocs/search/tests/conftest.py +++ b/readthedocs/search/tests/conftest.py @@ -14,7 +14,7 @@ from .dummy_data import ALL_PROJECTS, PROJECT_DATA_FILES -@pytest.fixture() +@pytest.fixture def es_index(): call_command('search_index', '--delete', '-f') call_command('search_index', '--create') @@ -23,7 +23,7 @@ def es_index(): call_command('search_index', '--delete', '-f') -@pytest.fixture(autouse=True) +@pytest.fixture def all_projects(es_index, mock_processed_json, db, settings): settings.ELASTICSEARCH_DSL_AUTOSYNC = True projects_list = [] @@ -95,7 +95,7 @@ def get_dummy_processed_json(instance): return json.load(f) -@pytest.fixture(autouse=True) +@pytest.fixture def mock_processed_json(mocker): mocked_function = mocker.patch.object(HTMLFile, 'get_processed_json', autospec=True) mocked_function.side_effect = get_dummy_processed_json diff --git a/readthedocs/search/tests/data/mkdocs/in/search_index.json b/readthedocs/search/tests/data/mkdocs/in/search_index.json new file mode 100644 index 00000000000..3d1988872ea --- /dev/null +++ b/readthedocs/search/tests/data/mkdocs/in/search_index.json @@ -0,0 +1,31 @@ +{ + "config": { + "lang": [ + "en" + ], + "prebuild_index": false, + "separator": "[\\s\\-]+" + }, + "docs": [ + { + "location": "", + "text": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs.", + "title": "Read the Docs MkDocs Test Project" + }, + { + "location": "#read-the-docs-mkdocs-test-project", + "text": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs.", + "title": "Read the Docs MkDocs Test Project" + }, + { + "location": "versions/", + "text": "Versions & Themes There are a number of versions and themes for mkdocs.", + "title": "Versions & Themes" + }, + { + "location": "versions/#versions-themes", + "text": "Versions & Themes There are a number of versions and themes for mkdocs.", + "title": "Versions & Themes" + } + ] +} diff --git a/readthedocs/search/tests/data/mkdocs/in/search_index_old.json b/readthedocs/search/tests/data/mkdocs/in/search_index_old.json new file mode 100644 index 00000000000..29a3b63811b --- /dev/null +++ b/readthedocs/search/tests/data/mkdocs/in/search_index_old.json @@ -0,0 +1,24 @@ +{ + "docs": [ + { + "location": "/", + "text": "Read the Docs MkDocs Test Project\n\n\nThis is a test of \nMkDocs\n as it appears on \nRead the Docs\n.", + "title": "Read the Docs MkDocs Test Project" + }, + { + "location": "/#read-the-docs-mkdocs-test-project", + "text": "Read the Docs MkDocs Test Project\n\n\nThis is a test of \nMkDocs\n as it appears on \nRead the Docs\n.", + "title": "Read the Docs MkDocs Test Project" + }, + { + "location": "/versions/", + "text": "Versions & Themes\n\n\nThere are a number of versions and themes for mkdocs.", + "title": "Versions & Themes" + }, + { + "location": "/versions/#versions-themes", + "text": "Versions & Themes\n\n\nThere are a number of versions and themes for mkdocs.", + "title": "Versions & Themes" + } + ] +} diff --git a/readthedocs/search/tests/data/mkdocs/out/search_index.json b/readthedocs/search/tests/data/mkdocs/out/search_index.json new file mode 100644 index 00000000000..a9621131697 --- /dev/null +++ b/readthedocs/search/tests/data/mkdocs/out/search_index.json @@ -0,0 +1,26 @@ +[ + { + "title": "Read the Docs MkDocs Test Project", + "path": "index.html", + "sections": [ + { + "id": "read-the-docs-mkdocs-test-project", + "title": "Read the Docs MkDocs Test Project", + "content": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs." + } + ], + "domain_data": {} + }, + { + "title": "Versions & Themes", + "path": "versions/index.html", + "sections": [ + { + "id": "versions-themes", + "title": "Versions & Themes", + "content": "Versions & Themes There are a number of versions and themes for mkdocs." + } + ], + "domain_data": {} + } +] diff --git a/readthedocs/search/tests/data/mkdocs/out/search_index_old.json b/readthedocs/search/tests/data/mkdocs/out/search_index_old.json new file mode 100644 index 00000000000..16483016b42 --- /dev/null +++ b/readthedocs/search/tests/data/mkdocs/out/search_index_old.json @@ -0,0 +1,26 @@ +[ + { + "title": "Read the Docs MkDocs Test Project", + "path": "index.html", + "sections": [ + { + "id": "read-the-docs-mkdocs-test-project", + "title": "Read the Docs MkDocs Test Project", + "content": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs ." + } + ], + "domain_data": {} + }, + { + "title": "Versions & Themes", + "path": "versions/index.html", + "sections": [ + { + "id": "versions-themes", + "title": "Versions & Themes", + "content": "Versions & Themes There are a number of versions and themes for mkdocs." + } + ], + "domain_data": {} + } +] diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py index aea84e5c782..a258545b29c 100644 --- a/readthedocs/search/tests/test_api.py +++ b/readthedocs/search/tests/test_api.py @@ -19,6 +19,7 @@ @pytest.mark.django_db @pytest.mark.search +@pytest.mark.usefixtures("all_projects") class BaseTestDocumentSearch: def setup_method(self, method): diff --git a/readthedocs/search/tests/test_parse_json.py b/readthedocs/search/tests/test_parse_json.py new file mode 100644 index 00000000000..704be9fa1e7 --- /dev/null +++ b/readthedocs/search/tests/test_parse_json.py @@ -0,0 +1,92 @@ +import json +from contextlib import contextmanager +from pathlib import Path +from unittest import mock + +import pytest +from django_dynamic_fixture import get + +from readthedocs.builds.storage import BuildMediaFileSystemStorage +from readthedocs.projects.constants import MKDOCS +from readthedocs.projects.models import HTMLFile, Project + +data_path = Path(__file__).parent.resolve() / 'data' + + +@pytest.mark.django_db +@pytest.mark.search +class TestParseJSON: + + def setup_method(self): + self.project = get( + Project, + slug='test', + main_language_project=None, + ) + self.version = self.project.versions.first() + + def _mock_open(self, content): + @contextmanager + def f(*args, **kwargs): + read_mock = mock.MagicMock() + read_mock.read.return_value = content + yield read_mock + return f + + @mock.patch.object(BuildMediaFileSystemStorage, 'exists') + @mock.patch.object(BuildMediaFileSystemStorage, 'open') + def test_mkdocs(self, storage_open, storage_exists): + json_file = data_path / 'mkdocs/in/search_index.json' + storage_open.side_effect = self._mock_open( + json_file.open().read() + ) + storage_exists.return_value = True + + self.version.documentation_type = MKDOCS + self.version.save() + + index_file = get( + HTMLFile, + project=self.project, + version=self.version, + path='index.html', + ) + versions_file = get( + HTMLFile, + project=self.project, + version=self.version, + path='versions/index.html', + ) + + parsed_json = [index_file.processed_json, versions_file.processed_json] + expected_json = json.load(open(data_path / 'mkdocs/out/search_index.json')) + assert parsed_json == expected_json + + @mock.patch.object(BuildMediaFileSystemStorage, 'exists') + @mock.patch.object(BuildMediaFileSystemStorage, 'open') + def test_mkdocs_old_version(self, storage_open, storage_exists): + json_file = data_path / 'mkdocs/in/search_index_old.json' + storage_open.side_effect = self._mock_open( + json_file.open().read() + ) + storage_exists.return_value = True + + self.version.documentation_type = MKDOCS + self.version.save() + + index_file = get( + HTMLFile, + project=self.project, + version=self.version, + path='index.html', + ) + versions_file = get( + HTMLFile, + project=self.project, + version=self.version, + path='versions/index.html', + ) + + parsed_json = [index_file.processed_json, versions_file.processed_json] + expected_json = json.load(open(data_path / 'mkdocs/out/search_index_old.json')) + assert parsed_json == expected_json diff --git a/readthedocs/search/tests/test_search_tasks.py b/readthedocs/search/tests/test_search_tasks.py index 75c01e35eeb..92501dfd367 100644 --- a/readthedocs/search/tests/test_search_tasks.py +++ b/readthedocs/search/tests/test_search_tasks.py @@ -14,6 +14,7 @@ @pytest.mark.django_db @pytest.mark.search +@pytest.mark.usefixtures("all_projects") class TestSearchTasks: @classmethod diff --git a/readthedocs/search/tests/test_views.py b/readthedocs/search/tests/test_views.py index 8ae7d9e4631..648f029e960 100644 --- a/readthedocs/search/tests/test_views.py +++ b/readthedocs/search/tests/test_views.py @@ -86,6 +86,7 @@ def test_search_project_filter_language(self, client, project): @pytest.mark.django_db @pytest.mark.search +@pytest.mark.usefixtures("all_projects") class TestPageSearch: @pytest.fixture(autouse=True) diff --git a/readthedocs/search/tests/utils.py b/readthedocs/search/tests/utils.py index d3cf1530996..8a82a42ccc5 100644 --- a/readthedocs/search/tests/utils.py +++ b/readthedocs/search/tests/utils.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - import random from readthedocs.projects.models import HTMLFile