Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for Mkdocs search #6937

Merged
merged 18 commits into from
Apr 29, 2020
Merged
6 changes: 6 additions & 0 deletions readthedocs/builds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@
MEDIA_TYPES,
PRIVACY_CHOICES,
SPHINX,
SPHINX_HTMLDIR,
SPHINX_SINGLEHTML,
)
from readthedocs.projects.models import APIProject, Project
from readthedocs.projects.version_handling import determine_stable_version
Expand Down Expand Up @@ -361,6 +363,10 @@ def supports_wipe(self):
"""Return True if version is not external."""
return not self.type == EXTERNAL

@property
def is_sphinx_type(self):
return self.documentation_type in {SPHINX, SPHINX_HTMLDIR, SPHINX_SINGLEHTML}

def get_subdomain_url(self):
external = self.type == EXTERNAL
return self.project.get_docs_url(
Expand Down
9 changes: 2 additions & 7 deletions readthedocs/doc_builder/backends/mkdocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
import yaml
from django.conf import settings
from django.template import loader as template_loader
from readthedocs.projects.constants import MKDOCS_HTML, MKDOCS

from readthedocs.doc_builder.base import BaseBuilder
from readthedocs.doc_builder.exceptions import MkDocsYAMLParseError
from readthedocs.projects.constants import MKDOCS, MKDOCS_HTML
from readthedocs.projects.models import Feature


Expand Down Expand Up @@ -314,17 +314,12 @@ def get_theme_name(self, mkdocs_config):


class MkdocsHTML(BaseMkdocs):

type = 'mkdocs'
builder = 'build'
build_dir = '_build/html'


class MkdocsJSON(BaseMkdocs):
type = 'mkdocs_json'
builder = 'json'
build_dir = '_build/json'


class SafeLoaderIgnoreUnknown(yaml.SafeLoader): # pylint: disable=too-many-ancestors

"""
Expand Down
3 changes: 0 additions & 3 deletions readthedocs/doc_builder/loader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-

"""Lookup tables for builders and backends."""
from importlib import import_module

Expand All @@ -21,7 +19,6 @@
'sphinx_singlehtmllocalmedia': sphinx.LocalMediaBuilder,
# Other markup
'mkdocs': mkdocs.MkdocsHTML,
'mkdocs_json': mkdocs.MkdocsJSON,
}


Expand Down
50 changes: 48 additions & 2 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
validate_repository_url,
)
from readthedocs.projects.version_handling import determine_stable_version
from readthedocs.search.parse_json import process_file
from readthedocs.search.parse_json import process_file, process_mkdocs_index_file
from readthedocs.vcs_support.backends import backend_cls
from readthedocs.vcs_support.utils import Lock, NonBlockingLock

Expand Down Expand Up @@ -1329,7 +1329,7 @@ class Meta:

objects = HTMLFileManager.from_queryset(HTMLFileQuerySet)()

def get_processed_json(self):
def get_processed_json_sphinx(self):
"""
Get the parsed JSON for search indexing.

Expand Down Expand Up @@ -1373,6 +1373,52 @@ def get_processed_json(self):
'domain_data': {},
}

def get_processed_json_mkdocs(self):
log.debug('Processing mkdocs index')
storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
storage_path = self.project.get_storage_path(
type_='html', version_slug=self.version.slug, include_file=False
)
try:
file_path = storage.join(storage_path, 'search/search_index.json')
if storage.exists(file_path):
index_data = process_mkdocs_index_file(file_path, page=self.path)
if index_data:
return index_data
except Exception:
log.warning(
'Unhandled exception during search processing file: %s',
file_path,
)
return {
'path': self.path,
'title': '',
'sections': [],
'domain_data': {},
}

def get_processed_json(self):
"""
Get the parsed JSON for search indexing.

Returns a dictionary with the following structure.
{
'path': 'file path',
'title': 'Title',
'sections': [
{
'id': 'section-anchor',
'title': 'Section title',
'content': 'Section content',
},
],
'domain_data': {},
}
"""
if self.version.is_sphinx_type:
return self.get_processed_json_sphinx()
return self.get_processed_json_mkdocs()

@cached_property
def processed_json(self):
return self.get_processed_json()
Expand Down
17 changes: 11 additions & 6 deletions readthedocs/projects/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,12 +1234,14 @@ def get_final_doctype(self):
return html_builder.get_final_doctype()

def build_docs_search(self):
"""Build search data."""
# Search is always run in sphinx using the rtd-sphinx-extension.
# Mkdocs has no search currently.
if self.is_type_sphinx() and self.version.type != EXTERNAL:
return True
return False
"""
Build search data.

.. note::
For MkDocs search is indexed from its ``html`` artifacts.
And in sphinx is run using the rtd-sphinx-extension.
"""
return self.is_type_sphinx() and self.version.type != EXTERNAL

def build_docs_localmedia(self):
"""Get local media files with separate build."""
Expand Down Expand Up @@ -1593,6 +1595,9 @@ def _create_intersphinx_data(version, commit, build):
:param commit: Commit that updated path
:param build: Build id
"""
if not version.is_sphinx_type:
return

storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()

html_storage_path = version.project.get_storage_path(
Expand Down
11 changes: 5 additions & 6 deletions readthedocs/search/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,10 @@ class Meta:

def prepare_domains(self, html_file):
"""Prepares and returns the values for domains field."""
all_domains = []
if not html_file.version.is_sphinx_type:
return []

all_domains = []
try:
domains_qs = html_file.sphinx_domains.exclude(
domain='std',
Expand Down Expand Up @@ -172,11 +174,8 @@ def get_queryset(self):
"""Overwrite default queryset to filter certain files to index."""
queryset = super().get_queryset()

# Do not index files that belong to non sphinx project
# Also do not index certain files
queryset = queryset.internal().filter(
project__documentation_type__contains='sphinx'
)
# Do not index files from external versions
queryset = queryset.internal().all()

# TODO: Make this smarter
# This was causing issues excluding some valid user documentation pages
Expand Down
54 changes: 53 additions & 1 deletion readthedocs/search/parse_json.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Functions related to converting content into dict/JSON structures."""

import logging
from urllib.parse import urlparse
import orjson as json

from django.conf import settings
Expand Down Expand Up @@ -195,5 +196,56 @@ def parse_content(content, remove_first_line=False):
content = content[1:]

# converting newlines to ". "
content = ' '.join([text.strip() for text in content if text])
content = ' '.join(text.strip() for text in content if text)
return content


def process_mkdocs_index_file(json_storage_path, page):
"""Reads the json index file and parses it into a structured dict."""
log.debug('Processing JSON index file: %s', json_storage_path)

storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
try:
with storage.open(json_storage_path, mode='r') as f:
file_contents = f.read()
except IOError:
log.info('Unable to read file: %s', json_storage_path)
raise

data = json.loads(file_contents)
page_data = {}

for section in data.get('docs', []):
parsed_path = urlparse(section.get('location', ''))
fragment = parsed_path.fragment
path = parsed_path.path

# Some old versions of mkdocs
# index the pages as ``/page.html`` insted of ``page.html``.
path = path.lstrip('/')

if path == '' or path.endswith('/'):
path += 'index.html'

if page != path:
continue

title = HTMLParser(section.get('title')).text()
content = parse_content(
HTMLParser(section.get('text')).text()
)

if not fragment:
page_data.update({
'path': path,
'title': title,
'domain_data': {},
})
else:
page_data.setdefault('sections', []).append({
'id': fragment,
'title': title,
'content': content,
})

return page_data
6 changes: 3 additions & 3 deletions readthedocs/search/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .dummy_data import ALL_PROJECTS, PROJECT_DATA_FILES


@pytest.fixture()
@pytest.fixture
def es_index():
call_command('search_index', '--delete', '-f')
call_command('search_index', '--create')
Expand All @@ -23,7 +23,7 @@ def es_index():
call_command('search_index', '--delete', '-f')


@pytest.fixture(autouse=True)
@pytest.fixture
def all_projects(es_index, mock_processed_json, db, settings):
settings.ELASTICSEARCH_DSL_AUTOSYNC = True
projects_list = []
Expand Down Expand Up @@ -95,7 +95,7 @@ def get_dummy_processed_json(instance):
return json.load(f)


@pytest.fixture(autouse=True)
@pytest.fixture
def mock_processed_json(mocker):
mocked_function = mocker.patch.object(HTMLFile, 'get_processed_json', autospec=True)
mocked_function.side_effect = get_dummy_processed_json
31 changes: 31 additions & 0 deletions readthedocs/search/tests/data/mkdocs/in/search_index.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"config": {
"lang": [
"en"
],
"prebuild_index": false,
"separator": "[\\s\\-]+"
},
"docs": [
{
"location": "",
"text": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs.",
"title": "Read the Docs MkDocs Test Project"
},
{
"location": "#read-the-docs-mkdocs-test-project",
"text": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs.",
"title": "Read the Docs MkDocs Test Project"
},
{
"location": "versions/",
"text": "Versions & Themes There are a number of versions and themes for mkdocs.",
"title": "Versions & Themes"
},
{
"location": "versions/#versions-themes",
"text": "Versions & Themes There are a number of versions and themes for mkdocs.",
"title": "Versions & Themes"
}
]
}
24 changes: 24 additions & 0 deletions readthedocs/search/tests/data/mkdocs/in/search_index_old.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"docs": [
{
"location": "/",
"text": "Read the Docs MkDocs Test Project\n\n\nThis is a test of \nMkDocs\n as it appears on \nRead the Docs\n.",
"title": "Read the Docs MkDocs Test Project"
},
{
"location": "/#read-the-docs-mkdocs-test-project",
"text": "Read the Docs MkDocs Test Project\n\n\nThis is a test of \nMkDocs\n as it appears on \nRead the Docs\n.",
"title": "Read the Docs MkDocs Test Project"
},
{
"location": "/versions/",
"text": "Versions & Themes\n\n\nThere are a number of versions and themes for mkdocs.",
"title": "Versions & Themes"
},
{
"location": "/versions/#versions-themes",
"text": "Versions & Themes\n\n\nThere are a number of versions and themes for mkdocs.",
"title": "Versions & Themes"
}
]
}
26 changes: 26 additions & 0 deletions readthedocs/search/tests/data/mkdocs/out/search_index.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[
{
"title": "Read the Docs MkDocs Test Project",
"path": "index.html",
"sections": [
{
"id": "read-the-docs-mkdocs-test-project",
"title": "Read the Docs MkDocs Test Project",
"content": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs."
}
],
"domain_data": {}
},
{
"title": "Versions & Themes",
"path": "versions/index.html",
"sections": [
{
"id": "versions-themes",
"title": "Versions & Themes",
"content": "Versions & Themes There are a number of versions and themes for mkdocs."
}
],
"domain_data": {}
}
]
26 changes: 26 additions & 0 deletions readthedocs/search/tests/data/mkdocs/out/search_index_old.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[
{
"title": "Read the Docs MkDocs Test Project",
"path": "index.html",
"sections": [
{
"id": "read-the-docs-mkdocs-test-project",
"title": "Read the Docs MkDocs Test Project",
"content": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs ."
}
],
"domain_data": {}
},
{
"title": "Versions & Themes",
"path": "versions/index.html",
"sections": [
{
"id": "versions-themes",
"title": "Versions & Themes",
"content": "Versions & Themes There are a number of versions and themes for mkdocs."
}
],
"domain_data": {}
}
]
Loading