From 92a183a56dd754287ae02774c2af013101818ab7 Mon Sep 17 00:00:00 2001 From: Eric Holscher Date: Mon, 13 Oct 2014 12:56:52 -0700 Subject: [PATCH] Clean up mkdocs builder and search --- readthedocs/doc_builder/backends/mkdocs.py | 79 +++++------- readthedocs/doc_builder/loader.py | 3 +- readthedocs/projects/models.py | 5 +- readthedocs/projects/tasks.py | 38 ++++-- readthedocs/restapi/utils.py | 4 +- readthedocs/search/utils.py | 136 ++++++++++++++------- 6 files changed, 159 insertions(+), 106 deletions(-) diff --git a/readthedocs/doc_builder/backends/mkdocs.py b/readthedocs/doc_builder/backends/mkdocs.py index 307a14fa252..41e71dadb32 100644 --- a/readthedocs/doc_builder/backends/mkdocs.py +++ b/readthedocs/doc_builder/backends/mkdocs.py @@ -17,31 +17,25 @@ log = logging.getLogger(__name__) -class Builder(BaseBuilder): +class BaseMkdocs(BaseBuilder): """ Mkdocs builder """ - type = 'mkdocs' def __init__(self, *args, **kwargs): - super(Builder, self).__init__(*args, **kwargs) - self.old_artifact_path = os.path.join( - self.version.project.checkout_path(self.version.slug), 'site') + super(BaseMkdocs, self).__init__(*args, **kwargs) + self.old_artifact_path = os.path.join(self.version.project.checkout_path(self.version.slug), self.build_dir) - @restoring_chdir - def build(self, **kwargs): - project = self.version.project - checkout_path = project.checkout_path(self.version.slug) - site_path = os.path.join(checkout_path, 'site') - os.chdir(checkout_path) + def append_conf(self, **kwargs): + """ + Set mkdocs config values + """ # Pull mkdocs config data user_config = yaml.safe_load(open('mkdocs.yml', 'r')) docs_dir = user_config.get('docs_dir', 'docs') - # Set mkdocs config values - MEDIA_URL = getattr( settings, 'MEDIA_URL', 'https://media.readthedocs.org') if 'extra_javascript' in user_config: @@ -84,9 +78,9 @@ def build(self, **kwargs): # RTD javascript writing READTHEDOCS_DATA = { - 'project': project.slug, + 'project': self.version.project.slug, 'version': self.version.slug, - 'language': project.language, + 'language': self.version.project.language, 'page': None, 'theme': "readthedocs", 'docroot': docs_dir, @@ -112,7 +106,7 @@ def build(self, **kwargs): include_ctx = Context({ 'global_analytics_code': getattr(settings, 'GLOBAL_ANALYTICS_CODE', 'UA-17997319-1'), - 'user_analytics_code': project.analytics_code, + 'user_analytics_code': self.version.project.analytics_code, }) include_string = template_loader.get_template( 'doc_builder/include.js.tmpl' @@ -121,40 +115,29 @@ def build(self, **kwargs): include_file.write(include_string) include_file.close() + @restoring_chdir + def build(self, **kwargs): + checkout_path = self.version.project.checkout_path(self.version.slug) + #site_path = os.path.join(checkout_path, 'site') + os.chdir(checkout_path) + self.append_conf() # Actual build - - build_command = "%s build --site-dir=site --theme=mkdocs" % ( - project.venv_bin(version=self.version.slug, - bin='mkdocs') + build_command = "{command} {builder} --site-dir={build_dir} --theme=mkdocs".format( + command=self.version.project.venv_bin(version=self.version.slug, bin='mkdocs'), + builder=self.builder, + build_dir=self.build_dir, ) results = run(build_command, shell=True) + return results - try: - # Index Search - page_list = [] - log.info(LOG_TEMPLATE.format(project=self.version.project.slug, version=self.version.slug, msg='Indexing files')) - for root, dirnames, filenames in os.walk(site_path): - for filename in filenames: - if fnmatch.fnmatch(filename, '*.html'): - full_path = os.path.join(root, filename.lstrip('/')) - relative_path = os.path.join(root.replace(site_path, '').lstrip('/'), filename.lstrip('/')) - relative_path = re.sub('.html$', '', relative_path) - html = parse_content_from_file(documentation_type='mkdocs', file_path=full_path) - headers = parse_headers_from_file(documentation_type='mkdocs', file_path=full_path) - sections = parse_sections_from_file(documentation_type='mkdocs', file_path=full_path) - page_list.append( - {'content': html, 'path': relative_path, 'title': sections[0]['title'], 'headers': headers, 'sections': sections} - ) - - data = { - 'page_list': page_list, - 'version_pk': self.version.pk, - 'project_pk': self.version.project.pk - } - log_msg = ' '.join([page['path'] for page in page_list]) - log.info("(Search Index) Sending Data: %s [%s]" % (self.version.project.slug, log_msg)) - apiv2.index_search.post({'data': data}) - except: - log.error('Search indexing failed') - return results +class MkdocsHTML(BaseMkdocs): + type = 'mkdocs' + builder = 'build' + build_dir = '_build/html' + + +class MkdocsJSON(BaseMkdocs): + type = 'mkdocs_json' + builder = 'json' + build_dir = '_build/json' diff --git a/readthedocs/doc_builder/loader.py b/readthedocs/doc_builder/loader.py index fe818bfe5f2..6029d94f87f 100644 --- a/readthedocs/doc_builder/loader.py +++ b/readthedocs/doc_builder/loader.py @@ -16,5 +16,6 @@ 'sphinx_search': sphinx.SearchBuilder, 'sphinx_singlehtmllocalmedia': sphinx.LocalMediaBuilder, # Other markup - 'mkdocs': mkdocs.Builder, + 'mkdocs': mkdocs.MkdocsHTML, + 'mkdocs_json': mkdocs.MkdocsJSON, } diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index e955956b2c0..3d57d859332 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -480,7 +480,10 @@ def full_json_path(self, version='latest'): """ The path to the build json docs in the project. """ - return os.path.join(self.conf_dir(version), "_build", "json") + if 'sphinx' in self.documentation_type: + return os.path.join(self.conf_dir(version), "_build", "json") + elif 'mkdocs' in self.documentation_type: + return os.path.join(self.checkout_path(version), "_build", "json") def full_singlehtml_path(self, version='latest'): """ diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py index 81bd7e9d586..2d3e70609ff 100644 --- a/readthedocs/projects/tasks.py +++ b/readthedocs/projects/tasks.py @@ -29,6 +29,7 @@ run_on_app_servers) from core import utils as core_utils from search.parse_json import process_all_json_files +from search.utils import process_mkdocs_json from vcs_support import utils as vcs_support_utils log = logging.getLogger(__name__) @@ -451,15 +452,24 @@ def build_docs(version, force, pdf, man, epub, dash, search, localmedia): html_builder = builder_loading.get(project.documentation_type)(version) if force: html_builder.force() - # html_builder.clean() - if 'sphinx' in project.documentation_type: - html_builder.append_conf() + html_builder.append_conf() results['html'] = html_builder.build() if results['html'][0] == 0: html_builder.move() fake_results = (999, "Project Skipped, Didn't build", "Project Skipped, Didn't build") + if 'mkdocs' in project.documentation_type: + if search: + try: + search_builder = builder_loading.get('mkdocs_json')(version) + results['search'] = search_builder.build() + if results['search'][0] == 0: + search_builder.move() + except: + log.error(LOG_TEMPLATE.format( + project=project.slug, version=version.slug, msg="JSON Build Error"), exc_info=True) + if 'sphinx' in project.documentation_type: # Search builder. Creates JSON from docs and sends it to the # server. @@ -689,16 +699,18 @@ def record_pdf(api, record, results, state, version): def update_search(version, build): if 'sphinx' in version.project.documentation_type: page_list = process_all_json_files(version) - data = { - 'page_list': page_list, - 'version_pk': version.pk, - 'project_pk': version.project.pk, - 'commit': build.get('commit'), - } - log_msg = ' '.join([page['path'] for page in page_list]) - log.info("(Search Index) Sending Data: %s [%s]" % ( - version.project.slug, log_msg)) - apiv2.index_search.post({'data': data}) + if 'mkdocs' in version.project.documentation_type: + page_list = process_mkdocs_json(version) + + data = { + 'page_list': page_list, + 'version_pk': version.pk, + 'project_pk': version.project.pk, + 'commit': build.get('commit'), + } + log_msg = ' '.join([page['path'] for page in page_list]) + log.info("(Search Index) Sending Data: %s [%s]" % (version.project.slug, log_msg)) + apiv2.index_search.post({'data': data}) @task() diff --git a/readthedocs/restapi/utils.py b/readthedocs/restapi/utils.py index d6ab2f7011a..e015762cb97 100644 --- a/readthedocs/restapi/utils.py +++ b/readthedocs/restapi/utils.py @@ -93,7 +93,7 @@ def index_search_request(version, page_list, commit): page_obj = PageIndex() project_scale = 1 - tags = [tag.name for tag in project.tags.all()] + #tags = [tag.name for tag in project.tags.all()] project_obj = ProjectIndex() project_obj.index_document(data={ @@ -104,7 +104,7 @@ def index_search_request(version, page_list, commit): 'lang': project.language, 'author': [user.username for user in project.users.all()], 'url': project.get_absolute_url(), - 'tags': tags, + 'tags': None, '_boost': project_scale, }) diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py index 18d1d6b4e0d..b66270e060f 100644 --- a/readthedocs/search/utils.py +++ b/readthedocs/search/utils.py @@ -1,13 +1,39 @@ # -*- coding: utf-8 -*- +import os +import fnmatch +import re import codecs import logging +import json from pyquery import PyQuery log = logging.getLogger(__name__) +def process_mkdocs_json(version, build_dir=True): + if build_dir: + full_path = version.project.full_json_path(version.slug) + else: + full_path = version.project.get_production_media_path(type='json', version_slug=version.slug, include_file=False) + + html_files = [] + for root, dirs, files in os.walk(full_path): + for filename in fnmatch.filter(files, '*.json'): + html_files.append(os.path.join(root, filename)) + page_list = [] + for filename in html_files: + relative_path = parse_path_from_file(documentation_type='mkdocs', file_path=filename) + html = parse_content_from_file(documentation_type='mkdocs', file_path=filename) + headers = parse_headers_from_file(documentation_type='mkdocs', file_path=filename) + sections = parse_sections_from_file(documentation_type='mkdocs', file_path=filename) + page_list.append( + {'content': html, 'path': relative_path, 'title': sections[0]['title'], 'headers': headers, 'sections': sections} + ) + return page_list + + def recurse_while_none(element): if element.text is None: return recurse_while_none(element.getchildren()[0]) @@ -15,6 +41,21 @@ def recurse_while_none(element): return element.text +def parse_path_from_file(documentation_type, file_path): + try: + with codecs.open(file_path, encoding='utf-8', mode='r') as f: + content = f.read() + except IOError as e: + log.info('(Search Index) Unable to index file: %s, error :%s' % (file_path, e)) + return '' + + page_json = json.loads(content) + path = page_json['url'] + path = re.sub('/$', '/index', path) + + return path + + def parse_content_from_file(documentation_type, file_path): try: with codecs.open(file_path, encoding='utf-8', mode='r') as f: @@ -23,7 +64,9 @@ def parse_content_from_file(documentation_type, file_path): log.info('(Search Index) Unable to index file: %s, error :%s' % (file_path, e)) return '' - content = parse_content(documentation_type, content) + page_json = json.loads(content) + page_content = page_json['content'] + content = parse_content(documentation_type, page_content) if not content: log.info('(Search Index) Unable to index file: %s, empty file' % (file_path)) @@ -38,7 +81,7 @@ def parse_content(documentation_type, content): Returns the body text of a document """ try: - to_index = PyQuery(content)('div[role="main"]').text() + to_index = PyQuery(content).text() except ValueError: return '' return to_index @@ -52,7 +95,11 @@ def parse_headers_from_file(documentation_type, file_path): except IOError as e: log.info('(Search Index) Unable to index file: %s, error :%s' % (file_path, e)) return '' - headers = parse_headers(documentation_type, content) + + page_json = json.loads(content) + page_content = page_json['content'] + headers = parse_headers(documentation_type, page_content) + if not headers: log.error('Unable to index file headers for: %s' % file_path) return headers @@ -74,7 +121,11 @@ def parse_sections_from_file(documentation_type, file_path): except IOError as e: log.info('(Search Index) Unable to index file: %s, error :%s' % (file_path, e)) return '' - sections = parse_sections(documentation_type, content) + + page_json = json.loads(content) + page_content = page_json['content'] + sections = parse_sections(documentation_type, page_content) + if not sections: log.error('Unable to index file sections for: %s' % file_path) return sections @@ -120,51 +171,54 @@ def parse_sections(documentation_type, content): log.debug("(Search Index) Section [%s:%s]: %s" % (section_id, title, content)) if 'mkdocs' in documentation_type: try: - body = PyQuery(content)('div[role="main"]') + body = PyQuery(content) except ValueError: return '' - # H1 content - h1 = body('h1') - h1_id = h1.attr('id') - h1_title = h1.text().strip() - h1_content = "" - next_p = body('h1').next() - while next_p: - if next_p[0].tag == 'h2': - break - h1_html = next_p.html() - if h1_html: - h1_content += "\n%s\n" % h1_html - next_p = next_p.next() - if h1_content: - sections.append({ - 'id': h1_id, - 'title': h1_title, - 'content': h1_content, - }) - - # H2 content - section_list = body('h2') - for num in range(len(section_list)): - h2 = section_list.eq(num) - h2_title = h2.text().strip() - section_id = h2.attr('id') - h2_content = "" - next_p = body('h2').next() + try: + # H1 content + h1 = body('h1') + h1_id = h1.attr('id') + h1_title = h1.text().strip() + h1_content = "" + next_p = body('h1').next() while next_p: if next_p[0].tag == 'h2': break - h2_html = next_p.html() - if h2_html: - h2_content += "\n%s\n" % h2_html + h1_html = next_p.html() + if h1_html: + h1_content += "\n%s\n" % h1_html next_p = next_p.next() - if h2_content: + if h1_content: sections.append({ - 'id': section_id, - 'title': h2_title, - 'content': h2_content, + 'id': h1_id, + 'title': h1_title, + 'content': h1_content, }) - log.debug("(Search Index) Section [%s:%s]: %s" % (section_id, h2_title, h2_content)) + + # H2 content + section_list = body('h2') + for num in range(len(section_list)): + h2 = section_list.eq(num) + h2_title = h2.text().strip() + section_id = h2.attr('id') + h2_content = "" + next_p = body('h2').next() + while next_p: + if next_p[0].tag == 'h2': + break + h2_html = next_p.html() + if h2_html: + h2_content += "\n%s\n" % h2_html + next_p = next_p.next() + if h2_content: + sections.append({ + 'id': section_id, + 'title': h2_title, + 'content': h2_content, + }) + log.debug("(Search Index) Section [%s:%s]: %s" % (section_id, h2_title, h2_content)) + except: + log.error('Failed indexing', exc_info=True) return sections