From 92a183a56dd754287ae02774c2af013101818ab7 Mon Sep 17 00:00:00 2001
From: Eric Holscher <eric@ericholscher.com>
Date: Mon, 13 Oct 2014 12:56:52 -0700
Subject: [PATCH] Clean up mkdocs builder and search

---
 readthedocs/doc_builder/backends/mkdocs.py |  79 +++++-------
 readthedocs/doc_builder/loader.py          |   3 +-
 readthedocs/projects/models.py             |   5 +-
 readthedocs/projects/tasks.py              |  38 ++++--
 readthedocs/restapi/utils.py               |   4 +-
 readthedocs/search/utils.py                | 136 ++++++++++++++-------
 6 files changed, 159 insertions(+), 106 deletions(-)

diff --git a/readthedocs/doc_builder/backends/mkdocs.py b/readthedocs/doc_builder/backends/mkdocs.py
index 307a14fa252..41e71dadb32 100644
--- a/readthedocs/doc_builder/backends/mkdocs.py
+++ b/readthedocs/doc_builder/backends/mkdocs.py
@@ -17,31 +17,25 @@
 log = logging.getLogger(__name__)
 
 
-class Builder(BaseBuilder):
+class BaseMkdocs(BaseBuilder):
 
     """
     Mkdocs builder
     """
-    type = 'mkdocs'
 
     def __init__(self, *args, **kwargs):
-        super(Builder, self).__init__(*args, **kwargs)
-        self.old_artifact_path = os.path.join(
-            self.version.project.checkout_path(self.version.slug), 'site')
+        super(BaseMkdocs, self).__init__(*args, **kwargs)
+        self.old_artifact_path = os.path.join(self.version.project.checkout_path(self.version.slug), self.build_dir)
 
-    @restoring_chdir
-    def build(self, **kwargs):
-        project = self.version.project
-        checkout_path = project.checkout_path(self.version.slug)
-        site_path = os.path.join(checkout_path, 'site')
-        os.chdir(checkout_path)
+    def append_conf(self, **kwargs):
+        """
+        Set mkdocs config values
+        """
 
         # Pull mkdocs config data
         user_config = yaml.safe_load(open('mkdocs.yml', 'r'))
         docs_dir = user_config.get('docs_dir', 'docs')
 
-        # Set mkdocs config values
-
         MEDIA_URL = getattr(
             settings, 'MEDIA_URL', 'https://media.readthedocs.org')
         if 'extra_javascript' in user_config:
@@ -84,9 +78,9 @@ def build(self, **kwargs):
         # RTD javascript writing
 
         READTHEDOCS_DATA = {
-            'project': project.slug,
+            'project': self.version.project.slug,
             'version': self.version.slug,
-            'language': project.language,
+            'language': self.version.project.language,
             'page': None,
             'theme': "readthedocs",
             'docroot': docs_dir,
@@ -112,7 +106,7 @@ def build(self, **kwargs):
 
         include_ctx = Context({
             'global_analytics_code': getattr(settings, 'GLOBAL_ANALYTICS_CODE', 'UA-17997319-1'),
-            'user_analytics_code': project.analytics_code,
+            'user_analytics_code': self.version.project.analytics_code,
         })
         include_string = template_loader.get_template(
             'doc_builder/include.js.tmpl'
@@ -121,40 +115,29 @@ def build(self, **kwargs):
         include_file.write(include_string)
         include_file.close()
 
+    @restoring_chdir
+    def build(self, **kwargs):
+        checkout_path = self.version.project.checkout_path(self.version.slug)
+        #site_path = os.path.join(checkout_path, 'site')
+        os.chdir(checkout_path)
+        self.append_conf()
         # Actual build
-
-        build_command = "%s build --site-dir=site --theme=mkdocs" % (
-            project.venv_bin(version=self.version.slug,
-                             bin='mkdocs')
+        build_command = "{command} {builder} --site-dir={build_dir} --theme=mkdocs".format(
+            command=self.version.project.venv_bin(version=self.version.slug, bin='mkdocs'),
+            builder=self.builder,
+            build_dir=self.build_dir,
         )
         results = run(build_command, shell=True)
+        return results
 
-        try:
-            # Index Search
-            page_list = []
-            log.info(LOG_TEMPLATE.format(project=self.version.project.slug, version=self.version.slug, msg='Indexing files'))
-            for root, dirnames, filenames in os.walk(site_path):
-                for filename in filenames:
-                    if fnmatch.fnmatch(filename, '*.html'):
-                        full_path = os.path.join(root, filename.lstrip('/'))
-                        relative_path = os.path.join(root.replace(site_path, '').lstrip('/'), filename.lstrip('/'))
-                        relative_path = re.sub('.html$', '', relative_path)
-                        html = parse_content_from_file(documentation_type='mkdocs', file_path=full_path)
-                        headers = parse_headers_from_file(documentation_type='mkdocs', file_path=full_path)
-                        sections = parse_sections_from_file(documentation_type='mkdocs', file_path=full_path)
-                        page_list.append(
-                            {'content': html, 'path': relative_path, 'title': sections[0]['title'], 'headers': headers, 'sections': sections}
-                        )
-
-            data = {
-                'page_list': page_list,
-                'version_pk': self.version.pk,
-                'project_pk': self.version.project.pk
-            }
-            log_msg = ' '.join([page['path'] for page in page_list])
-            log.info("(Search Index) Sending Data: %s [%s]" % (self.version.project.slug, log_msg))
-            apiv2.index_search.post({'data': data})
-        except:
-            log.error('Search indexing failed')
 
-        return results
+class MkdocsHTML(BaseMkdocs):
+    type = 'mkdocs'
+    builder = 'build'
+    build_dir = '_build/html'
+
+
+class MkdocsJSON(BaseMkdocs):
+    type = 'mkdocs_json'
+    builder = 'json'
+    build_dir = '_build/json'
diff --git a/readthedocs/doc_builder/loader.py b/readthedocs/doc_builder/loader.py
index fe818bfe5f2..6029d94f87f 100644
--- a/readthedocs/doc_builder/loader.py
+++ b/readthedocs/doc_builder/loader.py
@@ -16,5 +16,6 @@
     'sphinx_search': sphinx.SearchBuilder,
     'sphinx_singlehtmllocalmedia': sphinx.LocalMediaBuilder,
     # Other markup
-    'mkdocs': mkdocs.Builder,
+    'mkdocs': mkdocs.MkdocsHTML,
+    'mkdocs_json': mkdocs.MkdocsJSON,
 }
diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
index e955956b2c0..3d57d859332 100644
--- a/readthedocs/projects/models.py
+++ b/readthedocs/projects/models.py
@@ -480,7 +480,10 @@ def full_json_path(self, version='latest'):
         """
         The path to the build json docs in the project.
         """
-        return os.path.join(self.conf_dir(version), "_build", "json")
+        if 'sphinx' in self.documentation_type:
+            return os.path.join(self.conf_dir(version), "_build", "json")
+        elif 'mkdocs' in self.documentation_type:
+            return os.path.join(self.checkout_path(version), "_build", "json")
 
     def full_singlehtml_path(self, version='latest'):
         """
diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py
index 81bd7e9d586..2d3e70609ff 100644
--- a/readthedocs/projects/tasks.py
+++ b/readthedocs/projects/tasks.py
@@ -29,6 +29,7 @@
                         run_on_app_servers)
 from core import utils as core_utils
 from search.parse_json import process_all_json_files
+from search.utils import process_mkdocs_json
 from vcs_support import utils as vcs_support_utils
 
 log = logging.getLogger(__name__)
@@ -451,15 +452,24 @@ def build_docs(version, force, pdf, man, epub, dash, search, localmedia):
         html_builder = builder_loading.get(project.documentation_type)(version)
         if force:
             html_builder.force()
-        # html_builder.clean()
-        if 'sphinx' in project.documentation_type:
-            html_builder.append_conf()
+        html_builder.append_conf()
         results['html'] = html_builder.build()
         if results['html'][0] == 0:
             html_builder.move()
 
         fake_results = (999, "Project Skipped, Didn't build",
                         "Project Skipped, Didn't build")
+        if 'mkdocs' in project.documentation_type:
+            if search:
+                try:
+                    search_builder = builder_loading.get('mkdocs_json')(version)
+                    results['search'] = search_builder.build()
+                    if results['search'][0] == 0:
+                        search_builder.move()
+                except:
+                    log.error(LOG_TEMPLATE.format(
+                        project=project.slug, version=version.slug, msg="JSON Build Error"), exc_info=True)
+
         if 'sphinx' in project.documentation_type:
             # Search builder. Creates JSON from docs and sends it to the
             # server.
@@ -689,16 +699,18 @@ def record_pdf(api, record, results, state, version):
 def update_search(version, build):
     if 'sphinx' in version.project.documentation_type:
         page_list = process_all_json_files(version)
-        data = {
-            'page_list': page_list,
-            'version_pk': version.pk,
-            'project_pk': version.project.pk,
-            'commit': build.get('commit'),
-        }
-        log_msg = ' '.join([page['path'] for page in page_list])
-        log.info("(Search Index) Sending Data: %s [%s]" % (
-            version.project.slug, log_msg))
-        apiv2.index_search.post({'data': data})
+    if 'mkdocs' in version.project.documentation_type:
+        page_list = process_mkdocs_json(version)
+
+    data = {
+        'page_list': page_list,
+        'version_pk': version.pk,
+        'project_pk': version.project.pk,
+        'commit': build.get('commit'),
+    }
+    log_msg = ' '.join([page['path'] for page in page_list])
+    log.info("(Search Index) Sending Data: %s [%s]" % (version.project.slug, log_msg))
+    apiv2.index_search.post({'data': data})
 
 
 @task()
diff --git a/readthedocs/restapi/utils.py b/readthedocs/restapi/utils.py
index d6ab2f7011a..e015762cb97 100644
--- a/readthedocs/restapi/utils.py
+++ b/readthedocs/restapi/utils.py
@@ -93,7 +93,7 @@ def index_search_request(version, page_list, commit):
     page_obj = PageIndex()
     project_scale = 1
 
-    tags = [tag.name for tag in project.tags.all()]
+    #tags = [tag.name for tag in project.tags.all()]
 
     project_obj = ProjectIndex()
     project_obj.index_document(data={
@@ -104,7 +104,7 @@ def index_search_request(version, page_list, commit):
         'lang': project.language,
         'author': [user.username for user in project.users.all()],
         'url': project.get_absolute_url(),
-        'tags': tags,
+        'tags': None,
         '_boost': project_scale,
     })
 
diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py
index 18d1d6b4e0d..b66270e060f 100644
--- a/readthedocs/search/utils.py
+++ b/readthedocs/search/utils.py
@@ -1,13 +1,39 @@
 # -*- coding: utf-8 -*-
 
+import os
+import fnmatch
+import re
 import codecs
 import logging
+import json
 
 from pyquery import PyQuery
 
 log = logging.getLogger(__name__)
 
 
+def process_mkdocs_json(version, build_dir=True):
+    if build_dir:
+        full_path = version.project.full_json_path(version.slug)
+    else:
+        full_path = version.project.get_production_media_path(type='json', version_slug=version.slug, include_file=False)
+
+    html_files = []
+    for root, dirs, files in os.walk(full_path):
+        for filename in fnmatch.filter(files, '*.json'):
+            html_files.append(os.path.join(root, filename))
+    page_list = []
+    for filename in html_files:
+        relative_path = parse_path_from_file(documentation_type='mkdocs', file_path=filename)
+        html = parse_content_from_file(documentation_type='mkdocs', file_path=filename)
+        headers = parse_headers_from_file(documentation_type='mkdocs', file_path=filename)
+        sections = parse_sections_from_file(documentation_type='mkdocs', file_path=filename)
+        page_list.append(
+            {'content': html, 'path': relative_path, 'title': sections[0]['title'], 'headers': headers, 'sections': sections}
+        )
+    return page_list
+
+
 def recurse_while_none(element):
     if element.text is None:
         return recurse_while_none(element.getchildren()[0])
@@ -15,6 +41,21 @@ def recurse_while_none(element):
         return element.text
 
 
+def parse_path_from_file(documentation_type, file_path):
+    try:
+        with codecs.open(file_path, encoding='utf-8', mode='r') as f:
+            content = f.read()
+    except IOError as e:
+        log.info('(Search Index) Unable to index file: %s, error :%s' % (file_path, e))
+        return ''
+
+    page_json = json.loads(content)
+    path = page_json['url']
+    path = re.sub('/$', '/index', path)
+
+    return path
+
+
 def parse_content_from_file(documentation_type, file_path):
     try:
         with codecs.open(file_path, encoding='utf-8', mode='r') as f:
@@ -23,7 +64,9 @@ def parse_content_from_file(documentation_type, file_path):
         log.info('(Search Index) Unable to index file: %s, error :%s' % (file_path, e))
         return ''
 
-    content = parse_content(documentation_type, content)
+    page_json = json.loads(content)
+    page_content = page_json['content']
+    content = parse_content(documentation_type, page_content)
 
     if not content:
         log.info('(Search Index) Unable to index file: %s, empty file' % (file_path))
@@ -38,7 +81,7 @@ def parse_content(documentation_type, content):
     Returns the body text of a document
     """
     try:
-        to_index = PyQuery(content)('div[role="main"]').text()
+        to_index = PyQuery(content).text()
     except ValueError:
         return ''
     return to_index
@@ -52,7 +95,11 @@ def parse_headers_from_file(documentation_type, file_path):
     except IOError as e:
         log.info('(Search Index) Unable to index file: %s, error :%s' % (file_path, e))
         return ''
-    headers = parse_headers(documentation_type, content)
+
+    page_json = json.loads(content)
+    page_content = page_json['content']
+    headers = parse_headers(documentation_type, page_content)
+
     if not headers:
         log.error('Unable to index file headers for: %s' % file_path)
     return headers
@@ -74,7 +121,11 @@ def parse_sections_from_file(documentation_type, file_path):
     except IOError as e:
         log.info('(Search Index) Unable to index file: %s, error :%s' % (file_path, e))
         return ''
-    sections = parse_sections(documentation_type, content)
+
+    page_json = json.loads(content)
+    page_content = page_json['content']
+    sections = parse_sections(documentation_type, page_content)
+
     if not sections:
         log.error('Unable to index file sections for: %s' % file_path)
     return sections
@@ -120,51 +171,54 @@ def parse_sections(documentation_type, content):
             log.debug("(Search Index) Section [%s:%s]: %s" % (section_id, title, content))
     if 'mkdocs' in documentation_type:
         try:
-            body = PyQuery(content)('div[role="main"]')
+            body = PyQuery(content)
         except ValueError:
             return ''
 
-        # H1 content
-        h1 = body('h1')
-        h1_id = h1.attr('id')
-        h1_title = h1.text().strip()
-        h1_content = ""
-        next_p = body('h1').next()
-        while next_p:
-            if next_p[0].tag == 'h2':
-                break
-            h1_html = next_p.html()
-            if h1_html:
-                h1_content += "\n%s\n" % h1_html
-            next_p = next_p.next()
-        if h1_content:
-            sections.append({
-                'id': h1_id,
-                'title': h1_title,
-                'content': h1_content,
-            })
-
-        # H2 content
-        section_list = body('h2')
-        for num in range(len(section_list)):
-            h2 = section_list.eq(num)
-            h2_title = h2.text().strip()
-            section_id = h2.attr('id')
-            h2_content = ""
-            next_p = body('h2').next()
+        try:
+            # H1 content
+            h1 = body('h1')
+            h1_id = h1.attr('id')
+            h1_title = h1.text().strip()
+            h1_content = ""
+            next_p = body('h1').next()
             while next_p:
                 if next_p[0].tag == 'h2':
                     break
-                h2_html = next_p.html()
-                if h2_html:
-                    h2_content += "\n%s\n" % h2_html
+                h1_html = next_p.html()
+                if h1_html:
+                    h1_content += "\n%s\n" % h1_html
                 next_p = next_p.next()
-            if h2_content:
+            if h1_content:
                 sections.append({
-                    'id': section_id,
-                    'title': h2_title,
-                    'content': h2_content,
+                    'id': h1_id,
+                    'title': h1_title,
+                    'content': h1_content,
                 })
-            log.debug("(Search Index) Section [%s:%s]: %s" % (section_id, h2_title, h2_content))
+
+            # H2 content
+            section_list = body('h2')
+            for num in range(len(section_list)):
+                h2 = section_list.eq(num)
+                h2_title = h2.text().strip()
+                section_id = h2.attr('id')
+                h2_content = ""
+                next_p = body('h2').next()
+                while next_p:
+                    if next_p[0].tag == 'h2':
+                        break
+                    h2_html = next_p.html()
+                    if h2_html:
+                        h2_content += "\n%s\n" % h2_html
+                    next_p = next_p.next()
+                if h2_content:
+                    sections.append({
+                        'id': section_id,
+                        'title': h2_title,
+                        'content': h2_content,
+                    })
+                log.debug("(Search Index) Section [%s:%s]: %s" % (section_id, h2_title, h2_content))
+        except:
+            log.error('Failed indexing', exc_info=True)
 
     return sections