Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Index path with original path name #5785

Merged
merged 12 commits into from
Jun 18, 2019
5 changes: 2 additions & 3 deletions readthedocs/core/static-src/core/js/doc-embed/search.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,9 @@ function attach_elastic_search_query(data) {
var list_item = $('<li style="display: none;"></li>');

// Creating the result from elements
var link = doc.link + DOCUMENTATION_OPTIONS.FILE_SUFFIX +
'?highlight=' + $.urlencode(query);
var url = doc.url + '?highlight=' + $.urlencode(query);

var item = $('<a>', {'href': link});
var item = $('<a>', {'href': url});
item.html(doc.title);
list_item.append(item);

Expand Down
14 changes: 7 additions & 7 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1231,21 +1231,21 @@ def get_processed_json(self):
Both lead to `foo/index.html`
https://github.com/rtfd/readthedocs.org/issues/5368
"""
paths = []
fjson_paths = []
basename = os.path.splitext(self.path)[0]
paths.append(basename + '.fjson')
fjson_paths.append(basename + '.fjson')
if basename.endswith('/index'):
new_basename = re.sub(r'\/index$', '', basename)
paths.append(new_basename + '.fjson')
fjson_paths.append(new_basename + '.fjson')

full_json_path = self.project.get_production_media_path(
type_='json', version_slug=self.version.slug, include_file=False
)
try:
for path in paths:
file_path = os.path.join(full_json_path, path)
for fjson_path in fjson_paths:
file_path = os.path.join(full_json_path, fjson_path)
if os.path.exists(file_path):
return process_file(file_path)
return process_file(file_path, self.path)
except Exception:
log.warning(
'Unhandled exception during search processing file: %s',
Expand All @@ -1254,7 +1254,7 @@ def get_processed_json(self):
return {
'headers': [],
'content': '',
'path': file_path,
'path': self.path,
'title': '',
'sections': [],
}
Expand Down
1 change: 1 addition & 0 deletions readthedocs/rtd_tests/tests/test_search_json_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def test_h2_parsing(self):
base_dir,
'files/api.fjson',
),
'files/api.html',
)
self.assertEqual(data['sections'][1]['id'], 'a-basic-api-client-using-slumber')
# Only capture h2's after the first section
Expand Down
24 changes: 22 additions & 2 deletions readthedocs/search/api.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import logging
import os
from pprint import pformat

from rest_framework import generics
from rest_framework import serializers
from rest_framework import generics, serializers
from rest_framework.exceptions import ValidationError
from rest_framework.pagination import PageNumberPagination

from readthedocs.search.faceted_search import PageSearch
from readthedocs.search.utils import get_project_list_or_404


log = logging.getLogger(__name__)


Expand All @@ -23,10 +24,29 @@ class PageSearchSerializer(serializers.Serializer):
version = serializers.CharField()
title = serializers.CharField()
path = serializers.CharField()
# Doc url without extension
link = serializers.SerializerMethodField()
# Doc url with extension
url = serializers.SerializerMethodField()
highlight = serializers.SerializerMethodField()

def get_link(self, obj):
"""
Gets the url without extension.

.. warning::
This is only used to keep compatibility with
the previous search implementation.
Use `url` instead.
"""
projects_url = self.context.get('projects_url')
if projects_url:
docs_url = projects_url[obj.project]
path = os.path.splitext(obj.path)[0]
return docs_url + path

def get_url(self, obj):
"""Gets the full url."""
projects_url = self.context.get('projects_url')
if projects_url:
docs_url = projects_url[obj.project]
Expand Down
27 changes: 12 additions & 15 deletions readthedocs/search/parse_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,40 +59,37 @@ def generate_sections_from_pyquery(body):
}


def process_file(filename):
"""Read a file from disk and parse it into a structured dict."""
def process_file(fjson_filename, filename):
"""Read the fjson file from disk and parse it into a structured dict."""
try:
with codecs.open(filename, encoding='utf-8', mode='r') as f:
with codecs.open(fjson_filename, encoding='utf-8', mode='r') as f:
file_contents = f.read()
except IOError:
log.info('Unable to read file: %s', filename)
return None
log.info('Unable to read file: %s', fjson_filename)
raise
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Won't this cause all indexing to fail if a single file is missing?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see we're catching it at a higher level, 👍

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We were returning None, but we expect to always have a dict.

This is caught by https://github.com/stsewd/readthedocs.org/blob/96a85fa8af3cac8b139bdf99598d119eae0e0163/readthedocs/projects/models.py#L1244-L1260

Which returns a default dict

data = json.loads(file_contents)
sections = []
title = ''
body_content = ''
if 'current_page_name' in data:
path = data['current_page_name']
else:
log.info('Unable to index file due to no name %s', filename)
return None
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we return none from this function, get_processed_json is going to return None too, we always expect a dict from here. And get_processed_json has a default https://github.com/stsewd/readthedocs.org/blob/a2e0e3f5e442b0072a4b7cbac9efadbe2a41c224/readthedocs/projects/models.py#L1254-L1261

if 'body' in data and data['body']:

if data.get('body'):
body = PyQuery(data['body'])
body_content = body.text().replace('¶', '')
sections.extend(generate_sections_from_pyquery(body))
else:
log.info('Unable to index content for: %s', filename)
log.info('Unable to index content for: %s', fjson_filename)

if 'title' in data:
title = data['title']
if title.startswith('<'):
title = PyQuery(data['title']).text()
else:
log.info('Unable to index title for: %s', filename)
log.info('Unable to index title for: %s', fjson_filename)

return {
'headers': process_headers(data, filename),
'headers': process_headers(data, fjson_filename),
'content': body_content,
'path': path,
'path': filename,
'title': title,
'sections': sections,
}
Expand Down