Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addons + Proxito: return X-RTD-Resolver-Filename and inject via CF #11100

Merged
merged 12 commits into from
Feb 27, 2024
Merged
2 changes: 2 additions & 0 deletions dockerfiles/nginx/proxito.conf.template
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ server {
add_header X-RTD-Project-Method $rtd_project_method always;
set $rtd_redirect $upstream_http_x_rtd_redirect;
add_header X-RTD-Redirect $rtd_redirect always;
set $rtd_resolver_filename $upstream_http_x_rtd_resolver_filename;
add_header X-RTD-Resolver-Filename $rtd_resolver_filename always;
set $cdn_cache_control $upstream_http_cdn_cache_control;
add_header CDN-Cache-Control $cdn_cache_control always;
set $cache_tag $upstream_http_cache_tag;
Expand Down
25 changes: 25 additions & 0 deletions readthedocs/proxito/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
)
from django.conf import settings
from django.core.exceptions import SuspiciousOperation
from django.http.response import BadHeaderError, ResponseHeaders
from django.shortcuts import redirect
from django.urls import reverse
from django.utils.deprecation import MiddlewareMixin
from django.utils.html import escape

from readthedocs.builds.models import Version
from readthedocs.core.unresolver import (
Expand Down Expand Up @@ -198,6 +200,7 @@ def _set_request_attributes(self, request, unresolved_domain):
def process_request(self, request): # noqa
# Initialize our custom request attributes.
request.unresolved_domain = None
request.unresolved_url = None

skip = any(request.path.startswith(reverse(view)) for view in self.skip_views)
if skip:
Expand Down Expand Up @@ -367,11 +370,33 @@ def _get_https_redirect(self, request):

return None

def add_resolver_headers(self, request, response):
if request.unresolved_url is not None:
# TODO: add more ``X-RTD-Resolver-*`` headers
header_value = escape(request.unresolved_url.filename)
try:
# Use Django internals to validate the header's value before injecting it.
ResponseHeaders({})._convert_to_charset(
header_value,
"latin-1",
mime_encode=True,
)
humitos marked this conversation as resolved.
Show resolved Hide resolved

response["X-RTD-Resolver-Filename"] = header_value
except BadHeaderError:
# Skip adding the header because it fails validation
log.info(
"Skip adding X-RTD-Resolver-Filename header due to invalid value.",
filename=request.unresolved_url.filename,
value=header_value,
)

def process_response(self, request, response): # noqa
self.add_proxito_headers(request, response)
self.add_cache_headers(request, response)
self.add_hsts_headers(request, response)
self.add_user_headers(request, response)
self.add_hosting_integrations_headers(request, response)
self.add_resolver_headers(request, response)
self.add_cors_headers(request, response)
return response
20 changes: 20 additions & 0 deletions readthedocs/proxito/tests/test_headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,30 @@ def test_serve_headers(self):
self.assertEqual(r["X-RTD-Project-Method"], "public_domain")
self.assertEqual(r["X-RTD-Version"], "latest")
self.assertEqual(r["X-RTD-version-Method"], "path")
self.assertEqual(r["X-RTD-Resolver-Filename"], "/")
humitos marked this conversation as resolved.
Show resolved Hide resolved
self.assertEqual(
r["X-RTD-Path"], "/proxito/media/html/project/latest/index.html"
)

def test_serve_headers_with_path(self):
r = self.client.get(
"/en/latest/guides/jupyter/gallery.html",
secure=True,
headers={"host": "project.dev.readthedocs.io"},
)
self.assertEqual(r.status_code, 200)
self.assertEqual(r["Cache-Tag"], "project,project:latest")
self.assertEqual(r["X-RTD-Domain"], "project.dev.readthedocs.io")
self.assertEqual(r["X-RTD-Project"], "project")
self.assertEqual(r["X-RTD-Project-Method"], "public_domain")
self.assertEqual(r["X-RTD-Version"], "latest")
self.assertEqual(r["X-RTD-version-Method"], "path")
self.assertEqual(r["X-RTD-Resolver-Filename"], "/guides/jupyter/gallery.html")
self.assertEqual(
r["X-RTD-Path"],
"/proxito/media/html/project/latest/guides/jupyter/gallery.html",
)

def test_subproject_serve_headers(self):
r = self.client.get(
"/projects/subproject/en/latest/",
Expand Down
1 change: 1 addition & 0 deletions readthedocs/proxito/tests/test_old_redirects.py
Original file line number Diff line number Diff line change
Expand Up @@ -1352,6 +1352,7 @@ def test_redirect_exact_with_wildcard_crossdomain(self):
r = self.client.get(url, headers={"host": "project.dev.readthedocs.io"})
self.assertEqual(r.status_code, 302, url)
self.assertEqual(r["Location"], expected_location, url)
self.assertNotIn("X-RTD-Resolver-Filename", r.headers)

def test_redirect_html_to_clean_url_crossdomain(self):
"""
Expand Down
99 changes: 58 additions & 41 deletions readthedocs/proxito/views/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,10 @@ def serve_path(self, request, path):
version = unresolved.version
filename = unresolved.filename

# Inject the UnresolvedURL into the HttpRequest so we can access from the middleware.
# We could resolve it again from the middleware, but we would duplicating DB queries.
request.unresolved_url = unresolved
Comment on lines +269 to +270
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should also be in the 404 view.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍🏼 -- I put it there.

I found there could be cases where the unresolve_path could raise and exception making the request.unresolved_url=None but there would be a filename anyways, since it's got from exc.path.

I think we could eventually refactor the proxito/unresolver code to support UnresolvedURL without all the components. This is out of the scope of this PR, tho and probably also related to #10456


# Check if the old language code format was used, and redirect to the new one.
# NOTE: we may have some false positives here, for example for an URL like:
# /pt-br/latest/pt_BR/index.html, but our protection for infinite redirects
Expand Down Expand Up @@ -391,7 +395,7 @@ def get(self, request, proxito_path):
the Docs default page (Maze Found) is rendered by Django and served.
"""
log.bind(proxito_path=proxito_path)
log.debug('Executing 404 handler.')
log.debug("Executing 404 handler.")
unresolved_domain = request.unresolved_domain
# We force all storage calls to use the external versions storage,
# since we are serving an external version.
Expand Down Expand Up @@ -420,6 +424,11 @@ def get(self, request, proxito_path):
path=proxito_path,
append_indexhtml=False,
)

# Inject the UnresolvedURL into the HttpRequest so we can access from the middleware.
# We could resolve it again from the middleware, but we would duplicating DB queries.
request.unresolved_url = unresolved

project = unresolved.project
version = unresolved.version
filename = unresolved.filename
Expand Down Expand Up @@ -708,11 +717,12 @@ def get(self, request):
# Verify if the project is marked as spam and return a custom robots.txt
if "readthedocsext.spamfighting" in settings.INSTALLED_APPS:
from readthedocsext.spamfighting.utils import is_robotstxt_denied # noqa

if is_robotstxt_denied(project):
return render(
request,
'robots.spam.txt',
content_type='text/plain',
"robots.spam.txt",
content_type="text/plain",
)

# Use the ``robots.txt`` file from the default version configured
Expand Down Expand Up @@ -747,33 +757,30 @@ def get(self, request):
filename="robots.txt",
check_if_exists=True,
)
log.info('Serving custom robots.txt file.')
log.info("Serving custom robots.txt file.")
return response
except StorageFileNotFound:
pass

# Serve default robots.txt
sitemap_url = '{scheme}://{domain}/sitemap.xml'.format(
scheme='https',
sitemap_url = "{scheme}://{domain}/sitemap.xml".format(
scheme="https",
domain=project.subdomain(),
)
context = {
'sitemap_url': sitemap_url,
'hidden_paths': self._get_hidden_paths(project),
"sitemap_url": sitemap_url,
"hidden_paths": self._get_hidden_paths(project),
}
return render(
request,
'robots.txt',
"robots.txt",
context,
content_type='text/plain',
content_type="text/plain",
)

def _get_hidden_paths(self, project):
"""Get the absolute paths of the public hidden versions of `project`."""
hidden_versions = (
Version.internal.public(project=project)
.filter(hidden=True)
)
hidden_versions = Version.internal.public(project=project).filter(hidden=True)
resolver = Resolver()
hidden_paths = [
resolver.resolve_path(project, version_slug=version.slug)
Expand Down Expand Up @@ -846,8 +853,8 @@ def hreflang_formatter(lang):
Use hyphen instead of underscore in language and country value.
ref: https://en.wikipedia.org/wiki/Hreflang#Common_Mistakes
"""
if '_' in lang:
return lang.replace('_', '-')
if "_" in lang:
return lang.replace("_", "-")
return lang

def changefreqs_generator():
Expand All @@ -861,8 +868,8 @@ def changefreqs_generator():
aggressive. If the tag is removed and a branch is created with the same
name, we will want bots to revisit this.
"""
changefreqs = ['weekly', 'daily']
yield from itertools.chain(changefreqs, itertools.repeat('monthly'))
changefreqs = ["weekly", "daily"]
yield from itertools.chain(changefreqs, itertools.repeat("monthly"))

project = request.unresolved_domain.project
public_versions = Version.internal.public(
Expand All @@ -879,28 +886,34 @@ def changefreqs_generator():
# We want stable with priority=1 and changefreq='weekly' and
# latest with priority=0.9 and changefreq='daily'
# More details on this: https://github.com/rtfd/readthedocs.org/issues/5447
if (len(sorted_versions) >= 2 and sorted_versions[0].slug == LATEST and
sorted_versions[1].slug == STABLE):
sorted_versions[0], sorted_versions[1] = sorted_versions[1], sorted_versions[0]
if (
len(sorted_versions) >= 2
and sorted_versions[0].slug == LATEST
and sorted_versions[1].slug == STABLE
):
sorted_versions[0], sorted_versions[1] = (
sorted_versions[1],
sorted_versions[0],
)

versions = []
for version, priority, changefreq in zip(
sorted_versions,
priorities_generator(),
changefreqs_generator(),
sorted_versions,
priorities_generator(),
changefreqs_generator(),
):
element = {
'loc': version.get_subdomain_url(),
'priority': priority,
'changefreq': changefreq,
'languages': [],
"loc": version.get_subdomain_url(),
"priority": priority,
"changefreq": changefreq,
"languages": [],
}

# Version can be enabled, but not ``built`` yet. We want to show the
# link without a ``lastmod`` attribute
last_build = version.builds.order_by('-date').first()
last_build = version.builds.order_by("-date").first()
if last_build:
element['lastmod'] = last_build.date.isoformat()
element["lastmod"] = last_build.date.isoformat()

resolver = Resolver()
if project.translations.exists():
Expand All @@ -915,27 +928,31 @@ def changefreqs_generator():
project=translation,
version=translated_version,
)
element['languages'].append({
'hreflang': hreflang_formatter(translation.language),
'href': href,
})
element["languages"].append(
{
"hreflang": hreflang_formatter(translation.language),
"href": href,
}
)

# Add itself also as protocol requires
element['languages'].append({
'hreflang': project.language,
'href': element['loc'],
})
element["languages"].append(
{
"hreflang": project.language,
"href": element["loc"],
}
)

versions.append(element)

context = {
'versions': versions,
"versions": versions,
}
return render(
request,
'sitemap.xml',
"sitemap.xml",
context,
content_type='application/xml',
content_type="application/xml",
)

def _get_project(self):
Expand Down