Skip to content

Commit

Permalink
Addons + Proxito: return X-RTD-Resolver-Filename and inject via CF (#…
Browse files Browse the repository at this point in the history
…11100)

* Development: use `wrangler` locally (update NGINX/Dockerfile config)

Related readthedocs/addons#217

* Add CF Worker .js script

* Proxy JS files to Addons GitHub's repository

* Point common to wrangler branch

* Addons + Proxito: return `X-RTD-Resolver-Filename` and inject via CF

Return `X-RTD-Resolver-Filename` from El Proxito and inject it as HTML `meta`
tag from Cloudflare Worker.

Required by readthedocs/addons#211

* Feedback from review

* Validate the header value before injecting it

* Initialize `request.unresolved_url` as `None`

* Add test for longer filename

* Inject the `unresolved_url` on 404 pages
  • Loading branch information
humitos authored Feb 27, 2024
1 parent 5279a90 commit 48203f2
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 41 deletions.
2 changes: 2 additions & 0 deletions dockerfiles/nginx/proxito.conf.template
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ server {
add_header X-RTD-Project-Method $rtd_project_method always;
set $rtd_redirect $upstream_http_x_rtd_redirect;
add_header X-RTD-Redirect $rtd_redirect always;
set $rtd_resolver_filename $upstream_http_x_rtd_resolver_filename;
add_header X-RTD-Resolver-Filename $rtd_resolver_filename always;
set $cdn_cache_control $upstream_http_cdn_cache_control;
add_header CDN-Cache-Control $cdn_cache_control always;
set $cache_tag $upstream_http_cache_tag;
Expand Down
25 changes: 25 additions & 0 deletions readthedocs/proxito/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
)
from django.conf import settings
from django.core.exceptions import SuspiciousOperation
from django.http.response import BadHeaderError, ResponseHeaders
from django.shortcuts import redirect
from django.urls import reverse
from django.utils.deprecation import MiddlewareMixin
from django.utils.html import escape

from readthedocs.builds.models import Version
from readthedocs.core.unresolver import (
Expand Down Expand Up @@ -198,6 +200,7 @@ def _set_request_attributes(self, request, unresolved_domain):
def process_request(self, request): # noqa
# Initialize our custom request attributes.
request.unresolved_domain = None
request.unresolved_url = None

skip = any(request.path.startswith(reverse(view)) for view in self.skip_views)
if skip:
Expand Down Expand Up @@ -367,11 +370,33 @@ def _get_https_redirect(self, request):

return None

def add_resolver_headers(self, request, response):
if request.unresolved_url is not None:
# TODO: add more ``X-RTD-Resolver-*`` headers
header_value = escape(request.unresolved_url.filename)
try:
# Use Django internals to validate the header's value before injecting it.
ResponseHeaders({})._convert_to_charset(
header_value,
"latin-1",
mime_encode=True,
)

response["X-RTD-Resolver-Filename"] = header_value
except BadHeaderError:
# Skip adding the header because it fails validation
log.info(
"Skip adding X-RTD-Resolver-Filename header due to invalid value.",
filename=request.unresolved_url.filename,
value=header_value,
)

def process_response(self, request, response): # noqa
self.add_proxito_headers(request, response)
self.add_cache_headers(request, response)
self.add_hsts_headers(request, response)
self.add_user_headers(request, response)
self.add_hosting_integrations_headers(request, response)
self.add_resolver_headers(request, response)
self.add_cors_headers(request, response)
return response
20 changes: 20 additions & 0 deletions readthedocs/proxito/tests/test_headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,30 @@ def test_serve_headers(self):
self.assertEqual(r["X-RTD-Project-Method"], "public_domain")
self.assertEqual(r["X-RTD-Version"], "latest")
self.assertEqual(r["X-RTD-version-Method"], "path")
self.assertEqual(r["X-RTD-Resolver-Filename"], "/")
self.assertEqual(
r["X-RTD-Path"], "/proxito/media/html/project/latest/index.html"
)

def test_serve_headers_with_path(self):
r = self.client.get(
"/en/latest/guides/jupyter/gallery.html",
secure=True,
headers={"host": "project.dev.readthedocs.io"},
)
self.assertEqual(r.status_code, 200)
self.assertEqual(r["Cache-Tag"], "project,project:latest")
self.assertEqual(r["X-RTD-Domain"], "project.dev.readthedocs.io")
self.assertEqual(r["X-RTD-Project"], "project")
self.assertEqual(r["X-RTD-Project-Method"], "public_domain")
self.assertEqual(r["X-RTD-Version"], "latest")
self.assertEqual(r["X-RTD-version-Method"], "path")
self.assertEqual(r["X-RTD-Resolver-Filename"], "/guides/jupyter/gallery.html")
self.assertEqual(
r["X-RTD-Path"],
"/proxito/media/html/project/latest/guides/jupyter/gallery.html",
)

def test_subproject_serve_headers(self):
r = self.client.get(
"/projects/subproject/en/latest/",
Expand Down
1 change: 1 addition & 0 deletions readthedocs/proxito/tests/test_old_redirects.py
Original file line number Diff line number Diff line change
Expand Up @@ -1352,6 +1352,7 @@ def test_redirect_exact_with_wildcard_crossdomain(self):
r = self.client.get(url, headers={"host": "project.dev.readthedocs.io"})
self.assertEqual(r.status_code, 302, url)
self.assertEqual(r["Location"], expected_location, url)
self.assertNotIn("X-RTD-Resolver-Filename", r.headers)

def test_redirect_html_to_clean_url_crossdomain(self):
"""
Expand Down
99 changes: 58 additions & 41 deletions readthedocs/proxito/views/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,10 @@ def serve_path(self, request, path):
version = unresolved.version
filename = unresolved.filename

# Inject the UnresolvedURL into the HttpRequest so we can access from the middleware.
# We could resolve it again from the middleware, but we would duplicating DB queries.
request.unresolved_url = unresolved

# Check if the old language code format was used, and redirect to the new one.
# NOTE: we may have some false positives here, for example for an URL like:
# /pt-br/latest/pt_BR/index.html, but our protection for infinite redirects
Expand Down Expand Up @@ -391,7 +395,7 @@ def get(self, request, proxito_path):
the Docs default page (Maze Found) is rendered by Django and served.
"""
log.bind(proxito_path=proxito_path)
log.debug('Executing 404 handler.')
log.debug("Executing 404 handler.")
unresolved_domain = request.unresolved_domain
# We force all storage calls to use the external versions storage,
# since we are serving an external version.
Expand Down Expand Up @@ -420,6 +424,11 @@ def get(self, request, proxito_path):
path=proxito_path,
append_indexhtml=False,
)

# Inject the UnresolvedURL into the HttpRequest so we can access from the middleware.
# We could resolve it again from the middleware, but we would duplicating DB queries.
request.unresolved_url = unresolved

project = unresolved.project
version = unresolved.version
filename = unresolved.filename
Expand Down Expand Up @@ -708,11 +717,12 @@ def get(self, request):
# Verify if the project is marked as spam and return a custom robots.txt
if "readthedocsext.spamfighting" in settings.INSTALLED_APPS:
from readthedocsext.spamfighting.utils import is_robotstxt_denied # noqa

if is_robotstxt_denied(project):
return render(
request,
'robots.spam.txt',
content_type='text/plain',
"robots.spam.txt",
content_type="text/plain",
)

# Use the ``robots.txt`` file from the default version configured
Expand Down Expand Up @@ -747,33 +757,30 @@ def get(self, request):
filename="robots.txt",
check_if_exists=True,
)
log.info('Serving custom robots.txt file.')
log.info("Serving custom robots.txt file.")
return response
except StorageFileNotFound:
pass

# Serve default robots.txt
sitemap_url = '{scheme}://{domain}/sitemap.xml'.format(
scheme='https',
sitemap_url = "{scheme}://{domain}/sitemap.xml".format(
scheme="https",
domain=project.subdomain(),
)
context = {
'sitemap_url': sitemap_url,
'hidden_paths': self._get_hidden_paths(project),
"sitemap_url": sitemap_url,
"hidden_paths": self._get_hidden_paths(project),
}
return render(
request,
'robots.txt',
"robots.txt",
context,
content_type='text/plain',
content_type="text/plain",
)

def _get_hidden_paths(self, project):
"""Get the absolute paths of the public hidden versions of `project`."""
hidden_versions = (
Version.internal.public(project=project)
.filter(hidden=True)
)
hidden_versions = Version.internal.public(project=project).filter(hidden=True)
resolver = Resolver()
hidden_paths = [
resolver.resolve_path(project, version_slug=version.slug)
Expand Down Expand Up @@ -846,8 +853,8 @@ def hreflang_formatter(lang):
Use hyphen instead of underscore in language and country value.
ref: https://en.wikipedia.org/wiki/Hreflang#Common_Mistakes
"""
if '_' in lang:
return lang.replace('_', '-')
if "_" in lang:
return lang.replace("_", "-")
return lang

def changefreqs_generator():
Expand All @@ -861,8 +868,8 @@ def changefreqs_generator():
aggressive. If the tag is removed and a branch is created with the same
name, we will want bots to revisit this.
"""
changefreqs = ['weekly', 'daily']
yield from itertools.chain(changefreqs, itertools.repeat('monthly'))
changefreqs = ["weekly", "daily"]
yield from itertools.chain(changefreqs, itertools.repeat("monthly"))

project = request.unresolved_domain.project
public_versions = Version.internal.public(
Expand All @@ -879,28 +886,34 @@ def changefreqs_generator():
# We want stable with priority=1 and changefreq='weekly' and
# latest with priority=0.9 and changefreq='daily'
# More details on this: https://github.com/rtfd/readthedocs.org/issues/5447
if (len(sorted_versions) >= 2 and sorted_versions[0].slug == LATEST and
sorted_versions[1].slug == STABLE):
sorted_versions[0], sorted_versions[1] = sorted_versions[1], sorted_versions[0]
if (
len(sorted_versions) >= 2
and sorted_versions[0].slug == LATEST
and sorted_versions[1].slug == STABLE
):
sorted_versions[0], sorted_versions[1] = (
sorted_versions[1],
sorted_versions[0],
)

versions = []
for version, priority, changefreq in zip(
sorted_versions,
priorities_generator(),
changefreqs_generator(),
sorted_versions,
priorities_generator(),
changefreqs_generator(),
):
element = {
'loc': version.get_subdomain_url(),
'priority': priority,
'changefreq': changefreq,
'languages': [],
"loc": version.get_subdomain_url(),
"priority": priority,
"changefreq": changefreq,
"languages": [],
}

# Version can be enabled, but not ``built`` yet. We want to show the
# link without a ``lastmod`` attribute
last_build = version.builds.order_by('-date').first()
last_build = version.builds.order_by("-date").first()
if last_build:
element['lastmod'] = last_build.date.isoformat()
element["lastmod"] = last_build.date.isoformat()

resolver = Resolver()
if project.translations.exists():
Expand All @@ -915,27 +928,31 @@ def changefreqs_generator():
project=translation,
version=translated_version,
)
element['languages'].append({
'hreflang': hreflang_formatter(translation.language),
'href': href,
})
element["languages"].append(
{
"hreflang": hreflang_formatter(translation.language),
"href": href,
}
)

# Add itself also as protocol requires
element['languages'].append({
'hreflang': project.language,
'href': element['loc'],
})
element["languages"].append(
{
"hreflang": project.language,
"href": element["loc"],
}
)

versions.append(element)

context = {
'versions': versions,
"versions": versions,
}
return render(
request,
'sitemap.xml',
"sitemap.xml",
context,
content_type='application/xml',
content_type="application/xml",
)

def _get_project(self):
Expand Down

0 comments on commit 48203f2

Please sign in to comment.