From 32e3ddc9999d538cbab41a69d8c79c3a61edb594 Mon Sep 17 00:00:00 2001 From: Corry Haines Date: Sat, 28 Jan 2023 16:12:51 -0800 Subject: [PATCH] Templatize robots.txt The existing static robots.txt did not allow any method of adding additional disallowed agents. Since a substantial majority of web traffic to my test instance is a few SEO crawlers, and this is likely to be true for others as well, add a method by which well-behaving robots can be blocked. --- core/views.py | 18 ++++++++++++++++++ docker/nginx.conf.d/default.conf.tpl | 5 ----- takahe/settings.py | 6 ++++++ takahe/urls.py | 1 + templates/robots.txt | 13 +++++++++++++ 5 files changed, 38 insertions(+), 5 deletions(-) create mode 100644 templates/robots.txt diff --git a/core/views.py b/core/views.py index d8fddd48..c7827288 100644 --- a/core/views.py +++ b/core/views.py @@ -2,6 +2,7 @@ from typing import ClassVar import markdown_it +from django.conf import settings from django.http import HttpResponse from django.shortcuts import redirect from django.templatetags.static import static @@ -69,6 +70,23 @@ def get_static_content(self) -> str | bytes: raise NotImplementedError() +@method_decorator(cache_page(60 * 60), name="dispatch") +class RobotsTxt(TemplateView): + """ + Serves the robots.txt for Takahē + + To specify additional user-agents to disallow, use TAKAHE_ROBOTS_TXT_DISALLOWED_USER_AGENTS + """ + + template_name = "robots.txt" + content_type = "text/plain" + + def get_context_data(self): + return { + "user_agents": getattr(settings, "ROBOTS_TXT_DISALLOWED_USER_AGENTS", []), + } + + @method_decorator(cache_control(max_age=60 * 15), name="dispatch") class AppManifest(StaticContentView): """ diff --git a/docker/nginx.conf.d/default.conf.tpl b/docker/nginx.conf.d/default.conf.tpl index 3bcb2b8e..36b73d0d 100644 --- a/docker/nginx.conf.d/default.conf.tpl +++ b/docker/nginx.conf.d/default.conf.tpl @@ -29,11 +29,6 @@ server { proxy_hide_header X-Takahe-User; proxy_hide_header X-Takahe-Identity; - # Serve robots.txt from the non-collected dir as a special case. - location /robots.txt { - alias /takahe/static/robots.txt; - } - # Serves static files from the collected dir location /static/ { # Files in static have cache-busting hashes in the name, thus can be cached forever diff --git a/takahe/settings.py b/takahe/settings.py index 3b044eae..efd7861f 100644 --- a/takahe/settings.py +++ b/takahe/settings.py @@ -105,6 +105,10 @@ class Settings(BaseSettings): AUTO_ADMIN_EMAIL: EmailStr | None = None ERROR_EMAILS: list[EmailStr] | None = None + #: If set, a list of user agents to completely disallow in robots.txt + #: List formatting must be a valid JSON list, such as `["Agent1", "Agent2"]` + ROBOTS_TXT_DISALLOWED_USER_AGENTS: list[str] = Field(default_factory=list) + MEDIA_URL: str = "/media/" MEDIA_ROOT: str = str(BASE_DIR / "media") MEDIA_BACKEND: MediaBackendUrl | None = None @@ -313,6 +317,8 @@ class Config: STATOR_CONCURRENCY = SETUP.STATOR_CONCURRENCY STATOR_CONCURRENCY_PER_MODEL = SETUP.STATOR_CONCURRENCY_PER_MODEL +ROBOTS_TXT_DISALLOWED_USER_AGENTS = SETUP.ROBOTS_TXT_DISALLOWED_USER_AGENTS + CORS_ORIGIN_ALLOW_ALL = True # Temporary CORS_ORIGIN_WHITELIST = SETUP.CORS_HOSTS CORS_ALLOW_CREDENTIALS = True diff --git a/takahe/urls.py b/takahe/urls.py index 97bb8d41..d4373083 100644 --- a/takahe/urls.py +++ b/takahe/urls.py @@ -19,6 +19,7 @@ urlpatterns = [ path("", core.homepage), + path("robots.txt", core.RobotsTxt.as_view()), path("manifest.json", core.AppManifest.as_view()), # Activity views path("notifications/", timelines.Notifications.as_view(), name="notifications"), diff --git a/templates/robots.txt b/templates/robots.txt new file mode 100644 index 00000000..3ea91127 --- /dev/null +++ b/templates/robots.txt @@ -0,0 +1,13 @@ +User-Agent: * + +# Don't allow any bot to crawl tags. +Disallow: /tags/ +Disallow: /tags/* + +# Don't allow bots to crawl through the proxy +Disallow: /proxy/* + +{% for user_agent in user_agents %} +User-agent: {{user_agent}} +Disallow: / +{% endfor %}