Skip to content

Commit

Permalink
Templatize robots.txt (jointakahe#478)
Browse files Browse the repository at this point in the history
  • Loading branch information
tabletcorry authored and shuuji3 committed Feb 22, 2023
1 parent ff39af1 commit 3896ddc
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 5 deletions.
18 changes: 18 additions & 0 deletions core/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import ClassVar

import markdown_it
from django.conf import settings
from django.http import HttpResponse
from django.shortcuts import redirect
from django.templatetags.static import static
Expand Down Expand Up @@ -69,6 +70,23 @@ def get_static_content(self) -> str | bytes:
raise NotImplementedError()


@method_decorator(cache_page(60 * 60), name="dispatch")
class RobotsTxt(TemplateView):
"""
Serves the robots.txt for Takahē
To specify additional user-agents to disallow, use TAKAHE_ROBOTS_TXT_DISALLOWED_USER_AGENTS
"""

template_name = "robots.txt"
content_type = "text/plain"

def get_context_data(self):
return {
"user_agents": getattr(settings, "ROBOTS_TXT_DISALLOWED_USER_AGENTS", []),
}


@method_decorator(cache_control(max_age=60 * 15), name="dispatch")
class AppManifest(StaticContentView):
"""
Expand Down
5 changes: 0 additions & 5 deletions docker/nginx.conf.d/default.conf.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,6 @@ server {
proxy_hide_header X-Takahe-User;
proxy_hide_header X-Takahe-Identity;
# Serve robots.txt from the non-collected dir as a special case.
location /robots.txt {
alias /takahe/static/robots.txt;
}

# Serves static files from the collected dir
location /static/ {
# Files in static have cache-busting hashes in the name, thus can be cached forever
Expand Down
6 changes: 6 additions & 0 deletions takahe/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ class Settings(BaseSettings):
AUTO_ADMIN_EMAIL: EmailStr | None = None
ERROR_EMAILS: list[EmailStr] | None = None

#: If set, a list of user agents to completely disallow in robots.txt
#: List formatting must be a valid JSON list, such as `["Agent1", "Agent2"]`
ROBOTS_TXT_DISALLOWED_USER_AGENTS: list[str] = Field(default_factory=list)

MEDIA_URL: str = "/media/"
MEDIA_ROOT: str = str(BASE_DIR / "media")
MEDIA_BACKEND: MediaBackendUrl | None = None
Expand Down Expand Up @@ -313,6 +317,8 @@ class Config:
STATOR_CONCURRENCY = SETUP.STATOR_CONCURRENCY
STATOR_CONCURRENCY_PER_MODEL = SETUP.STATOR_CONCURRENCY_PER_MODEL

ROBOTS_TXT_DISALLOWED_USER_AGENTS = SETUP.ROBOTS_TXT_DISALLOWED_USER_AGENTS

CORS_ORIGIN_ALLOW_ALL = True # Temporary
CORS_ORIGIN_WHITELIST = SETUP.CORS_HOSTS
CORS_ALLOW_CREDENTIALS = True
Expand Down
1 change: 1 addition & 0 deletions takahe/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

urlpatterns = [
path("", core.homepage),
path("robots.txt", core.RobotsTxt.as_view()),
path("manifest.json", core.AppManifest.as_view()),
# Activity views
path("notifications/", timelines.Notifications.as_view(), name="notifications"),
Expand Down
13 changes: 13 additions & 0 deletions templates/robots.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
User-Agent: *

# Don't allow any bot to crawl tags.
Disallow: /tags/
Disallow: /tags/*

# Don't allow bots to crawl through the proxy
Disallow: /proxy/*

{% for user_agent in user_agents %}
User-agent: {{user_agent}}
Disallow: /
{% endfor %}

0 comments on commit 3896ddc

Please sign in to comment.