From ec6a3e2c9ab5e8d49ab4a5e876ec1e68bb1de587 Mon Sep 17 00:00:00 2001 From: Jake Goulding Date: Fri, 23 Jun 2023 10:56:52 -0400 Subject: [PATCH 1/8] Allow setting the number of nginx worker connections --- ansible/roles/nginx/README.md | 4 ++++ ansible/roles/nginx/defaults/main.yml | 1 + ansible/roles/nginx/tasks/main.yml | 10 ++++++++++ ansible/roles/nginx/templates/nginx.conf | 2 +- ansible/roles/nginx/templates/override.conf | 6 ++++++ 5 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 ansible/roles/nginx/templates/override.conf diff --git a/ansible/roles/nginx/README.md b/ansible/roles/nginx/README.md index fe298a14d..9b11119fd 100644 --- a/ansible/roles/nginx/README.md +++ b/ansible/roles/nginx/README.md @@ -9,6 +9,10 @@ this role as well. ```yaml - role: nginx + # The number of worker connections. [optional] + # https://nginx.org/en/docs/ngx_core_module.html#worker_connections + worker_connections: 123 + # Configures reverse proxies with HTTPS termination. [optional] proxied: # The domain to proxy from diff --git a/ansible/roles/nginx/defaults/main.yml b/ansible/roles/nginx/defaults/main.yml index 7a417cfda..fd0212240 100644 --- a/ansible/roles/nginx/defaults/main.yml +++ b/ansible/roles/nginx/defaults/main.yml @@ -1,4 +1,5 @@ --- # See this role's README for documentation about these defaults. +worker_connections: 768 proxied: {} diff --git a/ansible/roles/nginx/tasks/main.yml b/ansible/roles/nginx/tasks/main.yml index 4043415cc..fc88234e9 100644 --- a/ansible/roles/nginx/tasks/main.yml +++ b/ansible/roles/nginx/tasks/main.yml @@ -25,3 +25,13 @@ src: after-ssl-renew.sh dest: /etc/ssl/letsencrypt/after-renew.d mode: 0750 + +- name: create systemd override file + file: + path: /etc/systemd/system/nginx.service.d + state: directory + +- name: create systemd override file + template: + src: override.conf + dest: /etc/systemd/system/nginx.service.d/override.conf diff --git a/ansible/roles/nginx/templates/nginx.conf b/ansible/roles/nginx/templates/nginx.conf index 7fd8762ae..f8f74dbd0 100644 --- a/ansible/roles/nginx/templates/nginx.conf +++ b/ansible/roles/nginx/templates/nginx.conf @@ -8,7 +8,7 @@ pid /run/nginx.pid; include /etc/nginx/modules-enabled/*.conf; events { - worker_connections 768; + worker_connections {{ worker_connections }}; } http { diff --git a/ansible/roles/nginx/templates/override.conf b/ansible/roles/nginx/templates/override.conf new file mode 100644 index 000000000..97b69538b --- /dev/null +++ b/ansible/roles/nginx/templates/override.conf @@ -0,0 +1,6 @@ +[Service] +# This assumes that the NGINX instance will usually be used as an +# upstream proxy. Each incoming connection takes one FD for the client +# and one FD for the proxy. We add a few extra FDs to account for +# things like config and log files. +LimitNOFILE={{ (worker_connections * 2) + 32 }} From f58d948157824de0a95af054582b0671ac01101a Mon Sep 17 00:00:00 2001 From: Jake Goulding Date: Fri, 23 Jun 2023 10:21:08 -0400 Subject: [PATCH 2/8] Reduce error rates by always routing to 127.0.0.1 The playground server only listens on an IPv4 address, not IPv6. Sending requests to `localhost` routes to both the IPv4 **and** IPv6, which results in a percentage of requests seemingly randomly failing. --- ansible/playbooks/playground.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/playbooks/playground.yml b/ansible/playbooks/playground.yml index 6dedc915c..cc2aba862 100644 --- a/ansible/playbooks/playground.yml +++ b/ansible/playbooks/playground.yml @@ -22,7 +22,7 @@ - role: nginx proxied: - domain: "{{ vars_playground_domain }}" - to: "http://localhost:{{ vars_playground_env_ui_port }}" + to: "http://127.0.0.1:{{ vars_playground_env_ui_port }}" websockets: - '/websocket' From 428ea8b39be0a1ce0b1e827d6688877da9dc804e Mon Sep 17 00:00:00 2001 From: Jake Goulding Date: Fri, 23 Jun 2023 10:57:54 -0400 Subject: [PATCH 3/8] Increase number of file descriptors available to the playground With the WebSocket functionality, we now have a number of latent connections hanging out. --- ansible/playbooks/playground.yml | 1 + ansible/roles/playground/defaults/main.yml | 5 +++++ ansible/roles/playground/templates/playground.service | 2 ++ 3 files changed, 8 insertions(+) diff --git a/ansible/playbooks/playground.yml b/ansible/playbooks/playground.yml index cc2aba862..0b3792761 100644 --- a/ansible/playbooks/playground.yml +++ b/ansible/playbooks/playground.yml @@ -20,6 +20,7 @@ - "{{ vars_playground_domain }}" - role: nginx + worker_connections: "{{ vars_playground_number_connections }}" proxied: - domain: "{{ vars_playground_domain }}" to: "http://127.0.0.1:{{ vars_playground_env_ui_port }}" diff --git a/ansible/roles/playground/defaults/main.yml b/ansible/roles/playground/defaults/main.yml index b478e6fa7..0f96a1d8b 100644 --- a/ansible/roles/playground/defaults/main.yml +++ b/ansible/roles/playground/defaults/main.yml @@ -19,3 +19,8 @@ vars_playground_env_cors_enabled: 1 vars_playground_env_ui_address: 127.0.0.1 vars_playground_env_ui_port: 8080 vars_playground_env_ui_root_path: "{{ vars_playground_artifacts_path }}/build" + +# The playground peaks at a bit less than 2000 open file descriptors +# over a normal week but `2560` resulted in sporadic errors, so there +# must be many non-WebSocket connections. +vars_playground_number_connections: 5120 diff --git a/ansible/roles/playground/templates/playground.service b/ansible/roles/playground/templates/playground.service index c0bc5638d..7b9477bb4 100644 --- a/ansible/roles/playground/templates/playground.service +++ b/ansible/roles/playground/templates/playground.service @@ -19,5 +19,7 @@ WorkingDirectory={{ vars_playground_artifacts_path }} ExecStart={{ vars_playground_executable_path }} +LimitNOFILE={{ vars_playground_number_connections }} + [Install] WantedBy=multi-user.target From 08f75d0401cb49196291655d5a1487af954bc24d Mon Sep 17 00:00:00 2001 From: Jake Goulding Date: Fri, 23 Jun 2023 11:28:56 -0400 Subject: [PATCH 4/8] Allow checking and diffing the Ansible run --- ansible/apply | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ansible/apply b/ansible/apply index 613b1a40e..39576cdf5 100755 --- a/ansible/apply +++ b/ansible/apply @@ -74,6 +74,10 @@ def run_playbook(args): ansible_args += ["-u", args.user] if args.start_at_task is not None: ansible_args += ["--start-at-task", args.start_at_task] + if args.check: + ansible_args += ["--check"] + if args.diff: + ansible_args += ["--diff"] res = subprocess.run(ansible_args, cwd=str(tempdir)) if res.returncode != 0: exit(1) @@ -92,6 +96,14 @@ if __name__ == "__main__": "--start-at-task", help="start at a task with the provided name", default=None, ) + parser.add_argument( + "--check", help="perform an Ansible check run", + action="store_true", + ) + parser.add_argument( + "--diff", help="perform an Ansible diff run", + action="store_true", + ) args = parser.parse_args() install_ansible() From 8b2cb1a3e0c928fb162246616e6b5cea380e82c7 Mon Sep 17 00:00:00 2001 From: Jake Goulding Date: Wed, 13 Sep 2023 13:40:37 -0400 Subject: [PATCH 5/8] Restart the playground if it dies --- ansible/roles/playground/templates/playground.service | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ansible/roles/playground/templates/playground.service b/ansible/roles/playground/templates/playground.service index 7b9477bb4..eae6d6201 100644 --- a/ansible/roles/playground/templates/playground.service +++ b/ansible/roles/playground/templates/playground.service @@ -6,6 +6,8 @@ Description=The Rust Playground [Service] +Restart=on-failure + Environment=TMPDIR={{ vars_playground_mountpoint_path }} Environment=RUST_LOG=info Environment=PLAYGROUND_CORS_ENABLED={{ vars_playground_env_cors_enabled }} From 005ed1fec8423878224e3505819f4e85edb9f6cd Mon Sep 17 00:00:00 2001 From: Jake Goulding Date: Wed, 13 Sep 2023 13:41:03 -0400 Subject: [PATCH 6/8] Place playground processes into a systemd slice This allows us to influence the OOM killer's behavior, hopefully killing our processes instead of something more important to the system. --- ansible/roles/playground/handlers/main.yml | 6 ++++ ansible/roles/playground/tasks/main.yml | 35 +++++++++++++++++++ .../templates/containerd-override.conf | 2 ++ .../roles/playground/templates/daemon.json | 1 + .../playground/templates/docker-override.conf | 2 ++ .../playground/templates/playground.service | 2 ++ .../playground/templates/playground.slice | 3 ++ 7 files changed, 51 insertions(+) create mode 100644 ansible/roles/playground/templates/containerd-override.conf create mode 100644 ansible/roles/playground/templates/docker-override.conf create mode 100644 ansible/roles/playground/templates/playground.slice diff --git a/ansible/roles/playground/handlers/main.yml b/ansible/roles/playground/handlers/main.yml index 0692a3344..448e9d63a 100644 --- a/ansible/roles/playground/handlers/main.yml +++ b/ansible/roles/playground/handlers/main.yml @@ -6,6 +6,12 @@ state: restarted daemon_reload: true +- name: restart-containerd + systemd: + name: containerd + state: restarted + daemon_reload: true + - name: start-playground-update systemd: name: playground-update diff --git a/ansible/roles/playground/tasks/main.yml b/ansible/roles/playground/tasks/main.yml index 4e8c293c7..2f525143b 100644 --- a/ansible/roles/playground/tasks/main.yml +++ b/ansible/roles/playground/tasks/main.yml @@ -1,5 +1,16 @@ --- +# Create a Systemd slice to manage resources (Memory, CPU) across the +# processes spawned by the playground. + +- name: Configure playground slice + template: + src: playground.slice + dest: /etc/systemd/system/playground.slice + mode: 0644 + +# -------------------- + - name: Add Docker APT repository GPG key apt_key: state: present @@ -30,6 +41,30 @@ mode: 0600 notify: restart-docker +- name: Add Docker to the Playground slice + file: + path: /etc/systemd/system/docker.service.d + state: directory + +- name: Add Docker to the Playground slice + template: + src: docker-override.conf + dest: /etc/systemd/system/docker.service.d/override.conf + mode: 0600 + notify: restart-docker + +- name: Add ContainerD to the Playground slice + file: + path: /etc/systemd/system/containerd.service.d + state: directory + +- name: Add ContainerD to the Playground slice + template: + src: containerd-override.conf + dest: /etc/systemd/system/containerd.service.d/override.conf + mode: 0600 + notify: restart-containerd + # -------------------- # Set up a partition with limited space to avoid temporary diff --git a/ansible/roles/playground/templates/containerd-override.conf b/ansible/roles/playground/templates/containerd-override.conf new file mode 100644 index 000000000..84ccd28c4 --- /dev/null +++ b/ansible/roles/playground/templates/containerd-override.conf @@ -0,0 +1,2 @@ +[Service] +Slice=playground.slice diff --git a/ansible/roles/playground/templates/daemon.json b/ansible/roles/playground/templates/daemon.json index f37d9194c..fd393a62e 100644 --- a/ansible/roles/playground/templates/daemon.json +++ b/ansible/roles/playground/templates/daemon.json @@ -1,3 +1,4 @@ { + "cgroup-parent": "playground.slice", "storage-driver": "overlay2" } diff --git a/ansible/roles/playground/templates/docker-override.conf b/ansible/roles/playground/templates/docker-override.conf new file mode 100644 index 000000000..84ccd28c4 --- /dev/null +++ b/ansible/roles/playground/templates/docker-override.conf @@ -0,0 +1,2 @@ +[Service] +Slice=playground.slice diff --git a/ansible/roles/playground/templates/playground.service b/ansible/roles/playground/templates/playground.service index eae6d6201..8ef05cfc5 100644 --- a/ansible/roles/playground/templates/playground.service +++ b/ansible/roles/playground/templates/playground.service @@ -6,6 +6,8 @@ Description=The Rust Playground [Service] +Slice=playground.slice + Restart=on-failure Environment=TMPDIR={{ vars_playground_mountpoint_path }} diff --git a/ansible/roles/playground/templates/playground.slice b/ansible/roles/playground/templates/playground.slice new file mode 100644 index 000000000..ebc056b13 --- /dev/null +++ b/ansible/roles/playground/templates/playground.slice @@ -0,0 +1,3 @@ +[Unit] +Description=Resource management group for playground processes +Before=slices.target From 2499fe2a8077db8fdee0dc743390b99e75c3f301 Mon Sep 17 00:00:00 2001 From: Jake Goulding Date: Sat, 16 Dec 2023 16:57:04 -0500 Subject: [PATCH 7/8] Use Docker's `local` log driver The default driver can be inefficient [1] as it reads / parses / formats / writes a large JSON file over and over. Since all of the playground's communication goes over stdin / stdout, that can be a lot of junk logged! The `local` driver should be more efficient. [1]: https://github.com/docker/for-linux/issues/641 --- ansible/roles/playground/templates/daemon.json | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/playground/templates/daemon.json b/ansible/roles/playground/templates/daemon.json index fd393a62e..85591c834 100644 --- a/ansible/roles/playground/templates/daemon.json +++ b/ansible/roles/playground/templates/daemon.json @@ -1,4 +1,5 @@ { "cgroup-parent": "playground.slice", + "log-driver": "local", "storage-driver": "overlay2" } From ce8b5e48906ebd0d25bc544a33b01d052853a965 Mon Sep 17 00:00:00 2001 From: Jake Goulding Date: Tue, 2 Apr 2024 14:14:18 -0400 Subject: [PATCH 8/8] Garbage collect zombie containers This is expected to happen when the service restarts while a container is running, as we don't have a graceful cleanup in the service. It can also happen unexpectedly during a crash. For some reason, this also happens when the service hasn't restarted (naturally or unexpectedly) and I haven't had a chance to hunt that down yet. --- ansible/roles/playground/defaults/main.yml | 1 + ansible/roles/playground/handlers/main.yml | 12 +++++++ ansible/roles/playground/tasks/main.yml | 27 +++++++++++++++ ansible/roles/playground/templates/gc.sh | 33 +++++++++++++++++++ .../templates/playground-gc.service | 10 ++++++ .../playground/templates/playground-gc.timer | 13 ++++++++ 6 files changed, 96 insertions(+) create mode 100644 ansible/roles/playground/templates/gc.sh create mode 100644 ansible/roles/playground/templates/playground-gc.service create mode 100644 ansible/roles/playground/templates/playground-gc.timer diff --git a/ansible/roles/playground/defaults/main.yml b/ansible/roles/playground/defaults/main.yml index 0f96a1d8b..95a2dc298 100644 --- a/ansible/roles/playground/defaults/main.yml +++ b/ansible/roles/playground/defaults/main.yml @@ -12,6 +12,7 @@ vars_playground_repository_url: https://github.com/rust-lang/rust-playground.git vars_playground_checkout_path: "{{ vars_playground_home_path }}/rust-playground" vars_playground_update_path: "{{ vars_playground_home_path }}/update.sh" +vars_playground_gc_path: "{{ vars_playground_home_path }}/gc.sh" vars_playground_artifacts_path: "{{ vars_playground_home_path }}/playground-artifacts" vars_playground_executable_path: "{{ vars_playground_artifacts_path }}/ui" diff --git a/ansible/roles/playground/handlers/main.yml b/ansible/roles/playground/handlers/main.yml index 448e9d63a..615e5017d 100644 --- a/ansible/roles/playground/handlers/main.yml +++ b/ansible/roles/playground/handlers/main.yml @@ -24,6 +24,18 @@ state: restarted daemon_reload: true +- name: start-playground-gc + systemd: + name: playground-gc + state: started + daemon_reload: true + +- name: restart-playground-gc-timer + systemd: + name: playground-gc.timer + state: restarted + daemon_reload: true + - name: restart-playground systemd: name: playground diff --git a/ansible/roles/playground/tasks/main.yml b/ansible/roles/playground/tasks/main.yml index 2f525143b..192cbb53b 100644 --- a/ansible/roles/playground/tasks/main.yml +++ b/ansible/roles/playground/tasks/main.yml @@ -154,6 +154,33 @@ state: started enabled: true +- name: Configure garbage collection script + template: + src: gc.sh + dest: "{{ vars_playground_gc_path }}" + mode: 0755 + notify: start-playground-gc + +- name: Configure garbage collection script service + template: + src: playground-gc.service + dest: /etc/systemd/system/playground-gc.service + mode: 0644 + notify: start-playground-gc + +- name: Configure garbage collection script service timer + template: + src: playground-gc.timer + dest: /etc/systemd/system/playground-gc.timer + mode: 0644 + notify: restart-playground-gc-timer + +- name: Start and enable garbage collection script service timer + systemd: + name: playground-gc.timer + state: started + enabled: true + - name: Configure playground service template: src: playground.service diff --git a/ansible/roles/playground/templates/gc.sh b/ansible/roles/playground/templates/gc.sh new file mode 100644 index 000000000..2cc1b54cd --- /dev/null +++ b/ansible/roles/playground/templates/gc.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# +# {{ ansible_managed }} +# + +# {% raw %} + +set -euv -o pipefail + +# How long a container must be running to be killed. +# Number of seconds. +MAX_TIME=3600 + +now=$(date "+%s") +to_kill=() + +readarray -t container_ids < <(docker ps --format '{{ .ID }}' --no-trunc) + +while read -r id started_at; do + started_at=$(date --date "${started_at}" "+%s") + running_time=$((now - started_at)) + + if [[ "${running_time}" -gt "${MAX_TIME}" ]]; then + to_kill+=("${id}") + fi +done < <(docker inspect "${container_ids[@]}" --format '{{ .ID }} {{ .State.StartedAt }}') + +if [[ ${#to_kill[@]} -gt 0 ]]; then + docker kill "${to_kill[@]}" +fi + +# {% endraw %} diff --git a/ansible/roles/playground/templates/playground-gc.service b/ansible/roles/playground/templates/playground-gc.service new file mode 100644 index 000000000..d103eba2b --- /dev/null +++ b/ansible/roles/playground/templates/playground-gc.service @@ -0,0 +1,10 @@ +# +# {{ ansible_managed }} +# + +[Unit] +Description=Garbage collect dead playground containers + +[Service] +Type=oneshot +ExecStart={{ vars_playground_gc_path }} diff --git a/ansible/roles/playground/templates/playground-gc.timer b/ansible/roles/playground/templates/playground-gc.timer new file mode 100644 index 000000000..5d10c1e60 --- /dev/null +++ b/ansible/roles/playground/templates/playground-gc.timer @@ -0,0 +1,13 @@ +# +# {{ ansible_managed }} +# + +[Unit] +Description = Garbage collect playground containers every 15 minutes + +[Timer] +OnBootSec = 15min +OnUnitActiveSec = 15min + +[Install] +WantedBy = timers.target