Skip to content

Commit

Permalink
feat: make signal configurable and add tests
Browse files Browse the repository at this point in the history
Signed-off-by: Christian Kotzbauer <[email protected]>
  • Loading branch information
ckotzbauer committed Aug 12, 2023
1 parent 0eab3e0 commit 865e4d5
Show file tree
Hide file tree
Showing 8 changed files with 215 additions and 13 deletions.
90 changes: 89 additions & 1 deletion .github/workflows/on-pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ jobs:
# - Ensure manifests work with the latest versions even with no manifest change
# (compared to helm charts, manifests cannot easily template changes based on versions)
# Helm charts are _trailing_ releases, while manifests are done during development.
e2e-manifests:
# This test uses the "command" reboot-method.
e2e-manifests-command:
name: End-to-End test with kured with code and manifests from HEAD
runs-on: ubuntu-latest
strategy:
Expand Down Expand Up @@ -179,3 +180,90 @@ jobs:
DEBUG: true
run: |
./tests/kind/follow-coordinated-reboot.sh
# This ensures the latest code works with the manifests built from tree.
# It is useful for two things:
# - Test manifests changes (obviously), ensuring they don't break existing clusters
# - Ensure manifests work with the latest versions even with no manifest change
# (compared to helm charts, manifests cannot easily template changes based on versions)
# Helm charts are _trailing_ releases, while manifests are done during development.
# This test uses the "signal" reboot-method.
e2e-manifests-signal:
name: End-to-End test with kured with code and manifests from HEAD
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
kubernetes:
- "1.25"
- "1.26"
- "1.27"
steps:
- uses: actions/checkout@v3
- name: Ensure go version
uses: actions/setup-go@v4
with:
go-version-file: 'go.mod'
check-latest: true
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Setup GoReleaser
run: make bootstrap-tools
- name: Find current tag version
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
id: tags
- name: Build artifacts
run: |
VERSION="${{ steps.tags.outputs.sha_short }}" make image
VERSION="${{ steps.tags.outputs.sha_short }}" make manifest
- name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions
run: |
sudo bash << EOF
cp /etc/docker/daemon.json /etc/docker/daemon.json.old
echo '{}' > /etc/docker/daemon.json
systemctl restart docker || journalctl --no-pager -n 500
systemctl status docker
EOF
# Default name for helm/kind-action kind clusters is "chart-testing"
- name: Create kind cluster with 5 nodes
uses: helm/[email protected]
with:
config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
version: v0.14.0

- name: Preload previously built images onto kind cluster
run: kind load docker-image ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }} --name chart-testing

- name: Do not wait for an hour before detecting the rebootSentinel
run: |
sed -i 's/#\(.*\)--period=1h/\1--period=30s/g' kured-ds-signal.yaml
- name: Install kured with kubectl
run: |
kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds-signal.yaml
- name: Ensure kured is ready
uses: nick-invision/[email protected]
with:
timeout_minutes: 10
max_attempts: 10
retry_wait_seconds: 60
# DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = to cluster_size
command: "kubectl get ds -n kube-system kured | grep -E 'kured.*5.*5.*5.*5.*5'"

- name: Create reboot sentinel files
run: |
./tests/kind/create-reboot-sentinels.sh
- name: Follow reboot until success
env:
DEBUG: true
run: |
./tests/kind/follow-coordinated-reboot.sh
9 changes: 7 additions & 2 deletions cmd/kured/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ var (
messageTemplateUncordon string
podSelectors []string
rebootCommand string
rebootSignal int
logFormat string
preRebootNodeLabels []string
postRebootNodeLabels []string
Expand Down Expand Up @@ -174,6 +175,8 @@ func NewRootCommand() *cobra.Command {
"command for which a zero return code will trigger a reboot command")
rootCmd.PersistentFlags().StringVar(&rebootCommand, "reboot-command", "/bin/systemctl reboot",
"command to run when a reboot is required")
rootCmd.PersistentFlags().IntVar(&rebootSignal, "reboot-signal", 34+5,
"signal to use for reboot, SIGRTMIN+5 by default.")

rootCmd.PersistentFlags().StringVar(&slackHookURL, "slack-hook-url", "",
"slack hook URL for reboot notifications [deprecated in favor of --notify-url]")
Expand Down Expand Up @@ -522,7 +525,7 @@ func invokeReboot(nodeID string, rebootCommand []string) {
if rebootMethod == MethodCommand {
booter = reboot.NewCommandReboot(nodeID, rebootCommand)
} else if rebootMethod == MethodSignal {
booter = reboot.NewSignalReboot(nodeID)
booter = reboot.NewSignalReboot(nodeID, rebootSignal)
} else {
log.Fatalf("Invalid reboot-method configured: %s", rebootMethod)
}
Expand Down Expand Up @@ -829,8 +832,10 @@ func root(cmd *cobra.Command, args []string) {
log.Infof("Reboot schedule: %v", window)
log.Infof("Reboot check command: %s every %v", sentinelCommand, period)
log.Infof("Reboot method: %s", rebootMethod)
if rebootCommand == MethodSignal {
if rebootCommand == MethodCommand {
log.Infof("Reboot command: %s", restartCommand)
} else {
log.Infof("Reboot signal: %v", rebootSignal)
}

if annotateNodes {
Expand Down
100 changes: 100 additions & 0 deletions kured-ds-signal.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: kured
namespace: kube-system
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: kured # Must match `--ds-name`
namespace: kube-system # Must match `--ds-namespace`
spec:
selector:
matchLabels:
name: kured
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: kured
spec:
serviceAccountName: kured
tolerations:
- key: node-role.kubernetes.io/control-plane
effect: NoSchedule
- key: node-role.kubernetes.io/master
effect: NoSchedule
hostPID: true # Facilitate entering the host mount namespace via init
restartPolicy: Always
volumes:
- name: sentinel
hostPath:
path: /var/run
type: Directory
containers:
- name: kured
# If you find yourself here wondering why there is no
# :latest tag on Docker Hub,see the FAQ in the README
image: ghcr.io/kubereboot/kured:1.13.2
imagePullPolicy: IfNotPresent
securityContext:
privileged: false # Give permission to nsenter /proc/1/ns/mnt
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
capabilities:
drop: ["*"]
add: ["CAP_KILL"]
ports:
- containerPort: 8080
name: metrics
env:
# Pass in the name of the node on which this pod is scheduled
# for use with drain/uncordon operations and lock acquisition
- name: KURED_NODE_ID
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- mountPath: /sentinel
name: sentinel
readOnly: true
command:
- /usr/bin/kured
- --reboot-sentinel=/sentinel/reboot-required
- --reboot-method=signal
# - --reboot-signal=39
# - --force-reboot=false
# - --drain-grace-period=-1
# - --skip-wait-for-delete-timeout=0
# - --drain-timeout=0
# - --period=1h
# - --ds-namespace=kube-system
# - --ds-name=kured
# - --lock-annotation=weave.works/kured-node-lock
# - --lock-ttl=0
# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local
# - --alert-filter-regexp=^RebootRequired$
# - --alert-firing-only=false
# - --prefer-no-schedule-taint=""
# - --reboot-sentinel-command=""
# - --slack-hook-url=https://hooks.slack.com/...
# - --slack-username=prod
# - --slack-channel=alerting
# - --notify-url="" # See also shoutrrr url format
# - --message-template-drain=Draining node %s
# - --message-template-reboot=Rebooting node %s
# - --message-template-uncordon=Node %s rebooted & uncordoned successfully!
# - --blocking-pod-selector=runtime=long,cost=expensive
# - --blocking-pod-selector=name=temperamental
# - --blocking-pod-selector=...
# - --reboot-days=sun,mon,tue,wed,thu,fri,sat
# - --reboot-delay=90s
# - --start-time=0:00
# - --end-time=23:59:59
# - --time-zone=UTC
# - --annotate-nodes=false
# - --lock-release-delay=30m
# - --log-format=text
3 changes: 3 additions & 0 deletions kured-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ spec:
- name: sentinel
hostPath:
path: /var/run
type: Directory
containers:
- name: kured
# If you find yourself here wondering why there is no
Expand Down Expand Up @@ -73,6 +74,8 @@ spec:
# - --alert-firing-only=false
# - --prefer-no-schedule-taint=""
# - --reboot-sentinel-command=""
# - --reboot-method=command
# - --reboot-signal=39
# - --slack-hook-url=https://hooks.slack.com/...
# - --slack-username=prod
# - --slack-channel=alerting
Expand Down
10 changes: 6 additions & 4 deletions pkg/reboot/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,18 @@ import (
log "github.com/sirupsen/logrus"
)

type commandRebootMethod struct {
// CommandRebootMethod holds context-information for a command reboot.
type CommandRebootMethod struct {
nodeID string
rebootCommand []string
}

func NewCommandReboot(nodeID string, rebootCommand []string) *commandRebootMethod {
return &commandRebootMethod{nodeID: nodeID, rebootCommand: rebootCommand}
// NewCommandReboot creates a new command-rebooter which needs full privileges on the host.
func NewCommandReboot(nodeID string, rebootCommand []string) *CommandRebootMethod {
return &CommandRebootMethod{nodeID: nodeID, rebootCommand: rebootCommand}
}

func (c *commandRebootMethod) Reboot() {
func (c *CommandRebootMethod) Reboot() {

Check failure on line 19 in pkg/reboot/command.go

View workflow job for this annotation

GitHub Actions / Lint golang code

exported method CommandRebootMethod.Reboot should have comment or be unexported

Check failure on line 19 in pkg/reboot/command.go

View workflow job for this annotation

GitHub Actions / Lint golang code

exported method CommandRebootMethod.Reboot should have comment or be unexported
log.Infof("Running command: %s for node: %s", c.rebootCommand, c.nodeID)
if err := util.NewCommand(c.rebootCommand[0], c.rebootCommand[1:]...).Run(); err != nil {
log.Fatalf("Error invoking reboot command: %v", err)
Expand Down
1 change: 1 addition & 0 deletions pkg/reboot/reboot.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package reboot

// Reboot interface defines the Reboot function to be implemented.
type Reboot interface {
Reboot()
}
13 changes: 8 additions & 5 deletions pkg/reboot/signal.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,26 @@ import (
log "github.com/sirupsen/logrus"
)

type signalRebootMethod struct {
// SignalRebootMethod holds context-information for a signal reboot.
type SignalRebootMethod struct {
nodeID string
signal int
}

func NewSignalReboot(nodeID string) *signalRebootMethod {
return &signalRebootMethod{nodeID: nodeID}
// NewSignalReboot creates a new signal-rebooter which can run unprivileged.
func NewSignalReboot(nodeID string, signal int) *SignalRebootMethod {
return &SignalRebootMethod{nodeID: nodeID, signal: signal}
}

func (c *signalRebootMethod) Reboot() {
func (c *SignalRebootMethod) Reboot() {

Check failure on line 21 in pkg/reboot/signal.go

View workflow job for this annotation

GitHub Actions / Lint golang code

exported method SignalRebootMethod.Reboot should have comment or be unexported

Check failure on line 21 in pkg/reboot/signal.go

View workflow job for this annotation

GitHub Actions / Lint golang code

exported method SignalRebootMethod.Reboot should have comment or be unexported
log.Infof("Emit reboot-signal for node: %s", c.nodeID)

process, err := os.FindProcess(1)
if err != nil {
log.Fatalf("There was no systemd process found: %v", err)
}

err = process.Signal(syscall.Signal(34 + 5)) // SIGRTMIN+5
err = process.Signal(syscall.Signal(c.signal))
if err != nil {
log.Fatalf("Signal of SIGRTMIN+5 failed: %v", err)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
log "github.com/sirupsen/logrus"
)

// newCommand creates a new Command with stdout/stderr wired to our standard logger
// NewCommand creates a new Command with stdout/stderr wired to our standard logger
func NewCommand(name string, arg ...string) *exec.Cmd {
cmd := exec.Command(name, arg...)
cmd.Stdout = log.NewEntry(log.StandardLogger()).
Expand Down

0 comments on commit 865e4d5

Please sign in to comment.