Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌱 WipeDisk annotation #1429

Merged
merged 16 commits into from
Aug 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions api/v1beta1/conditions_const.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ const (
ServerNotFoundReason = "ServerNotFound"
// LinuxOnOtherDiskFoundReason indicates that the server can't be provisioned on the given WWN, since the reboot would fail.
LinuxOnOtherDiskFoundReason = "LinuxOnOtherDiskFound"
// WipeDiskFailedReason indicates that erasing the disks before provisioning failed.
WipeDiskFailedReason = "WipeDiskFailed"
// SSHToRescueSystemFailedReason indicates that the rescue system can't be reached via ssh.
SSHToRescueSystemFailedReason = "SSHToRescueSystemFailed"
// RebootTimedOutReason indicates that the reboot timed out.
Expand Down
4 changes: 4 additions & 0 deletions api/v1beta1/hetznerbaremetalhost_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ const (
// HostAnnotation is the key for an annotation that should go on a HetznerBareMetalMachine to
// reference what HetznerBareMetalHost it corresponds to.
HostAnnotation = "infrastructure.cluster.x-k8s.io/HetznerBareMetalHost"

// WipeDiskAnnotation indicates which Disks (WWNs) to erase before provisioning
// The value is a list of WWNS or "all".
WipeDiskAnnotation = "wipedisk.hetznerbaremetalhost.infrastructure.cluster.x-k8s.io"
)

// RootDeviceHints holds the hints for specifying the storage location
Expand Down
2 changes: 1 addition & 1 deletion controllers/hcloudmachine_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ var _ = Describe("HCloudMachineReconciler", func() {
}

return len(servers) == 0
}, timeout, interval).Should(BeTrue())
}, 2*timeout, interval).Should(BeTrue())
janiskemper marked this conversation as resolved.
Show resolved Hide resolved

By("checking that bootstrap condition is not ready")

Expand Down
59 changes: 59 additions & 0 deletions pkg/services/baremetal/client/mocks/ssh/Client.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

58 changes: 56 additions & 2 deletions pkg/services/baremetal/client/ssh/ssh_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,19 @@ package sshclient
import (
"bufio"
"bytes"
"context"
_ "embed"
"encoding/base64"
"errors"
"fmt"
"os"
"regexp"
"slices"
"strings"
"time"

"golang.org/x/crypto/ssh"
ctrl "sigs.k8s.io/controller-runtime"
)

const (
Expand All @@ -39,6 +42,9 @@ const (
//go:embed detect-linux-on-another-disk.sh
var detectLinuxOnAnotherDiskShellScript string

//go:embed wipe-disk.sh
var wipeDiskShellScript string

var downloadFromOciShellScript = `#!/bin/bash

# Copyright 2023 The Kubernetes Authors.
Expand Down Expand Up @@ -253,6 +259,10 @@ type Client interface {
ResetKubeadm() Output
UntarTGZ() Output
DetectLinuxOnAnotherDisk(sliceOfWwns []string) Output

// Erase filesystem, raid and partition-table signatures.
// String "all" will wipe all disks.
WipeDisk(ctx context.Context, sliceOfWwns []string) (string, error)
}

// Factory is the interface for creating new Client objects.
Expand Down Expand Up @@ -546,10 +556,54 @@ func (c *sshClient) ResetKubeadm() Output {
}

func (c *sshClient) DetectLinuxOnAnotherDisk(sliceOfWwns []string) Output {
return c.runSSH(fmt.Sprintf(`cat <<'EOF_VIA_SSH' | bash -s -- %s
return c.runSSH(fmt.Sprintf(`cat >/root/detect-linux-on-another-disk.sh <<'EOF_VIA_SSH'
%s
EOF_VIA_SSH
chmod a+rx /root/detect-linux-on-another-disk.sh
/root/detect-linux-on-another-disk.sh %s
`, detectLinuxOnAnotherDiskShellScript, strings.Join(sliceOfWwns, " ")))
}

var (
// I found no details about the format. I found these examples
// 10:00:00:05:1e:7a:7a:00 eui.00253885910c8cec 0x500a07511bb48b25
isValidWWNRegex = regexp.MustCompile(`^[0-9a-zA-Z.:-]{5,64}$`)

// ErrInvalidWWN indicates that a WWN has an invalid syntax.
ErrInvalidWWN = fmt.Errorf("WWN does not match regex %q", isValidWWNRegex.String())
)

func (c *sshClient) WipeDisk(ctx context.Context, sliceOfWwns []string) (string, error) {
log := ctrl.LoggerFrom(ctx)
if len(sliceOfWwns) == 0 {
return "", nil
}
if slices.Contains(sliceOfWwns, "all") {
out := c.runSSH("lsblk --nodeps --noheadings -o WWN | sort -u")
if out.Err != nil {
return "", fmt.Errorf("failed to find WWNs of all disks: %w", out.Err)
}
log.Info("WipeDisk: 'all' was given. Found these WWNs", "WWNs", sliceOfWwns)
sliceOfWwns = strings.Fields(out.StdOut)
} else {
for _, wwn := range sliceOfWwns {
// validate WWN.
// It is unlikely, but someone could use this wwn: `"; do-nasty-things-here`
if !isValidWWNRegex.MatchString(wwn) {
return "", fmt.Errorf("WWN %q is invalid. %w", wwn, ErrInvalidWWN)
}
}
}
out := c.runSSH(fmt.Sprintf(`cat >/root/wipe-disk.sh <<'EOF_VIA_SSH'
%s
EOF_VIA_SSH
`, strings.Join(sliceOfWwns, " "), detectLinuxOnAnotherDiskShellScript))
chmod a+rx /root/wipe-disk.sh
/root/wipe-disk.sh %s
`, wipeDiskShellScript, strings.Join(sliceOfWwns, " ")))
if out.Err != nil {
return "", fmt.Errorf("WipeDisk for %+v failed: %s. %s: %w", sliceOfWwns, out.StdOut, out.StdErr, out.Err)
}
return out.String(), nil
}

func (c *sshClient) UntarTGZ() Output {
Expand Down
50 changes: 50 additions & 0 deletions pkg/services/baremetal/client/ssh/wipe-disk.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash

# Copyright 2024 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

trap 'echo "ERROR: A command has failed. Exiting the script. Line was ($0:$LINENO): $(sed -n "${LINENO}p" "$0")"; exit 3' ERR
set -Eeuo pipefail

function usage() {
echo "$0 wwn1 [wwn2 ...]"
echo " Wipe all filesystem, raid or partition-table signaturesfrom the specified disks."
echo " ATTENTION! THIS DELETES ALL DATA ON THE GIVEN DISK!"
echo "Existing WWNs:"
lsblk -oNAME,WWN | grep -vi loop || true
}

if [ $# -eq 0 ]; then
echo "Error: No WWN was provided."
echo
usage
exit 3
fi

# Iterate over all input arguments
for wwn in "$@"; do
if ! lsblk -l -oWWN | grep -qFx "${wwn}"; then
echo "$wwn is not a WWN of this machine"
echo
usage
exit 3
fi
device=$(lsblk -oNAME,WWN,TYPE | grep disk | grep "$wwn" | cut -d' ' -f1)
if [ -z "$device" ]; then
echo "Failed to find device for WWN $wwn"
exit 3
fi
echo "INFO: Calling wipfs for $wwn (/dev/$device)"
wipefs -af "/dev/$device"
done
42 changes: 42 additions & 0 deletions pkg/services/baremetal/host/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,48 @@ func (s *Service) actionImageInstalling(ctx context.Context) actionResult {
}

func (s *Service) actionImageInstallingStartBackgroundProcess(ctx context.Context, sshClient sshclient.Client) actionResult {
// Call WipeDisk if the corresponding annotation is set.
sliceOfWwns := strings.Fields(s.scope.HetznerBareMetalHost.Annotations[infrav1.WipeDiskAnnotation])
if len(sliceOfWwns) > 0 {
output, err := sshClient.WipeDisk(ctx, sliceOfWwns)
if err != nil {
var exitErr *ssh.ExitError
if errors.As(err, &exitErr) || errors.Is(err, sshclient.ErrInvalidWWN) {
// The script was executed, but an error occurred.
// Do not retry. This needs manual intervention.
msg := fmt.Sprintf("WipeDisk failed (permanent error): %s",
err.Error())
conditions.MarkFalse(
s.scope.HetznerBareMetalHost,
infrav1.ProvisionSucceededCondition,
infrav1.WipeDiskFailedReason,
clusterv1.ConditionSeverityError,
msg,
)
record.Warn(s.scope.HetznerBareMetalHost, infrav1.WipeDiskFailedReason, msg)
s.scope.HetznerBareMetalHost.SetError(infrav1.PermanentError, msg)
return actionStop{}
}
// some other error happened. It is likely that the ssh connection failed.
msg := fmt.Sprintf("WipeDisk failed (Will retry): %s",
err.Error())
conditions.MarkFalse(
s.scope.HetznerBareMetalHost,
infrav1.ProvisionSucceededCondition,
infrav1.WipeDiskFailedReason,
clusterv1.ConditionSeverityWarning,
msg,
)
record.Warn(s.scope.HetznerBareMetalHost, infrav1.WipeDiskFailedReason, msg)
return actionContinue{
delay: 10 * time.Second,
}
}
delete(s.scope.HetznerBareMetalHost.Annotations, infrav1.WipeDiskAnnotation)
record.Eventf(s.scope.HetznerBareMetalHost, "WipeDiskDone", "WipeDisk %v was done. Annotation %q was removed.\n%s",
sliceOfWwns, infrav1.WipeDiskAnnotation, output)
}

// If there is a Linux OS on an other disk, then the reboot after the provisioning
// will likely fail, because the machine boots into the other operating system.
// We want detect that early, and not start the provisioning process.
Expand Down