Skip to content

Commit

Permalink
feat: provide several options to force boot from disk via iPXE
Browse files Browse the repository at this point in the history
If the server is configured to PXE boot by default, it might hit the
Sidero iPXE server after Talos install, so Sidero has to force the
server to boot from disk.

Sidero supported default method via iPXE `exit` command, but this
command doesn't always work
([details](https://ipxe.org/appnote/work_around_bios_halting_on_ipxe_exit)).

This PR provides two other ways to force to boot from disk:

* `http-404` force HTTP 404 response from iPXE server
* `ipxe-sanboot` uses `sanboot` command to boot from the first disk

This should improve compatibility with some BIOSes.

This setting should eventually be provided per-server as well.

Fixes #287

Signed-off-by: Andrey Smirnov <[email protected]>
  • Loading branch information
smira authored and andrewrynhard committed May 19, 2021
1 parent 1e8096e commit a792890
Show file tree
Hide file tree
Showing 12 changed files with 420 additions and 106 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ type EnvironmentList struct {
func EnvironmentDefaultSpec(talosRelease, apiEndpoint string, apiPort uint16) *EnvironmentSpec {
args := make([]string, 0, len(kernel.DefaultArgs)+6)
args = append(args, kernel.DefaultArgs...)
args = append(args, "console=tty0", "console=ttyS1,115200n8", "earlyprintk=ttyS1,115200n8")
args = append(args, "console=tty0", "console=ttyS0", "earlyprintk=ttyS0")
args = append(args, "initrd=initramfs.xz", "talos.platform=metal")
args = append(args, fmt.Sprintf("talos.config=http://%s:%d/configdata?uuid=", apiEndpoint, apiPort))
sort.Strings(args)
Expand Down
1 change: 1 addition & 0 deletions app/metal-controller-manager/config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ spec:
- --api-endpoint=${SIDERO_CONTROLLER_MANAGER_API_ENDPOINT:=-}
- --api-port=${SIDERO_CONTROLLER_MANAGER_API_PORT:=8081}
- --extra-agent-kernel-args=${SIDERO_CONTROLLER_MANAGER_EXTRA_AGENT_KERNEL_ARGS:=-}
- --boot-from-disk-method=${SIDERO_CONTROLLER_MANAGER_BOOT_FROM_DISK_METHOD:=ipxe-exit}
- --auto-accept-servers=${SIDERO_CONTROLLER_MANAGER_AUTO_ACCEPT_SERVERS:=false}
- --insecure-wipe=${SIDERO_CONTROLLER_MANAGER_INSECURE_WIPE:=true}
- --auto-bmc-setup=${SIDERO_CONTROLLER_MANAGER_AUTO_BMC_SETUP:=true}
Expand Down
45 changes: 35 additions & 10 deletions app/metal-controller-manager/internal/ipxe/ipxe_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,25 +55,49 @@ initrd /env/{{ .Env.Name }}/{{ .InitrdAsset }}
boot
`))

// ipxeBootFromDisk script is used to skip PXE booting and boot from disk.
const ipxeBootFromDisk = `#!ipxe
// ipxeBootFromDiskExit script is used to skip PXE booting and boot from disk via exit.
const ipxeBootFromDiskExit = `#!ipxe
exit
`

// ipxeBootFromDiskSanboot script is used to skip PXE booting and boot from disk via sanboot.
const ipxeBootFromDiskSanboot = `#!ipxe
sanboot --no-describe --drive 0x80
`

// BootFromDisk defines a way to boot from disk.
type BootFromDisk string

const (
BootIPXEExit BootFromDisk = "ipxe-exit" // Use iPXE script with `exit` command.
Boot404 BootFromDisk = "http-404" // Return HTTP 404 response to iPXE.
BootSANDisk BootFromDisk = "ipxe-sanboot" // Use iPXE script with `sanboot` command.
)

var (
apiEndpoint string
apiPort int
extraAgentKernelArgs string
c client.Client
apiEndpoint string
apiPort int
extraAgentKernelArgs string
defaultBootFromDiskMethod BootFromDisk
c client.Client
)

func bootFileHandler(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, bootFile)
}

//nolint:unparam
func bootFromDiskHandler(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, ipxeBootFromDisk)
func bootFromDiskHandler(method BootFromDisk, w http.ResponseWriter, r *http.Request) {
switch method { //nolint:exhaustive
case Boot404:
w.WriteHeader(http.StatusNotFound)
case BootSANDisk:
fmt.Fprint(w, ipxeBootFromDiskSanboot)
case BootIPXEExit:
fallthrough
default:
fmt.Fprint(w, ipxeBootFromDiskExit)
}
}

func ipxeHandler(w http.ResponseWriter, r *http.Request) {
Expand Down Expand Up @@ -104,7 +128,7 @@ func ipxeHandler(w http.ResponseWriter, r *http.Request) {
if err != nil {
if errors.Is(err, ErrBootFromDisk) {
log.Printf("Server %q booting from disk", uuid)
bootFromDiskHandler(w, r)
bootFromDiskHandler(defaultBootFromDiskMethod, w, r)

return
}
Expand Down Expand Up @@ -169,10 +193,11 @@ func ipxeHandler(w http.ResponseWriter, r *http.Request) {
}
}

func RegisterIPXE(mux *http.ServeMux, endpoint string, port int, args string, iPXEPort int, mgrClient client.Client) error {
func RegisterIPXE(mux *http.ServeMux, endpoint string, port int, args string, bootMethod BootFromDisk, iPXEPort int, mgrClient client.Client) error {
apiEndpoint = endpoint
apiPort = port
extraAgentKernelArgs = args
defaultBootFromDiskMethod = bootMethod
c = mgrClient

var embeddedScriptBuf bytes.Buffer
Expand Down
4 changes: 3 additions & 1 deletion app/metal-controller-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ func main() {
apiEndpoint string
apiPort int
extraAgentKernelArgs string
bootFromDiskMethod string
enableLeaderElection bool
autoAcceptServers bool
insecureWipe bool
Expand All @@ -86,6 +87,7 @@ func main() {
flag.IntVar(&apiPort, "api-port", httpPort, "The TCP port Sidero components can be reached at from the servers.")
flag.StringVar(&metricsAddr, "metrics-addr", ":8081", "The address the metric endpoint binds to.")
flag.StringVar(&extraAgentKernelArgs, "extra-agent-kernel-args", "", "A comma delimited list of key-value pairs to be added to the agent environment kernel parameters.")
flag.StringVar(&bootFromDiskMethod, "boot-from-disk-method", string(ipxe.BootIPXEExit), "Default method to use to boot server from disk if it hits iPXE endpoint after install.")
flag.BoolVar(&enableLeaderElection, "enable-leader-election", true, "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.")
flag.BoolVar(&autoAcceptServers, "auto-accept-servers", false, "Add servers as 'accepted' when they register with Sidero API.")
flag.BoolVar(&insecureWipe, "insecure-wipe", true, "Wipe head of the disk only (if false, wipe whole disk).")
Expand Down Expand Up @@ -211,7 +213,7 @@ func main() {

setupLog.Info("starting iPXE server")

if err := ipxe.RegisterIPXE(httpMux, apiEndpoint, apiPort, extraAgentKernelArgs, apiPort, mgr.GetClient()); err != nil {
if err := ipxe.RegisterIPXE(httpMux, apiEndpoint, apiPort, extraAgentKernelArgs, ipxe.BootFromDisk(bootFromDiskMethod), apiPort, mgr.GetClient()); err != nil {
setupLog.Error(err, "unable to start iPXE server", "controller", "Environment")
os.Exit(1)
}
Expand Down
12 changes: 12 additions & 0 deletions sfyra/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,18 @@ One can also run parts of the test flow:

See each command help on how to customize the operations.

## Testing Always PXE Boot

By default, QEMU VMs provisioned to emulate metal servers are configured to boot from the disk first, and Sidero uses API
call to force PXE boot to run the agent.

Sometimes it's important to test the flow when the servers are configured to boot from the network first always (e.g. if
bare metal setup doesn't have IPMI), in that case it's important to force VMs to boot from the network always.
This can be achieved by adding a flag `--default-boot-order=nc` to `sfyra` invocation.
In this case Sidero iPXE server will force VM to boot from disk via iPXE if the server is already provisioned.

> Note: due to the dependency on new `talosctl`, this feature will be available once Talos in Sfyra is updated to version >= 0.11.
## Running with Talos HEAD

Build the artifacts in Talos:
Expand Down
3 changes: 3 additions & 0 deletions sfyra/cmd/sfyra/cmd/bootstrap_servers.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ var bootstrapServersCmd = &cobra.Command{
CPUs: options.ManagementCPUs,
MemMB: options.ManagementMemMB,
DiskGB: options.ManagementDiskGB,

DefaultBootOrder: options.DefaultBootOrder,
})
if err != nil {
return err
Expand All @@ -52,4 +54,5 @@ func init() {
bootstrapServersCmd.Flags().IntVar(&options.ManagementNodes, "management-nodes", options.ManagementNodes, "number of PXE nodes to create for the management rack")
bootstrapServersCmd.Flags().StringVar(&options.ManagementCIDR, "management-cidr", options.ManagementCIDR, "management cluster network CIDR")
bootstrapServersCmd.Flags().StringVar(&bootSource, "boot-source", "172.24.0.2", "the boot source IP for the iPXE boot")
bootstrapServersCmd.Flags().StringVar(&options.DefaultBootOrder, "default-boot-order", options.DefaultBootOrder, "QEMU default boot order")
}
4 changes: 4 additions & 0 deletions sfyra/cmd/sfyra/cmd/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ type Options struct {
ManagementCPUs int64
ManagementDiskGB int64

DefaultBootOrder string

TalosctlPath string

PowerSimulatedExplicitFailureProb float64
Expand Down Expand Up @@ -77,6 +79,8 @@ func DefaultOptions() Options {
ManagementCPUs: 2,
ManagementDiskGB: 6,

DefaultBootOrder: "cn", // disk, then network; override to "nc" to force PXE boot each time

TalosctlPath: fmt.Sprintf("_out/%s/talosctl-linux-amd64", TalosRelease),
}
}
3 changes: 3 additions & 0 deletions sfyra/cmd/sfyra/cmd/test_integration.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ var testIntegrationCmd = &cobra.Command{
CPUs: options.ManagementCPUs,
MemMB: options.ManagementMemMB,
DiskGB: options.ManagementDiskGB,

DefaultBootOrder: options.DefaultBootOrder,
})
if err != nil {
return err
Expand Down Expand Up @@ -134,6 +136,7 @@ func init() {
testIntegrationCmd.Flags().StringVar(&options.TalosKernelURL, "talos-kernel-url", options.TalosKernelURL, "Talos kernel image URL for Cluster API Environment")
testIntegrationCmd.Flags().StringVar(&options.TalosInitrdURL, "talos-initrd-url", options.TalosInitrdURL, "Talos initramfs image URL for Cluster API Environment")
testIntegrationCmd.Flags().StringVar(&options.ClusterctlConfigPath, "clusterctl-config", options.ClusterctlConfigPath, "path to the clusterctl config file")
testIntegrationCmd.Flags().StringVar(&options.DefaultBootOrder, "default-boot-order", options.DefaultBootOrder, "QEMU default boot order")
testIntegrationCmd.Flags().Float64Var(&options.PowerSimulatedExplicitFailureProb, "power-simulated-explicit-failure-prob", options.PowerSimulatedExplicitFailureProb, "simulated power management explicit failure probability")
testIntegrationCmd.Flags().Float64Var(&options.PowerSimulatedSilentFailureProb, "power-simulated-silent-failure-prob", options.PowerSimulatedSilentFailureProb, "simulated power management silent failure probability")
testIntegrationCmd.Flags().StringVar(&runTestPattern, "test.run", "", "tests to run (regular expression)")
Expand Down
2 changes: 1 addition & 1 deletion sfyra/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ require (
github.com/talos-systems/go-retry v0.3.0
github.com/talos-systems/net v0.2.1-0.20210212213224-05190541b0fa
github.com/talos-systems/sidero v0.0.0-00010101000000-000000000000
github.com/talos-systems/talos v0.10.2
github.com/talos-systems/talos v0.10.0-alpha.2.0.20210518203841-d3d9112f288d // 0.11-alpha to get DefaultBootOrder
github.com/talos-systems/talos/pkg/machinery v0.0.0-20210513202018-8d73bc5999b4 // v0.10.2
google.golang.org/grpc v1.37.1
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b
Expand Down
Loading

0 comments on commit a792890

Please sign in to comment.