From 8dc4367aaa55f302526fc0c44863fe4c9bbb5ed1 Mon Sep 17 00:00:00 2001 From: Jacob Davis-Hansson Date: Wed, 5 Sep 2018 15:10:35 -0500 Subject: [PATCH 1/3] Initial sketch of --exec This is a proposal to introduce alternative means of killing pods, outside of simply asking kubernetes to stop them. Specifically, it allows executing a shell command on a victim container; that shell command can then be a simple kill -9, a fork bomb, or any other means of destruction. There are, at least, two reasons why this is useful. First, it gives an option of more realistic failures. Simply deleting a pod is a rather kind way to stop a pod - in reality a pod is likely to go down in flames from lack of memory, segfaulting or instant hardware failure. This brings the chaos closer to reality. The second reason is that this helps test operators that directly manipulate pods, rather than rely on higher-level concepts like StatefulSets, Deployments and so on. In this case, the thing we want to test is the operators ability to realize a pod is severely broken, and to take appropriate action, depending on what software the operator is running in the failed pod. --- chaoskube/action.go | 103 ++++++++++++++++++++++++++++++++++++ chaoskube/chaoskube.go | 24 ++++----- chaoskube/chaoskube_test.go | 2 +- main.go | 34 ++++++++++-- 4 files changed, 144 insertions(+), 19 deletions(-) create mode 100644 chaoskube/action.go diff --git a/chaoskube/action.go b/chaoskube/action.go new file mode 100644 index 00000000..6c66fd24 --- /dev/null +++ b/chaoskube/action.go @@ -0,0 +1,103 @@ +package chaoskube + +import ( + "k8s.io/api/core/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/remotecommand" + "os" + restclient "k8s.io/client-go/rest" + "k8s.io/client-go/kubernetes/scheme" + "fmt" +) + +type ChaosAction interface { + // Imbue chaos in the given victim + ApplyChaos(victim v1.Pod) error + // Name of this action, ideally a verb - like "terminate pod" + Name() string +} + +func NewDryRunAction() ChaosAction { + return &dryRun{} +} + +func NewDeletePodAction(client kubernetes.Interface) ChaosAction { + return &deletePod{client} +} + +func NewExecAction(client restclient.Interface, config *restclient.Config, containerName string, command []string) ChaosAction { + return &execOnPod{client, config, containerName, command} +} + +// no-op +type dryRun struct { + +} +func (s *dryRun) ApplyChaos(victim v1.Pod) error { + return nil +} +func (s *dryRun) Name() string { return "dry run" } + +var _ ChaosAction = &dryRun{} + +// Simply ask k8s to delete the victim pod +type deletePod struct { + client kubernetes.Interface +} +func (s *deletePod) ApplyChaos(victim v1.Pod) error { + return s.client.CoreV1().Pods(victim.Namespace).Delete(victim.Name, nil) +} +func (s *deletePod) Name() string { return "terminate pod" } + +var _ ChaosAction = &deletePod{} + +// Execute the given command on victim pods +type execOnPod struct { + client restclient.Interface + config *restclient.Config + + containerName string + command []string +} + +// Based on https://github.com/kubernetes/kubernetes/blob/master/pkg/kubectl/cmd/exec.go +func (s *execOnPod) ApplyChaos(pod v1.Pod) error { + var container string + if s.containerName != "" { + for _, c := range pod.Spec.Containers { + container = c.Name; + } + } + + req := s.client.Post(). + Resource("pods"). + Name(pod.Name). + Namespace(pod.Namespace). + SubResource("exec"). + Param("container", container) + req.VersionedParams(&v1.PodExecOptions{ + Container: container, + Command: s.command, + Stdin: false, + Stdout: true, + Stderr: true, + TTY: false, + }, scheme.ParameterCodec) + + exec, err := remotecommand.NewSPDYExecutor(s.config, "POST", req.URL()) + if err != nil { + return err + } + // TODO: Collect stderr/stdout in RAM and log + err = exec.Stream(remotecommand.StreamOptions{ + Stdin: nil, + Stdout: os.Stdout, + Stderr: os.Stderr, + Tty: false, + TerminalSizeQueue: nil, + }) + + return err +} +func (s *execOnPod) Name() string { return fmt.Sprintf("exec '%v'", s.command) } +var _ ChaosAction = &execOnPod{} \ No newline at end of file diff --git a/chaoskube/chaoskube.go b/chaoskube/chaoskube.go index d734f8db..81253c56 100644 --- a/chaoskube/chaoskube.go +++ b/chaoskube/chaoskube.go @@ -40,8 +40,8 @@ type Chaoskube struct { MinimumAge time.Duration // an instance of logrus.StdLogger to write log messages to Logger log.FieldLogger - // dry run will not allow any pod terminations - DryRun bool + // action taken against victim pods + Action ChaosAction // a function to retrieve the current time Now func() time.Time } @@ -65,8 +65,8 @@ var ( // * a list of weekdays, times of day and/or days of a year when chaos mode is disabled // * a time zone to apply to the aforementioned time-based filters // * a logger implementing logrus.FieldLogger to send log output to -// * whether to enable/disable dry-run mode -func New(client kubernetes.Interface, labels, annotations, namespaces labels.Selector, excludedWeekdays []time.Weekday, excludedTimesOfDay []util.TimePeriod, excludedDaysOfYear []time.Time, timezone *time.Location, minimumAge time.Duration, logger log.FieldLogger, dryRun bool) *Chaoskube { +// * what specific action to use to imbue chaos on victim pods +func New(client kubernetes.Interface, labels, annotations, namespaces labels.Selector, excludedWeekdays []time.Weekday, excludedTimesOfDay []util.TimePeriod, excludedDaysOfYear []time.Time, timezone *time.Location, minimumAge time.Duration, logger log.FieldLogger, action ChaosAction) *Chaoskube { return &Chaoskube{ Client: client, Labels: labels, @@ -78,7 +78,7 @@ func New(client kubernetes.Interface, labels, annotations, namespaces labels.Sel Timezone: timezone, MinimumAge: minimumAge, Logger: logger, - DryRun: dryRun, + Action: action, Now: time.Now, } } @@ -135,7 +135,7 @@ func (c *Chaoskube) TerminateVictim() error { return err } - return c.DeletePod(victim) + return c.ApplyChaos(victim) } // Victim returns a random pod from the list of Candidates. @@ -179,19 +179,15 @@ func (c *Chaoskube) Candidates() ([]v1.Pod, error) { return pods, nil } -// DeletePod deletes the given pod. +// ApplyChaos deletes the given pod. // It will not delete the pod if dry-run mode is enabled. -func (c *Chaoskube) DeletePod(victim v1.Pod) error { +func (c *Chaoskube) ApplyChaos(victim v1.Pod) error { c.Logger.WithFields(log.Fields{ "namespace": victim.Namespace, "name": victim.Name, - }).Info("terminating pod") + }).Info(c.Action.Name()) - if c.DryRun { - return nil - } - - return c.Client.CoreV1().Pods(victim.Namespace).Delete(victim.Name, nil) + return c.Action.ApplyChaos(victim) } // filterByNamespaces filters a list of pods by a given namespace selector. diff --git a/chaoskube/chaoskube_test.go b/chaoskube/chaoskube_test.go index 6566f5f0..42aeee33 100644 --- a/chaoskube/chaoskube_test.go +++ b/chaoskube/chaoskube_test.go @@ -218,7 +218,7 @@ func (suite *Suite) TestDeletePod() { victim := util.NewPod("default", "foo", v1.PodRunning) - err := chaoskube.DeletePod(victim) + err := chaoskube.ApplyChaos(victim) suite.Require().NoError(err) suite.assertLog(log.InfoLevel, "terminating pod", log.Fields{"namespace": "default", "name": "foo"}) diff --git a/main.go b/main.go index 1a06581f..1ebacde3 100644 --- a/main.go +++ b/main.go @@ -18,9 +18,11 @@ import ( "k8s.io/client-go/kubernetes" _ "k8s.io/client-go/plugin/pkg/client/auth" "k8s.io/client-go/tools/clientcmd" + restclient "k8s.io/client-go/rest" "github.com/linki/chaoskube/chaoskube" "github.com/linki/chaoskube/util" + "strings" ) var ( @@ -42,6 +44,8 @@ var ( dryRun bool debug bool metricsAddress string + exec string + execContainer string ) func init() { @@ -59,6 +63,8 @@ func init() { kingpin.Flag("kubeconfig", "Path to a kubeconfig file").StringVar(&kubeconfig) kingpin.Flag("interval", "Interval between Pod terminations").Default("10m").DurationVar(&interval) kingpin.Flag("dry-run", "If true, don't actually do anything.").Default("true").BoolVar(&dryRun) + kingpin.Flag("exec", "Execute the given terminal command on victim pods, rather than deleting pods, eg killall -9 bash").StringVar(&exec) + kingpin.Flag("exec-container", "Name of container to run --exec command in, defaults to first container in spec").Default("").StringVar(&execContainer) kingpin.Flag("debug", "Enable debug logging.").BoolVar(&debug) kingpin.Flag("metrics-address", "Listening address for metrics handler").Default(":8080").StringVar(&metricsAddress) } @@ -84,9 +90,11 @@ func main() { "kubeconfig": kubeconfig, "interval": interval, "dryRun": dryRun, + "exec": exec, + "execContainer": execContainer, "debug": debug, "metricsAddress": metricsAddress, - }).Debug("reading config") + }).Info("reading config") log.WithFields(log.Fields{ "version": version, @@ -94,7 +102,12 @@ func main() { "interval": interval, }).Info("starting up") - client, err := newClient() + config, err := newConfig() + if err != nil { + log.WithField("err", err).Fatal("failed to determine k8s client config") + } + + client, err := newClient(config) if err != nil { log.WithField("err", err).Fatal("failed to connect to cluster") } @@ -149,6 +162,15 @@ func main() { "offset": offset / int(time.Hour/time.Second), }).Info("setting timezone") + var action chaoskube.ChaosAction + if dryRun { + action = chaoskube.NewDryRunAction() + } else if len(exec) > 0 { + action = chaoskube.NewExecAction(client.CoreV1().RESTClient(), config, execContainer, strings.Split(exec, " ")) + } else { + action = chaoskube.NewDeletePodAction(client) + } + chaoskube := chaoskube.New( client, labelSelector, @@ -160,7 +182,7 @@ func main() { parsedTimezone, minimumAge, log.StandardLogger(), - dryRun, + action, ) if metricsAddress != "" { @@ -205,7 +227,7 @@ func main() { chaoskube.Run(ctx, ticker.C) } -func newClient() (*kubernetes.Clientset, error) { +func newConfig() (*restclient.Config, error) { if kubeconfig == "" { if _, err := os.Stat(clientcmd.RecommendedHomeFile); err == nil { kubeconfig = clientcmd.RecommendedHomeFile @@ -222,6 +244,10 @@ func newClient() (*kubernetes.Clientset, error) { return nil, err } + return config, nil +} + +func newClient(config *restclient.Config) (*kubernetes.Clientset, error) { client, err := kubernetes.NewForConfig(config) if err != nil { return nil, err From 0a9f2af84dad21bd39febadc473a92d9f56f1e59 Mon Sep 17 00:00:00 2001 From: Martin Linkhorst Date: Mon, 22 Oct 2018 23:56:34 +0200 Subject: [PATCH 2/3] chore: remove gracePeriod from New as it's part of the action --- chaoskube/chaoskube.go | 3 +-- chaoskube/chaoskube_test.go | 8 ++------ main.go | 1 - 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/chaoskube/chaoskube.go b/chaoskube/chaoskube.go index 01abd7a7..b5bb451a 100644 --- a/chaoskube/chaoskube.go +++ b/chaoskube/chaoskube.go @@ -73,7 +73,7 @@ var ( // * a logger implementing logrus.FieldLogger to send log output to // * what specific action to use to imbue chaos on victim pods // * whether to enable/disable event creation -func New(client kubernetes.Interface, labels, annotations, namespaces labels.Selector, excludedWeekdays []time.Weekday, excludedTimesOfDay []util.TimePeriod, excludedDaysOfYear []time.Time, timezone *time.Location, minimumAge time.Duration, logger log.FieldLogger, action ChaosAction, createEvent bool, gracePeriod time.Duration) *Chaoskube { +func New(client kubernetes.Interface, labels, annotations, namespaces labels.Selector, excludedWeekdays []time.Weekday, excludedTimesOfDay []util.TimePeriod, excludedDaysOfYear []time.Time, timezone *time.Location, minimumAge time.Duration, logger log.FieldLogger, action ChaosAction, createEvent bool) *Chaoskube { return &Chaoskube{ Client: client, Labels: labels, @@ -87,7 +87,6 @@ func New(client kubernetes.Interface, labels, annotations, namespaces labels.Sel Logger: logger, Action: action, CreateEvent: createEvent, - GracePeriod: gracePeriod, Now: time.Now, } } diff --git a/chaoskube/chaoskube_test.go b/chaoskube/chaoskube_test.go index 47bb88c5..ac35caf0 100644 --- a/chaoskube/chaoskube_test.go +++ b/chaoskube/chaoskube_test.go @@ -43,8 +43,7 @@ func (suite *Suite) TestNew() { excludedTimesOfDay = []util.TimePeriod{util.TimePeriod{}} excludedDaysOfYear = []time.Time{time.Now()} minimumAge = time.Duration(42) - action = NewDeletePodAction(client, -1) - gracePeriod = 10 * time.Second + action = NewDeletePodAction(client, 10*time.Second) ) chaoskube := New( @@ -60,7 +59,6 @@ func (suite *Suite) TestNew() { logger, action, true, - gracePeriod, ) suite.Require().NotNil(chaoskube) @@ -75,7 +73,6 @@ func (suite *Suite) TestNew() { suite.Equal(minimumAge, chaoskube.MinimumAge) suite.Equal(logger, chaoskube.Logger) suite.Equal(action, chaoskube.Action) - suite.Equal(gracePeriod, chaoskube.GracePeriod) } // TestRunContextCanceled tests that a canceled context will exit the Run function. @@ -603,7 +600,7 @@ func (suite *Suite) setup(labelSelector labels.Selector, annotations labels.Sele if dryRun { action = NewDryRunAction() } else { - action = NewDeletePodAction(client, -1) + action = NewDeletePodAction(client, gracePeriod) } return New( @@ -619,7 +616,6 @@ func (suite *Suite) setup(labelSelector labels.Selector, annotations labels.Sele logger, action, createEvent, - gracePeriod, ) } diff --git a/main.go b/main.go index 8346ec50..e0b71114 100644 --- a/main.go +++ b/main.go @@ -191,7 +191,6 @@ func main() { log.StandardLogger(), action, createEvent, - gracePeriod, ) if metricsAddress != "" { From 1b5de0f342117975d3e10523ab95441065431deb Mon Sep 17 00:00:00 2001 From: Martin Linkhorst Date: Wed, 24 Oct 2018 11:56:43 +0200 Subject: [PATCH 3/3] fix: add missing dependency to lock file --- Gopkg.lock | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Gopkg.lock b/Gopkg.lock index a219e410..636e5028 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -66,6 +66,17 @@ pruneopts = "" revision = "01aeca54ebda6e0fbfafd0a524d234159c05ec20" +[[projects]] + branch = "master" + digest = "1:d6c13a378213e3de60445e49084b8a0a9ce582776dfc77927775dbeb3ff72a35" + name = "github.com/docker/spdystream" + packages = [ + ".", + "spdy", + ] + pruneopts = "" + revision = "6480d4af844c189cf5dd913db24ddd339d3a4f85" + [[projects]] digest = "1:a31fbb19d2b38d50bc125d97b7c3e7a286d3f6f37d18756011eb6e7d1a9fa7d0" name = "github.com/ghodss/yaml" @@ -428,9 +439,12 @@ "pkg/util/clock", "pkg/util/errors", "pkg/util/framer", + "pkg/util/httpstream", + "pkg/util/httpstream/spdy", "pkg/util/intstr", "pkg/util/json", "pkg/util/net", + "pkg/util/remotecommand", "pkg/util/runtime", "pkg/util/sets", "pkg/util/validation", @@ -439,6 +453,7 @@ "pkg/util/yaml", "pkg/version", "pkg/watch", + "third_party/forked/golang/netutil", "third_party/forked/golang/reflect", ] pruneopts = "" @@ -529,8 +544,11 @@ "tools/clientcmd/api/v1", "tools/metrics", "tools/reference", + "tools/remotecommand", "transport", + "transport/spdy", "util/cert", + "util/exec", "util/flowcontrol", "util/homedir", "util/integer", @@ -557,8 +575,10 @@ "k8s.io/client-go/kubernetes/fake", "k8s.io/client-go/kubernetes/scheme", "k8s.io/client-go/plugin/pkg/client/auth", + "k8s.io/client-go/rest", "k8s.io/client-go/tools/clientcmd", "k8s.io/client-go/tools/reference", + "k8s.io/client-go/tools/remotecommand", ] solver-name = "gps-cdcl" solver-version = 1