diff --git a/Makefile b/Makefile index f02e8df4..90258205 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,11 @@ all: build ENVVAR = GOOS=linux GOARCH=amd64 CGO_ENABLED=0 -TAG = v0.2.0 +TAG = v0.2.1 .PHONY: all build container clean -build: clean +build: clean gofmt $(ENVVAR) go build -o kube-monkey # Supressing docker build avoids printing the env variables diff --git a/README.md b/README.md index fc8be8c2..c428b7f8 100644 --- a/README.md +++ b/README.md @@ -22,10 +22,15 @@ Opt-in is done by setting the following labels on a Kubernetes k8 app: killed approximately every third weekday. **`kube-monkey/identifier`**: A unique identifier for the k8 app (eg. the k8 app's name). This is used to identify the pods that belong to a k8 app as Pods inherit labels from their k8 app. -**`kube-monkey/kill-all`**: Set this label's value to `"kill-all"` if you want kube-monkey to kill ALL of your pods. Default behavior in the absence of this label is to kill only ONE pod. **Use this label carefully.** +**`kube-monkey/kill-mode`**: Set this label's value to +* `"kill-all"` if you want kube-monkey to kill ALL of your pods regardless of status. Does not require kill-value. Default behavior in the absence of this label is to kill only ONE pod. **Use this label carefully.** +* `fixed` if you want to kill a specific number of running pods with kill-value. If you overspecify, it will kill all running pods and issue a warning. +* `random-max-percent` to specify a maximum % with kill-value that can be killed. At the scheduled time, a uniform random specified % of the running pods will be terminated. +**`kube-monkey/kill-value`**: Specify value for kill-mode +* if `fixed`, provide an integer of pods to kill +* if `random-max-percent`, provide a number from 0-100 to specify the max % of pods kube-monkey can kill - -#### Example of opted-in Deployment +#### Example of opted-in Deployment killing one pod per purge ```yaml --- @@ -41,6 +46,8 @@ spec: kube-monkey/enabled: enabled kube-monkey/identifier: monkey-victim-pods kube-monkey/mtbf: '2' + kube-monkey/kill-mode: "fixed" + kube-monkey/kill-value: 1 [... omitted ...] ``` @@ -57,6 +64,8 @@ metadata: kube-monkey/enabled: enabled kube-monkey/identifier: monkey-victim kube-monkey/mtbf: '2' + kube-monkey/kill-mode: "fixed" + kube-monkey/kill-value: 1 spec: template: metadata: @@ -82,16 +91,16 @@ host="https://your-apiserver-url.com" #### Scheduling time Scheduling happens once a day on Weekdays - this is when a schedule for terminations for the current day is generated. During scheduling, kube-monkey will: -1. Generate a list of eligible k8 apps (k8 apps that have opted-in and are not blacklisted) +1. Generate a list of eligible k8 apps (k8 apps that have opted-in and are not blacklisted, if specified, and are whitelisted, if specified) 2. For each eligible k8 app, flip a biased coin (bias determined by `kube-monkey/mtbf`) to determine if a pod for that k8 app should be killed today 3. For each victim, calculate a random time when a pod will be killed #### Termination time -This is the randomly generated time during the day when a victim k8 app will have a pod killed. -At termination time, kube-monkey will: -1. Check if the k8 app is still eligible (has not opted-out or been blacklisted since scheduling) -2. Get a list of running pods for the k8 app -3. Select one random pod and delete it +This is the randomly generated time during the day when a victim k8 app will have a pod killed. +At termination time, kube-monkey will: +1. Check if the k8 app is still eligible (has not opted-out or been blacklisted or removed from the whitelist since scheduling) +2. Check if the k8 app has updated kill-mode and kill-value +3. Depending on kill-mode and kill-value, execute pods ## Building @@ -123,7 +132,7 @@ time_zone = "America/New_York" # Set tzdata timezone example. Note the 1. First deploy the expected `kube-monkey-config-map` configmap in the namespace you intend to run kube-monkey in (for example, the `kube-system` namespace). Make sure to define the keyname as `config.toml` -> For example `kubectl create configmap km-config --from-file=config.toml=km-config.toml` +> For example `kubectl create configmap km-config --from-file=config.toml=km-config.toml` or `kubectl apply -f km-config.yaml` 2. Run kube-monkey as a k8 app within the Kubernetes cluster, in a namespace that has permissions to kill Pods in other namespaces (eg. `kube-system`). diff --git a/chaos/chaos.go b/chaos/chaos.go index 80897832..31d62260 100644 --- a/chaos/chaos.go +++ b/chaos/chaos.go @@ -2,10 +2,12 @@ package chaos import ( "fmt" + "math/rand" "time" "github.com/golang/glog" + "github.com/asobti/kube-monkey/config" "github.com/asobti/kube-monkey/kubernetes" "github.com/asobti/kube-monkey/victims" @@ -64,6 +66,7 @@ func (c *Chaos) Execute(resultchan chan<- *ChaosResult) { err = c.terminate(clientset) if err != nil { resultchan <- c.NewResult(err) + return } // Send a success msg @@ -96,53 +99,44 @@ func (c *Chaos) verifyExecution(clientset *kube.Clientset) error { return nil } -// The termination type and termination of pods happens here +// The termination type and value is processed here func (c *Chaos) terminate(clientset *kube.Clientset) error { - // Do the termination - killAll, err := c.Victim().HasKillAll(clientset) + killType, err := c.Victim().KillType(clientset) + if err != nil { + glog.Errorf("Failed to check KillType label for %s %s. Proceeding with termination of a single pod. Error: %v", c.Victim().Kind(), c.Victim().Name(), err.Error()) + return c.terminatePod(clientset) + } + if killType == config.KillAllLabelValue { + return c.Victim().TerminateAllPods(clientset) + } + + killValue, err := c.Victim().KillValue(clientset) if err != nil { - glog.Errorf("Failed to check KillAll label for %s %s. Proceeding with termination of a single pod. Error: %v", c.Victim().Kind(), c.Victim().Name(), err.Error()) + glog.Errorf("Failed to check KillValue label for %s %s. Proceeding with termination of a single pod. Error: %v", c.Victim().Kind(), c.Victim().Name(), err.Error()) + return c.terminatePod(clientset) } - if killAll { - err = c.terminateAll(clientset) - } else { - err = c.terminatePod(clientset) + // Validate killtype + switch killType { + case config.KillFixedLabelValue: + return c.Victim().DeleteRandomPods(clientset, killValue) + case config.KillRandomLabelValue: + r := rand.New(rand.NewSource(time.Now().UnixNano())) + return c.Victim().DeleteRandomPods(clientset, killValue*100/r.Intn(101)) + default: + return fmt.Errorf("Failed to recognize KillValue label for %s %s. Error: %v", c.Victim().Kind(), c.Victim().Name(), err.Error()) } // Send back termination success return nil } +// Redundant for DeleteRandomPods(clientset,1) but DeleteRandomPod is faster // Terminates one random pod func (c *Chaos) terminatePod(clientset *kube.Clientset) error { return c.Victim().DeleteRandomPod(clientset) } -// Terminates ALL pods for the victim -// Not the default, or recommended, behavior -func (c *Chaos) terminateAll(clientset *kube.Clientset) error { - glog.V(1).Infof("Terminating ALL pods for %s %s\n", c.Victim().Kind(), c.Victim().Name()) - - pods, err := c.Victim().Pods(clientset) - if err != nil { - return err - } - - if len(pods) == 0 { - return fmt.Errorf("%s %s has no pods at the moment", c.Victim().Kind(), c.Victim().Name()) - } - - for _, pod := range pods { - // In case of error, log it and move on to next pod - if err = c.Victim().DeletePod(clientset, pod.Name); err != nil { - glog.Errorf("Failed to delete pod %s for %s %s", pod.Name, c.Victim().Kind(), c.Victim().Name()) - } - } - - return nil -} - // Create a ChaosResult instance func (c *Chaos) NewResult(e error) *ChaosResult { return &ChaosResult{ diff --git a/config/config.go b/config/config.go index e01ad09f..777e558c 100644 --- a/config/config.go +++ b/config/config.go @@ -21,12 +21,15 @@ const ( // Currently, there does not appear to be // any value in making these configurable // so defining them as consts - IdentLabelKey = "kube-monkey/identifier" - EnabledLabelKey = "kube-monkey/enabled" - EnabledLabelValue = "enabled" - MtbfLabelKey = "kube-monkey/mtbf" - KillAllLabelKey = "kube-monkey/kill-all" - KillAllLabelValue = "kill-all" + IdentLabelKey = "kube-monkey/identifier" + EnabledLabelKey = "kube-monkey/enabled" + EnabledLabelValue = "enabled" + MtbfLabelKey = "kube-monkey/mtbf" + KillTypeLabelKey = "kube-monkey/kill-mode" + KillValueLabelKey = "kube-monkey/kill-value" + KillRandomLabelValue = "random-max-percent" + KillFixedLabelValue = "fixed" + KillAllLabelValue = "kill-all" ) func SetDefaults() { diff --git a/kubernetes/kubernetes.go b/kubernetes/kubernetes.go index 77823c27..5d7afd98 100644 --- a/kubernetes/kubernetes.go +++ b/kubernetes/kubernetes.go @@ -54,4 +54,4 @@ func NewInClusterClient() (*kube.Clientset, error) { func VerifyClient(client *kube.Clientset) bool { _, err := client.ServerVersion() return err == nil -} \ No newline at end of file +} diff --git a/victims/factory/deployments/deployments.go b/victims/factory/deployments/deployments.go index 634f40b7..0e01702b 100644 --- a/victims/factory/deployments/deployments.go +++ b/victims/factory/deployments/deployments.go @@ -24,7 +24,7 @@ func New(dep *v1beta1.Deployment) (*Deployment, error) { if err != nil { return nil, err } - kind := fmt.Sprintf("%T", dep) + kind := fmt.Sprintf("%T", *dep) return &Deployment{victims.New(kind, dep.Name, dep.Namespace, ident, mtbf)}, nil } diff --git a/victims/factory/deployments/eligible_deployments.go b/victims/factory/deployments/eligible_deployments.go index 59d46002..62ec3bb8 100644 --- a/victims/factory/deployments/eligible_deployments.go +++ b/victims/factory/deployments/eligible_deployments.go @@ -3,6 +3,9 @@ package deployments //All these functions require api access specific to the version of the app import ( + "fmt" + "strconv" + "github.com/golang/glog" "github.com/asobti/kube-monkey/config" @@ -51,13 +54,37 @@ func (d *Deployment) IsEnrolled(clientset *kube.Clientset) (bool, error) { return deployment.Labels[config.EnabledLabelKey] == config.EnabledLabelValue, nil } -// Checks if the deployment is flagged for killall at this time -func (d *Deployment) HasKillAll(clientset *kube.Clientset) (bool, error) { +// Returns current killtype config label for update +func (d *Deployment) KillType(clientset *kube.Clientset) (string, error) { deployment, err := clientset.ExtensionsV1beta1().Deployments(d.Namespace()).Get(d.Name(), metav1.GetOptions{}) if err != nil { - // Ran into some error: return 'false' for killAll to be safe - return false, nil + return "", err + } + + killType, ok := deployment.Labels[config.KillTypeLabelKey] + if !ok { + return "", fmt.Errorf("%s %s does not have %s label", d.Kind(), d.Name(), config.KillTypeLabelKey) + } + + return killType, nil +} + +// Returns current killvalue config label for update +func (d *Deployment) KillValue(clientset *kube.Clientset) (int, error) { + deployment, err := clientset.ExtensionsV1beta1().Deployments(d.Namespace()).Get(d.Name(), metav1.GetOptions{}) + if err != nil { + return -1, err + } + + killMode, ok := deployment.Labels[config.KillValueLabelKey] + if !ok { + return -1, fmt.Errorf("%s %s does not have %s label", d.Kind(), d.Name(), config.KillValueLabelKey) + } + + killModeInt, err := strconv.Atoi(killMode) + if !(killModeInt > 0) { + return -1, fmt.Errorf("Invalid value for label %s: %d", config.KillValueLabelKey, killModeInt) } - return deployment.Labels[config.KillAllLabelKey] == config.KillAllLabelValue, nil + return killModeInt, nil } diff --git a/victims/factory/factory.go b/victims/factory/factory.go index fc026a62..f75a27ec 100644 --- a/victims/factory/factory.go +++ b/victims/factory/factory.go @@ -36,25 +36,25 @@ func EligibleVictims() (eligibleVictims []victims.Victim, err error) { return nil, err } - for _, namespace := range config.WhitelistedNamespaces().UnsortedList() { - // Fetch deployments - deployments, err := deployments.EligibleDeployments(clientset, namespace, filter) - if err != nil { - //allow pass through to schedule other kinds and namespaces - glog.Warningf("Failed to fetch eligible deployments for namespace %s due to error: %s", namespace, err.Error()) - continue - } - eligibleVictims = append(eligibleVictims, deployments...) + for _, namespace := range config.WhitelistedNamespaces().UnsortedList() { + // Fetch deployments + deployments, err := deployments.EligibleDeployments(clientset, namespace, filter) + if err != nil { + //allow pass through to schedule other kinds and namespaces + glog.Warningf("Failed to fetch eligible deployments for namespace %s due to error: %s", namespace, err.Error()) + continue + } + eligibleVictims = append(eligibleVictims, deployments...) - // Fetch statefulsets - statefulsets, err := statefulsets.EligibleStatefulSets(clientset, namespace, filter) - if err != nil { - //allow pass through to schedule other kinds and namespaces - glog.Warningf("Failed to fetch eligible statefulsets for namespace %s due to error: %s", namespace, err.Error()) - continue - } - eligibleVictims = append(eligibleVictims, statefulsets...) - } + // Fetch statefulsets + statefulsets, err := statefulsets.EligibleStatefulSets(clientset, namespace, filter) + if err != nil { + //allow pass through to schedule other kinds and namespaces + glog.Warningf("Failed to fetch eligible statefulsets for namespace %s due to error: %s", namespace, err.Error()) + continue + } + eligibleVictims = append(eligibleVictims, statefulsets...) + } return } diff --git a/victims/factory/statefulsets/eligible_statefulsets.go b/victims/factory/statefulsets/eligible_statefulsets.go index 8c9597c9..bebfff40 100644 --- a/victims/factory/statefulsets/eligible_statefulsets.go +++ b/victims/factory/statefulsets/eligible_statefulsets.go @@ -3,6 +3,9 @@ package statefulsets //All these functions require api access specific to the version of the app import ( + "fmt" + "strconv" + "github.com/golang/glog" "github.com/asobti/kube-monkey/config" @@ -15,7 +18,7 @@ import ( // Get all eligible statefulsets that opted in (filtered by config.EnabledLabel) func EligibleStatefulSets(clientset *kube.Clientset, namespace string, filter *metav1.ListOptions) (eligVictims []victims.Victim, err error) { - enabledVictims, err := clientset.AppsV1beta1().StatefulSets(namespace).List(*filter) + enabledVictims, err := clientset.AppsV1beta1().StatefulSets(namespace).List(*filter) if err != nil { return nil, err } @@ -27,9 +30,9 @@ func EligibleStatefulSets(clientset *kube.Clientset, namespace string, filter *m continue } - // TODO: After generating whitelisting ns list, this will move to factory. - // IsBlacklisted will change to something like IsAllowedNamespace - // and will only be used to verify at time of scheduled execution + // TODO: After generating whitelisting ns list, this will move to factory. + // IsBlacklisted will change to something like IsAllowedNamespace + // and will only be used to verify at time of scheduled execution if victim.IsBlacklisted() { continue } @@ -51,13 +54,37 @@ func (ss *StatefulSet) IsEnrolled(clientset *kube.Clientset) (bool, error) { return statefulset.Labels[config.EnabledLabelKey] == config.EnabledLabelValue, nil } -// Checks if the statefulset is flagged for killall at this time -func (ss *StatefulSet) HasKillAll(clientset *kube.Clientset) (bool, error) { +// Returns current killtype config label for update +func (ss *StatefulSet) KillType(clientset *kube.Clientset) (string, error) { statefulset, err := clientset.AppsV1beta1().StatefulSets(ss.Namespace()).Get(ss.Name(), metav1.GetOptions{}) if err != nil { - // Ran into some error: return 'false' for killAll to be safe - return false, nil + return "", err + } + + killType, ok := statefulset.Labels[config.KillTypeLabelKey] + if !ok { + return "", fmt.Errorf("%s %s does not have %s label", ss.Kind(), ss.Name(), config.KillTypeLabelKey) + } + + return killType, nil +} + +// Returns current killvalue config label for update +func (ss *StatefulSet) KillValue(clientset *kube.Clientset) (int, error) { + statefulset, err := clientset.AppsV1beta1().StatefulSets(ss.Namespace()).Get(ss.Name(), metav1.GetOptions{}) + if err != nil { + return -1, err + } + + killMode, ok := statefulset.Labels[config.KillValueLabelKey] + if !ok { + return -1, fmt.Errorf("%s %s does not have %s label", ss.Kind(), ss.Name(), config.KillValueLabelKey) + } + + killModeInt, err := strconv.Atoi(killMode) + if !(killModeInt > 0) { + return -1, fmt.Errorf("Invalid value for label %s: %d", config.KillValueLabelKey, killModeInt) } - return statefulset.Labels[config.KillAllLabelKey] == config.KillAllLabelValue, nil + return killModeInt, nil } diff --git a/victims/factory/statefulsets/statefulsets.go b/victims/factory/statefulsets/statefulsets.go index 04b9c26f..c39c8154 100644 --- a/victims/factory/statefulsets/statefulsets.go +++ b/victims/factory/statefulsets/statefulsets.go @@ -24,7 +24,7 @@ func New(ss *v1beta1.StatefulSet) (*StatefulSet, error) { if err != nil { return nil, err } - kind := fmt.Sprintf("%T", ss) + kind := fmt.Sprintf("%T", *ss) return &StatefulSet{victims.New(kind, ss.Name, ss.Namespace, ident, mtbf)}, nil } diff --git a/victims/victims.go b/victims/victims.go index cdb9ec10..ecaaf1e1 100644 --- a/victims/victims.go +++ b/victims/victims.go @@ -36,8 +36,9 @@ type VictimBaseTemplate interface { type VictimSpecificApiCalls interface { // Depends on which version i.e. apps/v1 or extensions/v1beta2 - IsEnrolled(*kube.Clientset) (bool, error) - HasKillAll(*kube.Clientset) (bool, error) + IsEnrolled(*kube.Clientset) (bool, error) // Get updated enroll status + KillType(*kube.Clientset) (string, error) // Get updated kill config type + KillValue(*kube.Clientset) (int, error) // Get updated kill config value } type VictimApiCalls interface { @@ -45,7 +46,9 @@ type VictimApiCalls interface { RunningPods(*kube.Clientset) ([]v1.Pod, error) Pods(*kube.Clientset) ([]v1.Pod, error) DeletePod(*kube.Clientset, string) error - DeleteRandomPod(*kube.Clientset) error + DeleteRandomPod(*kube.Clientset) error // Deprecated, but faster than DeleteRandomPods for single pod termination + DeleteRandomPods(*kube.Clientset, int) error + TerminateAllPods(*kube.Clientset) error IsBlacklisted() bool IsWhitelisted() bool } @@ -123,6 +126,79 @@ func (v *VictimBase) DeletePod(clientset *kube.Clientset, podName string) error return clientset.CoreV1().Pods(v.namespace).Delete(podName, deleteopts) } +// Removes specified number of random pods for the victim +func (v *VictimBase) DeleteRandomPods(clientset *kube.Clientset, killNum int) error { + // Pick a target pod to delete + pods, err := v.RunningPods(clientset) + if err != nil { + return err + } + + numPods := len(pods) + switch { + case numPods == 0: + return fmt.Errorf("%s %s has no running pods at the moment", v.kind, v.name) + case numPods < killNum: + glog.Warningf("%s %s has only %d currently running pods, but %d terminations requested", v.kind, v.name, numPods, killNum) + fallthrough + case numPods == killNum: + glog.V(6).Infof("Killing ALL %d running pods for %s %s", numPods, v.kind, v.name) + break + case killNum == 0: + return fmt.Errorf("No terminations requested for %s %s", v.kind, v.name) + case killNum < 0: + return fmt.Errorf("Cannot request negative terminations %d for %s %s", numPods, v.kind, v.name) + case numPods > killNum: + glog.V(6).Infof("Killing %d running pods for %s %s", numPods, v.kind, v.name) + break + default: + return fmt.Errorf("unexpected behavior for terminating %s %s", v.kind, v.name) + } + + r := rand.New(rand.NewSource(time.Now().UnixNano())) + killCount := 0 + for _, i := range r.Perm(numPods) { + if killCount == killNum { + // Report success + return nil + } + targetPod := pods[i].Name + glog.V(6).Infof("Terminating pod %s for %s %s\n", targetPod, v.kind, v.name) + err = v.DeletePod(clientset, targetPod) + if err != nil { + return err + } + killCount++ + } + + // Successful termination + return nil +} + +// Terminate all pods for the victim, regardless of status +func (v *VictimBase) TerminateAllPods(clientset *kube.Clientset) error { + glog.V(2).Infof("Terminating ALL pods for %s %s\n", v.kind, v.name) + + pods, err := v.Pods(clientset) + if err != nil { + return err + } + + if len(pods) == 0 { + return fmt.Errorf("%s %s has no pods at the moment", v.kind, v.name) + } + + for _, pod := range pods { + // In case of error, log it and move on to next pod + if err = v.DeletePod(clientset, pod.Name); err != nil { + glog.Errorf("Failed to delete pod %s for %s %s", pod.Name, v.kind, v.name) + } + } + + return nil +} + +// Deprecated for DeleteRandomPods(clientset, 1) // Remove a random pod for the victim func (v *VictimBase) DeleteRandomPod(clientset *kube.Clientset) error { // Pick a target pod to delete @@ -137,7 +213,7 @@ func (v *VictimBase) DeleteRandomPod(clientset *kube.Clientset) error { targetPod := RandomPodName(pods) - glog.Errorf("Terminating pod %s for %s %s\n", targetPod, v.kind, v.name) + glog.V(6).Infof("Terminating pod %s for %s %s\n", targetPod, v.kind, v.name) return v.DeletePod(clientset, targetPod) }