From 535fbd6cecb2b64e9f122f87ade1e870baf26b1b Mon Sep 17 00:00:00 2001 From: Jan Schlicht Date: Tue, 14 Apr 2020 10:42:51 +0200 Subject: [PATCH] Repair cassandra nodes per pod using the 'REPAIR_POD' parameter (#77) When updating the 'REPAIR_POD' parameter a plan will be triggered that runs 'nodetool repair' for the pod specified in the parameter. Signed-off-by: Jan Schlicht --- operator/operator.yaml | 15 +++++++ operator/params.yaml | 9 +++++ operator/templates/repair-job-rbac.yaml | 31 ++++++++++++++ operator/templates/repair-job.yaml | 18 +++++++++ templates/operator/operator.yaml.template | 15 +++++++ templates/operator/params.yaml.template | 9 +++++ tests/cassandra/cassandra.go | 49 ++++++++++++++++++----- tests/suites/sanity/sanity_test.go | 16 ++++++++ 8 files changed, 153 insertions(+), 9 deletions(-) create mode 100644 operator/templates/repair-job-rbac.yaml create mode 100644 operator/templates/repair-job.yaml diff --git a/operator/operator.yaml b/operator/operator.yaml index 97421677..b7bc17df 100644 --- a/operator/operator.yaml +++ b/operator/operator.yaml @@ -37,6 +37,12 @@ tasks: parameter: SERVICE_ACCOUNT_INSTALL resources: - node-resolver-rbac.yaml + - name: repair-pod + kind: Apply + spec: + resources: + - repair-job-rbac.yaml + - repair-job.yaml plans: deploy: strategy: serial @@ -54,3 +60,12 @@ plans: tasks: - node - ext-service + repair-pod: + strategy: serial + phases: + - name: nodes + strategy: parallel + steps: + - name: repair + tasks: + - repair-pod diff --git a/operator/params.yaml b/operator/params.yaml index 312bc943..15f4270d 100644 --- a/operator/params.yaml +++ b/operator/params.yaml @@ -892,3 +892,12 @@ parameters: - name: POD_MANAGEMENT_POLICY description: "podManagementPolicy of the Cassandra Statefulset" default: "OrderedReady" + + ################################################################################ + ################################ Repair options ################################ + ################################################################################ + + - name: REPAIR_POD + description: "Name of the pod on which 'nodetool repair' should be run." + default: "" + trigger: repair-pod diff --git a/operator/templates/repair-job-rbac.yaml b/operator/templates/repair-job-rbac.yaml new file mode 100644 index 00000000..d4ab448a --- /dev/null +++ b/operator/templates/repair-job-rbac.yaml @@ -0,0 +1,31 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ .Name }}-node-repair-role + namespace: {{ .Namespace }} +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get"] + - apiGroups: [""] + resources: ["pods/exec"] + verbs: ["create"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ .Name }}-node-repairer + namespace: {{ .Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ .Name }}-node-repairer-{{ .Namespace }}-binding +subjects: + - kind: ServiceAccount + name: {{ .Name }}-node-repairer + namespace: {{ .Namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ .Name }}-node-repair-role diff --git a/operator/templates/repair-job.yaml b/operator/templates/repair-job.yaml new file mode 100644 index 00000000..124aa660 --- /dev/null +++ b/operator/templates/repair-job.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ $.Name }}-node-repair-job + namespace: {{ $.Namespace }} + labels: + cassandra: {{ $.OperatorName }} + app: {{ $.Name }} +spec: + template: + spec: + containers: + - name: repair-job + image: bitnami/kubectl:1.18.0 + command: [ "kubectl", "exec", "{{ $.Params.REPAIR_POD }}", "--", "nodetool", "repair" ] + restartPolicy: Never + serviceAccountName: {{ .Name }}-node-repairer diff --git a/templates/operator/operator.yaml.template b/templates/operator/operator.yaml.template index de9bd02a..876f0a32 100644 --- a/templates/operator/operator.yaml.template +++ b/templates/operator/operator.yaml.template @@ -37,6 +37,12 @@ tasks: parameter: SERVICE_ACCOUNT_INSTALL resources: - node-resolver-rbac.yaml + - name: repair-pod + kind: Apply + spec: + resources: + - repair-job-rbac.yaml + - repair-job.yaml plans: deploy: strategy: serial @@ -54,3 +60,12 @@ plans: tasks: - node - ext-service + repair-pod: + strategy: serial + phases: + - name: nodes + strategy: parallel + steps: + - name: repair + tasks: + - repair-pod diff --git a/templates/operator/params.yaml.template b/templates/operator/params.yaml.template index 13cc641a..248dc0d6 100644 --- a/templates/operator/params.yaml.template +++ b/templates/operator/params.yaml.template @@ -892,3 +892,12 @@ parameters: - name: POD_MANAGEMENT_POLICY description: "podManagementPolicy of the Cassandra Statefulset" default: "OrderedReady" + + ################################################################################ + ################################ Repair options ################################ + ################################################################################ + + - name: REPAIR_POD + description: "Name of the pod on which 'nodetool repair' should be run." + default: "" + trigger: repair-pod diff --git a/tests/cassandra/cassandra.go b/tests/cassandra/cassandra.go index ae74bf97..3aaf7819 100644 --- a/tests/cassandra/cassandra.go +++ b/tests/cassandra/cassandra.go @@ -87,7 +87,7 @@ func OverrideOperatorVersion( return operatorVersion, desiredOperatorVersion, nil } -func firstPodName(instance kudo.Instance) (string, error) { +func FirstPodName(instance kudo.Instance) (string, error) { if instance.Spec.Parameters["NODE_TOPOLOGY"] != "" { topology, err := TopologyFromYaml(instance.Spec.Parameters["NODE_TOPOLOGY"]) if err != nil { @@ -100,7 +100,7 @@ func firstPodName(instance kudo.Instance) (string, error) { } func Nodes(client client.Client, instance kudo.Instance) ([]map[string]string, error) { - podName, err := firstPodName(instance) + podName, err := FirstPodName(instance) if err != nil { return nil, err } @@ -180,7 +180,7 @@ func Nodes(client client.Client, instance kudo.Instance) ([]map[string]string, e // Cqlsh Wrapper to run cql commands in the cqlsh cli of cassandra 0th node func Cqlsh(client client.Client, instance kudo.Instance, cql string) (string, error) { - podName, err := firstPodName(instance) + podName, err := FirstPodName(instance) if err != nil { return "", err } @@ -249,11 +249,15 @@ func NodeJVMOptions(client client.Client, instance kudo.Instance) (map[string]st ",") } -func configurationFromNodeLogs( - client client.Client, - instance kudo.Instance, - regex string, - separator string) (map[string]string, error) { +func NodeWasRepaired(client client.Client, instance kudo.Instance) (bool, error) { + return nodeLogsContain( + client, + instance, + "o.a.cassandra.repair.RepairRunnable - Starting repair command", + ) +} + +func nodeLogs(client client.Client, instance kudo.Instance) ([]byte, error) { podName := fmt.Sprintf("%s-%s-%d", instance.Name, "node", 0) pod, err := kubernetes.GetPod(client, podName, instance.Namespace) @@ -261,7 +265,15 @@ func configurationFromNodeLogs( return nil, err } - logs, err := pod.ContainerLogs("cassandra") + return pod.ContainerLogs("cassandra") +} + +func configurationFromNodeLogs( + client client.Client, + instance kudo.Instance, + regex string, + separator string) (map[string]string, error) { + logs, err := nodeLogs(client, instance) if err != nil { return nil, err } @@ -288,3 +300,22 @@ func configurationFromNodeLogs( return configuration, nil } + +func nodeLogsContain(client client.Client, instance kudo.Instance, expected string) (bool, error) { + logs, err := nodeLogs(client, instance) + if err != nil { + return false, err + } + + scanner := bufio.NewScanner(bytes.NewReader(logs)) + + var inLogs bool + for scanner.Scan() { + inLogs = strings.Contains(scanner.Text(), expected) + if inLogs { + break + } + } + + return inLogs, nil +} diff --git a/tests/suites/sanity/sanity_test.go b/tests/suites/sanity/sanity_test.go index fe4d0823..a4672e70 100644 --- a/tests/suites/sanity/sanity_test.go +++ b/tests/suites/sanity/sanity_test.go @@ -219,6 +219,22 @@ var _ = Describe(TestName, func() { Expect(err).To(BeNil()) assertNumberOfCassandraNodes(NodeCount) + + By("Triggering a Cassandra node repair") + podName, err := cassandra.FirstPodName(Operator.Instance) + Expect(err).To(BeNil()) + + err = Operator.Instance.UpdateParameters(map[string]string{ + "REPAIR_POD": podName, + }) + Expect(err).To(BeNil()) + + err = Operator.Instance.WaitForPlanComplete("repair-pod") + Expect(err).To(BeNil()) + + repair, err := cassandra.NodeWasRepaired(Client, Operator.Instance) + Expect(err).To(BeNil()) + Expect(repair).To(BeTrue()) }) It("Uninstalls the operator", func() {