From 02ba934e8c646aa396a4dae2e13828dc0c16706f Mon Sep 17 00:00:00 2001 From: Yecheng Fu Date: Fri, 3 Apr 2020 16:07:17 +0800 Subject: [PATCH] support gke --- hack/run-e2e.sh | 28 ++---- tests/actions.go | 6 +- tests/e2e/tidbcluster/stability.go | 139 ++++++++++++++++++----------- tests/e2e/util/node/node.go | 24 +++++ 4 files changed, 122 insertions(+), 75 deletions(-) diff --git a/hack/run-e2e.sh b/hack/run-e2e.sh index c2f3879511c..a4bc6029746 100755 --- a/hack/run-e2e.sh +++ b/hack/run-e2e.sh @@ -29,7 +29,7 @@ GCP_REGION=${GCP_REGION:-} GCP_ZONE=${GCP_ZONE:-} GCP_CREDENTIALS=${GCP_CREDENTIALS:-} GCP_SDK=${GCP_SDK:-/google-cloud-sdk} -KUBE_SSH_USER=${KUBE_SSH_USER:-} +KUBE_SSH_USER=${KUBE_SSH_USER:-vagrant} IMAGE_TAG=${IMAGE_TAG:-} SKIP_IMAGE_LOAD=${SKIP_IMAGE_LOAD:-} TIDB_OPERATOR_IMAGE=${TIDB_OPERATOR_IMAGE:-localhost:5000/pingcap/tidb-operator:latest} @@ -125,27 +125,7 @@ for ((i = 1; i <= 32; i++)) { EOF done elif [ "$PROVIDER" == "gke" ]; then - # disks are created under /mnt/stateful_partition directory - # https://cloud.google.com/container-optimized-os/docs/concepts/disks-and-filesystem - for n in $($KUBECTL_BIN --context "$KUBECONTEXT" get nodes -ojsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}'); do - gcloud compute ssh e2e@$n --command 'sudo bash -c '"'"' -test -d /mnt/stateful_partition/disks || mkdir -p /mnt/stateful_partition/disks -df -h /mnt/stateful_partition/disks -test -d /mnt/disks || mkdir -p /mnt/disks -cd /mnt/disks -for ((i = 1; i <= 32; i++)) { - if [ ! -d vol$i ]; then - mkdir vol$i - fi - if ! mountpoint vol$i &>/dev/null; then - if [ ! -d /mnt/stateful_partition/disks/vol$i ]; then - mkdir /mnt/stateful_partition/disks/vol$i - fi - mount --bind /mnt/stateful_partition/disks/vol$i vol$i - fi -} -'"'" - done + echo "info: provider is $PROVIDER, skipped" elif [ "$PROVIDER" == "eks" ]; then echo "info: provider is $PROVIDER, skipped" fi @@ -352,7 +332,7 @@ if [ "$PROVIDER" == "eks" ]; then # aws credential is required to get token for EKS -v $HOME/.aws:/root/.aws # ~/.ssh/kube_aws_rsa must be mounted into e2e container to run ssh - -v $HOME/.ssh:/root/.ssh + -v $HOME/.ssh/kube_aws_rsa:/root/.ssh/kube_aws_rsa ) elif [ "$PROVIDER" == "gke" ]; then e2e_args+=( @@ -373,6 +353,8 @@ elif [ "$PROVIDER" == "gke" ]; then fi docker_args+=( -v ${GCP_SDK}:/google-cloud-sdk + # ~/.ssh/google_compute_engine must be mounted into e2e container to run ssh + -v $HOME/.ssh/google_compute_engine:/root/.ssh/google_compute_engine ) else e2e_args+=( diff --git a/tests/actions.go b/tests/actions.go index 87f2ca7f995..1b33a815724 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -294,7 +294,7 @@ type OperatorConfig struct { ValidatingEnabled bool Cabundle string BackupImage string - AutoFailover bool + AutoFailover *bool } type TidbClusterConfig struct { @@ -409,7 +409,6 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string { set := map[string]string{ "operatorImage": oi.Image, "tidbBackupManagerImage": oi.BackupImage, - "controllerManager.autoFailover": strconv.FormatBool(oi.AutoFailover), "scheduler.logLevel": "4", "testMode": strconv.FormatBool(oi.TestMode), "admissionWebhook.cabundle": oi.Cabundle, @@ -443,6 +442,9 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string { if oi.Enabled(features.AdvancedStatefulSet) { set["advancedStatefulset.create"] = "true" } + if oi.AutoFailover != nil { + set["controllerManager.autoFailover"] = strconv.FormatBool(*oi.AutoFailover) + } arr := make([]string, 0, len(set)) for k, v := range set { diff --git a/tests/e2e/tidbcluster/stability.go b/tests/e2e/tidbcluster/stability.go index 92faa63e221..8afa4c0ea01 100644 --- a/tests/e2e/tidbcluster/stability.go +++ b/tests/e2e/tidbcluster/stability.go @@ -48,6 +48,7 @@ import ( aggregatorclient "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset" "k8s.io/kubernetes/test/e2e/framework" e2enode "k8s.io/kubernetes/test/e2e/framework/node" + "k8s.io/utils/pointer" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -203,7 +204,7 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() { Tag: cfg.OperatorTag, LogLevel: "4", TestMode: true, - AutoFailover: false, + AutoFailover: pointer.BoolPtr(false), } oa = tests.NewOperatorActions(cli, c, asCli, aggrCli, apiExtCli, tests.DefaultPollInterval, ocfg, e2econfig.TestConfig, nil, fw, f) ginkgo.By("Installing CRDs") @@ -243,10 +244,10 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() { // - in EKS, failed pods on deleted node will be recreated because // the node object is gone too (old pods is recycled by pod gc). But // the newly created pods will be stuck at Pending state because - // assocaited PVCs reference nonexistent PVs. Pods will be recreated - // by tidb-operator again when we delete associated PVCs. New PVCs - // will be created by statefulset controller and pods will be - // scheduled to feasible nodes. + // associated PVs are invalid now. Pods will be recreated by + // tidb-operator again when we delete associated PVCs. New PVCs will + // be created by statefulset controller and pods will be scheduled to + // feasible nodes. // - it's highly recommended to enable `setPVOwnerRef` in // local-volume-provisioner, then orphan PVs will be garbaged // collected and will not cause problem even if the name of deleted @@ -257,9 +258,7 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() { // volumes are mounted at the same paths, PD/TiKV pods will be // running soon when underlying instance is recreated and running. // Otherwise, we need to delete failed pods and associated PVCs/PVs. - // - PVs must be cleaned because they are reference nonexistent - // storage now. (See - // https://github.com/kubernetes-sigs/sig-storage-local-static-provisioner/issues/6#issuecomment-484062771) + // PVs must be deleted because their paths does not exist now. // // Note that: // - We assume local storage is used, otherwise PV can be re-attached @@ -284,8 +283,7 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() { // - https://github.com/pingcap/tidb-operator/issues/1546 // - https://github.com/pingcap/tidb-operator/issues/408 ginkgo.It("recover tidb cluster from node deletion", func() { - // TODO support GKE - supportedProviders := sets.NewString("aws") + supportedProviders := sets.NewString("aws", "gke") if !supportedProviders.Has(framework.TestContext.Provider) { framework.Skipf("current provider is not supported list %v, skipping", supportedProviders.List()) } @@ -322,7 +320,7 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() { framework.ExpectNoError(err) for _, pod := range podList.Items { if v, ok := pod.Labels[label.ComponentLabelKey]; !ok { - framework.Failf("pod %s/%s does not have componet label key %q", pod.Namespace, pod.Name, label.ComponentLabelKey) + framework.Failf("pod %s/%s does not have component label key %q", pod.Namespace, pod.Name, label.ComponentLabelKey) } else if v == label.PDLabelVal { allPDNodes[pod.Name] = allNodes[pod.Spec.NodeName] } else if v == label.TiKVLabelVal { @@ -384,61 +382,95 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() { framework.ExpectNoError(err) ginkgo.By("[AWS/EKS] Initialize newly created node") - nodeList, err := c.CoreV1().Nodes().List(metav1.ListOptions{}) + nodeList, err = c.CoreV1().Nodes().List(metav1.ListOptions{}) framework.ExpectNoError(err) initialized := 0 for _, node := range nodeList.Items { if _, ok := allNodes[node.Name]; !ok { - utilnode.InitNode(&node) + framework.ExpectNoError(utilnode.InitNode(&node)) initialized++ } } gomega.Expect(initialized).To(gomega.BeNumerically("==", 1), "must have a node initialized") + } else if framework.TestContext.Provider == "gke" { + instanceIDAnn := "container.googleapis.com/instance_id" + oldInstanceID, ok := nodeToDelete.Annotations[instanceIDAnn] + if !ok { + framework.Failf("instance label %q not found on node object %q", instanceIDAnn, nodeToDelete.Name) + } - ginkgo.By("[AWS/EKS] Mark stores of failed tikv pods as tombstone") - pdClient, cancel, err := proxiedpdclient.NewProxiedPDClient(c, fw, ns, clusterName, false, nil) - framework.ExpectNoError(err) - defer func() { - if cancel != nil { - cancel() + ginkgo.By("[GCP/GKE] Wait for instance ID to be updated") + err = wait.PollImmediate(time.Second*5, time.Minute*10, func() (bool, error) { + node, err := c.CoreV1().Nodes().Get(nodeToDelete.Name, metav1.GetOptions{}) + if err != nil { + return false, nil } - }() - for _, pod := range tikvPodsOnDeletedNode { - framework.Logf("Mark tikv store of pod %s/%s as Tombstone", ns, pod.Name) - err = wait.PollImmediate(time.Second*3, time.Minute, func() (bool, error) { - storeID, err := utiltikv.GetStoreIDByPodName(cli, ns, clusterName, pod.Name) - if err != nil { - return false, nil - } - err = pdClient.SetStoreState(storeID, pdapi.StoreStateTombstone) - if err != nil { - return false, nil - } - return true, nil - }) - framework.ExpectNoError(err) - } - for _, pod := range pdPodsOnDeletedNode { - framework.Logf("Delete member of pod %s/%s", ns, pod.Name) - err = wait.PollImmediate(time.Second*3, time.Minute, func() (bool, error) { - err = pdClient.DeleteMember(pod.Name) - if err != nil { - return false, nil - } - return true, nil - }) - framework.ExpectNoError(err) + instanceID, ok := node.Annotations[instanceIDAnn] + if !ok { + return false, nil + } + if instanceID == oldInstanceID { + return false, nil + } + framework.Logf("instance ID of node %q changed from %q to %q", nodeToDelete.Name, oldInstanceID, instanceID) + return true, nil + }) + framework.ExpectNoError(err) + + ginkgo.By("[GCP/GKE] Wait for the node to be ready") + e2enode.WaitForNodeToBeReady(c, nodeToDelete.Name, time.Minute*5) + + ginkgo.By(fmt.Sprintf("[GCP/GKE] Initialize underlying machine of node %s", nodeToDelete.Name)) + node, err := c.CoreV1().Nodes().Get(nodeToDelete.Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + framework.ExpectNoError(utilnode.InitNode(node)) + } + + ginkgo.By("Mark stores of failed tikv pods as tombstone") + pdClient, cancel, err := proxiedpdclient.NewProxiedPDClient(c, fw, ns, clusterName, false, nil) + framework.ExpectNoError(err) + defer func() { + if cancel != nil { + cancel() } - cancel() - cancel = nil + }() + for _, pod := range tikvPodsOnDeletedNode { + framework.Logf("Mark tikv store of pod %s/%s as Tombstone", ns, pod.Name) + err = wait.PollImmediate(time.Second*3, time.Minute, func() (bool, error) { + storeID, err := utiltikv.GetStoreIDByPodName(cli, ns, clusterName, pod.Name) + if err != nil { + return false, nil + } + err = pdClient.SetStoreState(storeID, pdapi.StoreStateTombstone) + if err != nil { + return false, nil + } + return true, nil + }) + framework.ExpectNoError(err) + } + for _, pod := range pdPodsOnDeletedNode { + framework.Logf("Delete member of pod %s/%s", ns, pod.Name) + err = wait.PollImmediate(time.Second*3, time.Minute, func() (bool, error) { + err = pdClient.DeleteMember(pod.Name) + if err != nil { + return false, nil + } + return true, nil + }) + framework.ExpectNoError(err) + } + cancel() + cancel = nil + if framework.TestContext.Provider == "aws" { // Local storage is gone with the node and local PVs on deleted // node will be unusable. // If `setPVOwnerRef` is enabled in local-volume-provisioner, // local PVs will be deleted when the node object is deleted // and permanently gone in apiserver when associated PVCs are // delete here. - ginkgo.By("[AWS/EKS] Delete associated PVCs if they reference local storage PVs") + ginkgo.By("[AWS/EKS] Delete associated PVCs if they are bound with local PVs") localPVs := make([]string, 0) for _, pvcName := range pvcNamesOnDeletedNode { pvc, err := c.CoreV1().PersistentVolumeClaims(ns).Get(pvcName, metav1.GetOptions{}) @@ -455,11 +487,18 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() { } } } else if framework.TestContext.Provider == "gke" { - ginkgo.By("TODO support GKE") + framework.Logf("We are using fixed paths in local PVs in our e2e. PVs of the deleted node are usable though the underlying storage is empty now") + // Because of pod exponential crash loop back off, we can + // delete the failed pods to make it start soon. + // Note that this is optional. + ginkgo.By("Deleting the failed pods") + for _, pod := range append(tikvPodsOnDeletedNode, pdPodsOnDeletedNode...) { + framework.ExpectNoError(c.CoreV1().Pods(ns).Delete(pod.Name, &metav1.DeleteOptions{})) + } } ginkgo.By("Waiting for tidb cluster to be fully ready") - err = oa.WaitForTidbClusterReady(tc, 30*time.Minute, 15*time.Second) + err = oa.WaitForTidbClusterReady(tc, 5*time.Minute, 15*time.Second) framework.ExpectNoError(err) }) diff --git a/tests/e2e/util/node/node.go b/tests/e2e/util/node/node.go index dfe0b3d1929..b8a873706fb 100644 --- a/tests/e2e/util/node/node.go +++ b/tests/e2e/util/node/node.go @@ -44,6 +44,27 @@ if ! grep -qF "OPTIONS" /etc/sysconfig/docker; then fi systemctl restart docker ' +` + // disks are created under /mnt/stateful_partition directory + // https://cloud.google.com/container-optimized-os/docs/concepts/disks-and-filesystem + gkeNodeInitCmd = ` +sudo bash -c ' +test -d /mnt/stateful_partition/disks || mkdir -p /mnt/stateful_partition/disks +df -h /mnt/stateful_partition/disks +test -d /mnt/disks || mkdir -p /mnt/disks +cd /mnt/disks +for ((i = 1; i <= 32; i++)) { + if [ ! -d vol$i ]; then + mkdir vol$i + fi + if ! mountpoint vol$i &>/dev/null; then + if [ ! -d /mnt/stateful_partition/disks/vol$i ]; then + mkdir /mnt/stateful_partition/disks/vol$i + fi + mount --bind /mnt/stateful_partition/disks/vol$i vol$i + fi +} +' ` ) @@ -51,7 +72,10 @@ func InitNode(node *v1.Node) error { var initNodeCmd string if framework.TestContext.Provider == "aws" { initNodeCmd = awsNodeInitCmd + } else if framework.TestContext.Provider == "gke" { + initNodeCmd = gkeNodeInitCmd } else { + framework.Logf("Unknown provider %q, skipped", framework.TestContext.Provider) return nil } return ssh.IssueSSHCommand(initNodeCmd, framework.TestContext.Provider, node)