Skip to content

Commit

Permalink
support gke
Browse files Browse the repository at this point in the history
  • Loading branch information
cofyc committed Apr 3, 2020
1 parent 2725bfb commit e5dc491
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 71 deletions.
28 changes: 5 additions & 23 deletions hack/run-e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ GCP_REGION=${GCP_REGION:-}
GCP_ZONE=${GCP_ZONE:-}
GCP_CREDENTIALS=${GCP_CREDENTIALS:-}
GCP_SDK=${GCP_SDK:-/google-cloud-sdk}
KUBE_SSH_USER=${KUBE_SSH_USER:-}
KUBE_SSH_USER=${KUBE_SSH_USER:-vagrant}
IMAGE_TAG=${IMAGE_TAG:-}
SKIP_IMAGE_LOAD=${SKIP_IMAGE_LOAD:-}
TIDB_OPERATOR_IMAGE=${TIDB_OPERATOR_IMAGE:-localhost:5000/pingcap/tidb-operator:latest}
Expand Down Expand Up @@ -125,27 +125,7 @@ for ((i = 1; i <= 32; i++)) {
EOF
done
elif [ "$PROVIDER" == "gke" ]; then
# disks are created under /mnt/stateful_partition directory
# https://cloud.google.com/container-optimized-os/docs/concepts/disks-and-filesystem
for n in $($KUBECTL_BIN --context "$KUBECONTEXT" get nodes -ojsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}'); do
gcloud compute ssh e2e@$n --command 'sudo bash -c '"'"'
test -d /mnt/stateful_partition/disks || mkdir -p /mnt/stateful_partition/disks
df -h /mnt/stateful_partition/disks
test -d /mnt/disks || mkdir -p /mnt/disks
cd /mnt/disks
for ((i = 1; i <= 32; i++)) {
if [ ! -d vol$i ]; then
mkdir vol$i
fi
if ! mountpoint vol$i &>/dev/null; then
if [ ! -d /mnt/stateful_partition/disks/vol$i ]; then
mkdir /mnt/stateful_partition/disks/vol$i
fi
mount --bind /mnt/stateful_partition/disks/vol$i vol$i
fi
}
'"'"
done
echo "info: provider is $PROVIDER, skipped"
elif [ "$PROVIDER" == "eks" ]; then
echo "info: provider is $PROVIDER, skipped"
fi
Expand Down Expand Up @@ -352,7 +332,7 @@ if [ "$PROVIDER" == "eks" ]; then
# aws credential is required to get token for EKS
-v $HOME/.aws:/root/.aws
# ~/.ssh/kube_aws_rsa must be mounted into e2e container to run ssh
-v $HOME/.ssh:/root/.ssh
-v $HOME/.ssh/kube_aws_rsa:/root/.ssh/kube_aws_rsa
)
elif [ "$PROVIDER" == "gke" ]; then
e2e_args+=(
Expand All @@ -373,6 +353,8 @@ elif [ "$PROVIDER" == "gke" ]; then
fi
docker_args+=(
-v ${GCP_SDK}:/google-cloud-sdk
# ~/.ssh/google_compute_engine must be mounted into e2e container to run ssh
-v $HOME/.ssh/google_compute_engine:/root/.ssh/google_compute_engine
)
else
e2e_args+=(
Expand Down
6 changes: 4 additions & 2 deletions tests/actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ type OperatorConfig struct {
ValidatingEnabled bool
Cabundle string
BackupImage string
AutoFailover bool
AutoFailover *bool
}

type TidbClusterConfig struct {
Expand Down Expand Up @@ -409,7 +409,6 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string {
set := map[string]string{
"operatorImage": oi.Image,
"tidbBackupManagerImage": oi.BackupImage,
"controllerManager.autoFailover": strconv.FormatBool(oi.AutoFailover),
"scheduler.logLevel": "4",
"testMode": strconv.FormatBool(oi.TestMode),
"admissionWebhook.cabundle": oi.Cabundle,
Expand Down Expand Up @@ -443,6 +442,9 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string {
if oi.Enabled(features.AdvancedStatefulSet) {
set["advancedStatefulset.create"] = "true"
}
if oi.AutoFailover != nil {
set["controllerManager.autoFailover"] = strconv.FormatBool(*oi.AutoFailover)
}

arr := make([]string, 0, len(set))
for k, v := range set {
Expand Down
132 changes: 86 additions & 46 deletions tests/e2e/tidbcluster/stability.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ import (
aggregatorclient "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset"
"k8s.io/kubernetes/test/e2e/framework"
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
"k8s.io/utils/pointer"
"sigs.k8s.io/controller-runtime/pkg/client"
)

Expand Down Expand Up @@ -203,7 +204,7 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() {
Tag: cfg.OperatorTag,
LogLevel: "4",
TestMode: true,
AutoFailover: false,
AutoFailover: pointer.BoolPtr(false),
}
oa = tests.NewOperatorActions(cli, c, asCli, aggrCli, apiExtCli, tests.DefaultPollInterval, ocfg, e2econfig.TestConfig, nil, fw, f)
ginkgo.By("Installing CRDs")
Expand Down Expand Up @@ -243,7 +244,7 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() {
// - in EKS, failed pods on deleted node will be recreated because
// the node object is gone too (old pods is recycled by pod gc). But
// the newly created pods will be stuck at Pending state because
// assocaited PVCs reference nonexistent PVs. Pods will be recreated
// associated PVCs reference nonexistent PVs. Pods will be recreated
// by tidb-operator again when we delete associated PVCs. New PVCs
// will be created by statefulset controller and pods will be
// scheduled to feasible nodes.
Expand All @@ -257,9 +258,8 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() {
// volumes are mounted at the same paths, PD/TiKV pods will be
// running soon when underlying instance is recreated and running.
// Otherwise, we need to delete failed pods and associated PVCs/PVs.
// - PVs must be cleaned because they are reference nonexistent
// storage now. (See
// https://github.com/kubernetes-sigs/sig-storage-local-static-provisioner/issues/6#issuecomment-484062771)
// PVs must be deleted because they are reference nonexistent storage
// now.
//
// Note that:
// - We assume local storage is used, otherwise PV can be re-attached
Expand All @@ -284,8 +284,7 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() {
// - https://github.com/pingcap/tidb-operator/issues/1546
// - https://github.com/pingcap/tidb-operator/issues/408
ginkgo.It("recover tidb cluster from node deletion", func() {
// TODO support GKE
supportedProviders := sets.NewString("aws")
supportedProviders := sets.NewString("aws", "gke")
if !supportedProviders.Has(framework.TestContext.Provider) {
framework.Skipf("current provider is not supported list %v, skipping", supportedProviders.List())
}
Expand Down Expand Up @@ -322,7 +321,7 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() {
framework.ExpectNoError(err)
for _, pod := range podList.Items {
if v, ok := pod.Labels[label.ComponentLabelKey]; !ok {
framework.Failf("pod %s/%s does not have componet label key %q", pod.Namespace, pod.Name, label.ComponentLabelKey)
framework.Failf("pod %s/%s does not have component label key %q", pod.Namespace, pod.Name, label.ComponentLabelKey)
} else if v == label.PDLabelVal {
allPDNodes[pod.Name] = allNodes[pod.Spec.NodeName]
} else if v == label.TiKVLabelVal {
Expand Down Expand Up @@ -384,54 +383,88 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() {
framework.ExpectNoError(err)

ginkgo.By("[AWS/EKS] Initialize newly created node")
nodeList, err := c.CoreV1().Nodes().List(metav1.ListOptions{})
nodeList, err = c.CoreV1().Nodes().List(metav1.ListOptions{})
framework.ExpectNoError(err)
initialized := 0
for _, node := range nodeList.Items {
if _, ok := allNodes[node.Name]; !ok {
utilnode.InitNode(&node)
framework.ExpectNoError(utilnode.InitNode(&node))
initialized++
}
}
gomega.Expect(initialized).To(gomega.BeNumerically("==", 1), "must have a node initialized")
} else if framework.TestContext.Provider == "gke" {
instanceIDAnn := "container.googleapis.com/instance_id"
oldInstanceID, ok := nodeToDelete.Annotations[instanceIDAnn]
if !ok {
framework.Failf("instance label %q not found on node object %q", instanceIDAnn, nodeToDelete.Name)
}

ginkgo.By("[AWS/EKS] Mark stores of failed tikv pods as tombstone")
pdClient, cancel, err := proxiedpdclient.NewProxiedPDClient(c, fw, ns, clusterName, false, nil)
framework.ExpectNoError(err)
defer func() {
if cancel != nil {
cancel()
ginkgo.By("[GCP/GKE] Wait for instance ID to be updated")
err = wait.PollImmediate(time.Second*5, time.Minute*10, func() (bool, error) {
node, err := c.CoreV1().Nodes().Get(nodeToDelete.Name, metav1.GetOptions{})
if err != nil {
return false, nil
}
}()
for _, pod := range tikvPodsOnDeletedNode {
framework.Logf("Mark tikv store of pod %s/%s as Tombstone", ns, pod.Name)
err = wait.PollImmediate(time.Second*3, time.Minute, func() (bool, error) {
storeID, err := utiltikv.GetStoreIDByPodName(cli, ns, clusterName, pod.Name)
if err != nil {
return false, nil
}
err = pdClient.SetStoreState(storeID, pdapi.StoreStateTombstone)
if err != nil {
return false, nil
}
return true, nil
})
framework.ExpectNoError(err)
}
for _, pod := range pdPodsOnDeletedNode {
framework.Logf("Delete member of pod %s/%s", ns, pod.Name)
err = wait.PollImmediate(time.Second*3, time.Minute, func() (bool, error) {
err = pdClient.DeleteMember(pod.Name)
if err != nil {
return false, nil
}
return true, nil
})
framework.ExpectNoError(err)
instanceID, ok := node.Annotations[instanceIDAnn]
if !ok {
return false, nil
}
if instanceID == oldInstanceID {
return false, nil
}
framework.Logf("instance ID of node %q changed from %q to %q", nodeToDelete.Name, oldInstanceID, instanceID)
return true, nil
})
framework.ExpectNoError(err)

ginkgo.By("[GCP/GKE] Wait for the node to be ready")
e2enode.WaitForNodeToBeReady(c, nodeToDelete.Name, time.Minute*5)

ginkgo.By(fmt.Sprintf("[GCP/GKE] Initialize underlying machine of node %s", nodeToDelete.Name))
node, err := c.CoreV1().Nodes().Get(nodeToDelete.Name, metav1.GetOptions{})
framework.ExpectNoError(err)
framework.ExpectNoError(utilnode.InitNode(node))
}

ginkgo.By("Mark stores of failed tikv pods as tombstone")
pdClient, cancel, err := proxiedpdclient.NewProxiedPDClient(c, fw, ns, clusterName, false, nil)
framework.ExpectNoError(err)
defer func() {
if cancel != nil {
cancel()
}
cancel()
cancel = nil
}()
for _, pod := range tikvPodsOnDeletedNode {
framework.Logf("Mark tikv store of pod %s/%s as Tombstone", ns, pod.Name)
err = wait.PollImmediate(time.Second*3, time.Minute, func() (bool, error) {
storeID, err := utiltikv.GetStoreIDByPodName(cli, ns, clusterName, pod.Name)
if err != nil {
return false, nil
}
err = pdClient.SetStoreState(storeID, pdapi.StoreStateTombstone)
if err != nil {
return false, nil
}
return true, nil
})
framework.ExpectNoError(err)
}
for _, pod := range pdPodsOnDeletedNode {
framework.Logf("Delete member of pod %s/%s", ns, pod.Name)
err = wait.PollImmediate(time.Second*3, time.Minute, func() (bool, error) {
err = pdClient.DeleteMember(pod.Name)
if err != nil {
return false, nil
}
return true, nil
})
framework.ExpectNoError(err)
}
cancel()
cancel = nil

if framework.TestContext.Provider == "aws" {
// Local storage is gone with the node and local PVs on deleted
// node will be unusable.
// If `setPVOwnerRef` is enabled in local-volume-provisioner,
Expand All @@ -455,11 +488,18 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() {
}
}
} else if framework.TestContext.Provider == "gke" {
ginkgo.By("TODO support GKE")
framework.Logf("We are using fixed paths in local PVs in our e2e. PVs of the deleted node are usable though the underlying storage is empty now")
// Because of pod exponential crash loop back off, we can
// delete the failed pods to make it start soon.
// Note that this is optional.
ginkgo.By("Deleting the failed pods")
for _, pod := range append(tikvPodsOnDeletedNode, pdPodsOnDeletedNode...) {
framework.ExpectNoError(c.CoreV1().Pods(ns).Delete(pod.Name, &metav1.DeleteOptions{}))
}
}

ginkgo.By("Waiting for tidb cluster to be fully ready")
err = oa.WaitForTidbClusterReady(tc, 30*time.Minute, 15*time.Second)
err = oa.WaitForTidbClusterReady(tc, 5*time.Minute, 15*time.Second)
framework.ExpectNoError(err)
})

Expand Down
24 changes: 24 additions & 0 deletions tests/e2e/util/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,38 @@ if ! grep -qF "OPTIONS" /etc/sysconfig/docker; then
fi
systemctl restart docker
'
`
// disks are created under /mnt/stateful_partition directory
// https://cloud.google.com/container-optimized-os/docs/concepts/disks-and-filesystem
gkeNodeInitCmd = `
sudo bash -c '
test -d /mnt/stateful_partition/disks || mkdir -p /mnt/stateful_partition/disks
df -h /mnt/stateful_partition/disks
test -d /mnt/disks || mkdir -p /mnt/disks
cd /mnt/disks
for ((i = 1; i <= 32; i++)) {
if [ ! -d vol$i ]; then
mkdir vol$i
fi
if ! mountpoint vol$i &>/dev/null; then
if [ ! -d /mnt/stateful_partition/disks/vol$i ]; then
mkdir /mnt/stateful_partition/disks/vol$i
fi
mount --bind /mnt/stateful_partition/disks/vol$i vol$i
fi
}
'
`
)

func InitNode(node *v1.Node) error {
var initNodeCmd string
if framework.TestContext.Provider == "aws" {
initNodeCmd = awsNodeInitCmd
} else if framework.TestContext.Provider == "gke" {
initNodeCmd = gkeNodeInitCmd
} else {
framework.Logf("Unknown provider %q, skipped", framework.TestContext.Provider)
return nil
}
return ssh.IssueSSHCommand(initNodeCmd, framework.TestContext.Provider, node)
Expand Down

0 comments on commit e5dc491

Please sign in to comment.