Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 60: Data protection / High availability #92

Merged
merged 34 commits into from
Dec 20, 2018
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
dfe8fd9
Configure pod anti-affinity policy for Ctrlr/SS/Bookie pods
adrianmo Nov 13, 2018
03a91d6
Create PDB for segmentstore
adrianmo Nov 14, 2018
f5dedba
Create Pod Disruption Budgets for Controller, SS, and BK
adrianmo Nov 20, 2018
62b41d6
Merge branch 'master' into issue-60-data-protection-ha
adrianmo Dec 7, 2018
2eb0ffe
Bring PR up to date with master
adrianmo Dec 7, 2018
6d74a06
Merge branch 'master' into issue-60-data-protection-ha
adrianmo Dec 12, 2018
72bebfa
Allow only one Bookie disruption at a time
adrianmo Dec 12, 2018
d563a31
Bring BK quorum down to 2
adrianmo Dec 12, 2018
641c6dd
Change Controller PDB to minAvailable=1
adrianmo Dec 12, 2018
8330e6c
Add test case that validates the creation of a default pravega cluster
adrianmo Dec 14, 2018
7a3076c
Add license headers to files
adrianmo Dec 14, 2018
c5bdeca
Fail fast travis job
adrianmo Dec 14, 2018
be07e84
Revert travis fail fast as it is causing weird job executions
adrianmo Dec 14, 2018
bb125b7
Hard code expected number of pods
adrianmo Dec 17, 2018
a0bf459
Install Pravega dependencies in Travis
adrianmo Dec 17, 2018
1a4abf4
Fix tar command
adrianmo Dec 17, 2018
7c0bac9
Update Helm install process
adrianmo Dec 17, 2018
516dbba
Update Travis
adrianmo Dec 17, 2018
9308bb3
Update Travis
adrianmo Dec 17, 2018
cbe2bd1
Update Travis
adrianmo Dec 17, 2018
5898411
Update Travis
adrianmo Dec 17, 2018
f6e0caf
Reduce tier 2 storage
adrianmo Dec 17, 2018
01a0bac
Obtain pod logs if test fails
adrianmo Dec 17, 2018
504bfbb
Update Travis
adrianmo Dec 17, 2018
23f459e
Update Travis
adrianmo Dec 17, 2018
b552b95
Install nfs-common on host machine
adrianmo Dec 17, 2018
3b00b0d
Write and read data from cluster
adrianmo Dec 17, 2018
75dbf31
Force Travis to wait until tests have finished
adrianmo Dec 17, 2018
6f33b83
Run test job to validate cluster
adrianmo Dec 18, 2018
3941faf
Fix e2e test
adrianmo Dec 18, 2018
022d224
Increase test job timeout
adrianmo Dec 18, 2018
7b5e535
Fix pod readiness check
adrianmo Dec 18, 2018
181112b
Merge branch 'master' into issue-60-data-protection-ha
adrianmo Dec 20, 2018
34b0edc
Remove hardcoded bookkeeper ensemble
adrianmo Dec 20, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions deploy/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ rules:
- statefulsets
verbs:
- "*"
- apiGroups:
- policy
resources:
- poddisruptionbudgets
verbs:
- "*"

---

Expand All @@ -38,6 +44,11 @@ apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: pravega-operator
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "watch", "list"]
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- watch
- list
23 changes: 23 additions & 0 deletions pkg/controller/pravega/bookie.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ import (
"github.com/pravega/pravega-operator/pkg/util"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
policyv1beta1 "k8s.io/api/policy/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)

const (
Expand Down Expand Up @@ -144,6 +146,7 @@ func makeBookiePodSpec(clusterName string, bookkeeperSpec *v1alpha1.BookkeeperSp
},
},
},
Affinity: util.PodAntiAffinity("bookie", clusterName),
}

if bookkeeperSpec.ServiceAccountName != "" {
Expand Down Expand Up @@ -195,3 +198,23 @@ func MakeBookieConfigMap(pravegaCluster *v1alpha1.PravegaCluster) *corev1.Config
Data: configData,
}
}

func MakeBookiePodDisruptionBudget(pravegaCluster *v1alpha1.PravegaCluster) *policyv1beta1.PodDisruptionBudget {
maxUnavailable := intstr.FromInt(1)
return &policyv1beta1.PodDisruptionBudget{
TypeMeta: metav1.TypeMeta{
Kind: "PodDisruptionBudget",
APIVersion: "policy/v1beta1",
},
ObjectMeta: metav1.ObjectMeta{
Name: util.PdbNameForBookie(pravegaCluster.Name),
Namespace: pravegaCluster.Namespace,
},
Spec: policyv1beta1.PodDisruptionBudgetSpec{
MaxUnavailable: &maxUnavailable,
Selector: &metav1.LabelSelector{
MatchLabels: util.LabelsForBookie(pravegaCluster),
},
},
}
}
23 changes: 23 additions & 0 deletions pkg/controller/pravega/pravega_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ import (
"github.com/pravega/pravega-operator/pkg/util"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
policyv1beta1 "k8s.io/api/policy/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)

func MakeControllerDeployment(p *api.PravegaCluster) *appsv1.Deployment {
Expand Down Expand Up @@ -102,6 +104,7 @@ func makeControllerPodSpec(name string, pravegaSpec *api.PravegaSpec) *corev1.Po
},
},
},
Affinity: util.PodAntiAffinity("pravega-controller", name),
}

if pravegaSpec.ControllerServiceAccountName != "" {
Expand Down Expand Up @@ -184,3 +187,23 @@ func MakeControllerService(p *api.PravegaCluster) *corev1.Service {
},
}
}

func MakeControllerPodDisruptionBudget(pravegaCluster *api.PravegaCluster) *policyv1beta1.PodDisruptionBudget {
minAvailable := intstr.FromInt(1)
return &policyv1beta1.PodDisruptionBudget{
TypeMeta: metav1.TypeMeta{
Kind: "PodDisruptionBudget",
APIVersion: "policy/v1beta1",
},
ObjectMeta: metav1.ObjectMeta{
Name: util.PdbNameForController(pravegaCluster.Name),
Namespace: pravegaCluster.Namespace,
},
Spec: policyv1beta1.PodDisruptionBudgetSpec{
MinAvailable: &minAvailable,
Selector: &metav1.LabelSelector{
MatchLabels: util.LabelsForController(pravegaCluster),
},
},
}
}
32 changes: 32 additions & 0 deletions pkg/controller/pravega/pravega_segmentstore.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/pravega/pravega-operator/pkg/util"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
policyv1beta1 "k8s.io/api/policy/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)
Expand Down Expand Up @@ -127,6 +128,7 @@ func makeSegmentstorePodSpec(pravegaCluster *api.PravegaCluster) corev1.PodSpec
},
},
},
Affinity: util.PodAntiAffinity("pravega-segmentstore", pravegaCluster.Name),
}

if pravegaSpec.SegmentStoreServiceAccountName != "" {
Expand All @@ -141,6 +143,9 @@ func makeSegmentstorePodSpec(pravegaCluster *api.PravegaCluster) corev1.PodSpec
func MakeSegmentstoreConfigMap(pravegaCluster *api.PravegaCluster) *corev1.ConfigMap {
javaOpts := []string{
"-Dpravegaservice.clusterName=" + pravegaCluster.Name,
"-Dbookkeeper.bkEnsembleSize=2",
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should remove these lines, and probably give a way to the user to set these values.

Copy link
Contributor Author

@adrianmo adrianmo Dec 20, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. I'll remove those lines as the default value is already set to 2 and this way we will allow users to override the default values using the options CR section.

...
  pravega:
    options:
      bookkeeper.bkEnsembleSize=2
      bookkeeper.bkAckQuorumSize=2
      bookkeeper.bkWriteQuorumSize=2
...

"-Dbookkeeper.bkAckQuorumSize=2",
"-Dbookkeeper.bkWriteQuorumSize=2",
}

for name, value := range pravegaCluster.Spec.Pravega.Options {
Expand Down Expand Up @@ -326,3 +331,30 @@ func MakeSegmentStoreExternalServices(pravegaCluster *api.PravegaCluster) []*cor
}
return services
}

func MakeSegmentstorePodDisruptionBudget(pravegaCluster *api.PravegaCluster) *policyv1beta1.PodDisruptionBudget {
var maxUnavailable intstr.IntOrString

if pravegaCluster.Spec.Pravega.SegmentStoreReplicas == int32(1) {
maxUnavailable = intstr.FromInt(0)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering if we should remove this if clause. If there is a single instance, then in principle we never want it down. At the same time, it is not possible to guarantee it. As the PDB policy is a soft policy, the pod will be brought down anyway in the case k8s needs to do it, so we might as well remove the if and leave it at:

maxUnavailable = intstr.FromInt(1)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PDB protects pods from planned disruptions, such as a node eviction. In such case, if we have 1 Segment Store replica and maxUnavailable=1, Kubernetes will first kill the pod, and then start a new one in a different node, causing a temporary disruption. Whereas if maxUnavailble=0, Kubernetes will put node eviction on hold, giving the user time to increase the replica number and create a new pod in a different node, and then delete the affected pod to resume the node eviction, resulting in zero downtime.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this comment:

if maxUnavailble=0, Kubernetes will put node eviction on hold, giving the user time to increase the replica number and create a new pod in a different node, and then delete the affected pod to resume the node eviction, resulting in zero downtime.

How does the user know it is being given time to increase the replica count? Also, is it saying that we can have multiple instances temporarily? That might be worse than having none temporarily and then one back because that will multiple temporarily will induce two rebalances of the segment containers.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does the user know it is being given time to increase the replica count?

Due to the PDB, Kubernetes will put the eviction on hold until the affected pod is deleted. It's up to the user to decide what to do, they can increase the replica count to avoid down time; or they can just delete the pod altogether and let Kubernetes reschedule it.

Also, is it saying that we can have multiple instances temporarily?

Yes, but not necessarily, the user can just delete the pod and avoid having multiple instances temporarily.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused about how the user knows that a pod needs to be deleted. I'm assuming that the eviction is part of some automated process and K8s is applying a policy to decide how to bring specific pods down.

} else {
maxUnavailable = intstr.FromInt(1)
}

return &policyv1beta1.PodDisruptionBudget{
TypeMeta: metav1.TypeMeta{
Kind: "PodDisruptionBudget",
APIVersion: "policy/v1beta1",
},
ObjectMeta: metav1.ObjectMeta{
Name: util.PdbNameForSegmentstore(pravegaCluster.Name),
Namespace: pravegaCluster.Namespace,
},
Spec: policyv1beta1.PodDisruptionBudgetSpec{
MaxUnavailable: &maxUnavailable,
Selector: &metav1.LabelSelector{
MatchLabels: util.LabelsForSegmentStore(pravegaCluster),
},
},
}
}
21 changes: 21 additions & 0 deletions pkg/controller/pravegacluster/pravegacluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,13 @@ func (r *ReconcilePravegaCluster) Reconcile(request reconcile.Request) (reconcil
}

func (r *ReconcilePravegaCluster) deployController(p *pravegav1alpha1.PravegaCluster) (err error) {
pdb := pravega.MakeControllerPodDisruptionBudget(p)
controllerutil.SetControllerReference(p, pdb, r.scheme)
err = r.client.Create(context.TODO(), pdb)
if err != nil && !errors.IsAlreadyExists(err) {
return err
}

configMap := pravega.MakeControllerConfigMap(p)
controllerutil.SetControllerReference(p, configMap, r.scheme)
err = r.client.Create(context.TODO(), configMap)
Expand Down Expand Up @@ -181,6 +188,13 @@ func (r *ReconcilePravegaCluster) deploySegmentStore(p *pravegav1alpha1.PravegaC
}
}

pdb := pravega.MakeSegmentstorePodDisruptionBudget(p)
controllerutil.SetControllerReference(p, pdb, r.scheme)
err = r.client.Create(context.TODO(), pdb)
if err != nil && !errors.IsAlreadyExists(err) {
return err
}

configMap := pravega.MakeSegmentstoreConfigMap(p)
controllerutil.SetControllerReference(p, configMap, r.scheme)
err = r.client.Create(context.TODO(), configMap)
Expand Down Expand Up @@ -209,6 +223,13 @@ func (r *ReconcilePravegaCluster) deployBookie(p *pravegav1alpha1.PravegaCluster
return err
}

pdb := pravega.MakeBookiePodDisruptionBudget(p)
controllerutil.SetControllerReference(p, pdb, r.scheme)
err = r.client.Create(context.TODO(), pdb)
if err != nil && !errors.IsAlreadyExists(err) {
return err
}

configMap := pravega.MakeBookieConfigMap(p)
controllerutil.SetControllerReference(p, configMap, r.scheme)
err = r.client.Create(context.TODO(), configMap)
Expand Down