Skip to content

Commit

Permalink
feat(flagD): support zero downtime during upgrades (#731)
Browse files Browse the repository at this point in the history
<!-- Please use this template for your pull request. -->
<!-- Please use the sections that you need and delete other sections -->

## This PR
<!-- add the description of the PR here -->

- implements graceful shutdown of flagD, which leads to zero-downtime ->
this means disabling the readiness probes and sending a shutdown event
to all connected SDKs
- create example manifests for deploying flagD as a standalone
Deployment
- create Makefile entry to deploy flagD to cluster
- create ZD test with README description how to run
- create Makefile entry to run ZD test

### Related Issues
<!-- add here the GitHub issue that this PR resolves if applicable -->

Fixes #728 

### Follow-up Tasks
- running ZD test as part of CI
#732

---------

Signed-off-by: odubajDT <[email protected]>
  • Loading branch information
odubajDT authored Jul 13, 2023
1 parent 46ac4a3 commit 7df8d39
Show file tree
Hide file tree
Showing 10 changed files with 258 additions and 2 deletions.
20 changes: 19 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
IMG ?= flagd:latest
PHONY: .docker-build .build .run .mockgen
PREFIX=/usr/local
ALL_GO_MOD_DIRS := $(shell find . -type f -name 'go.mod' -exec dirname {} \; | sort)

FLAGD_DEV_NAMESPACE ?= flagd-dev
ZD_TEST_NAMESPACE ?= flagd-zd-test

workspace-init: workspace-clean
go work init
$(foreach module, $(ALL_GO_MOD_DIRS), go work use $(module);)
Expand Down Expand Up @@ -67,6 +69,22 @@ mockgen: install-mockgen
generate-docs:
cd flagd; go run ./cmd/doc/main.go

.PHONY: deploy-dev-env
export IMG?= ghcr.io/open-feature/flagd:latest
deploy-dev-env: undeploy-dev-env
kubectl create ns "$(FLAGD_DEV_NAMESPACE)"
envsubst '$${IMG}' < config/deployments/flagd/deployment.yaml | kubectl apply -f - -n "$(FLAGD_DEV_NAMESPACE)"
kubectl apply -f config/deployments/flagd/service.yaml -n "$(FLAGD_DEV_NAMESPACE)"
kubectl wait --for=condition=available deployment/flagd -n "$(FLAGD_DEV_NAMESPACE)" --timeout=300s

undeploy-dev-env:
kubectl delete ns "$(FLAGD_DEV_NAMESPACE)" --ignore-not-found=true

run-zd-test:
kubectl delete ns "$(ZD_TEST_NAMESPACE)" --ignore-not-found=true
kubectl create ns "$(ZD_TEST_NAMESPACE)"
ZD_TEST_NAMESPACE="$(ZD_TEST_NAMESPACE)" FLAGD_DEV_NAMESPACE=$(FLAGD_DEV_NAMESPACE) IMG="$(IMG)" IMG_ZD="$(IMG_ZD)" ./test/zero-downtime/zd_test.sh

# Markdown lint configuration
#
# - .markdownlintignore holds the configuration for files to be ignored
Expand Down
74 changes: 74 additions & 0 deletions config/deployments/flagd/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: flagd
name: flagd
spec:
replicas: 1
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: flagd
template:
metadata:
labels:
app.kubernetes.io/name: flagd
app: flagd
spec:
containers:
- name: flagd
image: ${IMG}
volumeMounts:
- name: config-volume
mountPath: /etc/flagd
readinessProbe:
httpGet:
path: /readyz
port: 8014
initialDelaySeconds: 5
periodSeconds: 5
livenessProbe:
httpGet:
path: /healthz
port: 8014
initialDelaySeconds: 5
periodSeconds: 60
ports:
- containerPort: 8013
args:
- start
- --uri
- file:/etc/flagd/config.json
- --debug
volumes:
- name: config-volume
configMap:
name: open-feature-flags
items:
- key: flags
path: config.json
---
# ConfigMap for Flagd OpenFeatuer provider
apiVersion: v1
kind: ConfigMap
metadata:
name: open-feature-flags
data:
flags: |
{
"flags": {
"myStringFlag": {
"state": "ENABLED",
"variants": {
"key1": "val1",
"key2": "val2"
},
"defaultVariant": "key1"
}
}
}
10 changes: 10 additions & 0 deletions config/deployments/flagd/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: v1
kind: Service
metadata:
name: flagd-svc
spec:
selector:
app.kubernetes.io/name: flagd
ports:
- port: 8013
targetPort: 8013
7 changes: 7 additions & 0 deletions core/pkg/runtime/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ func (r *Runtime) Start() error {
return nil
})
}

defer func() {
r.Logger.Info("Shutting down server...")
r.Service.Shutdown()
r.Logger.Info("Server successfully shutdown.")
}()

g.Go(func() error {
// Readiness probe rely on the runtime
r.ServiceConfig.ReadinessProbe = r.isReady
Expand Down
13 changes: 12 additions & 1 deletion core/pkg/service/flag-evaluation/connect_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ type ConnectService struct {

serverMtx sync.RWMutex
metricsServerMtx sync.RWMutex

readinessEnabled bool
}

// NewConnectService creates a ConnectService with provided parameters
Expand All @@ -57,6 +59,7 @@ func NewConnectService(
// Serve serves services with provided configuration options
func (s *ConnectService) Serve(ctx context.Context, svcConf service.Configuration) error {
g, gCtx := errgroup.WithContext(ctx)
s.readinessEnabled = true

g.Go(func() error {
return s.startServer(svcConf)
Expand Down Expand Up @@ -152,6 +155,14 @@ func (s *ConnectService) AddMiddleware(mw middleware.IMiddleware) {
s.server.Handler = mw.Handler(s.server.Handler)
}

func (s *ConnectService) Shutdown() {
s.readinessEnabled = false
s.eventingConfiguration.emitToAll(service.Notification{
Type: service.Shutdown,
Data: map[string]interface{}{},
})
}

func (s *ConnectService) startServer(svcConf service.Configuration) error {
lis, err := s.setupServer(svcConf)
if err != nil {
Expand Down Expand Up @@ -189,7 +200,7 @@ func (s *ConnectService) startMetricsServer(svcConf service.Configuration) error
case "/healthz":
w.WriteHeader(http.StatusOK)
case "/readyz":
if svcConf.ReadinessProbe() {
if s.readinessEnabled && svcConf.ReadinessProbe() {
w.WriteHeader(http.StatusOK)
} else {
w.WriteHeader(http.StatusPreconditionFailed)
Expand Down
40 changes: 40 additions & 0 deletions core/pkg/service/flag-evaluation/connect_service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,43 @@ func TestConnectServiceNotify(t *testing.T) {
t.Error("timeout while waiting for notifications")
}
}

func TestConnectServiceShutdown(t *testing.T) {
// given
ctrl := gomock.NewController(t)
eval := mock.NewMockIEvaluator(ctrl)

exp := metric.NewManualReader()
rs := resource.NewWithAttributes("testSchema")
metricRecorder := telemetry.NewOTelRecorder(exp, rs, "my-exporter")

service := NewConnectService(logger.NewLogger(nil, false), eval, metricRecorder)

sChan := make(chan iservice.Notification, 1)
eventing := service.eventingConfiguration
eventing.subs["key"] = sChan

// notification type
ofType := iservice.Shutdown

// emit notification in routine
go func() {
service.Notify(iservice.Notification{
Type: ofType,
Data: map[string]interface{}{},
})
}()

// wait for notification
timeout, cancelFunc := context.WithTimeout(context.Background(), 2*time.Second)
defer cancelFunc()

require.False(t, service.readinessEnabled)

select {
case n := <-sChan:
require.Equal(t, ofType, n.Type, "expected notification type: %s, but received %s", ofType, n.Type)
case <-timeout.Done():
t.Error("timeout while waiting for notifications")
}
}
2 changes: 2 additions & 0 deletions core/pkg/service/iservice.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ type NotificationType string

const (
ConfigurationChange NotificationType = "configuration_change"
Shutdown NotificationType = "provider_shutdown"
ProviderReady NotificationType = "provider_ready"
KeepAlive NotificationType = "keep_alive"
)
Expand Down Expand Up @@ -40,6 +41,7 @@ which call the IEvaluator implementation.
type IFlagEvaluationService interface {
Serve(ctx context.Context, svcConf Configuration) error
Notify(n Notification)
Shutdown()
}

/*
Expand Down
25 changes: 25 additions & 0 deletions test/zero-downtime/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# FlagD Zero downtime test

## How to run

Clone this repository and run the following command to deploy a standalone flagD:

```shell
IMG=your-flagd-image make deploy-dev-env
```

This will create a flagd deployment `flagd-dev` namespace.

To run the test, execute:

```shell
IMG=your-flagd-image IMG_ZD=your-flagd-image2 make run-zd-test
```

Please be aware, you need to build your two custom images with different tags for flagD first.

To build your images using Docker execute:

```shell
docker build . -t image-name:tag -f flagd/build.Dockerfile
```
25 changes: 25 additions & 0 deletions test/zero-downtime/test-pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
apiVersion: v1
kind: Pod
metadata:
name: test-zd
spec:
containers:
- name: test-zd
image: curlimages/curl:8.1.2
# yamllint disable rule:line-length
command:
- 'sh'
- '-c'
- |
for i in $(seq 1 3000); do
curl -H 'Cache-Control: no-cache, no-store' -X POST flagd-svc.$FLAGD_DEV_NAMESPACE.svc.cluster.local:8013/schema.v1.Service/ResolveString?$RANDOM -d '{"flagKey":"myStringFlag","context":{}}' -H "Content-Type: application/json" > ~/out.txt
if ! grep -q "val1" ~/out.txt
then
cat ~/out.txt
echo "\n\nCannot fetch data from flagD, exiting...\n\n"
exit 1
fi
sleep 1
done
exit 0
# yamllint enable rule:line-length
44 changes: 44 additions & 0 deletions test/zero-downtime/zd_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/sh

set -eu

# Store the flagD image to a helper variable
IMG_ORIGINAL=$IMG

# Create pod requesting the values from flagD
envsubst < test/zero-downtime/test-pod.yaml | kubectl apply -f - -n $ZD_TEST_NAMESPACE

for count in 1 2 3;
do
# Update the flagD deployment with the second image
IMG=$IMG_ZD
envsubst < config/deployments/flagd/deployment.yaml | kubectl apply -f - -n $FLAGD_DEV_NAMESPACE
kubectl wait --for=condition=available deployment/flagd -n $FLAGD_DEV_NAMESPACE --timeout=30s

# Wait until the client pod executes curl requests agains flagD
sleep 20

# Update the flagDT deployment back to original image
IMG=$IMG_ORIGINAL
envsubst < config/deployments/flagd/deployment.yaml | kubectl apply -f - -n $FLAGD_DEV_NAMESPACE
kubectl wait --for=condition=available deployment/flagd -n $FLAGD_DEV_NAMESPACE --timeout=30s

# Wait until the client pod executes curl requests agains flagD
sleep 20
done

# Pod will fail only when it fails to get a proper response from curl (that means we do not have zero downtime)
# If it is still running, the last curl request was successfull.
kubectl wait --for=condition=ready pod/test-zd -n $ZD_TEST_NAMESPACE --timeout=30s

# If curl request once not successful and another curl request was, pod might be in a ready state again.
# Therefore we need to check that the restart count is equal to zero -> this means every request provided valid data.
restart_count=$(kubectl get pods test-zd -o=jsonpath='{.status.containerStatuses[0].restartCount}' -n $ZD_TEST_NAMESPACE)
if [ "$restart_count" -ne 0 ]; then
echo "Restart count of the test-zd pod is not equal to zero."
exit 1
fi

# Cleanup only when the test passed
kubectl delete ns $ZD_TEST_NAMESPACE --ignore-not-found=true

1 comment on commit 7df8d39

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Performance Alert ⚠️

Possible performance regression was detected for benchmark 'Go Benchmark'.
Benchmark result of this commit is worse than the previous benchmark result exceeding threshold 1.30.

Benchmark suite Current: 7df8d39 Previous: 46ac4a3 Ratio
BenchmarkResolveBooleanValue/test_staticBoolFlag - ns/op 2687 ns/op 1579 ns/op 1.70
BenchmarkResolveBooleanValue/test_targetingBoolFlag - ns/op 14814 ns/op 10661 ns/op 1.39
BenchmarkResolveStringValue/test_staticStringFlag - ns/op 2152 ns/op 1640 ns/op 1.31
BenchmarkResolveStringValue/test_targetingStringFlag - ns/op 15188 ns/op 10955 ns/op 1.39
BenchmarkResolveFloatValue/test:_targetingFloatFlag - ns/op 15707 ns/op 11127 ns/op 1.41
BenchmarkResolveFloatValue/test:_staticObjectFlag - ns/op 1940 ns/op 1485 ns/op 1.31
BenchmarkResolveFloatValue/test:_disabledFlag - ns/op 2176 ns/op 1663 ns/op 1.31
BenchmarkResolveIntValue/test_staticIntFlag - ns/op 2055 ns/op 1579 ns/op 1.30
BenchmarkResolveIntValue/test_targetingNumberFlag - ns/op 13777 ns/op 9888 ns/op 1.39
BenchmarkResolveObjectValue/test_staticObjectFlag - ns/op 6936 ns/op 5196 ns/op 1.33
BenchmarkResolveObjectValue/test_targetingObjectFlag - ns/op 20114 ns/op 14467 ns/op 1.39
BenchmarkFlag_Evaluation_ResolveString/happy_path - ns/op 12687 ns/op 9517 ns/op 1.33
BenchmarkFlag_Evaluation_ResolveFloat/happy_path - ns/op 12921 ns/op 9787 ns/op 1.32
BenchmarkFlag_Evaluation_ResolveInt/happy_path - ns/op 12838 ns/op 9662 ns/op 1.33
BenchmarkFlag_Evaluation_ResolveObject/happy_path - ns/op 15750 ns/op 11423 ns/op 1.38

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.