Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add prestop hook #665

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ Changelog
Unreleased
----------

* Add preStop hook to the CrateDB pods to ensure that the CrateDB process is
stopped gracefully.

2.42.0 (2024-10-02)
-------------------

Expand Down
3 changes: 3 additions & 0 deletions crate/operator/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@
GRAND_CENTRAL_BACKEND_API_PORT = 5050
GRAND_CENTRAL_PROMETHEUS_PORT = 8000

TERMINATION_GRACE_PERIOD_SECONDS = 900
DECOMMISSION_TIMEOUT = "720s"


class CloudProvider(str, enum.Enum):
AWS = "aws"
Expand Down
82 changes: 82 additions & 0 deletions crate/operator/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
AppsV1Api,
CoreV1Api,
PolicyV1Api,
RbacAuthorizationV1Api,
RbacV1Subject,
V1Affinity,
V1Capabilities,
V1ConfigMap,
Expand All @@ -41,10 +43,13 @@
V1EmptyDirVolumeSource,
V1EnvVar,
V1EnvVarSource,
V1ExecAction,
V1HTTPGetAction,
V1KeyToPath,
V1LabelSelector,
V1LabelSelectorRequirement,
V1Lifecycle,
V1LifecycleHandler,
V1LocalObjectReference,
V1NodeAffinity,
V1NodeSelector,
Expand All @@ -60,8 +65,12 @@
V1PodDisruptionBudgetSpec,
V1PodSpec,
V1PodTemplateSpec,
V1PolicyRule,
V1Probe,
V1ResourceRequirements,
V1Role,
V1RoleBinding,
V1RoleRef,
V1Secret,
V1SecretKeySelector,
V1SecretVolumeSource,
Expand All @@ -82,6 +91,7 @@
from crate.operator.constants import (
API_GROUP,
DATA_PVC_NAME_PREFIX,
DECOMMISSION_TIMEOUT,
LABEL_COMPONENT,
LABEL_MANAGED_BY,
LABEL_NAME,
Expand All @@ -92,6 +102,7 @@
SHARED_NODE_TOLERATION_EFFECT,
SHARED_NODE_TOLERATION_KEY,
SHARED_NODE_TOLERATION_VALUE,
TERMINATION_GRACE_PERIOD_SECONDS,
CloudProvider,
Nodepool,
Port,
Expand Down Expand Up @@ -369,6 +380,26 @@ def get_statefulset_containers(
security_context=V1SecurityContext(
capabilities=V1Capabilities(add=["SYS_CHROOT"])
),
lifecycle=V1Lifecycle(
pre_stop=(
V1LifecycleHandler(
_exec=V1ExecAction(
command=[
"/bin/sh",
"-c",
"curl -sLO "
"https://raw.githubusercontent.com/crate/crate-operator/master/dc_util && " # noqa
"curl -sLO "
"https://raw.githubusercontent.com/crate/crate-operator/master/dc_util && " # noqa
"sha256sum -c dc_util.sha256 && "
"chmod u+x ./dc_util && \n"
"./dc_util -min-availability PRIMARIES "
f"-timeout {DECOMMISSION_TIMEOUT}",
]
)
)
)
),
),
]

Expand Down Expand Up @@ -822,6 +853,7 @@ def get_statefulset(
init_containers=get_statefulset_init_containers(crate_image),
volumes=get_statefulset_volumes(name, ssl),
tolerations=get_tolerations(name, logger, node_spec),
termination_grace_period_seconds=TERMINATION_GRACE_PERIOD_SECONDS,
),
),
update_strategy=V1StatefulSetUpdateStrategy(type="OnDelete"),
Expand Down Expand Up @@ -917,6 +949,56 @@ async def create_statefulset(
namespace=namespace,
body=pdb,
)
"""
A Role is required to allow the POD to access the
number of replicas in the StatefulSet. This is required for the
pre-stop lifecycle hook to work correctly and detect a scale to 0.
"""
rule = RbacAuthorizationV1Api(api_client)
role = V1Role(
metadata=V1ObjectMeta(
name=f"crate-{name}",
owner_references=owner_references,
),
rules=[
V1PolicyRule(
api_groups=["apps"],
resources=["statefulsets"],
verbs=["get", "list", "watch"],
)
],
)
await call_kubeapi(
rule.create_namespaced_role,
logger,
continue_on_conflict=True,
namespace=namespace,
body=role,
)
role_binding = V1RoleBinding(
metadata=V1ObjectMeta(
name=f"crate-{name}",
owner_references=owner_references,
),
role_ref=V1RoleRef(
api_group="rbac.authorization.k8s.io",
kind="Role",
name=f"crate-{name}",
),
subjects=[
RbacV1Subject(
kind="ServiceAccount",
name="default",
)
],
)
await call_kubeapi(
rule.create_namespaced_role_binding,
logger,
continue_on_conflict=True,
namespace=namespace,
body=role_binding,
)


def get_data_service(
Expand Down
3 changes: 3 additions & 0 deletions deploy/charts/crate-operator/templates/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ rules:
- batch
- policy
- networking.k8s.io
- rbac.authorization.k8s.io
resources:
- configmaps
- cronjobs
Expand All @@ -46,6 +47,8 @@ rules:
- services
- statefulsets
- poddisruptionbudgets
- roles
- rolebindings
verbs:
- create
- delete
Expand Down
3 changes: 3 additions & 0 deletions deploy/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ rules:
- batch
- policy
- networking.k8s.io
- rbac.authorization.k8s.io
resources:
- configmaps
- cronjobs
Expand All @@ -71,6 +72,8 @@ rules:
- services
- statefulsets
- poddisruptionbudgets
- rolebindings
- roles
verbs:
- create
- delete
Expand Down
59 changes: 59 additions & 0 deletions tests/test_create.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,14 @@
from typing import Set
from unittest import mock

import aiohttp
import pytest
from kubernetes_asyncio.client import (
AppsV1Api,
CoreV1Api,
CustomObjectsApi,
NetworkingV1Api,
RbacAuthorizationV1Api,
)

from crate.operator.config import config
Expand All @@ -44,6 +46,7 @@
LABEL_NAME,
LABEL_PART_OF,
RESOURCE_CRATEDB,
TERMINATION_GRACE_PERIOD_SECONDS,
CloudProvider,
)
from crate.operator.create import (
Expand Down Expand Up @@ -1212,6 +1215,54 @@ async def test_create_minimal(self, faker, namespace, kopf_runner, api_client):
{f"crate-data-hot-{name}-0"},
)

async def test_decommission_settings(
self, faker, namespace, kopf_runner, api_client
):
apps = AppsV1Api(api_client)
coapi = CustomObjectsApi(api_client)
core = CoreV1Api(api_client)
rbac = RbacAuthorizationV1Api(api_client)
name = faker.domain_word()

await start_cluster(name, namespace, core, coapi, 1, wait_for_healthy=False)
await assert_wait_for(
True,
self.does_statefulset_exist,
apps,
namespace.metadata.name,
f"crate-data-hot-{name}",
)
await assert_wait_for(
True,
do_pods_exist,
core,
namespace.metadata.name,
{f"crate-data-hot-{name}-0"},
)
statefulset = await apps.read_namespaced_stateful_set(
f"crate-data-hot-{name}", namespace.metadata.name
)
assert (
statefulset.spec.template.spec.termination_grace_period_seconds
== TERMINATION_GRACE_PERIOD_SECONDS
)

role = await rbac.read_namespaced_role(f"crate-{name}", namespace.metadata.name)
assert any(
rule
for rule in role.rules
if "statefulsets" in rule.resources and "list" in rule.verbs
), "Role does not contain the 'list' verb for 'statefulsets'"

rolebinding = await rbac.read_namespaced_role_binding(
f"crate-{name}", namespace.metadata.name
)
assert any(
subject
for subject in rolebinding.subjects
if subject.kind == "ServiceAccount" and subject.name == "default"
), "RoleBinding does not contain the expected ServiceAccount subject"

async def test_create_with_svc_annotations(
self, faker, namespace, kopf_runner, api_client
):
Expand Down Expand Up @@ -1586,3 +1637,11 @@ def test_get_cluster_resource_limits(node_spec, expected_limits_cpu):
get_cluster_resource_limits(node_spec, resource_type="cpu", fallback_key="cpus")
== expected_limits_cpu
)


@pytest.mark.asyncio
async def test_download_dc_util():
url = "https://raw.githubusercontent.com/crate/crate-operator/master/dc_util"
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
assert response.status == 200, f"Expected status 200, got {response.status}"
87 changes: 87 additions & 0 deletions utils/dc_util/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Rolling restart with `alter cluster decommission`

While working on a cloud issue a small tool was created
to not only _terminate_ a POD by`kubelet` sending a SIGTERM, but by having the ability
to use a preStop Hook and issue a `alter cluster decommission` for that node.

# What does the tool do?

First the decommission settings are configured for the cluster. We assume that
we always want to _force_ decommission - in terms of: If cratedb would come to the
decision that the decommission failed, it would roll it back. In context of terminating
the POD/process in kubernetes, the shutdown cannot be canceled - therefore _force_ is set
on cratedb side.

Before doing that, the STS is checked for the number of replicas configured. This is done
to figure out whether a FULL stop of all PODS in the cratedb Cluster is _scheduled_. In
case of a FULL restart there is **NO** decommission sent to the cluster and the k8s shutdown
continues by sending `SIGTERM`.

For having access to the number of replicas on the sts, additional permission need to be granted
to the ServiceAccount:

```yaml
- apiGroups: ["apps"]
resources: ["statefulsets"]
verbs: ["get", "list", "watch"]
```

This needs to be created/setup manually, or by the crate-operator.

When the decommission is sent to the cluster the command almost immediately returns. Nevertheless
cratedb started the decommissioning in the background andwe need to wait until cratedb
exit's.
After that control is _returned_ to`kubelet` which continues by sending SIGTERM.
`Termination Grace Period` needs to be set longer than the decomission timeout, as
kubelet is monitoring this timer and would eventually assume the preStop process _hangs_
and continue with _TERMINATING_ the containers/POD.

# How to configure it?
The preStop Hook needs to be configured on the _Statefulset_ by adding something like this
to the cratedb containers configuration:

```yaml

image: crate:5.8.3
lifecycle:
preStop:
exec:
command:
- /bin/sh
- -c
- |
curl -sLO https://raw.githubusercontent.com/crate/crate-operator/master/dc_util && \
curl -sLO https://raw.githubusercontent.com/crate/crate-operator/master/dc_util.sha256 && \
sha256sum -c dc_util.sha256 && \
chmod u+x ./dc_util && \
./dc_util -min-availability PRIMARIES

terminationGracePeriodSeconds: 7230

```

In this example the binary is loaded from the GH repo as part of the PODs termination process. In case one of this commands fails, `kubelet` continues with the SIGTERM immediately.

Upload the binary to the CDN: `scp ./dc_util root@web:/mnt/data/www/cdn.crate.io/downloads`. It seems the STDOUT messages written in the `prestop` hook do not
end up in the PODs log, which requires to check the cratedb Logs.

# How to build it?

```shell
GOOS=linux go build -o dc_util dc_util.go
shasum -a 256 dc_util
```

This builds the binary for `Linux` in case you are building it on MacOS. For convenience and to test locally - without running cratedb - as tiny `http_server` is available.

# Command Line
There are a bunch of CLI parameters that can be set to fine-tune the behavior. Most
are used for testing purpose:

| Paramter | setting |
| --------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `--crate-node-prefix` | allows to customize the cratedb node names in the statefulset in case it is not the default `data-hot`. This is not to be confused with the _hostname_! |
| `--timemout` | crateDBs default timeout is 7200s - this needs to be correlated to `TerminationGracePeriod` |
| `--pid` | For testing locally only |
| `--hostname` | Is used to derive the name of the kubernetes statefulset, the _replica number_ of the pod is _stripped_ from it, which returns the sts name. eg. `crate-data-hot-eadf76b5-c634-4f0f-abcc-7442d01cb7dd-0 -> crate-data-hot-eadf76b5-c634-4f0f-abcc-7442d01cb7dd` |
| `--min-availability` | Either `PRIMARIES`or `FULL`. Please refer to the crateDB documentation. |
Binary file added utils/dc_util/dc_util
Binary file not shown.
Loading