From 8684d41e387ae40cc64cd513bbc3f7ac319360f4 Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Wed, 11 May 2022 15:02:06 +0200 Subject: [PATCH] k8systemd: run k8s workloads in systemd Support running `podman play kube` in systemd by exploiting the previously added "service containers". During `play kube`, a service container is started before all the pods and containers, and is stopped last. The service container communicates its conmon PID via sdnotify. Add a new systemd template to dispatch such k8s workloads. The argument of the template is the path to the k8s file. Note that the path must be escaped for systemd not to bark: Let's assume we have a `top.yaml` file in the home directory: ``` $ escaped=$(systemd-escape ~/top.yaml) $ systemctl --user start podman-play-kube@$escaped.service ``` Closes: https://issues.redhat.com/browse/RUN-1287 Signed-off-by: Valentin Rothberg --- .gitignore | 1 + Makefile | 5 +- .../system/podman-play-kube@.service.in | 18 +++++ libpod/container_inspect.go | 2 +- libpod/container_validate.go | 2 +- libpod/runtime_ctr.go | 20 +++++ libpod/service.go | 15 +++- pkg/domain/infra/abi/containers.go | 8 +- pkg/domain/infra/abi/play.go | 34 +++++++-- podman.spec.rpkg | 2 + test/system/250-systemd.bats | 76 +++++++++++++++++++ test/system/260-sdnotify.bats | 48 ++++++++++++ test/system/700-play.bats | 34 +++++---- test/system/helpers.bash | 13 ++++ 14 files changed, 248 insertions(+), 30 deletions(-) create mode 100644 contrib/systemd/system/podman-play-kube@.service.in diff --git a/.gitignore b/.gitignore index f6eee2fe0c..7fd55a663e 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ /build/ /conmon/ contrib/spec/podman.spec +contrib/systemd/*/*.service *.coverprofile coverprofile /.coverage diff --git a/Makefile b/Makefile index 3f9d56025d..de49451de3 100644 --- a/Makefile +++ b/Makefile @@ -832,7 +832,8 @@ install.docker-full: install.docker install.docker-docs ifneq (,$(findstring systemd,$(BUILDTAGS))) PODMAN_UNIT_FILES = contrib/systemd/auto-update/podman-auto-update.service \ contrib/systemd/system/podman.service \ - contrib/systemd/system/podman-restart.service + contrib/systemd/system/podman-restart.service \ + contrib/systemd/system/podman-play-kube@.service %.service: %.service.in sed -e 's;@@PODMAN@@;$(BINDIR)/podman;g' $< >$@.tmp.$$ \ @@ -846,12 +847,14 @@ install.systemd: $(PODMAN_UNIT_FILES) install ${SELINUXOPT} -m 644 contrib/systemd/system/podman.socket ${DESTDIR}${USERSYSTEMDDIR}/podman.socket install ${SELINUXOPT} -m 644 contrib/systemd/system/podman.service ${DESTDIR}${USERSYSTEMDDIR}/podman.service install ${SELINUXOPT} -m 644 contrib/systemd/system/podman-restart.service ${DESTDIR}${USERSYSTEMDDIR}/podman-restart.service + install ${SELINUXOPT} -m 644 contrib/systemd/system/podman-play-kube@.service ${DESTDIR}${USERSYSTEMDDIR}/podman-play-kube@.service # System services install ${SELINUXOPT} -m 644 contrib/systemd/auto-update/podman-auto-update.service ${DESTDIR}${SYSTEMDDIR}/podman-auto-update.service install ${SELINUXOPT} -m 644 contrib/systemd/auto-update/podman-auto-update.timer ${DESTDIR}${SYSTEMDDIR}/podman-auto-update.timer install ${SELINUXOPT} -m 644 contrib/systemd/system/podman.socket ${DESTDIR}${SYSTEMDDIR}/podman.socket install ${SELINUXOPT} -m 644 contrib/systemd/system/podman.service ${DESTDIR}${SYSTEMDDIR}/podman.service install ${SELINUXOPT} -m 644 contrib/systemd/system/podman-restart.service ${DESTDIR}${SYSTEMDDIR}/podman-restart.service + install ${SELINUXOPT} -m 644 contrib/systemd/system/podman-play-kube@.service ${DESTDIR}${SYSTEMDDIR}/podman-play-kube@.service rm -f $(PODMAN_UNIT_FILES) else install.systemd: diff --git a/contrib/systemd/system/podman-play-kube@.service.in b/contrib/systemd/system/podman-play-kube@.service.in new file mode 100644 index 0000000000..824f71eb09 --- /dev/null +++ b/contrib/systemd/system/podman-play-kube@.service.in @@ -0,0 +1,18 @@ +[Unit] +Description=A template for running K8s workloads via podman-play-kube +Documentation=man:podman-play-kube(1) +Wants=network-online.target +After=network-online.target +RequiresMountsFor=%t/containers + +[Service] +Environment=PODMAN_SYSTEMD_UNIT=%n +Restart=never +TimeoutStopSec=70 +ExecStart=@@PODMAN@@ play kube --replace --service-container=true %I +ExecStop=@@PODMAN@@ play kube --down %I +Type=notify +NotifyAccess=all + +[Install] +WantedBy=default.target diff --git a/libpod/container_inspect.go b/libpod/container_inspect.go index 5d809644d9..93240812da 100644 --- a/libpod/container_inspect.go +++ b/libpod/container_inspect.go @@ -171,7 +171,7 @@ func (c *Container) getContainerInspectData(size bool, driverData *define.Driver Mounts: inspectMounts, Dependencies: c.Dependencies(), IsInfra: c.IsInfra(), - IsService: c.isService(), + IsService: c.IsService(), } if c.state.ConfigPath != "" { diff --git a/libpod/container_validate.go b/libpod/container_validate.go index d939c94e6f..cfbdd2b1ea 100644 --- a/libpod/container_validate.go +++ b/libpod/container_validate.go @@ -31,7 +31,7 @@ func (c *Container) validate() error { // A container cannot be marked as an infra and service container at // the same time. - if c.IsInfra() && c.isService() { + if c.IsInfra() && c.IsService() { return fmt.Errorf("cannot be infra and service container at the same time: %w", define.ErrInvalidArg) } diff --git a/libpod/runtime_ctr.go b/libpod/runtime_ctr.go index df7174ac6e..0119cb2e59 100644 --- a/libpod/runtime_ctr.go +++ b/libpod/runtime_ctr.go @@ -644,6 +644,16 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo return err } + if c.IsService() { + canStop, err := c.canStopServiceContainer() + if err != nil { + return err + } + if !canStop { + return fmt.Errorf("container %s is the service container of pod(s) %s and cannot be removed without removing the pod(s)", c.ID(), strings.Join(c.state.Service.Pods, ",")) + } + } + // If we're not force-removing, we need to check if we're in a good // state to remove. if !force { @@ -903,6 +913,16 @@ func (r *Runtime) evictContainer(ctx context.Context, idOrName string, removeVol } } + if c.IsService() { + canStop, err := c.canStopServiceContainer() + if err != nil { + return id, err + } + if !canStop { + return id, fmt.Errorf("container %s is the service container of pod(s) %s and cannot be removed without removing the pod(s)", c.ID(), strings.Join(c.state.Service.Pods, ",")) + } + } + var cleanupErr error // Remove the container from the state if c.config.Pod != "" { diff --git a/libpod/service.go b/libpod/service.go index ad147e87b3..c14f5e51d2 100644 --- a/libpod/service.go +++ b/libpod/service.go @@ -54,11 +54,12 @@ func (c *Container) addServicePodLocked(id string) error { return c.save() } -func (c *Container) isService() bool { +// IsService returns true when the container is a "service container". +func (c *Container) IsService() bool { return c.config.IsService } -// canStopServiceContainer returns true if all pods of the service are stopped. +// canStopServiceContainerLocked returns true if all pods of the service are stopped. // Note that the method acquires the container lock. func (c *Container) canStopServiceContainerLocked() (bool, error) { c.lock.Lock() @@ -67,10 +68,16 @@ func (c *Container) canStopServiceContainerLocked() (bool, error) { return false, err } - if !c.isService() { + if !c.IsService() { return false, fmt.Errorf("internal error: checking service: container %s is not a service container", c.ID()) } + return c.canStopServiceContainer() +} + +// canStopServiceContainer returns true if all pods of the service are stopped. +// Note that the method expects the container to be locked. +func (c *Container) canStopServiceContainer() (bool, error) { for _, id := range c.state.Service.Pods { pod, err := c.runtime.LookupPod(id) if err != nil { @@ -163,7 +170,7 @@ func (c *Container) canRemoveServiceContainerLocked() (bool, error) { return false, err } - if !c.isService() { + if !c.IsService() { return false, fmt.Errorf("internal error: checking service: container %s is not a service container", c.ID()) } diff --git a/pkg/domain/infra/abi/containers.go b/pkg/domain/infra/abi/containers.go index 5ca678d6f0..4e9f38b95d 100644 --- a/pkg/domain/infra/abi/containers.go +++ b/pkg/domain/infra/abi/containers.go @@ -292,7 +292,13 @@ func (ic *ContainerEngine) removeContainer(ctx context.Context, ctr *libpod.Cont logrus.Debugf("Failed to remove container %s: %s", ctr.ID(), err.Error()) switch errors.Cause(err) { case define.ErrNoSuchCtr: - if options.Ignore { + // Ignore if the container does not exist (anymore) when either + // it has been requested by the user of if the container is a + // service one. Service containers are removed along with its + // pods which in turn are removed along with their infra + // container. Hence, there is an inherent race when removing + // infra containers with service containers in parallel. + if options.Ignore || ctr.IsService() { logrus.Debugf("Ignoring error (--allow-missing): %v", err) return nil } diff --git a/pkg/domain/infra/abi/play.go b/pkg/domain/infra/abi/play.go index 420d51483c..e04ab3a1ad 100644 --- a/pkg/domain/infra/abi/play.go +++ b/pkg/domain/infra/abi/play.go @@ -37,7 +37,15 @@ import ( // createServiceContainer creates a container that can later on // be associated with the pods of a K8s yaml. It will be started along with // the first pod. -func (ic *ContainerEngine) createServiceContainer(ctx context.Context, name string) (*libpod.Container, error) { +func (ic *ContainerEngine) createServiceContainer(ctx context.Context, name string, options entities.PlayKubeOptions) (*libpod.Container, error) { + // Make sure to replace the service container as well if requested by + // the user. + if options.Replace { + if _, err := ic.ContainerRm(ctx, []string{name}, entities.RmOptions{Force: true, Ignore: true}); err != nil { + return nil, fmt.Errorf("replacing service container: %w", err) + } + } + // Similar to infra containers, a service container is using the pause image. image, err := generate.PullOrBuildInfraImage(ic.Libpod, "") if err != nil { @@ -65,6 +73,7 @@ func (ic *ContainerEngine) createServiceContainer(ctx context.Context, name stri return nil, fmt.Errorf("creating runtime spec for service container: %w", err) } opts = append(opts, libpod.WithIsService()) + opts = append(opts, libpod.WithSdNotifyMode(define.SdNotifyModeConmon)) // Create a new libpod container based on the spec. ctr, err := ic.Libpod.NewContainer(ctx, runtimeSpec, spec, false, opts...) @@ -75,6 +84,17 @@ func (ic *ContainerEngine) createServiceContainer(ctx context.Context, name stri return ctr, nil } +// Creates the name for a service container based on the provided content of a +// K8s yaml file. +func serviceContainerName(content []byte) string { + // The name of the service container is the first 12 + // characters of the yaml file's hash followed by the + // '-service' suffix to guarantee a predictable and + // discoverable name. + hash := digest.FromBytes(content).Encoded() + return hash[0:12] + "-service" +} + func (ic *ContainerEngine) PlayKube(ctx context.Context, body io.Reader, options entities.PlayKubeOptions) (_ *entities.PlayKubeReport, finalErr error) { report := &entities.PlayKubeReport{} validKinds := 0 @@ -112,12 +132,7 @@ func (ic *ContainerEngine) PlayKube(ctx context.Context, body io.Reader, options // TODO: create constants for the various "kinds" of yaml files. var serviceContainer *libpod.Container if options.ServiceContainer && (kind == "Pod" || kind == "Deployment") { - // The name of the service container is the first 12 - // characters of the yaml file's hash followed by the - // '-service' suffix to guarantee a predictable and - // discoverable name. - hash := digest.FromBytes(content).Encoded() - ctr, err := ic.createServiceContainer(ctx, hash[0:12]+"-service") + ctr, err := ic.createServiceContainer(ctx, serviceContainerName(content), options) if err != nil { return nil, err } @@ -433,6 +448,7 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY podSpec.PodSpecGen.NoInfra = false podSpec.PodSpecGen.InfraContainerSpec = specgen.NewSpecGenerator(infraImage, false) podSpec.PodSpecGen.InfraContainerSpec.NetworkOptions = p.NetworkOptions + podSpec.PodSpecGen.InfraContainerSpec.SdNotifyMode = define.SdNotifyModeIgnore err = specgenutil.FillOutSpecGen(podSpec.PodSpecGen.InfraContainerSpec, &infraOptions, []string{}) if err != nil { @@ -516,10 +532,12 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY if err != nil { return nil, err } + specGen.SdNotifyMode = define.SdNotifyModeIgnore rtSpec, spec, opts, err := generate.MakeContainer(ctx, ic.Libpod, specGen, false, nil) if err != nil { return nil, err } + opts = append(opts, libpod.WithSdNotifyMode(define.SdNotifyModeIgnore)) ctr, err := generate.ExecuteCreate(ctx, ic.Libpod, rtSpec, spec, false, opts...) if err != nil { return nil, err @@ -570,6 +588,7 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY if err != nil { return nil, err } + opts = append(opts, libpod.WithSdNotifyMode(define.SdNotifyModeIgnore)) ctr, err := generate.ExecuteCreate(ctx, ic.Libpod, rtSpec, spec, false, opts...) if err != nil { return nil, err @@ -942,5 +961,6 @@ func (ic *ContainerEngine) PlayKubeDown(ctx context.Context, body io.Reader, _ e if err != nil { return nil, err } + return reports, nil } diff --git a/podman.spec.rpkg b/podman.spec.rpkg index 9372539187..12d1584af8 100644 --- a/podman.spec.rpkg +++ b/podman.spec.rpkg @@ -242,11 +242,13 @@ done %{_unitdir}/%{name}.service %{_unitdir}/%{name}.socket %{_unitdir}/%{name}-restart.service +%{_unitdir}/%{name}-play-kube@.service %{_userunitdir}/%{name}-auto-update.service %{_userunitdir}/%{name}-auto-update.timer %{_userunitdir}/%{name}.service %{_userunitdir}/%{name}.socket %{_userunitdir}/%{name}-restart.service +%{_userunitdir}/%{name}-play-kube@.service %{_tmpfilesdir}/%{name}.conf %if 0%{?fedora} >= 36 %{_modulesloaddir}/%{name}-iptables.conf diff --git a/test/system/250-systemd.bats b/test/system/250-systemd.bats index d0da654ad7..567fa89c19 100644 --- a/test/system/250-systemd.bats +++ b/test/system/250-systemd.bats @@ -292,4 +292,80 @@ LISTEN_FDNAMES=listen_fdnames" | sort) run_podman network rm -f $netname } +@test "podman-play-kube@.service template" { + skip_if_remote "systemd units do not work with remote clients" + + # If running from a podman source directory, build and use the source + # version of the play-kube-@ unit file + unit_name="podman-play-kube@.service" + unit_file="contrib/systemd/system/${unit_name}" + if [[ -e ${unit_file}.in ]]; then + echo "# [Building & using $unit_name from source]" >&3 + BINDIR=$(dirname $PODMAN) make $unit_file + cp $unit_file $UNIT_DIR/$unit_name + fi + + # Create the YAMl file + yaml_source="$PODMAN_TMPDIR/test.yaml" + cat >$yaml_source <$yaml_source <$yaml_source < $PODMAN_TMPDIR/test.yaml - run_podman play kube --service-container=true $PODMAN_TMPDIR/test.yaml + # The name of the service container is predictable: the first 12 characters + # of the hash of the YAML file followed by the "-service" suffix + yaml_sha=$(sha256sum $yaml_source) + service_container="${yaml_sha:0:12}-service" # Make sure that the service container exists and runs. - run_podman container inspect "352a88685060-service" --format "{{.State.Running}}" + run_podman container inspect $service_container --format "{{.State.Running}}" is "$output" "true" # Stop the *main* container and make sure that @@ -135,24 +137,26 @@ spec: # #) The service container is marked as an service container run_podman stop test_pod-test _ensure_pod_state test_pod Exited - run_podman container inspect "352a88685060-service" --format "{{.State.Running}}" - is "$output" "false" - run_podman container inspect "352a88685060-service" --format "{{.IsService}}" + _ensure_container_running $service_container false + run_podman container inspect $service_container --format "{{.IsService}}" is "$output" "true" # Restart the pod, make sure the service is running again run_podman pod restart test_pod - run_podman container inspect "352a88685060-service" --format "{{.State.Running}}" + run_podman container inspect $service_container --format "{{.State.Running}}" is "$output" "true" + # Check for an error when trying to remove the service container + run_podman 125 container rm $service_container + is "$output" "Error: container .* is the service container of pod(s) .* and cannot be removed without removing the pod(s)" + # Kill the pod and make sure the service is not running run_podman pod kill test_pod - run_podman container inspect "352a88685060-service" --format "{{.State.Running}}" - is "$output" "false" + _ensure_container_running $service_container false # Remove the pod and make sure the service is removed along with it run_podman pod rm test_pod - run_podman 1 container exists "352a88685060-service" + run_podman 1 container exists $service_container } @test "podman play --network" { diff --git a/test/system/helpers.bash b/test/system/helpers.bash index 0721312022..6868f26911 100644 --- a/test/system/helpers.bash +++ b/test/system/helpers.bash @@ -405,6 +405,19 @@ function _ensure_pod_state() { is "$output" "$2" "unexpected pod state" } +# Wait for the container's (1st arg) running state (2nd arg) +function _ensure_container_running() { + for i in {0..5}; do + run_podman container inspect $1 --format "{{.State.Running}}" + if [[ $output == "$2" ]]; then + break + fi + sleep 0.5 + done + + is "$output" "$2" "unexpected pod state" +} + ########################### # _add_label_if_missing # make sure skip messages include rootless/remote ###########################