Skip to content

Commit

Permalink
Data injection stability improvements (#644)
Browse files Browse the repository at this point in the history
BREAKING: Removes the ability to specify data injections into generic pods vs pod + container
Adds a compress flag for data injections to enable gzip compression on transfer
Adds file copy activity logging while running the data injection
Adds new Helm Chart config option NoWait to allow helm to apply manifests and keep going (needed to address data injection race condition)
Adds final data injection pod wait loop after data transfer
Changes from kubectl cp exec to kubectl exec
Adds a dynamic sync completion marker so that data injections leveraging an initContainer don't stall on their second run
Removes the arbitrary timeout conditions breaking large data syncs on slow networks
  • Loading branch information
jeff-mccoy authored Jul 27, 2022
1 parent 08a7b0d commit 68f519e
Show file tree
Hide file tree
Showing 13 changed files with 211 additions and 185 deletions.
29 changes: 27 additions & 2 deletions docs/4-user-guide/3-zarf-schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -1161,6 +1161,8 @@ must respect the following conditions

![Required](https://img.shields.io/badge/Required-red)

**Description:** The namespace to target for data injection

| Type | `string` |
| ---- | -------- |

Expand All @@ -1176,19 +1178,25 @@ must respect the following conditions

![Required](https://img.shields.io/badge/Required-red)

**Description:** The K8s selector to target for data injection

| Type | `string` |
| ---- | -------- |

</blockquote>
</details>

<details>
<summary><strong> <a name="components_items_dataInjections_items_target_container"></a>container</strong>
<summary><strong> <a name="components_items_dataInjections_items_target_container"></a>container *</strong>

</summary>
&nbsp;
<blockquote>

![Required](https://img.shields.io/badge/Required-red)

**Description:** The container to target for data injection

| Type | `string` |
| ---- | -------- |

Expand All @@ -1204,6 +1212,8 @@ must respect the following conditions

![Required](https://img.shields.io/badge/Required-red)

**Description:** The path to copy the data to in the container

| Type | `string` |
| ---- | -------- |

Expand All @@ -1213,6 +1223,21 @@ must respect the following conditions
</blockquote>
</details>

<details>
<summary><strong> <a name="components_items_dataInjections_items_compress"></a>compress</strong>

</summary>
&nbsp;
<blockquote>

**Description:** Compress the data before transmitting using gzip. Note: this requires support for tar/gzip locally and in the target image.

| Type | `boolean` |
| ---- | --------- |

</blockquote>
</details>

</blockquote>
</details>

Expand All @@ -1235,4 +1260,4 @@ must respect the following conditions
</details>

----------------------------------------------------------------------------------------------------------------------------
Generated from [zarf.schema.json](https://github.com/defenseunicorns/zarf/blob/master/zarf.schema.json) on 2022-07-12 at 22:24:30 +0000
Generated from [zarf.schema.json](https://github.com/defenseunicorns/zarf/blob/master/zarf.schema.json) on 2022-07-26 at 23:34:41 +0000
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ spec:
[
"sh",
"-c",
'while [ ! -f /test/.zarf-sync-complete ]; do echo "waiting for zarf data sync" && sleep 5; done; echo "we are done waiting!"',
'while [ ! -f /test/###ZARF_DATA_INJECTON_MARKER### ]; do echo "waiting for zarf data sync" && sleep 1; done; echo "we are done waiting!"',
]
resources:
requests:
Expand All @@ -37,11 +37,7 @@ spec:
- name: data-injection
image: alpine:3.15
command:
[
"/bin/sh",
"-ec",
"while :; do ls -lah /test ; sleep 2 ; done",
]
["/bin/sh", "-ec", "while :; do ls -lah /test ; sleep 2 ; done"]
resources:
requests:
memory: "16Mi"
Expand All @@ -55,8 +51,8 @@ spec:
readinessProbe:
exec:
command:
- cat
- /test/.zarf-sync-complete
- "cat"
- "/test/###ZARF_DATA_INJECTON_MARKER###"
initialDelaySeconds: 1
periodSeconds: 1

Expand Down
16 changes: 3 additions & 13 deletions examples/data-injection/zarf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ components:
- name: with-init-container
required: true
manifests:
- name: example-data-injection-pod
- name: example-data-injection
namespace: demo
files:
- manifests/deployment.yaml
- manifest.yaml
images:
- alpine:3.15
# Add new data into the cluster, these will keep trying up until their timeout
Expand All @@ -24,14 +24,4 @@ components:
selector: app=data-injection
container: data-loader
path: /test

- name: pod-only
required: true
# Add new data into the cluster, these will keep trying up until their timeout
dataInjections:
# Injection in a sub-directory with an already running pod
- source: sample-data
target:
namespace: demo
selector: app=data-injection
path: /test/subdirectory-test
compress: true
51 changes: 1 addition & 50 deletions go.sum

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions src/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,21 @@ var (
state types.ZarfState

SGetPublicKey string

// Timestamp of when the CLI was started
operationStartTime = time.Now().Unix()
dataInjectionMarker = ".zarf-injection-%d"
)

// Timestamp of when the CLI was started
func GetStartTime() int64 {
return operationStartTime
}

func GetDataInjectionMarker() string {
return fmt.Sprintf(dataInjectionMarker, operationStartTime)
}

func IsZarfInitConfig() bool {
message.Debug("config.IsZarfInitConfig")
return strings.ToLower(active.Kind) == "zarfinitconfig"
Expand Down
26 changes: 17 additions & 9 deletions src/internal/helm/chart.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"io/ioutil"
"time"

"github.com/defenseunicorns/zarf/src/config"
"github.com/defenseunicorns/zarf/src/types"

"github.com/defenseunicorns/zarf/src/internal/message"
Expand All @@ -24,6 +25,7 @@ type ChartOptions struct {
ChartOverride *chart.Chart
ValueOverride map[string]any
Component types.ZarfComponent
NoWait bool
}

// InstallOrUpgradeChart performs a helm install of the given chart
Expand All @@ -45,6 +47,13 @@ func InstallOrUpgradeChart(options ChartOptions) types.ConnectStrings {
} else {
options.ReleaseName = fmt.Sprintf("zarf-%s", options.Chart.Name)
}

// Do not wait for the chart to be ready if data injections are present
if len(options.Component.DataInjections) > 0 {
spinner.Updatef("Data injections detected, not waiting for chart to be ready")
options.NoWait = true
}

actionConfig, err := createActionConfig(options.Chart.Namespace, spinner)
postRender := NewRenderer(options, actionConfig)

Expand Down Expand Up @@ -161,15 +170,12 @@ func GenerateChart(basePath string, manifest types.ZarfManifest, component types
spinner := message.NewProgressSpinner("Starting helm chart generation %s", manifest.Name)
defer spinner.Stop()

// Use timestamp to help make a valid semver
now := time.Now()

// Generate a new chart
tmpChart := new(chart.Chart)
tmpChart.Metadata = new(chart.Metadata)
tmpChart.Metadata.Name = fmt.Sprintf("raw-%s", manifest.Name)
// This is fun, increment forward in a semver-way using epoch so helm doesn't cry
tmpChart.Metadata.Version = fmt.Sprintf("0.1.%d", now.Unix())
tmpChart.Metadata.Version = fmt.Sprintf("0.1.%d", config.GetStartTime())
tmpChart.Metadata.APIVersion = chart.APIVersionV1

// Add the manifest files so helm does its thing
Expand Down Expand Up @@ -213,10 +219,11 @@ func installChart(actionConfig *action.Configuration, options ChartOptions, post
// Bind the helm action
client := action.NewInstall(actionConfig)

// Let each chart run for 5 minutes
// Let each chart run for 15 minutes
client.Timeout = 15 * time.Minute

client.Wait = true
// Default helm behavior for Zarf is to wait for the resources to deploy, NoWait overrides that for special cases (such as data-injection)
client.Wait = !options.NoWait

// We need to include CRDs or operator installations will fail spectacularly
client.SkipCRDs = false
Expand All @@ -243,10 +250,11 @@ func upgradeChart(actionConfig *action.Configuration, options ChartOptions, post
message.Debugf("helm.upgradeChart(%#v, %#v, %#v)", actionConfig, options, postRender)
client := action.NewUpgrade(actionConfig)

// Let each chart run for 5 minutes
client.Timeout = 10 * time.Minute
// Let each chart run for 15 minutes
client.Timeout = 15 * time.Minute

client.Wait = true
// Default helm behavior for Zarf is to wait for the resources to deploy, NoWait overrides that for special cases (such as data-injection)k3
client.Wait = !options.NoWait

client.SkipCRDs = true

Expand Down
4 changes: 2 additions & 2 deletions src/internal/k8s/pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ func WaitForPodsAndContainers(target types.ZarfContainerTarget, waitForAllPods b
continue
}

// Check the status of regular containers for a runing match
// Check the status of regular containers for a running match
for _, container := range pod.Status.ContainerStatuses {
isRunning := container.State.Running != nil
if isRunning && container.Name == target.Container {
Expand Down Expand Up @@ -166,7 +166,7 @@ func WaitForPodsAndContainers(target types.ZarfContainerTarget, waitForAllPods b
time.Sleep(3 * time.Second)
}

message.Warn("Pod lookup timeout exceeded")
message.Debug("Pod lookup timeout exceeded")

return []string{}
}
103 changes: 103 additions & 0 deletions src/internal/packager/data.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package packager

import (
"context"
"fmt"
"os"
"path/filepath"
"sync"

"github.com/defenseunicorns/zarf/src/config"
"github.com/defenseunicorns/zarf/src/internal/k8s"
"github.com/defenseunicorns/zarf/src/internal/message"
"github.com/defenseunicorns/zarf/src/internal/utils"
"github.com/defenseunicorns/zarf/src/types"
)

// Wait for the target pod(s) to come up and inject the data into them
// todo: this currently requires kubectl but we should have enough k8s work to make this native now
func handleDataInjection(wg *sync.WaitGroup, data types.ZarfDataInjection, componentPath componentPaths) {
message.Debugf("packager.handleDataInjections(%#v, %#v, %#v)", wg, data, componentPath)
defer wg.Done()

injectionCompletionMarker := filepath.Join(componentPath.dataInjections, config.GetDataInjectionMarker())
if err := utils.WriteFile(injectionCompletionMarker, []byte("🦄")); err != nil {
message.Errorf(err, "Unable to create the data injection completion marker")
return
}

tarCompressFlag := ""
if data.Compress {
tarCompressFlag = "z"
}

iterator:
// The eternal loop because some data injections can take a very long time
for {
message.Debugf("Attempting to inject data into %s", data.Target)
source := filepath.Join(componentPath.dataInjections, filepath.Base(data.Target.Path))

// Wait until the pod we are injecting data into becomes available
pods := k8s.WaitForPodsAndContainers(data.Target, true)
if len(pods) < 1 {
continue
}

// Inject into all the pods
for _, pod := range pods {
kubectlExec := fmt.Sprintf("kubectl exec -i -n %s %s -c %s ", data.Target.Namespace, pod, data.Target.Container)
tarExec := fmt.Sprintf("tar c%s", tarCompressFlag)
untarExec := fmt.Sprintf("tar x%svf - -C %s", tarCompressFlag, data.Target.Path)

// Must create the target directory before trying to change to it for untar
mkdirExec := fmt.Sprintf("%s -- mkdir -p %s", kubectlExec, data.Target.Path)
_, _, err := utils.ExecCommandWithContext(context.TODO(), true, "sh", "-c", mkdirExec)
if err != nil {
message.Warnf("Unable to create the data injection target directory %s in pod %s", data.Target.Path, pod)
break iterator
}

cpPodExec := fmt.Sprintf("%s -C %s . | %s -- %s",
tarExec,
source,
kubectlExec,
untarExec,
)

// Do the actual data injection
_, _, err = utils.ExecCommandWithContext(context.TODO(), true, "sh", "-c", cpPodExec)
if err != nil {
message.Warnf("Error copying data into the pod %#v: %#v\n", pod, err)
break iterator
} else {
// Leave a marker in the target container for pods to track the sync action
cpPodExec := fmt.Sprintf("%s -C %s %s | %s -- %s",
tarExec,
componentPath.dataInjections,
config.GetDataInjectionMarker(),
kubectlExec,
untarExec,
)
_, _, err = utils.ExecCommandWithContext(context.TODO(), true, "sh", "-c", cpPodExec)
if err != nil {
message.Warnf("Error saving the zarf sync completion file after injection into pod %#v\n", pod)
}
}
}

// Do not look for a specific container after injection in case they are running an init container
podOnlyTarget := types.ZarfContainerTarget{
Namespace: data.Target.Namespace,
Selector: data.Target.Selector,
}

// Block one final time to make sure at least one pod has come up and injected the data
_ = k8s.WaitForPodsAndContainers(podOnlyTarget, false)

// Cleanup now to reduce disk pressure
_ = os.RemoveAll(source)

// Return to stop the loop
return
}
}
Loading

0 comments on commit 68f519e

Please sign in to comment.