Skip to content

Commit

Permalink
Support for collector readinessProbe (#2944)
Browse files Browse the repository at this point in the history
* enable readiness Probe for otel operator

Signed-off-by: Janario Oliveira <[email protected]>

* generate CRD and controller changes

Signed-off-by: Janario Oliveira <[email protected]>

* Adjusted code to be similar to Liveness logic

Signed-off-by: Janario Oliveira <[email protected]>

* Generated manifests

Signed-off-by: Janario Oliveira <[email protected]>

* Add changelog

Signed-off-by: Janario Oliveira <[email protected]>

* Fix lint

Signed-off-by: Janario Oliveira <[email protected]>

* Removed readinessProbe from alpha CRD

Signed-off-by: Janario Oliveira <[email protected]>

* Generated manifests

Signed-off-by: Janario Oliveira <[email protected]>

* Fix lint

Signed-off-by: Janario Oliveira <[email protected]>

* Centralized probe validation

Signed-off-by: Janario Oliveira <[email protected]>

---------

Signed-off-by: Janario Oliveira <[email protected]>
Co-authored-by: hesam.hamdarsi <[email protected]>
  • Loading branch information
janario and hesamhamdarsi authored May 13, 2024
1 parent c55a97a commit 3169efd
Show file tree
Hide file tree
Showing 10 changed files with 308 additions and 23 deletions.
17 changes: 17 additions & 0 deletions .chloggen/collector-readiness-support.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: enhancement

# The name of the component, or a single word describing the area of concern, (e.g. collector, target allocator, auto-instrumentation, opamp, github action)
component: collector

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Add support for readinessProbe on OpenTelemetryCollector CRD.

# One or more tracking issues related to the change
issues: [2943]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext: |
Add support for readinessProbe on `OpenTelemetryCollector` and its default similar to the already supported livenessProbe.
51 changes: 32 additions & 19 deletions apis/v1beta1/collector_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,25 +289,14 @@ func (c CollectorWebhook) validate(ctx context.Context, r *OpenTelemetryCollecto
return warnings, fmt.Errorf("a valid Ingress hostname has to be defined for subdomain ruleType")
}

if r.Spec.LivenessProbe != nil {
if r.Spec.LivenessProbe.InitialDelaySeconds != nil && *r.Spec.LivenessProbe.InitialDelaySeconds < 0 {
return warnings, fmt.Errorf("the OpenTelemetry Spec LivenessProbe InitialDelaySeconds configuration is incorrect. InitialDelaySeconds should be greater than or equal to 0")
}
if r.Spec.LivenessProbe.PeriodSeconds != nil && *r.Spec.LivenessProbe.PeriodSeconds < 1 {
return warnings, fmt.Errorf("the OpenTelemetry Spec LivenessProbe PeriodSeconds configuration is incorrect. PeriodSeconds should be greater than or equal to 1")
}
if r.Spec.LivenessProbe.TimeoutSeconds != nil && *r.Spec.LivenessProbe.TimeoutSeconds < 1 {
return warnings, fmt.Errorf("the OpenTelemetry Spec LivenessProbe TimeoutSeconds configuration is incorrect. TimeoutSeconds should be greater than or equal to 1")
}
if r.Spec.LivenessProbe.SuccessThreshold != nil && *r.Spec.LivenessProbe.SuccessThreshold < 1 {
return warnings, fmt.Errorf("the OpenTelemetry Spec LivenessProbe SuccessThreshold configuration is incorrect. SuccessThreshold should be greater than or equal to 1")
}
if r.Spec.LivenessProbe.FailureThreshold != nil && *r.Spec.LivenessProbe.FailureThreshold < 1 {
return warnings, fmt.Errorf("the OpenTelemetry Spec LivenessProbe FailureThreshold configuration is incorrect. FailureThreshold should be greater than or equal to 1")
}
if r.Spec.LivenessProbe.TerminationGracePeriodSeconds != nil && *r.Spec.LivenessProbe.TerminationGracePeriodSeconds < 1 {
return warnings, fmt.Errorf("the OpenTelemetry Spec LivenessProbe TerminationGracePeriodSeconds configuration is incorrect. TerminationGracePeriodSeconds should be greater than or equal to 1")
}
// validate probes Liveness/Readiness
err := validateProbe("LivenessProbe", r.Spec.LivenessProbe)
if err != nil {
return warnings, err
}
err = validateProbe("ReadinessProbe", r.Spec.ReadinessProbe)
if err != nil {
return warnings, err
}

// validate updateStrategy for DaemonSet
Expand Down Expand Up @@ -365,6 +354,30 @@ func (c CollectorWebhook) validateTargetAllocatorConfig(ctx context.Context, r *
return nil, nil
}

func validateProbe(probeName string, probe *Probe) error {
if probe != nil {
if probe.InitialDelaySeconds != nil && *probe.InitialDelaySeconds < 0 {
return fmt.Errorf("the OpenTelemetry Spec %s InitialDelaySeconds configuration is incorrect. InitialDelaySeconds should be greater than or equal to 0", probeName)
}
if probe.PeriodSeconds != nil && *probe.PeriodSeconds < 1 {
return fmt.Errorf("the OpenTelemetry Spec %s PeriodSeconds configuration is incorrect. PeriodSeconds should be greater than or equal to 1", probeName)
}
if probe.TimeoutSeconds != nil && *probe.TimeoutSeconds < 1 {
return fmt.Errorf("the OpenTelemetry Spec %s TimeoutSeconds configuration is incorrect. TimeoutSeconds should be greater than or equal to 1", probeName)
}
if probe.SuccessThreshold != nil && *probe.SuccessThreshold < 1 {
return fmt.Errorf("the OpenTelemetry Spec %s SuccessThreshold configuration is incorrect. SuccessThreshold should be greater than or equal to 1", probeName)
}
if probe.FailureThreshold != nil && *probe.FailureThreshold < 1 {
return fmt.Errorf("the OpenTelemetry Spec %s FailureThreshold configuration is incorrect. FailureThreshold should be greater than or equal to 1", probeName)
}
if probe.TerminationGracePeriodSeconds != nil && *probe.TerminationGracePeriodSeconds < 1 {
return fmt.Errorf("the OpenTelemetry Spec %s TerminationGracePeriodSeconds configuration is incorrect. TerminationGracePeriodSeconds should be greater than or equal to 1", probeName)
}
}
return nil
}

func checkAutoscalerSpec(autoscaler *AutoscalerSpec) error {
if autoscaler.Behavior != nil {
if autoscaler.Behavior.ScaleDown != nil && autoscaler.Behavior.ScaleDown.StabilizationWindowSeconds != nil &&
Expand Down
66 changes: 66 additions & 0 deletions apis/v1beta1/collector_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1026,6 +1026,17 @@ func TestOTELColValidatingWebhook(t *testing.T) {
},
expectedErr: "the OpenTelemetry Spec LivenessProbe InitialDelaySeconds configuration is incorrect",
},
{
name: "invalid InitialDelaySeconds readiness",
otelcol: OpenTelemetryCollector{
Spec: OpenTelemetryCollectorSpec{
ReadinessProbe: &Probe{
InitialDelaySeconds: &minusOne,
},
},
},
expectedErr: "the OpenTelemetry Spec ReadinessProbe InitialDelaySeconds configuration is incorrect",
},
{
name: "invalid PeriodSeconds",
otelcol: OpenTelemetryCollector{
Expand All @@ -1037,6 +1048,17 @@ func TestOTELColValidatingWebhook(t *testing.T) {
},
expectedErr: "the OpenTelemetry Spec LivenessProbe PeriodSeconds configuration is incorrect",
},
{
name: "invalid PeriodSeconds readiness",
otelcol: OpenTelemetryCollector{
Spec: OpenTelemetryCollectorSpec{
ReadinessProbe: &Probe{
PeriodSeconds: &zero,
},
},
},
expectedErr: "the OpenTelemetry Spec ReadinessProbe PeriodSeconds configuration is incorrect",
},
{
name: "invalid TimeoutSeconds",
otelcol: OpenTelemetryCollector{
Expand All @@ -1048,6 +1070,17 @@ func TestOTELColValidatingWebhook(t *testing.T) {
},
expectedErr: "the OpenTelemetry Spec LivenessProbe TimeoutSeconds configuration is incorrect",
},
{
name: "invalid TimeoutSeconds readiness",
otelcol: OpenTelemetryCollector{
Spec: OpenTelemetryCollectorSpec{
ReadinessProbe: &Probe{
TimeoutSeconds: &zero,
},
},
},
expectedErr: "the OpenTelemetry Spec ReadinessProbe TimeoutSeconds configuration is incorrect",
},
{
name: "invalid SuccessThreshold",
otelcol: OpenTelemetryCollector{
Expand All @@ -1059,6 +1092,17 @@ func TestOTELColValidatingWebhook(t *testing.T) {
},
expectedErr: "the OpenTelemetry Spec LivenessProbe SuccessThreshold configuration is incorrect",
},
{
name: "invalid SuccessThreshold readiness",
otelcol: OpenTelemetryCollector{
Spec: OpenTelemetryCollectorSpec{
ReadinessProbe: &Probe{
SuccessThreshold: &zero,
},
},
},
expectedErr: "the OpenTelemetry Spec ReadinessProbe SuccessThreshold configuration is incorrect",
},
{
name: "invalid FailureThreshold",
otelcol: OpenTelemetryCollector{
Expand All @@ -1070,6 +1114,17 @@ func TestOTELColValidatingWebhook(t *testing.T) {
},
expectedErr: "the OpenTelemetry Spec LivenessProbe FailureThreshold configuration is incorrect",
},
{
name: "invalid FailureThreshold readiness",
otelcol: OpenTelemetryCollector{
Spec: OpenTelemetryCollectorSpec{
ReadinessProbe: &Probe{
FailureThreshold: &zero,
},
},
},
expectedErr: "the OpenTelemetry Spec ReadinessProbe FailureThreshold configuration is incorrect",
},
{
name: "invalid TerminationGracePeriodSeconds",
otelcol: OpenTelemetryCollector{
Expand All @@ -1081,6 +1136,17 @@ func TestOTELColValidatingWebhook(t *testing.T) {
},
expectedErr: "the OpenTelemetry Spec LivenessProbe TerminationGracePeriodSeconds configuration is incorrect",
},
{
name: "invalid TerminationGracePeriodSeconds readiness",
otelcol: OpenTelemetryCollector{
Spec: OpenTelemetryCollectorSpec{
ReadinessProbe: &Probe{
TerminationGracePeriodSeconds: &zero64,
},
},
},
expectedErr: "the OpenTelemetry Spec ReadinessProbe TerminationGracePeriodSeconds configuration is incorrect",
},
{
name: "invalid AdditionalContainers",
otelcol: OpenTelemetryCollector{
Expand Down
6 changes: 5 additions & 1 deletion apis/v1beta1/opentelemetrycollector_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ type OpenTelemetryCollectorSpec struct {
// It is only effective when healthcheckextension is configured in the OpenTelemetry Collector pipeline.
// +optional
LivenessProbe *Probe `json:"livenessProbe,omitempty"`
// Readiness config for the OpenTelemetry Collector except the probe handler which is auto generated from the health extension of the collector.
// It is only effective when healthcheckextension is configured in the OpenTelemetry Collector pipeline.
// +optional
ReadinessProbe *Probe `json:"readinessProbe,omitempty"`

// ObservabilitySpec defines how telemetry data gets handled.
//
Expand Down Expand Up @@ -206,7 +210,7 @@ type TargetAllocatorEmbedded struct {
PodDisruptionBudget *PodDisruptionBudgetSpec `json:"podDisruptionBudget,omitempty"`
}

// Probe defines the OpenTelemetry's pod probe config. Only Liveness probe is supported currently.
// Probe defines the OpenTelemetry's pod probe config.
type Probe struct {
// Number of seconds after the container has started before liveness probes are initiated.
// Defaults to 0 seconds. Minimum value is 0.
Expand Down
5 changes: 5 additions & 0 deletions apis/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions bundle/manifests/opentelemetry.io_opentelemetrycollectors.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6654,6 +6654,27 @@ spec:
x-kubernetes-list-type: atomic
priorityClassName:
type: string
readinessProbe:
properties:
failureThreshold:
format: int32
type: integer
initialDelaySeconds:
format: int32
type: integer
periodSeconds:
format: int32
type: integer
successThreshold:
format: int32
type: integer
terminationGracePeriodSeconds:
format: int64
type: integer
timeoutSeconds:
format: int32
type: integer
type: object
replicas:
format: int32
type: integer
Expand Down
21 changes: 21 additions & 0 deletions config/crd/bases/opentelemetry.io_opentelemetrycollectors.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6640,6 +6640,27 @@ spec:
x-kubernetes-list-type: atomic
priorityClassName:
type: string
readinessProbe:
properties:
failureThreshold:
format: int32
type: integer
initialDelaySeconds:
format: int32
type: integer
periodSeconds:
format: int32
type: integer
successThreshold:
format: int32
type: integer
terminationGracePeriodSeconds:
format: int64
type: integer
timeoutSeconds:
format: int32
type: integer
type: object
replicas:
format: int32
type: integer
Expand Down
99 changes: 99 additions & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -30050,6 +30050,14 @@ If not specified, the pod priority will be default or zero if there is no
default.<br/>
</td>
<td>false</td>
</tr><tr>
<td><b><a href="#opentelemetrycollectorspecreadinessprobe">readinessProbe</a></b></td>
<td>object</td>
<td>
Readiness config for the OpenTelemetry Collector except the probe handler which is auto generated from the health extension of the collector.
It is only effective when healthcheckextension is configured in the OpenTelemetry Collector pipeline.<br/>
</td>
<td>false</td>
</tr><tr>
<td><b>replicas</b></td>
<td>integer</td>
Expand Down Expand Up @@ -40043,6 +40051,97 @@ More info: https://kubernetes.io/docs/concepts/services-networking/service/#defi
</table>


### OpenTelemetryCollector.spec.readinessProbe
<sup><sup>[↩ Parent](#opentelemetrycollectorspec-1)</sup></sup>



Readiness config for the OpenTelemetry Collector except the probe handler which is auto generated from the health extension of the collector.
It is only effective when healthcheckextension is configured in the OpenTelemetry Collector pipeline.

<table>
<thead>
<tr>
<th>Name</th>
<th>Type</th>
<th>Description</th>
<th>Required</th>
</tr>
</thead>
<tbody><tr>
<td><b>failureThreshold</b></td>
<td>integer</td>
<td>
Minimum consecutive failures for the probe to be considered failed after having succeeded.
Defaults to 3. Minimum value is 1.<br/>
<br/>
<i>Format</i>: int32<br/>
</td>
<td>false</td>
</tr><tr>
<td><b>initialDelaySeconds</b></td>
<td>integer</td>
<td>
Number of seconds after the container has started before liveness probes are initiated.
Defaults to 0 seconds. Minimum value is 0.
More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes<br/>
<br/>
<i>Format</i>: int32<br/>
</td>
<td>false</td>
</tr><tr>
<td><b>periodSeconds</b></td>
<td>integer</td>
<td>
How often (in seconds) to perform the probe.
Default to 10 seconds. Minimum value is 1.<br/>
<br/>
<i>Format</i>: int32<br/>
</td>
<td>false</td>
</tr><tr>
<td><b>successThreshold</b></td>
<td>integer</td>
<td>
Minimum consecutive successes for the probe to be considered successful after having failed.
Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1.<br/>
<br/>
<i>Format</i>: int32<br/>
</td>
<td>false</td>
</tr><tr>
<td><b>terminationGracePeriodSeconds</b></td>
<td>integer</td>
<td>
Optional duration in seconds the pod needs to terminate gracefully upon probe failure.
The grace period is the duration in seconds after the processes running in the pod are sent
a termination signal and the time when the processes are forcibly halted with a kill signal.
Set this value longer than the expected cleanup time for your process.
If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this
value overrides the value provided by the pod spec.
Value must be non-negative integer. The value zero indicates stop immediately via
the kill signal (no opportunity to shut down).
This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate.
Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset.<br/>
<br/>
<i>Format</i>: int64<br/>
</td>
<td>false</td>
</tr><tr>
<td><b>timeoutSeconds</b></td>
<td>integer</td>
<td>
Number of seconds after which the probe times out.
Defaults to 1 second. Minimum value is 1.
More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes<br/>
<br/>
<i>Format</i>: int32<br/>
</td>
<td>false</td>
</tr></tbody>
</table>


### OpenTelemetryCollector.spec.resources
<sup><sup>[↩ Parent](#opentelemetrycollectorspec-1)</sup></sup>

Expand Down
Loading

0 comments on commit 3169efd

Please sign in to comment.