Skip to content

Commit

Permalink
health: PoC for health checks via otel metrics
Browse files Browse the repository at this point in the history
Signed-off-by: Hank Donnay <[email protected]>
  • Loading branch information
hdonnay committed Feb 15, 2024
1 parent 1573239 commit efe68fc
Show file tree
Hide file tree
Showing 6 changed files with 422 additions and 7 deletions.
7 changes: 4 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@ require (
github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80
github.com/ugorji/go/codec v1.2.12
github.com/urfave/cli/v2 v2.27.1
go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace v0.47.0
go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace v0.48.0
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.48.0
go.opentelemetry.io/otel v1.23.1
go.opentelemetry.io/otel/exporters/jaeger v1.17.0
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.22.0
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.23.1
go.opentelemetry.io/otel/metric v1.23.1
go.opentelemetry.io/otel/sdk v1.23.1
go.opentelemetry.io/otel/sdk/metric v1.23.1
go.opentelemetry.io/otel/trace v1.23.1
golang.org/x/net v0.21.0
golang.org/x/sync v0.6.0
Expand Down Expand Up @@ -82,7 +84,6 @@ require (
github.com/ulikunitz/xz v0.5.11 // indirect
github.com/vbatts/tar-split v0.11.3 // indirect
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
go.opentelemetry.io/otel/metric v1.23.1 // indirect
golang.org/x/crypto v0.19.0 // indirect
golang.org/x/mod v0.14.0 // indirect
golang.org/x/sys v0.17.0 // indirect
Expand Down
10 changes: 6 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -255,20 +255,22 @@ github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673/go.mod h1:N3UwUGtsr
github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
github.com/zenazn/goji v0.9.0/go.mod h1:7S9M489iMyHBNxwZnk9/EHS098H4/F6TATF2mIxtB1Q=
go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace v0.47.0 h1:rw+yB4sMhufNzbVHGG9SDMSrw1CKSnRqfjJnMpAH4dE=
go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace v0.47.0/go.mod h1:2NonlJyJNVbDK/hCwiLsu5gsD2bVtmIzQ/tGzWq58us=
go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace v0.48.0 h1:ZeW4++xt1VrFSdnd0pFXz0PkrjLb89/VPOUOyPDJG/g=
go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace v0.48.0/go.mod h1:ZdzuQW6m/OEtOLWWJs+k5ddZXsUq9xs2vx+ZT9G9eJ0=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.48.0 h1:doUP+ExOpH3spVTLS0FcWGLnQrPct/hD/bCPbDRUEAU=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.48.0/go.mod h1:rdENBZMT2OE6Ne/KLwpiXudnAsbdrdBaqBvTN8M8BgA=
go.opentelemetry.io/otel v1.23.1 h1:Za4UzOqJYS+MUczKI320AtqZHZb7EqxO00jAHE0jmQY=
go.opentelemetry.io/otel v1.23.1/go.mod h1:Td0134eafDLcTS4y+zQ26GE8u3dEuRBiBCTUIRHaikA=
go.opentelemetry.io/otel/exporters/jaeger v1.17.0 h1:D7UpUy2Xc2wsi1Ras6V40q806WM07rqoCWzXu7Sqy+4=
go.opentelemetry.io/otel/exporters/jaeger v1.17.0/go.mod h1:nPCqOnEH9rNLKqH/+rrUjiMzHJdV1BlpKcTwRTyKkKI=
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.22.0 h1:zr8ymM5OWWjjiWRzwTfZ67c905+2TMHYp2lMJ52QTyM=
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.22.0/go.mod h1:sQs7FT2iLVJ+67vYngGJkPe1qr39IzaBzaj9IDNNY8k=
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.23.1 h1:IqmsDcJnxQSs6W+1TMSqpYO7VY4ZuEKJGYlSBPUlT1s=
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.23.1/go.mod h1:VMZ84RYOd4Lrp0+09mckDvqBj2PXWDwOFaxb1P5uO8g=
go.opentelemetry.io/otel/metric v1.23.1 h1:PQJmqJ9u2QaJLBOELl1cxIdPcpbwzbkjfEyelTl2rlo=
go.opentelemetry.io/otel/metric v1.23.1/go.mod h1:mpG2QPlAfnK8yNhNJAxDZruU9Y1/HubbC+KyH8FaCWI=
go.opentelemetry.io/otel/sdk v1.23.1 h1:O7JmZw0h76if63LQdsBMKQDWNb5oEcOThG9IrxscV+E=
go.opentelemetry.io/otel/sdk v1.23.1/go.mod h1:LzdEVR5am1uKOOwfBWFef2DCi1nu3SA8XQxx2IerWFk=
go.opentelemetry.io/otel/sdk/metric v1.23.1 h1:T9/8WsYg+ZqIpMWwdISVVrlGb/N0Jr1OHjR/alpKwzg=
go.opentelemetry.io/otel/sdk/metric v1.23.1/go.mod h1:8WX6WnNtHCgUruJ4TJ+UssQjMtpxkpX0zveQC8JG/E0=
go.opentelemetry.io/otel/trace v1.23.1 h1:4LrmmEd8AU2rFvU1zegmvqW7+kWarxtNOPyeL6HmYY8=
go.opentelemetry.io/otel/trace v1.23.1/go.mod h1:4IpnpJFwr1mo/6HL8XIPJaE9y0+u1KcVmuW7dwFSVrI=
go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
Expand Down
3 changes: 3 additions & 0 deletions health/health.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// Package health provides HTTP handlers and adapters for health and readiness
// probes.
package health
25 changes: 25 additions & 0 deletions health/main_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package health

import (
"os"
"testing"

"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/sdk/metric"
)

func TestMain(m *testing.M) {
exit := 0
defer func() {
if exit != 0 {
os.Exit(exit)
}
}()

exp, h := NewMetricsHook()
handler = h // Declared in otel_test.go
p := metric.NewMeterProvider(metric.WithReader(exp))
otel.SetMeterProvider(p)

exit = m.Run()
}
231 changes: 231 additions & 0 deletions health/otel.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
package health

import (
"bytes"
"fmt"
"io"
"net/http"
"sync"
"text/tabwriter"
"time"

"github.com/quay/zlog"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/sdk/instrumentation"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/metric/metricdata"
)

// HealthUnit is the unit that float64 gauges must use to be hooked into this this package's reporting.
//
// The instruments should be implemented as callbacks to avoid missing data after a cold start.
//
// Individual instruments may assign meaning to nonzero values, but should assume the values will
// not be machine parsed.
const HealthUnit = "{health}"

// FallibleKey is an [attribute.Key] that instruments can use as a boolean [attribute.KeyValue] to
// indicate that a metric shouldn't fail the check.
// Requests can override this by using the "strict" query parameter.
//
// Package authors can use this to indicate that there may be something wrong in a downstream service.
// Tripping a process' health check in the event of a transient event may cause restart storms
// or needless load balancer evictions, causing even worse service degradation.
var FallibleKey = attribute.Key("github.com/quay/clair/v4/health.fallible")

// This is modeled on the prometheus exporter: https://github.com/open-telemetry/opentelemetry-go/blob/exporters/prometheus/v0.45.2/exporters/prometheus/exporter.go

// NewMetricsHook ...
//
// The returned [http.Handler] currently does not care about the request path, but may in the
// future. Users should remove any prefixes for forward compatibility.
//
// Three query parameters are used:
// - meter: Select a single meter name.
// - instrument: Select a single instrument name.
// - strict: Disregard the "fallible" attribute.
//
// GET and HEAD methods are supported and return the same status code.
// Returned status codes are:
// - 200 OK: All checks reported ok (modified by the "strict" parameter).
// - 204 No Content: No health check instruments are match the supplied filters.
// - 425 Too Early: Instruments exist, but have no data.
// - 503 Service Unavailable: At least one check reported not-ok (modified by the "strict" parameter).
//
// GET requests return a body containing details. The contents are intended for humans and not considered
// API. The current format is space-separated columns containing:
// - Instrument name
// - Status
// - Value
// - Timestamp
// - Description
func NewMetricsHook() (sdkmetric.Reader, http.Handler) {
reader := sdkmetric.NewManualReader()
c := collector{
reader: reader,
}
return reader, &c
}

// Collector implements the HTTP API by calling the enclosed ManualReader on demand.
//
// There's no provision to prevent a user from DoS-ing the process by making requests in a tight loop.
type collector struct {
reader *sdkmetric.ManualReader
rmPool sync.Pool
bufPool sync.Pool
}

// ServeHTTP implements [http.Handler].
//
// The API is described in the [NewMetricsHook] documentation.
func (c *collector) ServeHTTP(w http.ResponseWriter, r *http.Request) {
nowrite := r.Method == http.MethodHead
switch r.Method {
case http.MethodGet, http.MethodHead:
default:
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
ctx := r.Context()
if err := r.ParseForm(); err != nil {
zlog.Warn(ctx).
Err(err).
Msg("unable to parse healthcheck request")
return
}
meter := r.Form.Get(`meter`)
instrument := r.Form.Get(`instrument`)
strict := r.Form.Has(`strict`)

var rm metricdata.ResourceMetrics
if err := c.reader.Collect(ctx, &rm); err != nil {
zlog.Warn(ctx).
Err(err).
Msg("unable to collect healthcheck")
return
}

// Mertic writing hook: by default, do nothing.
writeMetric := func(s instrumentation.Scope, m metricdata.Metrics, pt metricdata.DataPoint[float64]) {}
if !nowrite {
buf := c.getBuf()
tw := tabwriter.NewWriter(buf, 4, 4, 1, ' ', 0)
// The actual writing is handled in this defer.
defer func() {
tw.Flush()
io.Copy(w, buf)
c.putBuf(buf)
http.NewResponseController(w).Flush()
}()
writeMetric = func(s instrumentation.Scope, m metricdata.Metrics, pt metricdata.DataPoint[float64]) {
fmt.Fprintf(tw, "%s.%s\t%s\t%g\t%s\t# %s\n",
s.Name, m.Name,
checkStatus(pt.Value).String(),
pt.Value,
pt.Time.UTC().Format(time.RFC3339),
m.Description,
)
}
}
status := http.StatusOK
var haveData bool

Metrics:
for _, sm := range rm.ScopeMetrics {
// Tempting to break out of this loop when not writing a body, but we want to return the
// same status code no matter what. Consider a case where the first instrument has no data
// and the last one is failing.

s := sm.Scope
// Filter if needed.
if meter != "" && meter != s.Name {
continue
}

for _, m := range sm.Metrics {
if m.Unit != HealthUnit {
continue
}
g, ok := m.Data.(metricdata.Gauge[float64])
if !ok {
continue
}
// Filter if needed.
if instrument != "" && instrument != m.Name {
continue
}

if len(g.DataPoints) == 0 {
if status < http.StatusTooEarly {
status = http.StatusTooEarly
}
w.Header().Add(`health-data-missing`, s.Name+"."+m.Name)
continue
}

for _, pt := range g.DataPoints {
haveData = true

var fallible bool
if fv, ok := pt.Attributes.Value(FallibleKey); ok && fv.Type() == attribute.BOOL {
fallible = fv.AsBool()
}
switch ok := pt.Value == 0; {
case ok:
case fallible && !strict:
default:
status = http.StatusServiceUnavailable
}

writeMetric(s, m, pt)
}
}

if meter != "" {
break Metrics
}
}
if !haveData {
status = http.StatusNoContent
}

h := w.Header()
h.Set("Content-Type", "text/plain; charset=utf-8")
h.Set("Cache-Control", "no-store")
h.Set("X-Content-Type-Options", "nosniff")
w.WriteHeader(status)
}

// CheckStatus formats a float64 for printing.
type checkStatus float64

// String implements [fmt.Stringer].
func (s checkStatus) String() string {
if s == 0 {
return " ok"
}
return "bad"
}

// GetBuf returns a pooled buffer or creates one.
func (c *collector) getBuf() *bytes.Buffer {
v := c.bufPool.Get()
if v == nil {
var buf bytes.Buffer
buf.Grow(1024)
return &buf
}
return v.(*bytes.Buffer)
}

// PutBuf resets the buffer and returns it to the pool.
func (c *collector) putBuf(buf *bytes.Buffer) {
// If gigantic, leak the buffer.
// Trick from log/slog to reduce steady-state memory usage.
if buf.Cap() > 4096 {
return
}
buf.Reset()
c.bufPool.Put(buf)
}
Loading

0 comments on commit efe68fc

Please sign in to comment.