Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Telemetry Metrics Configuration. #18186

Merged
merged 22 commits into from
Feb 10, 2023
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
805c6ad
Telemetry Metrics Configuration.
kitography Dec 1, 2022
50b4809
Err Shadowing Fix (woah, semgrep is cool).
kitography Dec 1, 2022
0f5fe89
Fix TestBackend_RevokePlusTidy_Intermediate
kitography Dec 1, 2022
644291b
Add Changelog.
kitography Dec 1, 2022
e124d17
Fix memory leak. Code cleanup as suggested by Steve.
kitography Dec 5, 2022
173143b
Turn off metrics by default, breaking-change.
kitography Dec 5, 2022
39e7113
Merge branch 'main' into VAULT-8784-telemetry-turn-off-on-metrics-count
kitography Dec 12, 2022
50e02af
Show on tidy-status before start-up.
kitography Dec 13, 2022
38421eb
Fix tests
kitography Dec 13, 2022
617f8e3
make fmt
kitography Dec 13, 2022
dd21615
Merge branch 'main' into VAULT-8784-telemetry-turn-off-on-metrics-count
kitography Feb 10, 2023
27e5ca8
Add emit metrics to periodicFunc
kitography Feb 10, 2023
f4473ba
Test not delivering unavailable metrics + fix.
kitography Feb 10, 2023
bff0245
Better error message.
kitography Feb 10, 2023
eb646af
Fixing the false-error bug.
kitography Feb 10, 2023
945cf93
make fmt.
kitography Feb 10, 2023
32fb8f1
Try to fix race issue, remove confusing comments.
kitography Feb 10, 2023
f2a546d
Switch metric counter variables to an atomic.Uint32
stevendpclark Feb 10, 2023
82ca965
Fix race-issue better by trying until the metric is sunk.
kitography Feb 10, 2023
3cb6793
make fmt.
kitography Feb 10, 2023
ffc680a
empty commit to retrigger non-race tests that all pass locally
kitography Feb 10, 2023
eeeb864
Extend metrics, remove parallel run because I have no idea what else …
kitography Feb 10, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 117 additions & 42 deletions builtin/logical/pki/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ import (
"sync/atomic"
"time"

atomic2 "go.uber.org/atomic"

"github.com/hashicorp/vault/helper/constants"

"github.com/hashicorp/go-multierror"

atomic2 "go.uber.org/atomic"

"github.com/hashicorp/vault/sdk/helper/consts"

"github.com/armon/go-metrics"
Expand Down Expand Up @@ -244,9 +244,12 @@ func Backend(conf *logical.BackendConfig) *backend {
b.lastTidy = time.Now()

// Metrics initialization for count of certificates in storage
b.certCountEnabled = atomic2.NewBool(false)
b.publishCertCountMetrics = atomic2.NewBool(false)
b.certsCounted = atomic2.NewBool(false)
b.certCount = new(uint32)
b.revokedCertCount = new(uint32)
b.certCountError = "Initialize Not Yet Run, Cert Counts Unavailable"
b.certCount = &atomic.Uint32{}
b.revokedCertCount = &atomic.Uint32{}
b.possibleDoubleCountedSerials = make([]string, 0, 250)
b.possibleDoubleCountedRevokedSerials = make([]string, 0, 250)

Expand All @@ -270,9 +273,12 @@ type backend struct {

unifiedTransferStatus *unifiedTransferStatus

certCount *uint32
revokedCertCount *uint32
certCountEnabled *atomic2.Bool
publishCertCountMetrics *atomic2.Bool
certCount *atomic.Uint32
revokedCertCount *atomic.Uint32
certsCounted *atomic2.Bool
certCountError string
possibleDoubleCountedSerials []string
possibleDoubleCountedRevokedSerials []string

Expand Down Expand Up @@ -364,9 +370,10 @@ func (b *backend) initialize(ctx context.Context, _ *logical.InitializationReque
// Initialize also needs to populate our certificate and revoked certificate count
err = b.initializeStoredCertificateCounts(ctx)
if err != nil {
return err
// Don't block/err initialize/startup for metrics. Context on this call can time out due to number of certificates.
b.Logger().Error("Could not initialize stored certificate counts", err)
b.certCountError = err.Error()
}

return nil
}

Expand Down Expand Up @@ -601,6 +608,13 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er
crlErr := doCRL()
tidyErr := doAutoTidy()

// Periodically re-emit gauges so that they don't disappear/go stale
tidyConfig, err := sc.getAutoTidyConfig()
if err != nil {
return err
}
b.emitCertStoreMetrics(tidyConfig)

var errors error
if crlErr != nil {
errors = multierror.Append(errors, fmt.Errorf("Error building CRLs:\n - %w\n", crlErr))
Expand All @@ -625,24 +639,51 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er
}

func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error {
b.tidyStatusLock.RLock()
defer b.tidyStatusLock.RUnlock()
// For performance reasons, we can't lock on issuance/storage of certs until a list operation completes,
// but we want to limit possible miscounts / double-counts to over-counting, so we take the tidy lock which
// prevents (most) deletions - in particular we take a read lock (sufficient to block the write lock in
// tidyStatusStart while allowing tidy to still acquire a read lock to report via its endpoint)
b.tidyStatusLock.RLock()
defer b.tidyStatusLock.RUnlock()
sc := b.makeStorageContext(ctx, b.storage)
config, err := sc.getAutoTidyConfig()
if err != nil {
return err
}

b.certCountEnabled.Store(config.MaintainCount)
b.publishCertCountMetrics.Store(config.PublishMetrics)

if config.MaintainCount == false {
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil
b.certsCounted.Store(true)
b.certCount.Store(0)
b.revokedCertCount.Store(0)
b.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts"
return nil
}

// Ideally these three things would be set in one transaction, since that isn't possible, set the counts to "0",
// first, so count will over-count (and miss putting things in deduplicate queue), rather than under-count.
b.certCount.Store(0)
b.revokedCertCount.Store(0)
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil
// A cert issued or revoked here will be double-counted. That's okay, this is "best effort" metrics.
b.certsCounted.Store(false)

entries, err := b.storage.List(ctx, "certs/")
if err != nil {
return err
}
atomic.AddUint32(b.certCount, uint32(len(entries)))
b.certCount.Add(uint32(len(entries)))

revokedEntries, err := b.storage.List(ctx, "revoked/")
if err != nil {
return err
}
atomic.AddUint32(b.revokedCertCount, uint32(len(revokedEntries)))
b.revokedCertCount.Add(uint32(len(revokedEntries)))

b.certsCounted.Store(true)
// Now that the metrics are set, we can switch from appending newly-stored certificates to the possible double-count
Expand Down Expand Up @@ -723,64 +764,98 @@ func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error {
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil

certCount := atomic.LoadUint32(b.certCount)
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
revokedCertCount := atomic.LoadUint32(b.revokedCertCount)
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount))
b.emitCertStoreMetrics(config)

b.certCountError = ""

return nil
}

func (b *backend) emitCertStoreMetrics(config *tidyConfig) {
if config.PublishMetrics == true {
certCount := b.certCount.Load()
b.emitTotalCertCountMetric(certCount)
revokedCertCount := b.revokedCertCount.Load()
b.emitTotalRevokedCountMetric(revokedCertCount)
}
}

// The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call:
// eg. certsCounted := b.certsCounted.Load()
func (b *backend) incrementTotalCertificatesCount(certsCounted bool, newSerial string) {
certCount := atomic.AddUint32(b.certCount, 1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "certs/") {
newSerial = newSerial[6:]
func (b *backend) ifCountEnabledIncrementTotalCertificatesCount(certsCounted bool, newSerial string) {
if b.certCountEnabled.Load() {
certCount := b.certCount.Add(1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "certs/") {
newSerial = newSerial[6:]
}
b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial)
default:
if b.publishCertCountMetrics.Load() {
b.emitTotalCertCountMetric(certCount)
}
}
b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial)
default:
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
}
}

func (b *backend) decrementTotalCertificatesCountReport() {
certCount := b.decrementTotalCertificatesCountNoReport()
func (b *backend) ifCountEnabledDecrementTotalCertificatesCountReport() {
if b.certCountEnabled.Load() {
certCount := b.decrementTotalCertificatesCountNoReport()
if b.publishCertCountMetrics.Load() {
b.emitTotalCertCountMetric(certCount)
}
}
}

func (b *backend) emitTotalCertCountMetric(certCount uint32) {
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
}

// Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet
// Does not respect whether-we-are-counting backend information.
func (b *backend) decrementTotalCertificatesCountNoReport() uint32 {
newCount := atomic.AddUint32(b.certCount, ^uint32(0))
newCount := b.certCount.Add(^uint32(0))
return newCount
}

// The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call:
// eg. certsCounted := b.certsCounted.Load()
func (b *backend) incrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) {
newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, 1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "revoked/") { // allow passing in the path (revoked/serial) OR the serial
newSerial = newSerial[8:]
func (b *backend) ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) {
if b.certCountEnabled.Load() {
newRevokedCertCount := b.revokedCertCount.Add(1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "revoked/") { // allow passing in the path (revoked/serial) OR the serial
newSerial = newSerial[8:]
}
b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial)
default:
if b.publishCertCountMetrics.Load() {
b.emitTotalRevokedCountMetric(newRevokedCertCount)
}
}
}
}

func (b *backend) ifCountEnabledDecrementTotalRevokedCertificatesCountReport() {
if b.certCountEnabled.Load() {
revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport()
if b.publishCertCountMetrics.Load() {
b.emitTotalRevokedCountMetric(revokedCertCount)
}
b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial)
default:
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(newRevokedCertCount))
}
}

func (b *backend) decrementTotalRevokedCertificatesCountReport() {
revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport()
func (b *backend) emitTotalRevokedCountMetric(revokedCertCount uint32) {
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount))
}

// Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet
// Does not respect whether-we-are-counting backend information.
func (b *backend) decrementTotalRevokedCertificatesCountNoReport() uint32 {
newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, ^uint32(0))
newRevokedCertCount := b.revokedCertCount.Add(^uint32(0))
return newRevokedCertCount
}
49 changes: 36 additions & 13 deletions builtin/logical/pki/backend_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"testing"
"time"

Expand Down Expand Up @@ -3797,6 +3796,15 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
t.Fatal(err)
}

// Set up Metric Configuration, then restart to enable it
_, err = client.Logical().Write("pki/config/auto-tidy", map[string]interface{}{
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": true,
})
_, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{
"mounts": "pki/",
})

// Check the metrics initialized in order to calculate backendUUID for /pki
// BackendUUID not consistent during tests with UUID from /sys/mounts/pki
metricsSuffix := "total_certificates_stored"
Expand Down Expand Up @@ -3833,6 +3841,14 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
if err != nil {
t.Fatal(err)
}
// Set up Metric Configuration, then restart to enable it
_, err = client.Logical().Write("pki2/config/auto-tidy", map[string]interface{}{
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": true,
})
_, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{
"mounts": "pki2/",
})

// Create a CSR for the intermediate CA
secret, err := client.Logical().Write("pki2/intermediate/generate/internal", nil)
Expand Down Expand Up @@ -3961,6 +3977,7 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
"current_revoked_cert_count": json.Number("0"),
"revocation_queue_deleted_count": json.Number("0"),
"cross_revoked_cert_deleted_count": json.Number("0"),
"internal_backend_uuid": backendUUID,
}
// Let's copy the times from the response so that we can use deep.Equal()
timeStarted, ok := tidyStatus.Data["time_started"]
Expand Down Expand Up @@ -5590,6 +5607,14 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) {
serials[i] = resp.Data["serial_number"].(string)
}

// Turn on certificate counting:
CBWrite(b, s, "config/auto-tidy", map[string]interface{}{
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": false,
})
// Assert initialize from clean is correct:
b.initializeStoredCertificateCounts(ctx)

// Revoke certificates A + B
revocations := serials[0:2]
for _, key := range revocations {
Expand All @@ -5601,18 +5626,16 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) {
}
}

// Assert initialize from clean is correct:
b.initializeStoredCertificateCounts(ctx)
if atomic.LoadUint32(b.certCount) != 6 {
t.Fatalf("Failed to count six certificates root,A,B,C,D,E, instead counted %d certs", atomic.LoadUint32(b.certCount))
if b.certCount.Load() != 6 {
t.Fatalf("Failed to count six certificates root,A,B,C,D,E, instead counted %d certs", b.certCount.Load())
}
if atomic.LoadUint32(b.revokedCertCount) != 2 {
t.Fatalf("Failed to count two revoked certificates A+B, instead counted %d certs", atomic.LoadUint32(b.revokedCertCount))
if b.revokedCertCount.Load() != 2 {
t.Fatalf("Failed to count two revoked certificates A+B, instead counted %d certs", b.revokedCertCount.Load())
}

// Simulates listing while initialize in progress, by "restarting it"
atomic.StoreUint32(b.certCount, 0)
atomic.StoreUint32(b.revokedCertCount, 0)
b.certCount.Store(0)
b.revokedCertCount.Store(0)
b.certsCounted.Store(false)

// Revoke certificates C, D
Expand Down Expand Up @@ -5641,12 +5664,12 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) {
b.initializeStoredCertificateCounts(ctx)

// Test certificate count
if atomic.LoadUint32(b.certCount) != 8 {
t.Fatalf("Failed to initialize count of certificates root, A,B,C,D,E,F,G counted %d certs", *(b.certCount))
if b.certCount.Load() != 8 {
t.Fatalf("Failed to initialize count of certificates root, A,B,C,D,E,F,G counted %d certs", b.certCount.Load())
}

if atomic.LoadUint32(b.revokedCertCount) != 4 {
t.Fatalf("Failed to count revoked certificates A,B,C,D counted %d certs", *(b.revokedCertCount))
if b.revokedCertCount.Load() != 4 {
t.Fatalf("Failed to count revoked certificates A,B,C,D counted %d certs", b.revokedCertCount.Load())
}

return
Expand Down
4 changes: 2 additions & 2 deletions builtin/logical/pki/cert_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,9 @@ func fetchCertBySerial(sc *storageContext, prefix, serial string) (*logical.Stor
// If we fail here, we have an extra (copy) of a cert in storage, add to metrics:
switch {
case strings.HasPrefix(prefix, "revoked/"):
sc.Backend.incrementTotalRevokedCertificatesCount(certsCounted, path)
sc.Backend.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, path)
default:
sc.Backend.incrementTotalCertificatesCount(certsCounted, path)
sc.Backend.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, path)
}
return nil, errutil.InternalError{Err: fmt.Sprintf("error deleting certificate with serial %s from old location", serial)}
}
Expand Down
2 changes: 1 addition & 1 deletion builtin/logical/pki/crl_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -978,7 +978,7 @@ func revokeCert(sc *storageContext, config *crlConfig, cert *x509.Certificate) (
if err != nil {
return nil, fmt.Errorf("error saving revoked certificate to new location")
}
sc.Backend.incrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key)
sc.Backend.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key)

// If this flag is enabled after the fact, existing local entries will be published to
// the unified storage space through a periodic function.
Expand Down
Loading