Skip to content

Commit

Permalink
Backport of Telemetry Metrics Configuration. into release/1.13.x (#21070
Browse files Browse the repository at this point in the history
)

* backport of commit 2dd4528

* Add missing documentation on cert metrics (#21073)

Signed-off-by: Alexander Scheel <[email protected]>

---------

Signed-off-by: Alexander Scheel <[email protected]>
Co-authored-by: Kit Haines <[email protected]>
Co-authored-by: Alexander Scheel <[email protected]>
  • Loading branch information
3 people authored Jun 13, 2023
1 parent 8bde6d7 commit 5f96380
Show file tree
Hide file tree
Showing 12 changed files with 612 additions and 85 deletions.
159 changes: 117 additions & 42 deletions builtin/logical/pki/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ import (
"sync/atomic"
"time"

atomic2 "go.uber.org/atomic"

"github.com/hashicorp/vault/helper/constants"

"github.com/hashicorp/go-multierror"

atomic2 "go.uber.org/atomic"

"github.com/hashicorp/vault/sdk/helper/consts"

"github.com/armon/go-metrics"
Expand Down Expand Up @@ -251,9 +251,12 @@ func Backend(conf *logical.BackendConfig) *backend {
b.lastTidy = time.Now()

// Metrics initialization for count of certificates in storage
b.certCountEnabled = atomic2.NewBool(false)
b.publishCertCountMetrics = atomic2.NewBool(false)
b.certsCounted = atomic2.NewBool(false)
b.certCount = new(uint32)
b.revokedCertCount = new(uint32)
b.certCountError = "Initialize Not Yet Run, Cert Counts Unavailable"
b.certCount = &atomic.Uint32{}
b.revokedCertCount = &atomic.Uint32{}
b.possibleDoubleCountedSerials = make([]string, 0, 250)
b.possibleDoubleCountedRevokedSerials = make([]string, 0, 250)

Expand All @@ -277,9 +280,12 @@ type backend struct {

unifiedTransferStatus *unifiedTransferStatus

certCount *uint32
revokedCertCount *uint32
certCountEnabled *atomic2.Bool
publishCertCountMetrics *atomic2.Bool
certCount *atomic.Uint32
revokedCertCount *atomic.Uint32
certsCounted *atomic2.Bool
certCountError string
possibleDoubleCountedSerials []string
possibleDoubleCountedRevokedSerials []string

Expand Down Expand Up @@ -371,9 +377,10 @@ func (b *backend) initialize(ctx context.Context, _ *logical.InitializationReque
// Initialize also needs to populate our certificate and revoked certificate count
err = b.initializeStoredCertificateCounts(ctx)
if err != nil {
return err
// Don't block/err initialize/startup for metrics. Context on this call can time out due to number of certificates.
b.Logger().Error("Could not initialize stored certificate counts", err)
b.certCountError = err.Error()
}

return nil
}

Expand Down Expand Up @@ -608,6 +615,13 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er
crlErr := doCRL()
tidyErr := doAutoTidy()

// Periodically re-emit gauges so that they don't disappear/go stale
tidyConfig, err := sc.getAutoTidyConfig()
if err != nil {
return err
}
b.emitCertStoreMetrics(tidyConfig)

var errors error
if crlErr != nil {
errors = multierror.Append(errors, fmt.Errorf("Error building CRLs:\n - %w\n", crlErr))
Expand All @@ -632,24 +646,51 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er
}

func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error {
b.tidyStatusLock.RLock()
defer b.tidyStatusLock.RUnlock()
// For performance reasons, we can't lock on issuance/storage of certs until a list operation completes,
// but we want to limit possible miscounts / double-counts to over-counting, so we take the tidy lock which
// prevents (most) deletions - in particular we take a read lock (sufficient to block the write lock in
// tidyStatusStart while allowing tidy to still acquire a read lock to report via its endpoint)
b.tidyStatusLock.RLock()
defer b.tidyStatusLock.RUnlock()
sc := b.makeStorageContext(ctx, b.storage)
config, err := sc.getAutoTidyConfig()
if err != nil {
return err
}

b.certCountEnabled.Store(config.MaintainCount)
b.publishCertCountMetrics.Store(config.PublishMetrics)

if config.MaintainCount == false {
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil
b.certsCounted.Store(true)
b.certCount.Store(0)
b.revokedCertCount.Store(0)
b.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts"
return nil
}

// Ideally these three things would be set in one transaction, since that isn't possible, set the counts to "0",
// first, so count will over-count (and miss putting things in deduplicate queue), rather than under-count.
b.certCount.Store(0)
b.revokedCertCount.Store(0)
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil
// A cert issued or revoked here will be double-counted. That's okay, this is "best effort" metrics.
b.certsCounted.Store(false)

entries, err := b.storage.List(ctx, "certs/")
if err != nil {
return err
}
atomic.AddUint32(b.certCount, uint32(len(entries)))
b.certCount.Add(uint32(len(entries)))

revokedEntries, err := b.storage.List(ctx, "revoked/")
if err != nil {
return err
}
atomic.AddUint32(b.revokedCertCount, uint32(len(revokedEntries)))
b.revokedCertCount.Add(uint32(len(revokedEntries)))

b.certsCounted.Store(true)
// Now that the metrics are set, we can switch from appending newly-stored certificates to the possible double-count
Expand Down Expand Up @@ -730,64 +771,98 @@ func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error {
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil

certCount := atomic.LoadUint32(b.certCount)
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
revokedCertCount := atomic.LoadUint32(b.revokedCertCount)
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount))
b.emitCertStoreMetrics(config)

b.certCountError = ""

return nil
}

func (b *backend) emitCertStoreMetrics(config *tidyConfig) {
if config.PublishMetrics == true {
certCount := b.certCount.Load()
b.emitTotalCertCountMetric(certCount)
revokedCertCount := b.revokedCertCount.Load()
b.emitTotalRevokedCountMetric(revokedCertCount)
}
}

// The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call:
// eg. certsCounted := b.certsCounted.Load()
func (b *backend) incrementTotalCertificatesCount(certsCounted bool, newSerial string) {
certCount := atomic.AddUint32(b.certCount, 1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "certs/") {
newSerial = newSerial[6:]
func (b *backend) ifCountEnabledIncrementTotalCertificatesCount(certsCounted bool, newSerial string) {
if b.certCountEnabled.Load() {
certCount := b.certCount.Add(1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "certs/") {
newSerial = newSerial[6:]
}
b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial)
default:
if b.publishCertCountMetrics.Load() {
b.emitTotalCertCountMetric(certCount)
}
}
b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial)
default:
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
}
}

func (b *backend) decrementTotalCertificatesCountReport() {
certCount := b.decrementTotalCertificatesCountNoReport()
func (b *backend) ifCountEnabledDecrementTotalCertificatesCountReport() {
if b.certCountEnabled.Load() {
certCount := b.decrementTotalCertificatesCountNoReport()
if b.publishCertCountMetrics.Load() {
b.emitTotalCertCountMetric(certCount)
}
}
}

func (b *backend) emitTotalCertCountMetric(certCount uint32) {
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
}

// Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet
// Does not respect whether-we-are-counting backend information.
func (b *backend) decrementTotalCertificatesCountNoReport() uint32 {
newCount := atomic.AddUint32(b.certCount, ^uint32(0))
newCount := b.certCount.Add(^uint32(0))
return newCount
}

// The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call:
// eg. certsCounted := b.certsCounted.Load()
func (b *backend) incrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) {
newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, 1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "revoked/") { // allow passing in the path (revoked/serial) OR the serial
newSerial = newSerial[8:]
func (b *backend) ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) {
if b.certCountEnabled.Load() {
newRevokedCertCount := b.revokedCertCount.Add(1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "revoked/") { // allow passing in the path (revoked/serial) OR the serial
newSerial = newSerial[8:]
}
b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial)
default:
if b.publishCertCountMetrics.Load() {
b.emitTotalRevokedCountMetric(newRevokedCertCount)
}
}
}
}

func (b *backend) ifCountEnabledDecrementTotalRevokedCertificatesCountReport() {
if b.certCountEnabled.Load() {
revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport()
if b.publishCertCountMetrics.Load() {
b.emitTotalRevokedCountMetric(revokedCertCount)
}
b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial)
default:
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(newRevokedCertCount))
}
}

func (b *backend) decrementTotalRevokedCertificatesCountReport() {
revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport()
func (b *backend) emitTotalRevokedCountMetric(revokedCertCount uint32) {
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount))
}

// Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet
// Does not respect whether-we-are-counting backend information.
func (b *backend) decrementTotalRevokedCertificatesCountNoReport() uint32 {
newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, ^uint32(0))
newRevokedCertCount := b.revokedCertCount.Add(^uint32(0))
return newRevokedCertCount
}
49 changes: 36 additions & 13 deletions builtin/logical/pki/backend_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"testing"
"time"

Expand Down Expand Up @@ -3828,6 +3827,15 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
t.Fatal(err)
}

// Set up Metric Configuration, then restart to enable it
_, err = client.Logical().Write("pki/config/auto-tidy", map[string]interface{}{
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": true,
})
_, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{
"mounts": "pki/",
})

// Check the metrics initialized in order to calculate backendUUID for /pki
// BackendUUID not consistent during tests with UUID from /sys/mounts/pki
metricsSuffix := "total_certificates_stored"
Expand Down Expand Up @@ -3864,6 +3872,14 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
if err != nil {
t.Fatal(err)
}
// Set up Metric Configuration, then restart to enable it
_, err = client.Logical().Write("pki2/config/auto-tidy", map[string]interface{}{
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": true,
})
_, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{
"mounts": "pki2/",
})

// Create a CSR for the intermediate CA
secret, err := client.Logical().Write("pki2/intermediate/generate/internal", nil)
Expand Down Expand Up @@ -3992,6 +4008,7 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
"current_revoked_cert_count": json.Number("0"),
"revocation_queue_deleted_count": json.Number("0"),
"cross_revoked_cert_deleted_count": json.Number("0"),
"internal_backend_uuid": backendUUID,
}
// Let's copy the times from the response so that we can use deep.Equal()
timeStarted, ok := tidyStatus.Data["time_started"]
Expand Down Expand Up @@ -5624,6 +5641,14 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) {
serials[i] = resp.Data["serial_number"].(string)
}

// Turn on certificate counting:
CBWrite(b, s, "config/auto-tidy", map[string]interface{}{
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": false,
})
// Assert initialize from clean is correct:
b.initializeStoredCertificateCounts(ctx)

// Revoke certificates A + B
revocations := serials[0:2]
for _, key := range revocations {
Expand All @@ -5635,18 +5660,16 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) {
}
}

// Assert initialize from clean is correct:
b.initializeStoredCertificateCounts(ctx)
if atomic.LoadUint32(b.certCount) != 6 {
t.Fatalf("Failed to count six certificates root,A,B,C,D,E, instead counted %d certs", atomic.LoadUint32(b.certCount))
if b.certCount.Load() != 6 {
t.Fatalf("Failed to count six certificates root,A,B,C,D,E, instead counted %d certs", b.certCount.Load())
}
if atomic.LoadUint32(b.revokedCertCount) != 2 {
t.Fatalf("Failed to count two revoked certificates A+B, instead counted %d certs", atomic.LoadUint32(b.revokedCertCount))
if b.revokedCertCount.Load() != 2 {
t.Fatalf("Failed to count two revoked certificates A+B, instead counted %d certs", b.revokedCertCount.Load())
}

// Simulates listing while initialize in progress, by "restarting it"
atomic.StoreUint32(b.certCount, 0)
atomic.StoreUint32(b.revokedCertCount, 0)
b.certCount.Store(0)
b.revokedCertCount.Store(0)
b.certsCounted.Store(false)

// Revoke certificates C, D
Expand Down Expand Up @@ -5675,12 +5698,12 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) {
b.initializeStoredCertificateCounts(ctx)

// Test certificate count
if atomic.LoadUint32(b.certCount) != 8 {
t.Fatalf("Failed to initialize count of certificates root, A,B,C,D,E,F,G counted %d certs", *(b.certCount))
if b.certCount.Load() != 8 {
t.Fatalf("Failed to initialize count of certificates root, A,B,C,D,E,F,G counted %d certs", b.certCount.Load())
}

if atomic.LoadUint32(b.revokedCertCount) != 4 {
t.Fatalf("Failed to count revoked certificates A,B,C,D counted %d certs", *(b.revokedCertCount))
if b.revokedCertCount.Load() != 4 {
t.Fatalf("Failed to count revoked certificates A,B,C,D counted %d certs", b.revokedCertCount.Load())
}

return
Expand Down
4 changes: 2 additions & 2 deletions builtin/logical/pki/cert_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,9 @@ func fetchCertBySerial(sc *storageContext, prefix, serial string) (*logical.Stor
// If we fail here, we have an extra (copy) of a cert in storage, add to metrics:
switch {
case strings.HasPrefix(prefix, "revoked/"):
sc.Backend.incrementTotalRevokedCertificatesCount(certsCounted, path)
sc.Backend.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, path)
default:
sc.Backend.incrementTotalCertificatesCount(certsCounted, path)
sc.Backend.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, path)
}
return nil, errutil.InternalError{Err: fmt.Sprintf("error deleting certificate with serial %s from old location", serial)}
}
Expand Down
2 changes: 1 addition & 1 deletion builtin/logical/pki/crl_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -1019,7 +1019,7 @@ func revokeCert(sc *storageContext, config *crlConfig, cert *x509.Certificate) (
if err != nil {
return nil, fmt.Errorf("error saving revoked certificate to new location: %w", err)
}
sc.Backend.incrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key)
sc.Backend.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key)

// From here on out, the certificate has been revoked locally. Any other
// persistence issues might still err, but any other failure messages
Expand Down
Loading

0 comments on commit 5f96380

Please sign in to comment.