Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Telemetry Metrics Configuration. #18186

Merged
merged 22 commits into from
Feb 10, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
805c6ad
Telemetry Metrics Configuration.
kitography Dec 1, 2022
50b4809
Err Shadowing Fix (woah, semgrep is cool).
kitography Dec 1, 2022
0f5fe89
Fix TestBackend_RevokePlusTidy_Intermediate
kitography Dec 1, 2022
644291b
Add Changelog.
kitography Dec 1, 2022
e124d17
Fix memory leak. Code cleanup as suggested by Steve.
kitography Dec 5, 2022
173143b
Turn off metrics by default, breaking-change.
kitography Dec 5, 2022
39e7113
Merge branch 'main' into VAULT-8784-telemetry-turn-off-on-metrics-count
kitography Dec 12, 2022
50e02af
Show on tidy-status before start-up.
kitography Dec 13, 2022
38421eb
Fix tests
kitography Dec 13, 2022
617f8e3
make fmt
kitography Dec 13, 2022
dd21615
Merge branch 'main' into VAULT-8784-telemetry-turn-off-on-metrics-count
kitography Feb 10, 2023
27e5ca8
Add emit metrics to periodicFunc
kitography Feb 10, 2023
f4473ba
Test not delivering unavailable metrics + fix.
kitography Feb 10, 2023
bff0245
Better error message.
kitography Feb 10, 2023
eb646af
Fixing the false-error bug.
kitography Feb 10, 2023
945cf93
make fmt.
kitography Feb 10, 2023
32fb8f1
Try to fix race issue, remove confusing comments.
kitography Feb 10, 2023
f2a546d
Switch metric counter variables to an atomic.Uint32
stevendpclark Feb 10, 2023
82ca965
Fix race-issue better by trying until the metric is sunk.
kitography Feb 10, 2023
3cb6793
make fmt.
kitography Feb 10, 2023
ffc680a
empty commit to retrigger non-race tests that all pass locally
kitography Feb 10, 2023
eeeb864
Extend metrics, remove parallel run because I have no idea what else …
kitography Feb 10, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 79 additions & 34 deletions builtin/logical/pki/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,10 @@ func Backend(conf *logical.BackendConfig) *backend {
b.lastTidy = time.Now()

// Metrics initialization for count of certificates in storage
b.certCountEnabled = atomic2.NewBool(true) // Overwritten during initialize, don't break count until then
b.publishCertCountMetrics = atomic2.NewBool(true)
b.certsCounted = atomic2.NewBool(false)
b.certCountError = ""
b.certCount = new(uint32)
b.revokedCertCount = new(uint32)
b.possibleDoubleCountedSerials = make([]string, 0, 250)
Expand All @@ -234,9 +237,12 @@ type backend struct {
tidyStatus *tidyStatus
lastTidy time.Time

certCountEnabled *atomic2.Bool
publishCertCountMetrics *atomic2.Bool
certCount *uint32
revokedCertCount *uint32
certsCounted *atomic2.Bool
certCountError string
possibleDoubleCountedSerials []string
possibleDoubleCountedRevokedSerials []string

Expand Down Expand Up @@ -361,9 +367,10 @@ func (b *backend) initialize(ctx context.Context, _ *logical.InitializationReque
// Initialize also needs to populate our certificate and revoked certificate count
err = b.initializeStoredCertificateCounts(ctx)
if err != nil {
return err
// Don't block/err initialize/startup for metrics. Context on this call can time out due to number of certificates.
b.Logger().Error("Could not initialize stored certificate counts, run tidy to initialize", err)
b.certCountError = err.Error()
}

return nil
}

Expand Down Expand Up @@ -572,12 +579,30 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er
}

func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error {
b.tidyStatusLock.RLock()
defer b.tidyStatusLock.RUnlock()
// For performance reasons, we can't lock on issuance/storage of certs until a list operation completes,
// but we want to limit possible miscounts / double-counts to over-counting, so we take the tidy lock which
// prevents (most) deletions - in particular we take a read lock (sufficient to block the write lock in
// tidyStatusStart while allowing tidy to still acquire a read lock to report via its endpoint)
b.tidyStatusLock.RLock()
defer b.tidyStatusLock.RUnlock()
sc := b.makeStorageContext(ctx, b.storage)
config, err := sc.getAutoTidyConfig()
if err != nil {
return err
}

b.certCountEnabled.Store(config.MaintainCount)
b.publishCertCountMetrics.Store(config.PublishMetrics)

if config.MaintainCount == false {
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil
b.certsCounted.Store(false)
atomic.StoreUint32(b.certCount, 0)
atomic.StoreUint32(b.revokedCertCount, 0)
b.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts"
return nil
}

entries, err := b.storage.List(ctx, "certs/")
if err != nil {
Expand Down Expand Up @@ -670,63 +695,83 @@ func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error {
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil

certCount := atomic.LoadUint32(b.certCount)
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
revokedCertCount := atomic.LoadUint32(b.revokedCertCount)
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount))
if config.PublishMetrics == true {
certCount := atomic.LoadUint32(b.certCount)
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
revokedCertCount := atomic.LoadUint32(b.revokedCertCount)
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount))
}

return nil
}

// The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call:
// eg. certsCounted := b.certsCounted.Load()
func (b *backend) incrementTotalCertificatesCount(certsCounted bool, newSerial string) {
certCount := atomic.AddUint32(b.certCount, 1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "certs/") {
newSerial = newSerial[6:]
func (b *backend) ifCountEnabledIncrementTotalCertificatesCount(certsCounted bool, newSerial string) {
if b.certCountEnabled.Load() {
certCount := atomic.AddUint32(b.certCount, 1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "certs/") {
newSerial = newSerial[6:]
}
b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial)
default:
if b.publishCertCountMetrics.Load() {
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
}
}
b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial)
default:
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
}
}

func (b *backend) decrementTotalCertificatesCountReport() {
certCount := b.decrementTotalCertificatesCountNoReport()
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
func (b *backend) ifCountEnabledDecrementTotalCertificatesCountReport() {
if b.certCountEnabled.Load() {
certCount := b.decrementTotalCertificatesCountNoReport()
if b.publishCertCountMetrics.Load() {
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
}
}
}

// Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet
// Does not respect whether-we-are-counting backend information.
func (b *backend) decrementTotalCertificatesCountNoReport() uint32 {
newCount := atomic.AddUint32(b.certCount, ^uint32(0))
return newCount
}

// The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call:
// eg. certsCounted := b.certsCounted.Load()
func (b *backend) incrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) {
newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, 1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "revoked/") { // allow passing in the path (revoked/serial) OR the serial
newSerial = newSerial[8:]
func (b *backend) ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) {
if b.certCountEnabled.Load() {
newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, 1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "revoked/") { // allow passing in the path (revoked/serial) OR the serial
newSerial = newSerial[8:]
}
b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial)
default:
if b.publishCertCountMetrics.Load() {
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(newRevokedCertCount))
}
}
b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial)
default:
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(newRevokedCertCount))
}
}

func (b *backend) decrementTotalRevokedCertificatesCountReport() {
revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport()
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount))
func (b *backend) ifCountEnabledDecrementTotalRevokedCertificatesCountReport() {
if b.certCountEnabled.Load() {
revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport()
if b.publishCertCountMetrics.Load() {
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount))
}
}
}

// Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet
// Does not respect whether-we-are-counting backend information.
func (b *backend) decrementTotalRevokedCertificatesCountNoReport() uint32 {
newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, ^uint32(0))
return newRevokedCertCount
Expand Down
1 change: 1 addition & 0 deletions builtin/logical/pki/backend_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3952,6 +3952,7 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
"missing_issuer_cert_count": json.Number("0"),
"current_cert_store_count": json.Number("0"),
"current_revoked_cert_count": json.Number("0"),
"internal_backend_uuid": backendUUID,
}
// Let's copy the times from the response so that we can use deep.Equal()
timeStarted, ok := tidyStatus.Data["time_started"]
Expand Down
4 changes: 2 additions & 2 deletions builtin/logical/pki/cert_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,9 @@ func fetchCertBySerial(ctx context.Context, b *backend, req *logical.Request, pr
// If we fail here, we have an extra (copy) of a cert in storage, add to metrics:
switch {
case strings.HasPrefix(prefix, "revoked/"):
b.incrementTotalRevokedCertificatesCount(certsCounted, path)
b.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, path)
default:
b.incrementTotalCertificatesCount(certsCounted, path)
b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, path)
}
return nil, errutil.InternalError{Err: fmt.Sprintf("error deleting certificate with serial %s from old location", serial)}
}
Expand Down
2 changes: 1 addition & 1 deletion builtin/logical/pki/crl_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ func revokeCert(ctx context.Context, b *backend, req *logical.Request, serial st
if err != nil {
return nil, fmt.Errorf("error saving revoked certificate to new location")
}
b.incrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key)
b.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key)
}

// Fetch the config and see if we need to rebuild the CRL. If we have
Expand Down
17 changes: 17 additions & 0 deletions builtin/logical/pki/fields.go
Original file line number Diff line number Diff line change
Expand Up @@ -491,5 +491,22 @@ greater period of time. By default this is zero seconds.`,
Default: "0s",
}

fields["maintain_stored_certificate_counts"] = &framework.FieldSchema{
Type: framework.TypeBool,
Description: `This configures whether stored certificates
are counted upon initialization of the backend, and whether during
normal operation, a running count of certificates stored is maintained.`,
Default: true,
}

fields["publish_stored_certificate_count_metrics"] = &framework.FieldSchema{
Type: framework.TypeBool,
Description: `This configures whether the stored certificate
count is published to the metrics consumer. It does not affect if the
stored certificate count is maintained, and if maintained, it will be
available on the tidy-status endpoint.`,
Default: false,
}

return fields
}
2 changes: 1 addition & 1 deletion builtin/logical/pki/path_issue_sign.go
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ func (b *backend) pathIssueSignCert(ctx context.Context, req *logical.Request, d
if err != nil {
return nil, fmt.Errorf("unable to store certificate locally: %w", err)
}
b.incrementTotalCertificatesCount(certsCounted, key)
b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, key)
}

if useCSR {
Expand Down
4 changes: 2 additions & 2 deletions builtin/logical/pki/path_root.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ func (b *backend) pathCAGenerateRoot(ctx context.Context, req *logical.Request,
if err != nil {
return nil, fmt.Errorf("unable to store certificate locally: %w", err)
}
b.incrementTotalCertificatesCount(certsCounted, key)
b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, key)

// Build a fresh CRL
err = b.crlBuilder.rebuild(ctx, b, req, true)
Expand Down Expand Up @@ -464,7 +464,7 @@ func (b *backend) pathIssuerSignIntermediate(ctx context.Context, req *logical.R
if err != nil {
return nil, fmt.Errorf("unable to store certificate locally: %w", err)
}
b.incrementTotalCertificatesCount(certsCounted, key)
b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, key)

if parsedBundle.Certificate.MaxPathLen == 0 {
resp.AddWarning("Max path length of the signed certificate is zero. This certificate cannot be used to issue intermediate CA certificates.")
Expand Down
56 changes: 39 additions & 17 deletions builtin/logical/pki/path_tidy.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ type tidyConfig struct {
SafetyBuffer time.Duration `json:"safety_buffer"`
IssuerSafetyBuffer time.Duration `json:"issuer_safety_buffer"`
PauseDuration time.Duration `json:"pause_duration"`
MaintainCount bool `json:"maintain_stored_certificate_counts"`
PublishMetrics bool `json:"publish_stored_certificate_count_metrics"`
}

var defaultTidyConfig = tidyConfig{
Expand All @@ -41,6 +43,8 @@ var defaultTidyConfig = tidyConfig{
SafetyBuffer: 72 * time.Hour,
IssuerSafetyBuffer: 365 * 24 * time.Hour,
PauseDuration: 0 * time.Second,
MaintainCount: true,
kitography marked this conversation as resolved.
Show resolved Hide resolved
PublishMetrics: true,
}

func pathTidy(b *backend) *framework.Path {
Expand Down Expand Up @@ -630,6 +634,7 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f
resp.Data["cert_store_deleted_count"] = b.tidyStatus.certStoreDeletedCount
resp.Data["revoked_cert_deleted_count"] = b.tidyStatus.revokedCertDeletedCount
resp.Data["missing_issuer_cert_count"] = b.tidyStatus.missingIssuerCertCount
resp.Data["internal_backend_uuid"] = b.backendUUID

switch b.tidyStatus.state {
case tidyStatusStarted:
Expand All @@ -651,12 +656,16 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f
resp.Data["time_finished"] = b.tidyStatus.timeFinished
}

resp.Data["current_cert_store_count"] = b.certCount
resp.Data["current_revoked_cert_count"] = b.revokedCertCount

if !b.certsCounted.Load() {
resp.AddWarning("Certificates in storage are still being counted, current counts provided may be " +
"inaccurate")
if b.certCountEnabled.Load() {
resp.Data["current_cert_store_count"] = b.certCount
resp.Data["current_revoked_cert_count"] = b.revokedCertCount
if !b.certsCounted.Load() {
resp.AddWarning("Certificates in storage are still being counted, current counts provided may be " +
"inaccurate")
}
if b.certCountError != "" {
resp.Data["certificate_counting_error"] = b.certCountError
}
}

return resp, nil
Expand All @@ -671,15 +680,17 @@ func (b *backend) pathConfigAutoTidyRead(ctx context.Context, req *logical.Reque

return &logical.Response{
Data: map[string]interface{}{
"enabled": config.Enabled,
"interval_duration": int(config.Interval / time.Second),
"tidy_cert_store": config.CertStore,
"tidy_revoked_certs": config.RevokedCerts,
"tidy_revoked_cert_issuer_associations": config.IssuerAssocs,
"tidy_expired_issuers": config.ExpiredIssuers,
"safety_buffer": int(config.SafetyBuffer / time.Second),
"issuer_safety_buffer": int(config.IssuerSafetyBuffer / time.Second),
"pause_duration": config.PauseDuration.String(),
"enabled": config.Enabled,
"interval_duration": int(config.Interval / time.Second),
"tidy_cert_store": config.CertStore,
"tidy_revoked_certs": config.RevokedCerts,
"tidy_revoked_cert_issuer_associations": config.IssuerAssocs,
"tidy_expired_issuers": config.ExpiredIssuers,
"safety_buffer": int(config.SafetyBuffer / time.Second),
"issuer_safety_buffer": int(config.IssuerSafetyBuffer / time.Second),
"pause_duration": config.PauseDuration.String(),
"publish_stored_certificate_count_metrics": config.PublishMetrics,
"maintain_stored_certificate_counts": config.MaintainCount,
},
}, nil
}
Expand Down Expand Up @@ -736,6 +747,17 @@ func (b *backend) pathConfigAutoTidyWrite(ctx context.Context, req *logical.Requ
return logical.ErrorResponse("Auto-tidy enabled but no tidy operations were requested. Enable at least one tidy operation to be run (tidy_cert_store / tidy_revoked_certs / tidy_revoked_cert_issuer_associations)."), nil
}

if maintainCountEnabledRaw, ok := d.GetOk("maintain_stored_certificate_counts"); ok {
config.MaintainCount = maintainCountEnabledRaw.(bool)
}

if runningStorageMetricsEnabledRaw, ok := d.GetOk("publish_stored_certificate_count_metrics"); ok {
if config.MaintainCount == false {
return logical.ErrorResponse("Can not publish a running storage metrics count to metrics without first maintaining that count. Enable `maintain_stored_certificate_counts` to enable `publish_stored_certificate_count_metrics."), nil
}
config.PublishMetrics = runningStorageMetricsEnabledRaw.(bool)
}

return nil, sc.writeAutoTidyConfig(config)
}

Expand Down Expand Up @@ -798,7 +820,7 @@ func (b *backend) tidyStatusIncCertStoreCount() {

b.tidyStatus.certStoreDeletedCount++

b.decrementTotalCertificatesCountReport()
b.ifCountEnabledDecrementTotalCertificatesCountReport()
}

func (b *backend) tidyStatusIncRevokedCertCount() {
Expand All @@ -807,7 +829,7 @@ func (b *backend) tidyStatusIncRevokedCertCount() {

b.tidyStatus.revokedCertDeletedCount++

b.decrementTotalRevokedCertificatesCountReport()
b.ifCountEnabledDecrementTotalRevokedCertificatesCountReport()
}

func (b *backend) tidyStatusIncMissingIssuerCertCount() {
Expand Down
Loading