From 805c6ad609d99f1e1a9088f55b1e98e8bece8665 Mon Sep 17 00:00:00 2001 From: kitography Date: Thu, 1 Dec 2022 09:52:49 -0500 Subject: [PATCH 01/20] Telemetry Metrics Configuration. --- builtin/logical/pki/backend.go | 110 +++++++++++++++++-------- builtin/logical/pki/cert_util.go | 4 +- builtin/logical/pki/crl_util.go | 2 +- builtin/logical/pki/fields.go | 17 ++++ builtin/logical/pki/path_issue_sign.go | 2 +- builtin/logical/pki/path_root.go | 4 +- builtin/logical/pki/path_tidy.go | 56 +++++++++---- builtin/logical/pki/storage.go | 28 ++++++- 8 files changed, 165 insertions(+), 58 deletions(-) diff --git a/builtin/logical/pki/backend.go b/builtin/logical/pki/backend.go index 5dbb94c77729..6dcb1b76ba7d 100644 --- a/builtin/logical/pki/backend.go +++ b/builtin/logical/pki/backend.go @@ -212,7 +212,10 @@ func Backend(conf *logical.BackendConfig) *backend { b.lastTidy = time.Now() // Metrics initialization for count of certificates in storage + b.certCountEnabled = atomic2.NewBool(true) // Overwritten during initialize, don't break count until then + b.publishCertCountMetrics = atomic2.NewBool(true) b.certsCounted = atomic2.NewBool(false) + b.certCountError = "" b.certCount = new(uint32) b.revokedCertCount = new(uint32) b.possibleDoubleCountedSerials = make([]string, 0, 250) @@ -234,9 +237,12 @@ type backend struct { tidyStatus *tidyStatus lastTidy time.Time + certCountEnabled *atomic2.Bool + publishCertCountMetrics *atomic2.Bool certCount *uint32 revokedCertCount *uint32 certsCounted *atomic2.Bool + certCountError string possibleDoubleCountedSerials []string possibleDoubleCountedRevokedSerials []string @@ -361,9 +367,10 @@ func (b *backend) initialize(ctx context.Context, _ *logical.InitializationReque // Initialize also needs to populate our certificate and revoked certificate count err = b.initializeStoredCertificateCounts(ctx) if err != nil { - return err + // Don't block/err initialize/startup for metrics. Context on this call can time out due to number of certificates. + b.Logger().Error("Could not initialize stored certificate counts, run tidy to initialize", err) + b.certCountError = err.Error() } - return nil } @@ -572,12 +579,27 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er } func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error { - b.tidyStatusLock.RLock() - defer b.tidyStatusLock.RUnlock() // For performance reasons, we can't lock on issuance/storage of certs until a list operation completes, // but we want to limit possible miscounts / double-counts to over-counting, so we take the tidy lock which // prevents (most) deletions - in particular we take a read lock (sufficient to block the write lock in // tidyStatusStart while allowing tidy to still acquire a read lock to report via its endpoint) + b.tidyStatusLock.RLock() + defer b.tidyStatusLock.RUnlock() + sc := b.makeStorageContext(ctx, b.storage) + config, err := sc.getAutoTidyConfig() + + b.certCountEnabled.Store(config.MaintainCount) + b.publishCertCountMetrics.Store(config.PublishMetrics) + + if config.MaintainCount == false { + b.possibleDoubleCountedRevokedSerials = nil + b.possibleDoubleCountedSerials = nil + b.certsCounted.Store(false) + atomic.StoreUint32(b.certCount, 0) + atomic.StoreUint32(b.revokedCertCount, 0) + b.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts" + return nil + } entries, err := b.storage.List(ctx, "certs/") if err != nil { @@ -670,36 +692,47 @@ func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error { b.possibleDoubleCountedRevokedSerials = nil b.possibleDoubleCountedSerials = nil - certCount := atomic.LoadUint32(b.certCount) - metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount)) - revokedCertCount := atomic.LoadUint32(b.revokedCertCount) - metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount)) + if config.PublishMetrics == true { + certCount := atomic.LoadUint32(b.certCount) + metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount)) + revokedCertCount := atomic.LoadUint32(b.revokedCertCount) + metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount)) + } return nil } // The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call: // eg. certsCounted := b.certsCounted.Load() -func (b *backend) incrementTotalCertificatesCount(certsCounted bool, newSerial string) { - certCount := atomic.AddUint32(b.certCount, 1) - switch { - case !certsCounted: - // This is unsafe, but a good best-attempt - if strings.HasPrefix(newSerial, "certs/") { - newSerial = newSerial[6:] +func (b *backend) ifCountEnabledIncrementTotalCertificatesCount(certsCounted bool, newSerial string) { + if b.certCountEnabled.Load() { + certCount := atomic.AddUint32(b.certCount, 1) + switch { + case !certsCounted: + // This is unsafe, but a good best-attempt + if strings.HasPrefix(newSerial, "certs/") { + newSerial = newSerial[6:] + } + b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial) + default: + if b.publishCertCountMetrics.Load() { + metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount)) + } } - b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial) - default: - metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount)) } } -func (b *backend) decrementTotalCertificatesCountReport() { - certCount := b.decrementTotalCertificatesCountNoReport() - metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount)) +func (b *backend) ifCountEnabledDecrementTotalCertificatesCountReport() { + if b.certCountEnabled.Load() { + certCount := b.decrementTotalCertificatesCountNoReport() + if b.publishCertCountMetrics.Load() { + metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount)) + } + } } // Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet +// Does not respect whether-we-are-counting backend information. func (b *backend) decrementTotalCertificatesCountNoReport() uint32 { newCount := atomic.AddUint32(b.certCount, ^uint32(0)) return newCount @@ -707,26 +740,35 @@ func (b *backend) decrementTotalCertificatesCountNoReport() uint32 { // The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call: // eg. certsCounted := b.certsCounted.Load() -func (b *backend) incrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) { - newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, 1) - switch { - case !certsCounted: - // This is unsafe, but a good best-attempt - if strings.HasPrefix(newSerial, "revoked/") { // allow passing in the path (revoked/serial) OR the serial - newSerial = newSerial[8:] +func (b *backend) ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) { + if b.certCountEnabled.Load() { + newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, 1) + switch { + case !certsCounted: + // This is unsafe, but a good best-attempt + if strings.HasPrefix(newSerial, "revoked/") { // allow passing in the path (revoked/serial) OR the serial + newSerial = newSerial[8:] + } + b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial) + default: + if b.publishCertCountMetrics.Load() { + metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(newRevokedCertCount)) + } } - b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial) - default: - metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(newRevokedCertCount)) } } -func (b *backend) decrementTotalRevokedCertificatesCountReport() { - revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport() - metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount)) +func (b *backend) ifCountEnabledDecrementTotalRevokedCertificatesCountReport() { + if b.certCountEnabled.Load() { + revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport() + if b.publishCertCountMetrics.Load() { + metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount)) + } + } } // Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet +// Does not respect whether-we-are-counting backend information. func (b *backend) decrementTotalRevokedCertificatesCountNoReport() uint32 { newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, ^uint32(0)) return newRevokedCertCount diff --git a/builtin/logical/pki/cert_util.go b/builtin/logical/pki/cert_util.go index 560831063756..70d990235029 100644 --- a/builtin/logical/pki/cert_util.go +++ b/builtin/logical/pki/cert_util.go @@ -234,9 +234,9 @@ func fetchCertBySerial(ctx context.Context, b *backend, req *logical.Request, pr // If we fail here, we have an extra (copy) of a cert in storage, add to metrics: switch { case strings.HasPrefix(prefix, "revoked/"): - b.incrementTotalRevokedCertificatesCount(certsCounted, path) + b.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, path) default: - b.incrementTotalCertificatesCount(certsCounted, path) + b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, path) } return nil, errutil.InternalError{Err: fmt.Sprintf("error deleting certificate with serial %s from old location", serial)} } diff --git a/builtin/logical/pki/crl_util.go b/builtin/logical/pki/crl_util.go index 8b4bf6fd570e..b0cf1610f7a9 100644 --- a/builtin/logical/pki/crl_util.go +++ b/builtin/logical/pki/crl_util.go @@ -582,7 +582,7 @@ func revokeCert(ctx context.Context, b *backend, req *logical.Request, serial st if err != nil { return nil, fmt.Errorf("error saving revoked certificate to new location") } - b.incrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key) + b.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key) } // Fetch the config and see if we need to rebuild the CRL. If we have diff --git a/builtin/logical/pki/fields.go b/builtin/logical/pki/fields.go index 78cdd67e8c93..3b6eb2df167f 100644 --- a/builtin/logical/pki/fields.go +++ b/builtin/logical/pki/fields.go @@ -491,5 +491,22 @@ greater period of time. By default this is zero seconds.`, Default: "0s", } + fields["maintain_stored_certificate_counts"] = &framework.FieldSchema{ + Type: framework.TypeBool, + Description: `This configures whether stored certificates +are counted upon initialization of the backend, and whether during +normal operation, a running count of certificates stored is maintained.`, + Default: true, + } + + fields["publish_stored_certificate_count_metrics"] = &framework.FieldSchema{ + Type: framework.TypeBool, + Description: `This configures whether the stored certificate +count is published to the metrics consumer. It does not affect if the +stored certificate count is maintained, and if maintained, it will be +available on the tidy-status endpoint.`, + Default: false, + } + return fields } diff --git a/builtin/logical/pki/path_issue_sign.go b/builtin/logical/pki/path_issue_sign.go index 7203d56c73cf..3645f3020aa8 100644 --- a/builtin/logical/pki/path_issue_sign.go +++ b/builtin/logical/pki/path_issue_sign.go @@ -418,7 +418,7 @@ func (b *backend) pathIssueSignCert(ctx context.Context, req *logical.Request, d if err != nil { return nil, fmt.Errorf("unable to store certificate locally: %w", err) } - b.incrementTotalCertificatesCount(certsCounted, key) + b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, key) } if useCSR { diff --git a/builtin/logical/pki/path_root.go b/builtin/logical/pki/path_root.go index c5ef76fb13c7..26cde40ccb1a 100644 --- a/builtin/logical/pki/path_root.go +++ b/builtin/logical/pki/path_root.go @@ -267,7 +267,7 @@ func (b *backend) pathCAGenerateRoot(ctx context.Context, req *logical.Request, if err != nil { return nil, fmt.Errorf("unable to store certificate locally: %w", err) } - b.incrementTotalCertificatesCount(certsCounted, key) + b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, key) // Build a fresh CRL err = b.crlBuilder.rebuild(ctx, b, req, true) @@ -464,7 +464,7 @@ func (b *backend) pathIssuerSignIntermediate(ctx context.Context, req *logical.R if err != nil { return nil, fmt.Errorf("unable to store certificate locally: %w", err) } - b.incrementTotalCertificatesCount(certsCounted, key) + b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, key) if parsedBundle.Certificate.MaxPathLen == 0 { resp.AddWarning("Max path length of the signed certificate is zero. This certificate cannot be used to issue intermediate CA certificates.") diff --git a/builtin/logical/pki/path_tidy.go b/builtin/logical/pki/path_tidy.go index ae5f160ec1c4..e28aadc23cf7 100644 --- a/builtin/logical/pki/path_tidy.go +++ b/builtin/logical/pki/path_tidy.go @@ -29,6 +29,8 @@ type tidyConfig struct { SafetyBuffer time.Duration `json:"safety_buffer"` IssuerSafetyBuffer time.Duration `json:"issuer_safety_buffer"` PauseDuration time.Duration `json:"pause_duration"` + MaintainCount bool `json:"maintain_stored_certificate_counts"` + PublishMetrics bool `json:"publish_stored_certificate_count_metrics"` } var defaultTidyConfig = tidyConfig{ @@ -41,6 +43,8 @@ var defaultTidyConfig = tidyConfig{ SafetyBuffer: 72 * time.Hour, IssuerSafetyBuffer: 365 * 24 * time.Hour, PauseDuration: 0 * time.Second, + MaintainCount: true, + PublishMetrics: true, } func pathTidy(b *backend) *framework.Path { @@ -630,6 +634,7 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f resp.Data["cert_store_deleted_count"] = b.tidyStatus.certStoreDeletedCount resp.Data["revoked_cert_deleted_count"] = b.tidyStatus.revokedCertDeletedCount resp.Data["missing_issuer_cert_count"] = b.tidyStatus.missingIssuerCertCount + resp.Data["internal_backend_uuid"] = b.backendUUID switch b.tidyStatus.state { case tidyStatusStarted: @@ -651,12 +656,16 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f resp.Data["time_finished"] = b.tidyStatus.timeFinished } - resp.Data["current_cert_store_count"] = b.certCount - resp.Data["current_revoked_cert_count"] = b.revokedCertCount - - if !b.certsCounted.Load() { - resp.AddWarning("Certificates in storage are still being counted, current counts provided may be " + - "inaccurate") + if b.certCountEnabled.Load() { + resp.Data["current_cert_store_count"] = b.certCount + resp.Data["current_revoked_cert_count"] = b.revokedCertCount + if !b.certsCounted.Load() { + resp.AddWarning("Certificates in storage are still being counted, current counts provided may be " + + "inaccurate") + } + if b.certCountError != "" { + resp.Data["certificate_counting_error"] = b.certCountError + } } return resp, nil @@ -671,15 +680,17 @@ func (b *backend) pathConfigAutoTidyRead(ctx context.Context, req *logical.Reque return &logical.Response{ Data: map[string]interface{}{ - "enabled": config.Enabled, - "interval_duration": int(config.Interval / time.Second), - "tidy_cert_store": config.CertStore, - "tidy_revoked_certs": config.RevokedCerts, - "tidy_revoked_cert_issuer_associations": config.IssuerAssocs, - "tidy_expired_issuers": config.ExpiredIssuers, - "safety_buffer": int(config.SafetyBuffer / time.Second), - "issuer_safety_buffer": int(config.IssuerSafetyBuffer / time.Second), - "pause_duration": config.PauseDuration.String(), + "enabled": config.Enabled, + "interval_duration": int(config.Interval / time.Second), + "tidy_cert_store": config.CertStore, + "tidy_revoked_certs": config.RevokedCerts, + "tidy_revoked_cert_issuer_associations": config.IssuerAssocs, + "tidy_expired_issuers": config.ExpiredIssuers, + "safety_buffer": int(config.SafetyBuffer / time.Second), + "issuer_safety_buffer": int(config.IssuerSafetyBuffer / time.Second), + "pause_duration": config.PauseDuration.String(), + "publish_stored_certificate_count_metrics": config.PublishMetrics, + "maintain_stored_certificate_counts": config.MaintainCount, }, }, nil } @@ -736,6 +747,17 @@ func (b *backend) pathConfigAutoTidyWrite(ctx context.Context, req *logical.Requ return logical.ErrorResponse("Auto-tidy enabled but no tidy operations were requested. Enable at least one tidy operation to be run (tidy_cert_store / tidy_revoked_certs / tidy_revoked_cert_issuer_associations)."), nil } + if maintainCountEnabledRaw, ok := d.GetOk("maintain_stored_certificate_counts"); ok { + config.MaintainCount = maintainCountEnabledRaw.(bool) + } + + if runningStorageMetricsEnabledRaw, ok := d.GetOk("publish_stored_certificate_count_metrics"); ok { + if config.MaintainCount == false { + return logical.ErrorResponse("Can not publish a running storage metrics count to metrics without first maintaining that count. Enable `maintain_stored_certificate_counts` to enable `publish_stored_certificate_count_metrics."), nil + } + config.PublishMetrics = runningStorageMetricsEnabledRaw.(bool) + } + return nil, sc.writeAutoTidyConfig(config) } @@ -798,7 +820,7 @@ func (b *backend) tidyStatusIncCertStoreCount() { b.tidyStatus.certStoreDeletedCount++ - b.decrementTotalCertificatesCountReport() + b.ifCountEnabledDecrementTotalCertificatesCountReport() } func (b *backend) tidyStatusIncRevokedCertCount() { @@ -807,7 +829,7 @@ func (b *backend) tidyStatusIncRevokedCertCount() { b.tidyStatus.revokedCertDeletedCount++ - b.decrementTotalRevokedCertificatesCountReport() + b.ifCountEnabledDecrementTotalRevokedCertificatesCountReport() } func (b *backend) tidyStatusIncMissingIssuerCertCount() { diff --git a/builtin/logical/pki/storage.go b/builtin/logical/pki/storage.go index e55ea412150e..cd78ab68ed6e 100644 --- a/builtin/logical/pki/storage.go +++ b/builtin/logical/pki/storage.go @@ -8,6 +8,7 @@ import ( "fmt" "sort" "strings" + "sync/atomic" "time" "github.com/hashicorp/go-uuid" @@ -1235,7 +1236,32 @@ func (sc *storageContext) writeAutoTidyConfig(config *tidyConfig) error { return err } - return sc.Storage.Put(sc.Context, entry) + err = sc.Storage.Put(sc.Context, entry) + if err != nil { + return err + } + + sc.Backend.publishCertCountMetrics.Store(config.PublishMetrics) + + // To Potentially Disable Certificate Counting + if config.MaintainCount == false { + oldValue := sc.Backend.certCountEnabled.Swap(config.MaintainCount) + if oldValue != false { + if config.MaintainCount == false { + sc.Backend.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts" + sc.Backend.possibleDoubleCountedSerials = nil // This won't stop a list operation, but will stop an expensive clean up during initialize + sc.Backend.possibleDoubleCountedRevokedSerials = nil // This won't stop a list operation, but will stop an expensive clean up during initialize + atomic.StoreUint32(sc.Backend.certCount, 0) + atomic.StoreUint32(sc.Backend.revokedCertCount, 0) + } + } + } else { // To Potentially Enable Certificate Counting + // We haven't written "re-enable certificate counts" outside the initialize function + // Any call derived call to do so is likely to time out on ~2 million certs + sc.Backend.certCountError = "Certificate Counting Has Not Been Initialized, re-initialize this mount" + } + + return nil } func (sc *storageContext) listRevokedCerts() ([]string, error) { From 50b48096d232bf6762179d236cd1ccc91c84288c Mon Sep 17 00:00:00 2001 From: kitography Date: Thu, 1 Dec 2022 09:59:52 -0500 Subject: [PATCH 02/20] Err Shadowing Fix (woah, semgrep is cool). --- builtin/logical/pki/backend.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/builtin/logical/pki/backend.go b/builtin/logical/pki/backend.go index 6dcb1b76ba7d..31f191089d65 100644 --- a/builtin/logical/pki/backend.go +++ b/builtin/logical/pki/backend.go @@ -587,6 +587,9 @@ func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error { defer b.tidyStatusLock.RUnlock() sc := b.makeStorageContext(ctx, b.storage) config, err := sc.getAutoTidyConfig() + if err != nil { + return err + } b.certCountEnabled.Store(config.MaintainCount) b.publishCertCountMetrics.Store(config.PublishMetrics) From 0f5fe89c03fb1dfabac9519ce4349aa2a0801f3d Mon Sep 17 00:00:00 2001 From: kitography Date: Thu, 1 Dec 2022 13:03:29 -0500 Subject: [PATCH 03/20] Fix TestBackend_RevokePlusTidy_Intermediate --- builtin/logical/pki/backend_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/builtin/logical/pki/backend_test.go b/builtin/logical/pki/backend_test.go index 386126a86f16..20fd77d63ee0 100644 --- a/builtin/logical/pki/backend_test.go +++ b/builtin/logical/pki/backend_test.go @@ -3952,6 +3952,7 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) { "missing_issuer_cert_count": json.Number("0"), "current_cert_store_count": json.Number("0"), "current_revoked_cert_count": json.Number("0"), + "internal_backend_uuid": backendUUID, } // Let's copy the times from the response so that we can use deep.Equal() timeStarted, ok := tidyStatus.Data["time_started"] From 644291bb3fa1dc1852e15aac1331af3840fe43fd Mon Sep 17 00:00:00 2001 From: kitography Date: Thu, 1 Dec 2022 13:07:42 -0500 Subject: [PATCH 04/20] Add Changelog. --- changelog/18186.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 changelog/18186.txt diff --git a/changelog/18186.txt b/changelog/18186.txt new file mode 100644 index 000000000000..135da494189d --- /dev/null +++ b/changelog/18186.txt @@ -0,0 +1,3 @@ +```release-note:bug +secrets/pki: Allow Turning Off Storage Metrics (to reduce time on initialize with 5000 backends) +``` \ No newline at end of file From e124d17c42fa473c270eaee40c5aa07f81d7df77 Mon Sep 17 00:00:00 2001 From: kitography Date: Mon, 5 Dec 2022 12:26:44 -0500 Subject: [PATCH 05/20] Fix memory leak. Code cleanup as suggested by Steve. --- builtin/logical/pki/storage.go | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/builtin/logical/pki/storage.go b/builtin/logical/pki/storage.go index cd78ab68ed6e..2ce9a1250499 100644 --- a/builtin/logical/pki/storage.go +++ b/builtin/logical/pki/storage.go @@ -1245,15 +1245,14 @@ func (sc *storageContext) writeAutoTidyConfig(config *tidyConfig) error { // To Potentially Disable Certificate Counting if config.MaintainCount == false { - oldValue := sc.Backend.certCountEnabled.Swap(config.MaintainCount) - if oldValue != false { - if config.MaintainCount == false { - sc.Backend.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts" - sc.Backend.possibleDoubleCountedSerials = nil // This won't stop a list operation, but will stop an expensive clean up during initialize - sc.Backend.possibleDoubleCountedRevokedSerials = nil // This won't stop a list operation, but will stop an expensive clean up during initialize - atomic.StoreUint32(sc.Backend.certCount, 0) - atomic.StoreUint32(sc.Backend.revokedCertCount, 0) - } + certCountWasEnabled := sc.Backend.certCountEnabled.Swap(config.MaintainCount) + if certCountWasEnabled { + sc.Backend.certsCounted.Store(true) + sc.Backend.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts" + sc.Backend.possibleDoubleCountedSerials = nil // This won't stop a list operation, but will stop an expensive clean-up during initialize + sc.Backend.possibleDoubleCountedRevokedSerials = nil // This won't stop a list operation, but will stop an expensive clean-up during initialize + atomic.StoreUint32(sc.Backend.certCount, 0) + atomic.StoreUint32(sc.Backend.revokedCertCount, 0) } } else { // To Potentially Enable Certificate Counting // We haven't written "re-enable certificate counts" outside the initialize function From 173143bae481300f2f170fe7af859004a279573e Mon Sep 17 00:00:00 2001 From: kitography Date: Mon, 5 Dec 2022 14:52:17 -0500 Subject: [PATCH 06/20] Turn off metrics by default, breaking-change. --- builtin/logical/pki/backend.go | 19 +++++++++++++++---- builtin/logical/pki/fields.go | 2 +- builtin/logical/pki/path_tidy.go | 4 ++-- changelog/18186.txt | 7 +++++-- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/builtin/logical/pki/backend.go b/builtin/logical/pki/backend.go index 31f191089d65..19dfccf9cd64 100644 --- a/builtin/logical/pki/backend.go +++ b/builtin/logical/pki/backend.go @@ -212,10 +212,10 @@ func Backend(conf *logical.BackendConfig) *backend { b.lastTidy = time.Now() // Metrics initialization for count of certificates in storage - b.certCountEnabled = atomic2.NewBool(true) // Overwritten during initialize, don't break count until then - b.publishCertCountMetrics = atomic2.NewBool(true) + b.certCountEnabled = atomic2.NewBool(false) + b.publishCertCountMetrics = atomic2.NewBool(false) b.certsCounted = atomic2.NewBool(false) - b.certCountError = "" + b.certCountError = "Initialize Not Yet Run, Cert Counts Unavailable" b.certCount = new(uint32) b.revokedCertCount = new(uint32) b.possibleDoubleCountedSerials = make([]string, 0, 250) @@ -597,13 +597,22 @@ func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error { if config.MaintainCount == false { b.possibleDoubleCountedRevokedSerials = nil b.possibleDoubleCountedSerials = nil - b.certsCounted.Store(false) + b.certsCounted.Store(true) atomic.StoreUint32(b.certCount, 0) atomic.StoreUint32(b.revokedCertCount, 0) b.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts" return nil } + // Ideally these three things would be set in one transaction, since that isn't possible, set the counts to "0", + // first, so count will over-count (and miss putting things in deduplicate queue), rather than under-count. + atomic.StoreUint32(b.certCount, 0) + atomic.StoreUint32(b.revokedCertCount, 0) + b.possibleDoubleCountedRevokedSerials = nil + b.possibleDoubleCountedSerials = nil + // A cert issued or revoked here will be double-counted. That's okay, this is "best effort" metrics. + b.certsCounted.Store(false) + entries, err := b.storage.List(ctx, "certs/") if err != nil { return err @@ -702,6 +711,8 @@ func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error { metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount)) } + b.certCountError = "" + return nil } diff --git a/builtin/logical/pki/fields.go b/builtin/logical/pki/fields.go index 3b6eb2df167f..224053ef35f5 100644 --- a/builtin/logical/pki/fields.go +++ b/builtin/logical/pki/fields.go @@ -496,7 +496,7 @@ greater period of time. By default this is zero seconds.`, Description: `This configures whether stored certificates are counted upon initialization of the backend, and whether during normal operation, a running count of certificates stored is maintained.`, - Default: true, + Default: false, } fields["publish_stored_certificate_count_metrics"] = &framework.FieldSchema{ diff --git a/builtin/logical/pki/path_tidy.go b/builtin/logical/pki/path_tidy.go index e28aadc23cf7..7c2a17114790 100644 --- a/builtin/logical/pki/path_tidy.go +++ b/builtin/logical/pki/path_tidy.go @@ -43,8 +43,8 @@ var defaultTidyConfig = tidyConfig{ SafetyBuffer: 72 * time.Hour, IssuerSafetyBuffer: 365 * 24 * time.Hour, PauseDuration: 0 * time.Second, - MaintainCount: true, - PublishMetrics: true, + MaintainCount: false, + PublishMetrics: false, } func pathTidy(b *backend) *framework.Path { diff --git a/changelog/18186.txt b/changelog/18186.txt index 135da494189d..13710826284e 100644 --- a/changelog/18186.txt +++ b/changelog/18186.txt @@ -1,3 +1,6 @@ -```release-note:bug -secrets/pki: Allow Turning Off Storage Metrics (to reduce time on initialize with 5000 backends) +```release-note:breaking-change +secrets/pki: Maintaining running count of certificates will be turned off by default. +To re-enable keeping these metrics available on the tidy status endpoint, enable +maintain_stored_certificate_counts on tidy-config, to also publish them to the +metrics consumer, enable publish_stored_certificate_count_metrics . ``` \ No newline at end of file From 50e02afee086dcefb152107e8bb8ba9de9b966c3 Mon Sep 17 00:00:00 2001 From: kitography Date: Tue, 13 Dec 2022 09:32:57 -0500 Subject: [PATCH 07/20] Show on tidy-status before start-up. --- builtin/logical/pki/path_tidy.go | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/builtin/logical/pki/path_tidy.go b/builtin/logical/pki/path_tidy.go index 60bea1de4554..86c8d01795a2 100644 --- a/builtin/logical/pki/path_tidy.go +++ b/builtin/logical/pki/path_tidy.go @@ -614,9 +614,24 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f "missing_issuer_cert_count": nil, "current_cert_store_count": nil, "current_revoked_cert_count": nil, + "internal_backend_uuid": nil, }, } + resp.Data["internal_backend_uuid"] = b.backendUUID + + if b.certCountEnabled.Load() { + resp.Data["current_cert_store_count"] = b.certCount + resp.Data["current_revoked_cert_count"] = b.revokedCertCount + if !b.certsCounted.Load() { + resp.AddWarning("Certificates in storage are still being counted, current counts provided may be " + + "inaccurate") + } + if b.certCountError != "" { + resp.Data["certificate_counting_error"] = b.certCountError + } + } + if b.tidyStatus.state == tidyStatusInactive { return resp, nil } @@ -633,7 +648,6 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f resp.Data["cert_store_deleted_count"] = b.tidyStatus.certStoreDeletedCount resp.Data["revoked_cert_deleted_count"] = b.tidyStatus.revokedCertDeletedCount resp.Data["missing_issuer_cert_count"] = b.tidyStatus.missingIssuerCertCount - resp.Data["internal_backend_uuid"] = b.backendUUID switch b.tidyStatus.state { case tidyStatusStarted: @@ -655,18 +669,6 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f resp.Data["time_finished"] = b.tidyStatus.timeFinished } - if b.certCountEnabled.Load() { - resp.Data["current_cert_store_count"] = b.certCount - resp.Data["current_revoked_cert_count"] = b.revokedCertCount - if !b.certsCounted.Load() { - resp.AddWarning("Certificates in storage are still being counted, current counts provided may be " + - "inaccurate") - } - if b.certCountError != "" { - resp.Data["certificate_counting_error"] = b.certCountError - } - } - return resp, nil } From 38421ebea2e8a6fbc5c328b45a08210726e54f1b Mon Sep 17 00:00:00 2001 From: kitography Date: Tue, 13 Dec 2022 10:09:46 -0500 Subject: [PATCH 08/20] Fix tests --- builtin/logical/pki/backend_test.go | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/builtin/logical/pki/backend_test.go b/builtin/logical/pki/backend_test.go index 2ffc20143ef7..e3bdb32304eb 100644 --- a/builtin/logical/pki/backend_test.go +++ b/builtin/logical/pki/backend_test.go @@ -3792,6 +3792,15 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) { t.Fatal(err) } + // Set up Metric Configuration, then restart to enable it + _, err = client.Logical().Write("pki/config/auto-tidy", map[string]interface{}{ + "maintain_stored_certificate_counts": true, + "publish_stored_certificate_count_metrics": true, + }) + _, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{ + "mounts": "pki/", + }) + // Check the metrics initialized in order to calculate backendUUID for /pki // BackendUUID not consistent during tests with UUID from /sys/mounts/pki metricsSuffix := "total_certificates_stored" @@ -3828,6 +3837,14 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) { if err != nil { t.Fatal(err) } + // Set up Metric Configuration, then restart to enable it + _, err = client.Logical().Write("pki2/config/auto-tidy", map[string]interface{}{ + "maintain_stored_certificate_counts": true, + "publish_stored_certificate_count_metrics": true, + }) + _, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{ + "mounts": "pki2/", + }) // Create a CSR for the intermediate CA secret, err := client.Logical().Write("pki2/intermediate/generate/internal", nil) @@ -5571,6 +5588,14 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) { serials[i] = resp.Data["serial_number"].(string) } + // Turn on certificate counting: + CBWrite(b, s, "config/auto-tidy", map[string]interface{}{ + "maintain_stored_certificate_counts": true, + "publish_stored_certificate_count_metrics": false, + }) + // Assert initialize from clean is correct: + b.initializeStoredCertificateCounts(ctx) + // Revoke certificates A + B revocations := serials[0:2] for _, key := range revocations { @@ -5582,8 +5607,6 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) { } } - // Assert initialize from clean is correct: - b.initializeStoredCertificateCounts(ctx) if atomic.LoadUint32(b.certCount) != 6 { t.Fatalf("Failed to count six certificates root,A,B,C,D,E, instead counted %d certs", atomic.LoadUint32(b.certCount)) } From 617f8e3c62177c7c5a720695cabb6e1fdea7cf07 Mon Sep 17 00:00:00 2001 From: kitography Date: Tue, 13 Dec 2022 10:24:26 -0500 Subject: [PATCH 09/20] make fmt --- builtin/logical/pki/path_tidy.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/builtin/logical/pki/path_tidy.go b/builtin/logical/pki/path_tidy.go index 86c8d01795a2..226a392ede4a 100644 --- a/builtin/logical/pki/path_tidy.go +++ b/builtin/logical/pki/path_tidy.go @@ -5,11 +5,12 @@ import ( "crypto/x509" "errors" "fmt" - "github.com/hashicorp/go-hclog" "net/http" "sync/atomic" "time" + "github.com/hashicorp/go-hclog" + "github.com/armon/go-metrics" "github.com/hashicorp/vault/sdk/framework" "github.com/hashicorp/vault/sdk/helper/consts" From 27e5ca80abcc54cd0e609248ecb7536ddc63ac23 Mon Sep 17 00:00:00 2001 From: kitography Date: Fri, 10 Feb 2023 10:38:32 -0500 Subject: [PATCH 10/20] Add emit metrics to periodicFunc --- builtin/logical/pki/backend.go | 39 +++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/builtin/logical/pki/backend.go b/builtin/logical/pki/backend.go index 69b1c956a78f..faf68e6a0de2 100644 --- a/builtin/logical/pki/backend.go +++ b/builtin/logical/pki/backend.go @@ -608,6 +608,13 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er crlErr := doCRL() tidyErr := doAutoTidy() + // Periodically re-emit gauges so that they don't disappear/go stale + tidyConfig, err := sc.getAutoTidyConfig() + if err != nil { + return err + } + b.emitCertStoreMetrics(tidyConfig) + var errors error if crlErr != nil { errors = multierror.Append(errors, fmt.Errorf("Error building CRLs:\n - %w\n", crlErr)) @@ -757,18 +764,22 @@ func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error { b.possibleDoubleCountedRevokedSerials = nil b.possibleDoubleCountedSerials = nil - if config.PublishMetrics == true { - certCount := atomic.LoadUint32(b.certCount) - metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount)) - revokedCertCount := atomic.LoadUint32(b.revokedCertCount) - metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount)) - } + b.emitCertStoreMetrics(config) b.certCountError = "" return nil } +func (b *backend) emitCertStoreMetrics(config *tidyConfig) { + if config.PublishMetrics == true { + certCount := atomic.LoadUint32(b.certCount) + b.emitTotalCertCountMetric(certCount) + revokedCertCount := atomic.LoadUint32(b.revokedCertCount) + b.emitTotalRevokedCountMetric(revokedCertCount) + } +} + // The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call: // eg. certsCounted := b.certsCounted.Load() func (b *backend) ifCountEnabledIncrementTotalCertificatesCount(certsCounted bool, newSerial string) { @@ -783,7 +794,7 @@ func (b *backend) ifCountEnabledIncrementTotalCertificatesCount(certsCounted boo b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial) default: if b.publishCertCountMetrics.Load() { - metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount)) + b.emitTotalCertCountMetric(certCount) } } } @@ -793,11 +804,15 @@ func (b *backend) ifCountEnabledDecrementTotalCertificatesCountReport() { if b.certCountEnabled.Load() { certCount := b.decrementTotalCertificatesCountNoReport() if b.publishCertCountMetrics.Load() { - metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount)) + b.emitTotalCertCountMetric(certCount) } } } +func (b *backend) emitTotalCertCountMetric(certCount uint32) { + metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount)) +} + // Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet // Does not respect whether-we-are-counting backend information. func (b *backend) decrementTotalCertificatesCountNoReport() uint32 { @@ -819,7 +834,7 @@ func (b *backend) ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCoun b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial) default: if b.publishCertCountMetrics.Load() { - metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(newRevokedCertCount)) + b.emitTotalRevokedCountMetric(newRevokedCertCount) } } } @@ -829,11 +844,15 @@ func (b *backend) ifCountEnabledDecrementTotalRevokedCertificatesCountReport() { if b.certCountEnabled.Load() { revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport() if b.publishCertCountMetrics.Load() { - metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount)) + b.emitTotalRevokedCountMetric(revokedCertCount) } } } +func (b *backend) emitTotalRevokedCountMetric(revokedCertCount uint32) { + metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount)) +} + // Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet // Does not respect whether-we-are-counting backend information. func (b *backend) decrementTotalRevokedCertificatesCountNoReport() uint32 { From f4473ba04f6c809ed472ce30dd3575a229ca8013 Mon Sep 17 00:00:00 2001 From: kitography Date: Fri, 10 Feb 2023 13:17:59 -0500 Subject: [PATCH 11/20] Test not delivering unavailable metrics + fix. --- builtin/logical/pki/path_tidy.go | 12 +- builtin/logical/pki/path_tidy_test.go | 322 ++++++++++++++++++++++++++ 2 files changed, 324 insertions(+), 10 deletions(-) diff --git a/builtin/logical/pki/path_tidy.go b/builtin/logical/pki/path_tidy.go index f36f7fd6887f..492276a1dfd5 100644 --- a/builtin/logical/pki/path_tidy.go +++ b/builtin/logical/pki/path_tidy.go @@ -986,8 +986,8 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f resp.Data["internal_backend_uuid"] = b.backendUUID if b.certCountEnabled.Load() { - resp.Data["current_cert_store_count"] = b.certCount - resp.Data["current_revoked_cert_count"] = b.revokedCertCount + resp.Data["current_cert_store_count"] = atomic.LoadUint32(b.certCount) + resp.Data["current_revoked_cert_count"] = atomic.LoadUint32(b.revokedCertCount) if !b.certsCounted.Load() { resp.AddWarning("Certificates in storage are still being counted, current counts provided may be " + "inaccurate") @@ -1039,14 +1039,6 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f resp.Data["time_finished"] = b.tidyStatus.timeFinished } - resp.Data["current_cert_store_count"] = atomic.LoadUint32(b.certCount) - resp.Data["current_revoked_cert_count"] = atomic.LoadUint32(b.revokedCertCount) - - if !b.certsCounted.Load() { - resp.AddWarning("Certificates in storage are still being counted, current counts provided may be " + - "inaccurate") - } - return resp, nil } diff --git a/builtin/logical/pki/path_tidy_test.go b/builtin/logical/pki/path_tidy_test.go index 45d3d3a6a5e7..28e90533cf06 100644 --- a/builtin/logical/pki/path_tidy_test.go +++ b/builtin/logical/pki/path_tidy_test.go @@ -2,6 +2,7 @@ package pki import ( "encoding/json" + "github.com/armon/go-metrics" "testing" "time" @@ -405,3 +406,324 @@ func TestTidyIssuerConfig(t *testing.T) { require.Equal(t, true, resp.Data["tidy_expired_issuers"]) require.Equal(t, 5, resp.Data["issuer_safety_buffer"]) } + +// TestCertStorageMetrics ensures that when enabled, metrics are able to count the number of certificates in storage and +// number of revoked certificates in storage. Moreover, this test ensures that the gauge is emitted periodically, so +// that the metric does not disappear or go stale. +func TestCertStorageMetrics(t *testing.T) { + t.Parallel() + + // This tests uses the same setup as TestAutoTidy + newPeriod := 1 * time.Second + + // We set up a metrics accumulator + inmemSink := metrics.NewInmemSink( + newPeriod, // We use a short period here to test metrics are emitted every periodicFunc + 2000000*time.Hour) + + metricsConf := metrics.DefaultConfig("") + metricsConf.EnableHostname = false + metricsConf.EnableHostnameLabel = false + metricsConf.EnableServiceLabel = false + metricsConf.EnableTypePrefix = false + + _, err := metrics.NewGlobal(metricsConf, inmemSink) + if err != nil { + t.Fatal(err) + } + + // This test requires the periodicFunc to trigger, which requires we stand + // up a full test cluster. + coreConfig := &vault.CoreConfig{ + LogicalBackends: map[string]logical.Factory{ + "pki": Factory, + }, + // See notes below about usage of /sys/raw for reading cluster + // storage without barrier encryption. + EnableRaw: true, + RollbackPeriod: newPeriod, + } + cluster := vault.NewTestCluster(t, coreConfig, &vault.TestClusterOptions{ + HandlerFunc: vaulthttp.Handler, + }) + cluster.Start() + defer cluster.Cleanup() + client := cluster.Cores[0].Client + + // Mount PKI + err = client.Sys().Mount("pki", &api.MountInput{ + Type: "pki", + Config: api.MountConfigInput{ + DefaultLeaseTTL: "10m", + MaxLeaseTTL: "60m", + }, + }) + require.NoError(t, err) + + // Generate root. + resp, err := client.Logical().Write("pki/root/generate/internal", map[string]interface{}{ + "ttl": "40h", + "common_name": "Root X1", + "key_type": "ec", + }) + require.NoError(t, err) + require.NotNil(t, resp) + require.NotEmpty(t, resp.Data) + require.NotEmpty(t, resp.Data["issuer_id"]) + + // Set up a testing role. + _, err = client.Logical().Write("pki/roles/local-testing", map[string]interface{}{ + "allow_any_name": true, + "enforce_hostnames": false, + "key_type": "ec", + }) + require.NoError(t, err) + + // Run tidy so that tidy-status is not empty + _, err = client.Logical().Write("pki/tidy", map[string]interface{}{ + "tidy_revoked_certs": true, + }) + require.NoError(t, err) + + // Since certificate counts are off by default, we shouldn't see counts in the tidy status + tidyStatus, err := client.Logical().Read("pki/tidy-status") + if err != nil { + t.Fatal(err) + } + // backendUUID should exist, we need this for metrics + backendUUID := tidyStatus.Data["internal_backend_uuid"].(string) + // "current_cert_store_count", "current_revoked_cert_count" + countData, ok := tidyStatus.Data["current_cert_store_count"] + if ok && countData != nil { + t.Fatalf("Certificate counting should be off by default, but current cert store count %v appeared in tidy status in unconfigured mount", countData) + } + revokedCountData, ok := tidyStatus.Data["current_revoked_cert_count"] + if ok && revokedCountData != nil { + t.Fatalf("Certificate counting should be off by default, but revoked cert count %v appeared in tidy status in unconfigured mount", revokedCountData) + } + + // Since certificate counts are off by default, those metrics should not exist + mostRecentInterval := inmemSink.Data()[len(inmemSink.Data())-1] + _, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"] + if ok { + t.Fatalf("Certificate counting should be off by default, but revoked cert count was emitted as a metric in an unconfigured mount") + } + _, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"] + if ok { + t.Fatalf("Certificate counting should be off by default, but total certificate count was emitted as a metric in an unconfigured mount") + } + + // Write the auto-tidy config. + _, err = client.Logical().Write("pki/config/auto-tidy", map[string]interface{}{ + "enabled": true, + "interval_duration": "1s", + "tidy_cert_store": true, + "tidy_revoked_certs": true, + "safety_buffer": "1s", + "maintain_stored_certificate_counts": true, + "publish_stored_certificate_count_metrics": false, + }) + require.NoError(t, err) + + // Reload the Mount - Otherwise Stored Certificate Counts Will Not Be Populated + _, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{ + "plugin": "pki", + }) + + // Since publish_stored_certificate_count_metrics is false, those metrics should not exist + mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1] + _, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"] + if ok { + t.Fatalf("Certificate counting should be off by default, but revoked cert count was emitted as a metric in an unconfigured mount") + } + _, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"] + if ok { + t.Fatalf("Certificate counting should be off by default, but total certificate count was emitted as a metric in an unconfigured mount") + } + + // But since certificate counting is on, the metrics should exist on tidyStatus endpoint: + tidyStatus, err = client.Logical().Read("pki/tidy-status") + if err != nil { + t.Fatal(err) + } + // backendUUID should exist, we need this for metrics + backendUUID = tidyStatus.Data["internal_backend_uuid"].(string) + // "current_cert_store_count", "current_revoked_cert_count" + certStoreCount, ok := tidyStatus.Data["current_cert_store_count"] + if !ok { + t.Fatalf("Certificate counting has been turned on, but current cert store count does not appear in tidy status") + } + if certStoreCount != json.Number("1") { + t.Fatalf("Only created one certificate, but a got a certificate count of %v", certStoreCount) + } + revokedCertCount, ok := tidyStatus.Data["current_revoked_cert_count"] + if !ok { + t.Fatalf("Certificate counting has been turned on, but revoked cert store count does not appear in tidy status") + } + if revokedCertCount != json.Number("0") { + t.Fatalf("Have not yet revoked a certificate, but got a revoked cert store count of %v", revokedCertCount) + } + + // Write the auto-tidy config, again, this time turning on metrics + _, err = client.Logical().Write("pki/config/auto-tidy", map[string]interface{}{ + "enabled": true, + "interval_duration": "1s", + "tidy_cert_store": true, + "tidy_revoked_certs": true, + "safety_buffer": "1s", + "maintain_stored_certificate_counts": true, + "publish_stored_certificate_count_metrics": true, + }) + require.NoError(t, err) + + // Issue a cert and revoke it. + resp, err = client.Logical().Write("pki/issue/local-testing", map[string]interface{}{ + "common_name": "example.com", + "ttl": "10s", + }) + require.NoError(t, err) + require.NotNil(t, resp) + require.NotNil(t, resp.Data) + require.NotEmpty(t, resp.Data["serial_number"]) + require.NotEmpty(t, resp.Data["certificate"]) + leafSerial := resp.Data["serial_number"].(string) + leafCert := parseCert(t, resp.Data["certificate"].(string)) + + // Read cert before revoking + resp, err = client.Logical().Read("pki/cert/" + leafSerial) + require.NoError(t, err) + require.NotNil(t, resp) + require.NotNil(t, resp.Data) + require.NotEmpty(t, resp.Data["certificate"]) + revocationTime, err := (resp.Data["revocation_time"].(json.Number)).Int64() + require.Equal(t, int64(0), revocationTime, "revocation time was not zero") + require.Empty(t, resp.Data["revocation_time_rfc3339"], "revocation_time_rfc3339 was not empty") + require.Empty(t, resp.Data["issuer_id"], "issuer_id was not empty") + + _, err = client.Logical().Write("pki/revoke", map[string]interface{}{ + "serial_number": leafSerial, + }) + require.NoError(t, err) + + // Check Metrics After Cert Has Be Created and Revoked + tidyStatus, err = client.Logical().Read("pki/tidy-status") + if err != nil { + t.Fatal(err) + } + backendUUID = tidyStatus.Data["internal_backend_uuid"].(string) + certStoreCount, ok = tidyStatus.Data["current_cert_store_count"] + if !ok { + t.Fatalf("Certificate counting has been turned on, but current cert store count does not appear in tidy status") + } + if certStoreCount != json.Number("2") { + t.Fatalf("Created root and leaf certificate, but a got a certificate count of %v", certStoreCount) + } + revokedCertCount, ok = tidyStatus.Data["current_revoked_cert_count"] + if !ok { + t.Fatalf("Certificate counting has been turned on, but revoked cert store count does not appear in tidy status") + } + if revokedCertCount != json.Number("1") { + t.Fatalf("Revoked one certificate, but got a revoked cert store count of %v", revokedCertCount) + } + // This should now be initialized + certCountError, ok := tidyStatus.Data["certificate_counting_error"] + if ok && certCountError.(string) != "" { + t.Fatalf("Expected certificate count error to disappear after initialization, but got error %v", certCountError) + } + + // Since publish_stored_certificate_count_metrics is false, those metrics should not exist + mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1] + revokedCertCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"] + if !ok { + t.Fatalf("Turned on metrics, but revoked cert count was not emitted") + } + if revokedCertCountGaugeValue.Value != 1 { + t.Fatalf("Revoked one certificate, but metrics emitted a revoked cert store count of %v", revokedCertCountGaugeValue) + } + certStoreCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"] + if !ok { + t.Fatalf("Turned on metrics, but total certificate count was not emitted") + } + if certStoreCountGaugeValue.Value != 2 { + t.Fatalf("Stored two certificiates, but total certificate count emitted was %v", certStoreCountGaugeValue.Value) + } + + // Wait for cert to expire and the safety buffer to elapse. + time.Sleep(time.Until(leafCert.NotAfter) + 3*time.Second) + + // Wait for auto-tidy to run afterwards. + var foundTidyRunning string + var foundTidyFinished bool + timeoutChan := time.After(120 * time.Second) + for { + if foundTidyRunning != "" && foundTidyFinished { + break + } + + select { + case <-timeoutChan: + t.Fatalf("expected auto-tidy to run (%v) and finish (%v) before 120 seconds elapsed", foundTidyRunning, foundTidyFinished) + default: + time.Sleep(250 * time.Millisecond) + + resp, err = client.Logical().Read("pki/tidy-status") + require.NoError(t, err) + require.NotNil(t, resp) + require.NotNil(t, resp.Data) + require.NotEmpty(t, resp.Data["state"]) + require.NotEmpty(t, resp.Data["time_started"]) + state := resp.Data["state"].(string) + started := resp.Data["time_started"].(string) + t.Logf("Resp: %v", resp.Data) + + // We want the _next_ tidy run after the cert expires. This + // means if we're currently finished when we hit this the + // first time, we want to wait for the next run. + if foundTidyRunning == "" { + foundTidyRunning = started + } else if foundTidyRunning != started && !foundTidyFinished && state == "Finished" { + foundTidyFinished = true + } + } + } + + // After Tidy, Cert Store Count Should Still Be Available, and Be Updated: + // Check Metrics After Cert Has Be Created and Revoked + tidyStatus, err = client.Logical().Read("pki/tidy-status") + if err != nil { + t.Fatal(err) + } + backendUUID = tidyStatus.Data["internal_backend_uuid"].(string) + // "current_cert_store_count", "current_revoked_cert_count" + certStoreCount, ok = tidyStatus.Data["current_cert_store_count"] + if !ok { + t.Fatalf("Certificate counting has been turned on, but current cert store count does not appear in tidy status") + } + if certStoreCount != json.Number("1") { + t.Fatalf("Created root and leaf certificate, deleted leaf, but a got a certificate count of %v", certStoreCount) + } + revokedCertCount, ok = tidyStatus.Data["current_revoked_cert_count"] + if !ok { + t.Fatalf("Certificate counting has been turned on, but revoked cert store count does not appear in tidy status") + } + if revokedCertCount != json.Number("0") { + t.Fatalf("Revoked certificate has been tidied, but got a revoked cert store count of %v", revokedCertCount) + } + + // Since publish_stored_certificate_count_metrics is false, those metrics should not exist + mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1] + revokedCertCountGaugeValue, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"] + if !ok { + t.Fatalf("Turned on metrics, but revoked cert count was not emitted") + } + if revokedCertCountGaugeValue.Value != 0 { + t.Fatalf("Revoked certificate has been tidied, but metrics emitted a revoked cert store count of %v", revokedCertCountGaugeValue) + } + certStoreCountGaugeValue, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"] + if !ok { + t.Fatalf("Turned on metrics, but total certificate count was not emitted") + } + if certStoreCountGaugeValue.Value != 1 { + t.Fatalf("Only one of two certificates left after tidy, but total certificate count emitted was %v", certStoreCountGaugeValue.Value) + } +} From bff02458e5a6d34bcf2e9a6f49dba4b7207aa108 Mon Sep 17 00:00:00 2001 From: kitography Date: Fri, 10 Feb 2023 13:18:39 -0500 Subject: [PATCH 12/20] Better error message. --- builtin/logical/pki/backend.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builtin/logical/pki/backend.go b/builtin/logical/pki/backend.go index faf68e6a0de2..2a031e28b0d5 100644 --- a/builtin/logical/pki/backend.go +++ b/builtin/logical/pki/backend.go @@ -371,7 +371,7 @@ func (b *backend) initialize(ctx context.Context, _ *logical.InitializationReque err = b.initializeStoredCertificateCounts(ctx) if err != nil { // Don't block/err initialize/startup for metrics. Context on this call can time out due to number of certificates. - b.Logger().Error("Could not initialize stored certificate counts, run tidy to initialize", err) + b.Logger().Error("Could not initialize stored certificate counts", err) b.certCountError = err.Error() } return nil From eb646af1072fad06e96df5de67cebe90e8d7d9ee Mon Sep 17 00:00:00 2001 From: kitography Date: Fri, 10 Feb 2023 13:26:40 -0500 Subject: [PATCH 13/20] Fixing the false-error bug. --- builtin/logical/pki/storage.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/builtin/logical/pki/storage.go b/builtin/logical/pki/storage.go index eb61b3ced69c..955381d95fcb 100644 --- a/builtin/logical/pki/storage.go +++ b/builtin/logical/pki/storage.go @@ -1371,9 +1371,11 @@ func (sc *storageContext) writeAutoTidyConfig(config *tidyConfig) error { atomic.StoreUint32(sc.Backend.revokedCertCount, 0) } } else { // To Potentially Enable Certificate Counting - // We haven't written "re-enable certificate counts" outside the initialize function - // Any call derived call to do so is likely to time out on ~2 million certs - sc.Backend.certCountError = "Certificate Counting Has Not Been Initialized, re-initialize this mount" + if sc.Backend.certCountEnabled.Load() == false { + // We haven't written "re-enable certificate counts" outside the initialize function + // Any call derived call to do so is likely to time out on ~2 million certs + sc.Backend.certCountError = "Certificate Counting Has Not Been Initialized, re-initialize this mount" + } } return nil From 945cf9330ce6aa701c1c5945443addd74d97f35d Mon Sep 17 00:00:00 2001 From: kitography Date: Fri, 10 Feb 2023 13:31:03 -0500 Subject: [PATCH 14/20] make fmt. --- builtin/logical/pki/path_tidy_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/builtin/logical/pki/path_tidy_test.go b/builtin/logical/pki/path_tidy_test.go index 28e90533cf06..0d05e80b3a5e 100644 --- a/builtin/logical/pki/path_tidy_test.go +++ b/builtin/logical/pki/path_tidy_test.go @@ -2,10 +2,11 @@ package pki import ( "encoding/json" - "github.com/armon/go-metrics" "testing" "time" + "github.com/armon/go-metrics" + "github.com/hashicorp/vault/api" vaulthttp "github.com/hashicorp/vault/http" "github.com/hashicorp/vault/sdk/logical" From 32fb8f16d7496cba751e1ab083f208458bde00a0 Mon Sep 17 00:00:00 2001 From: kitography Date: Fri, 10 Feb 2023 14:38:14 -0500 Subject: [PATCH 15/20] Try to fix race issue, remove confusing comments. --- builtin/logical/pki/path_tidy_test.go | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/builtin/logical/pki/path_tidy_test.go b/builtin/logical/pki/path_tidy_test.go index 0d05e80b3a5e..63722bb41298 100644 --- a/builtin/logical/pki/path_tidy_test.go +++ b/builtin/logical/pki/path_tidy_test.go @@ -419,7 +419,8 @@ func TestCertStorageMetrics(t *testing.T) { // We set up a metrics accumulator inmemSink := metrics.NewInmemSink( - newPeriod, // We use a short period here to test metrics are emitted every periodicFunc + 10000*time.Second, // A short time period would be ideal here to test metrics are emitted every periodic Func, + // but this has race-issues (if the metrics aren't emitted yet) 2000000*time.Hour) metricsConf := metrics.DefaultConfig("") @@ -503,7 +504,7 @@ func TestCertStorageMetrics(t *testing.T) { t.Fatalf("Certificate counting should be off by default, but revoked cert count %v appeared in tidy status in unconfigured mount", revokedCountData) } - // Since certificate counts are off by default, those metrics should not exist + // Since certificate counts are off by default, those metrics should not exist yet mostRecentInterval := inmemSink.Data()[len(inmemSink.Data())-1] _, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"] if ok { @@ -531,7 +532,13 @@ func TestCertStorageMetrics(t *testing.T) { "plugin": "pki", }) - // Since publish_stored_certificate_count_metrics is false, those metrics should not exist + // By reading the auto-tidy endpoint, we ensure that initialize has completed (which has a write lock on auto-tidy) + _, err = client.Logical().Read("/pki/config/auto-tidy") + if err != nil { + t.Fatal(err) + } + + // Since publish_stored_certificate_count_metrics is still false, these metrics should still not exist yet mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1] _, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"] if ok { @@ -606,6 +613,12 @@ func TestCertStorageMetrics(t *testing.T) { }) require.NoError(t, err) + // We read the auto-tidy endpoint again, to ensure any metrics logic has completed (lock on config) + _, err = client.Logical().Read("/pki/config/auto-tidy") + if err != nil { + t.Fatal(err) + } + // Check Metrics After Cert Has Be Created and Revoked tidyStatus, err = client.Logical().Read("pki/tidy-status") if err != nil { @@ -632,7 +645,6 @@ func TestCertStorageMetrics(t *testing.T) { t.Fatalf("Expected certificate count error to disappear after initialization, but got error %v", certCountError) } - // Since publish_stored_certificate_count_metrics is false, those metrics should not exist mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1] revokedCertCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"] if !ok { @@ -711,7 +723,6 @@ func TestCertStorageMetrics(t *testing.T) { t.Fatalf("Revoked certificate has been tidied, but got a revoked cert store count of %v", revokedCertCount) } - // Since publish_stored_certificate_count_metrics is false, those metrics should not exist mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1] revokedCertCountGaugeValue, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"] if !ok { From f2a546d50752d83e4d49a26ac76bde8fe192656b Mon Sep 17 00:00:00 2001 From: Steve Clark Date: Fri, 10 Feb 2023 14:48:49 -0500 Subject: [PATCH 16/20] Switch metric counter variables to an atomic.Uint32 - Switch the metric counter variables to an atomic variable type so that we are forced to properly load/store values to it --- builtin/logical/pki/backend.go | 36 ++++++++++++++--------------- builtin/logical/pki/backend_test.go | 21 ++++++++--------- builtin/logical/pki/path_tidy.go | 4 ++-- builtin/logical/pki/storage.go | 5 ++-- 4 files changed, 32 insertions(+), 34 deletions(-) diff --git a/builtin/logical/pki/backend.go b/builtin/logical/pki/backend.go index 2a031e28b0d5..1b093d03ce55 100644 --- a/builtin/logical/pki/backend.go +++ b/builtin/logical/pki/backend.go @@ -9,12 +9,12 @@ import ( "sync/atomic" "time" + atomic2 "go.uber.org/atomic" + "github.com/hashicorp/vault/helper/constants" "github.com/hashicorp/go-multierror" - atomic2 "go.uber.org/atomic" - "github.com/hashicorp/vault/sdk/helper/consts" "github.com/armon/go-metrics" @@ -248,8 +248,8 @@ func Backend(conf *logical.BackendConfig) *backend { b.publishCertCountMetrics = atomic2.NewBool(false) b.certsCounted = atomic2.NewBool(false) b.certCountError = "Initialize Not Yet Run, Cert Counts Unavailable" - b.certCount = new(uint32) - b.revokedCertCount = new(uint32) + b.certCount = &atomic.Uint32{} + b.revokedCertCount = &atomic.Uint32{} b.possibleDoubleCountedSerials = make([]string, 0, 250) b.possibleDoubleCountedRevokedSerials = make([]string, 0, 250) @@ -275,8 +275,8 @@ type backend struct { certCountEnabled *atomic2.Bool publishCertCountMetrics *atomic2.Bool - certCount *uint32 - revokedCertCount *uint32 + certCount *atomic.Uint32 + revokedCertCount *atomic.Uint32 certsCounted *atomic2.Bool certCountError string possibleDoubleCountedSerials []string @@ -658,16 +658,16 @@ func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error { b.possibleDoubleCountedRevokedSerials = nil b.possibleDoubleCountedSerials = nil b.certsCounted.Store(true) - atomic.StoreUint32(b.certCount, 0) - atomic.StoreUint32(b.revokedCertCount, 0) + b.certCount.Store(0) + b.revokedCertCount.Store(0) b.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts" return nil } // Ideally these three things would be set in one transaction, since that isn't possible, set the counts to "0", // first, so count will over-count (and miss putting things in deduplicate queue), rather than under-count. - atomic.StoreUint32(b.certCount, 0) - atomic.StoreUint32(b.revokedCertCount, 0) + b.certCount.Store(0) + b.revokedCertCount.Store(0) b.possibleDoubleCountedRevokedSerials = nil b.possibleDoubleCountedSerials = nil // A cert issued or revoked here will be double-counted. That's okay, this is "best effort" metrics. @@ -677,13 +677,13 @@ func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error { if err != nil { return err } - atomic.AddUint32(b.certCount, uint32(len(entries))) + b.certCount.Add(uint32(len(entries))) revokedEntries, err := b.storage.List(ctx, "revoked/") if err != nil { return err } - atomic.AddUint32(b.revokedCertCount, uint32(len(revokedEntries))) + b.revokedCertCount.Add(uint32(len(revokedEntries))) b.certsCounted.Store(true) // Now that the metrics are set, we can switch from appending newly-stored certificates to the possible double-count @@ -773,9 +773,9 @@ func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error { func (b *backend) emitCertStoreMetrics(config *tidyConfig) { if config.PublishMetrics == true { - certCount := atomic.LoadUint32(b.certCount) + certCount := b.certCount.Load() b.emitTotalCertCountMetric(certCount) - revokedCertCount := atomic.LoadUint32(b.revokedCertCount) + revokedCertCount := b.revokedCertCount.Load() b.emitTotalRevokedCountMetric(revokedCertCount) } } @@ -784,7 +784,7 @@ func (b *backend) emitCertStoreMetrics(config *tidyConfig) { // eg. certsCounted := b.certsCounted.Load() func (b *backend) ifCountEnabledIncrementTotalCertificatesCount(certsCounted bool, newSerial string) { if b.certCountEnabled.Load() { - certCount := atomic.AddUint32(b.certCount, 1) + certCount := b.certCount.Add(1) switch { case !certsCounted: // This is unsafe, but a good best-attempt @@ -816,7 +816,7 @@ func (b *backend) emitTotalCertCountMetric(certCount uint32) { // Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet // Does not respect whether-we-are-counting backend information. func (b *backend) decrementTotalCertificatesCountNoReport() uint32 { - newCount := atomic.AddUint32(b.certCount, ^uint32(0)) + newCount := b.certCount.Add(^uint32(0)) return newCount } @@ -824,7 +824,7 @@ func (b *backend) decrementTotalCertificatesCountNoReport() uint32 { // eg. certsCounted := b.certsCounted.Load() func (b *backend) ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) { if b.certCountEnabled.Load() { - newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, 1) + newRevokedCertCount := b.revokedCertCount.Add(1) switch { case !certsCounted: // This is unsafe, but a good best-attempt @@ -856,6 +856,6 @@ func (b *backend) emitTotalRevokedCountMetric(revokedCertCount uint32) { // Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet // Does not respect whether-we-are-counting backend information. func (b *backend) decrementTotalRevokedCertificatesCountNoReport() uint32 { - newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, ^uint32(0)) + newRevokedCertCount := b.revokedCertCount.Add(^uint32(0)) return newRevokedCertCount } diff --git a/builtin/logical/pki/backend_test.go b/builtin/logical/pki/backend_test.go index 28d26429904d..6e07ccfeeee1 100644 --- a/builtin/logical/pki/backend_test.go +++ b/builtin/logical/pki/backend_test.go @@ -27,7 +27,6 @@ import ( "strconv" "strings" "sync" - "sync/atomic" "testing" "time" @@ -5627,16 +5626,16 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) { } } - if atomic.LoadUint32(b.certCount) != 6 { - t.Fatalf("Failed to count six certificates root,A,B,C,D,E, instead counted %d certs", atomic.LoadUint32(b.certCount)) + if b.certCount.Load() != 6 { + t.Fatalf("Failed to count six certificates root,A,B,C,D,E, instead counted %d certs", b.certCount.Load()) } - if atomic.LoadUint32(b.revokedCertCount) != 2 { - t.Fatalf("Failed to count two revoked certificates A+B, instead counted %d certs", atomic.LoadUint32(b.revokedCertCount)) + if b.revokedCertCount.Load() != 2 { + t.Fatalf("Failed to count two revoked certificates A+B, instead counted %d certs", b.revokedCertCount.Load()) } // Simulates listing while initialize in progress, by "restarting it" - atomic.StoreUint32(b.certCount, 0) - atomic.StoreUint32(b.revokedCertCount, 0) + b.certCount.Store(0) + b.revokedCertCount.Store(0) b.certsCounted.Store(false) // Revoke certificates C, D @@ -5665,12 +5664,12 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) { b.initializeStoredCertificateCounts(ctx) // Test certificate count - if atomic.LoadUint32(b.certCount) != 8 { - t.Fatalf("Failed to initialize count of certificates root, A,B,C,D,E,F,G counted %d certs", *(b.certCount)) + if b.certCount.Load() != 8 { + t.Fatalf("Failed to initialize count of certificates root, A,B,C,D,E,F,G counted %d certs", b.certCount.Load()) } - if atomic.LoadUint32(b.revokedCertCount) != 4 { - t.Fatalf("Failed to count revoked certificates A,B,C,D counted %d certs", *(b.revokedCertCount)) + if b.revokedCertCount.Load() != 4 { + t.Fatalf("Failed to count revoked certificates A,B,C,D counted %d certs", b.revokedCertCount.Load()) } return diff --git a/builtin/logical/pki/path_tidy.go b/builtin/logical/pki/path_tidy.go index 492276a1dfd5..4c0519c00982 100644 --- a/builtin/logical/pki/path_tidy.go +++ b/builtin/logical/pki/path_tidy.go @@ -986,8 +986,8 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f resp.Data["internal_backend_uuid"] = b.backendUUID if b.certCountEnabled.Load() { - resp.Data["current_cert_store_count"] = atomic.LoadUint32(b.certCount) - resp.Data["current_revoked_cert_count"] = atomic.LoadUint32(b.revokedCertCount) + resp.Data["current_cert_store_count"] = b.certCount.Load() + resp.Data["current_revoked_cert_count"] = b.revokedCertCount.Load() if !b.certsCounted.Load() { resp.AddWarning("Certificates in storage are still being counted, current counts provided may be " + "inaccurate") diff --git a/builtin/logical/pki/storage.go b/builtin/logical/pki/storage.go index 955381d95fcb..aee1ce7f1623 100644 --- a/builtin/logical/pki/storage.go +++ b/builtin/logical/pki/storage.go @@ -8,7 +8,6 @@ import ( "fmt" "sort" "strings" - "sync/atomic" "time" "github.com/hashicorp/go-uuid" @@ -1367,8 +1366,8 @@ func (sc *storageContext) writeAutoTidyConfig(config *tidyConfig) error { sc.Backend.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts" sc.Backend.possibleDoubleCountedSerials = nil // This won't stop a list operation, but will stop an expensive clean-up during initialize sc.Backend.possibleDoubleCountedRevokedSerials = nil // This won't stop a list operation, but will stop an expensive clean-up during initialize - atomic.StoreUint32(sc.Backend.certCount, 0) - atomic.StoreUint32(sc.Backend.revokedCertCount, 0) + sc.Backend.certCount.Store(0) + sc.Backend.revokedCertCount.Store(0) } } else { // To Potentially Enable Certificate Counting if sc.Backend.certCountEnabled.Load() == false { From 82ca96555a2fbcbafdbf001b61265178c1d2ab35 Mon Sep 17 00:00:00 2001 From: kitography Date: Fri, 10 Feb 2023 15:10:42 -0500 Subject: [PATCH 17/20] Fix race-issue better by trying until the metric is sunk. --- builtin/logical/pki/path_tidy_test.go | 72 +++++++++++++++------------ 1 file changed, 40 insertions(+), 32 deletions(-) diff --git a/builtin/logical/pki/path_tidy_test.go b/builtin/logical/pki/path_tidy_test.go index 63722bb41298..a06f87956489 100644 --- a/builtin/logical/pki/path_tidy_test.go +++ b/builtin/logical/pki/path_tidy_test.go @@ -2,6 +2,9 @@ package pki import ( "encoding/json" + "errors" + "fmt" + "github.com/hashicorp/vault/helper/testhelpers" "testing" "time" @@ -419,8 +422,7 @@ func TestCertStorageMetrics(t *testing.T) { // We set up a metrics accumulator inmemSink := metrics.NewInmemSink( - 10000*time.Second, // A short time period would be ideal here to test metrics are emitted every periodic Func, - // but this has race-issues (if the metrics aren't emitted yet) + newPeriod, // A short time period is ideal here to test metrics are emitted every periodic func 2000000*time.Hour) metricsConf := metrics.DefaultConfig("") @@ -645,21 +647,24 @@ func TestCertStorageMetrics(t *testing.T) { t.Fatalf("Expected certificate count error to disappear after initialization, but got error %v", certCountError) } - mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1] - revokedCertCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"] - if !ok { - t.Fatalf("Turned on metrics, but revoked cert count was not emitted") - } - if revokedCertCountGaugeValue.Value != 1 { - t.Fatalf("Revoked one certificate, but metrics emitted a revoked cert store count of %v", revokedCertCountGaugeValue) - } - certStoreCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"] - if !ok { - t.Fatalf("Turned on metrics, but total certificate count was not emitted") - } - if certStoreCountGaugeValue.Value != 2 { - t.Fatalf("Stored two certificiates, but total certificate count emitted was %v", certStoreCountGaugeValue.Value) - } + testhelpers.RetryUntil(t, newPeriod*5, func() error { + mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1] + revokedCertCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"] + if !ok { + return errors.New("turned on metrics, but revoked cert count was not emitted") + } + if revokedCertCountGaugeValue.Value != 1 { + return fmt.Errorf("revoked one certificate, but metrics emitted a revoked cert store count of %v", revokedCertCountGaugeValue) + } + certStoreCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"] + if !ok { + return errors.New("turned on metrics, but total certificate count was not emitted") + } + if certStoreCountGaugeValue.Value != 2 { + return fmt.Errorf("stored two certificiates, but total certificate count emitted was %v", certStoreCountGaugeValue.Value) + } + return nil + }) // Wait for cert to expire and the safety buffer to elapse. time.Sleep(time.Until(leafCert.NotAfter) + 3*time.Second) @@ -723,19 +728,22 @@ func TestCertStorageMetrics(t *testing.T) { t.Fatalf("Revoked certificate has been tidied, but got a revoked cert store count of %v", revokedCertCount) } - mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1] - revokedCertCountGaugeValue, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"] - if !ok { - t.Fatalf("Turned on metrics, but revoked cert count was not emitted") - } - if revokedCertCountGaugeValue.Value != 0 { - t.Fatalf("Revoked certificate has been tidied, but metrics emitted a revoked cert store count of %v", revokedCertCountGaugeValue) - } - certStoreCountGaugeValue, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"] - if !ok { - t.Fatalf("Turned on metrics, but total certificate count was not emitted") - } - if certStoreCountGaugeValue.Value != 1 { - t.Fatalf("Only one of two certificates left after tidy, but total certificate count emitted was %v", certStoreCountGaugeValue.Value) - } + testhelpers.RetryUntil(t, newPeriod*5, func() error { + mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1] + revokedCertCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"] + if !ok { + return errors.New("turned on metrics, but revoked cert count was not emitted") + } + if revokedCertCountGaugeValue.Value != 0 { + return fmt.Errorf("revoked certificate has been tidied, but metrics emitted a revoked cert store count of %v", revokedCertCountGaugeValue) + } + certStoreCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"] + if !ok { + return errors.New("turned on metrics, but total certificate count was not emitted") + } + if certStoreCountGaugeValue.Value != 1 { + return fmt.Errorf("only one of two certificates left after tidy, but total certificate count emitted was %v", certStoreCountGaugeValue.Value) + } + return nil + }) } From 3cb67930049a1c1d5b79077657ad5f2839b1ca36 Mon Sep 17 00:00:00 2001 From: kitography Date: Fri, 10 Feb 2023 15:11:34 -0500 Subject: [PATCH 18/20] make fmt. --- builtin/logical/pki/path_tidy_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/builtin/logical/pki/path_tidy_test.go b/builtin/logical/pki/path_tidy_test.go index a06f87956489..401a2c00ffa6 100644 --- a/builtin/logical/pki/path_tidy_test.go +++ b/builtin/logical/pki/path_tidy_test.go @@ -4,10 +4,11 @@ import ( "encoding/json" "errors" "fmt" - "github.com/hashicorp/vault/helper/testhelpers" "testing" "time" + "github.com/hashicorp/vault/helper/testhelpers" + "github.com/armon/go-metrics" "github.com/hashicorp/vault/api" From ffc680ab94897b84abcaa050539b931143984055 Mon Sep 17 00:00:00 2001 From: kitography Date: Fri, 10 Feb 2023 15:40:31 -0500 Subject: [PATCH 19/20] empty commit to retrigger non-race tests that all pass locally From eeeb8645c693e87f7837b16e2ae07e608bc3850a Mon Sep 17 00:00:00 2001 From: kitography Date: Fri, 10 Feb 2023 16:00:15 -0500 Subject: [PATCH 20/20] Extend metrics, remove parallel run because I have no idea what else could be causing flakiness of other tests. --- builtin/logical/pki/path_tidy_test.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/builtin/logical/pki/path_tidy_test.go b/builtin/logical/pki/path_tidy_test.go index 401a2c00ffa6..29292fbd329b 100644 --- a/builtin/logical/pki/path_tidy_test.go +++ b/builtin/logical/pki/path_tidy_test.go @@ -416,14 +416,12 @@ func TestTidyIssuerConfig(t *testing.T) { // number of revoked certificates in storage. Moreover, this test ensures that the gauge is emitted periodically, so // that the metric does not disappear or go stale. func TestCertStorageMetrics(t *testing.T) { - t.Parallel() - // This tests uses the same setup as TestAutoTidy newPeriod := 1 * time.Second // We set up a metrics accumulator inmemSink := metrics.NewInmemSink( - newPeriod, // A short time period is ideal here to test metrics are emitted every periodic func + 2*newPeriod, // A short time period is ideal here to test metrics are emitted every periodic func 2000000*time.Hour) metricsConf := metrics.DefaultConfig("")