Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Basics of Cert-Count Non-Locking Telemetry #16676

Merged
merged 26 commits into from
Sep 20, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
71587d3
Basics of Cert-Count Telemetry, including locks.
kitography Aug 10, 2022
9065949
Missing cert-writing counter locations.
kitography Aug 12, 2022
cb009a6
Add changelog.
kitography Aug 12, 2022
eb427f2
Remove issuance/cert-storage lock. Replace with "best attempt" slice…
kitography Aug 17, 2022
64a16eb
make fmt.
kitography Aug 17, 2022
8771730
Make the certsCounted non-lock an atomic boolean.
kitography Aug 22, 2022
78acc72
Move sorting of possibleDoubleCountedRevokedSerials to after compare …
kitography Aug 22, 2022
5355dd2
Add values to counter when still initializing.
kitography Aug 22, 2022
485044e
Remove true.
kitography Aug 22, 2022
3c10e8e
Merge branch 'main' into VAULT-7543-telemetry-stored-cert-count
kitography Aug 22, 2022
0e09502
Merge branch 'main' into VAULT-7543-telemetry-stored-cert-count
kitography Aug 23, 2022
635bce9
Merge branch 'main' into VAULT-7543-telemetry-stored-cert-count
kitography Aug 24, 2022
c86daae
make fmt.
kitography Aug 24, 2022
b269ab2
Fix atomic2 import (removed on main).
kitography Aug 24, 2022
8d6fa59
make fmt
kitography Sep 1, 2022
09b5130
Merge branch 'main' into VAULT-7543-telemetry-stored-cert-count
kitography Sep 1, 2022
f74928c
Delay reporting metrics until after deduplication has completed.
kitography Sep 1, 2022
15acb79
Fix lock, allow tidy to report status (and report our status).
kitography Sep 1, 2022
1fdf9ce
make fmt
kitography Sep 1, 2022
f8ad83e
The test works now.
kitography Sep 3, 2022
eb441f5
Move string slice to helper function; make fmt.
kitography Sep 8, 2022
29e01e6
Add the extra pki-tidy field expectations (test-fix, oops)
kitography Sep 8, 2022
8bb37d3
Add backendUUID to gauge name.
kitography Sep 9, 2022
8912bc3
Fix test - backendUUID drawn from earlier in the test.
kitography Sep 19, 2022
0cfc3a4
Merge branch 'main' into VAULT-7543-telemetry-stored-cert-count
kitography Sep 20, 2022
ccff788
make fmt
kitography Sep 20, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 160 additions & 0 deletions builtin/logical/pki/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package pki
import (
"context"
"fmt"
"sort"
"strings"
"sync"
"sync/atomic"
Expand Down Expand Up @@ -177,6 +178,13 @@ func Backend(conf *logical.BackendConfig) *backend {
b.pkiStorageVersion.Store(0)

b.crlBuilder = &crlBuilder{}

b.certsCounted = false
b.certCount = new(uint32)
b.revokedCertCount = new(uint32)
b.possibleDoubleCountedSerials = make([]string, 0, 250)
b.possibleDoubleCountedRevokedSerials = make([]string, 0, 250)

return &b
}

Expand All @@ -192,6 +200,12 @@ type backend struct {
tidyStatusLock sync.RWMutex
tidyStatus *tidyStatus

certCount *uint32
revokedCertCount *uint32
certsCounted bool
kitography marked this conversation as resolved.
Show resolved Hide resolved
possibleDoubleCountedSerials []string
possibleDoubleCountedRevokedSerials []string

pkiStorageVersion atomic.Value
crlBuilder *crlBuilder

Expand Down Expand Up @@ -293,6 +307,21 @@ func (b *backend) metricsWrap(callType string, roleMode int, ofunc roleOperation

// initialize is used to perform a possible PKI storage migration if needed
func (b *backend) initialize(ctx context.Context, _ *logical.InitializationRequest) error {
err := b.initializePKIIssuersStorage(ctx)
if err != nil {
return err
}

// Initialize also needs to populate our certificate and revoked certificate count
err = b.initializeStoredCertificateCounts(ctx)
if err != nil {
return err
}

return nil
}

func (b *backend) initializePKIIssuersStorage(ctx context.Context) error {
// Grab the lock prior to the updating of the storage lock preventing us flipping
// the storage flag midway through the request stream of other requests.
b.issuersLock.Lock()
Expand Down Expand Up @@ -377,3 +406,134 @@ func (b *backend) invalidate(ctx context.Context, key string) {
func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) error {
return b.crlBuilder.rebuildIfForced(ctx, b, request)
}

func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error {
b.tidyStatusLock.Lock()
defer b.tidyStatusLock.Unlock()
// For performance reasons, we can't lock on issuance/storage of certs until a list operation completes,
// but we want to limit possible miscounts / double-counts to over-counting, so we take the tidy lock which
// prevents (most) deletions

entries, err := b.storage.List(ctx, "certs/")
if err != nil {
return err
}
atomic.StoreUint32(b.certCount, uint32(len(entries)))
metrics.SetGauge([]string{"secrets", "pki", "total_certificates_stored"}, float32(*b.certCount))
kitography marked this conversation as resolved.
Show resolved Hide resolved
revokedEntries, err := b.storage.List(ctx, "revoked/")
if err != nil {
return err
}
atomic.StoreUint32(b.revokedCertCount, uint32(len(revokedEntries)))
metrics.SetGauge([]string{"secrets", "pki", "total_revoked_certificates_stored"}, float32(*b.revokedCertCount))

b.certsCounted = true
// Now that the metrics are set, we can switch from appending newly-stored certificates to the possible double-count
// list, and instead have them update the counter directly. We need to do this so that we are looking at a static
// slice of possibly double counted serials. Note that certsCounted is computed before the storage operation, so
// there may be some delay here.

// Sort the listed-entries first, to accommodate that delay.
sort.Slice(entries, func(i, j int) bool {
return entries[i] < entries[j]
})

sort.Slice(revokedEntries, func(i, j int) bool {
return revokedEntries[i] < revokedEntries[j]
})

// We assume here that these lists are now complete.
sort.Slice(b.possibleDoubleCountedSerials, func(i, j int) bool {
return b.possibleDoubleCountedSerials[i] < b.possibleDoubleCountedSerials[j]
})

sort.Slice(b.possibleDoubleCountedRevokedSerials, func(i, j int) bool {
return b.possibleDoubleCountedRevokedSerials[i] < b.possibleDoubleCountedRevokedSerials[j]
})

listEntriesIndex := 0
possibleDoubleCountIndex := 0
for true {
kitography marked this conversation as resolved.
Show resolved Hide resolved
if listEntriesIndex >= len(entries) {
break
}
if possibleDoubleCountIndex >= len(b.possibleDoubleCountedSerials) {
break
}
if entries[listEntriesIndex] == b.possibleDoubleCountedSerials[possibleDoubleCountIndex] {
kitography marked this conversation as resolved.
Show resolved Hide resolved
// This represents a double-counted entry
b.decrementTotalCertificatesCount()
listEntriesIndex = listEntriesIndex + 1
possibleDoubleCountIndex = possibleDoubleCountIndex + 1
continue
}
if entries[listEntriesIndex] < b.possibleDoubleCountedSerials[possibleDoubleCountIndex] {
listEntriesIndex = listEntriesIndex + 1
continue
}
if entries[listEntriesIndex] > b.possibleDoubleCountedSerials[possibleDoubleCountIndex] {
possibleDoubleCountIndex = possibleDoubleCountIndex + 1
continue
}
}

listRevokedEntriesIndex := 0
possibleRevokedDoubleCountIndex := 0
for true {
if listRevokedEntriesIndex >= len(revokedEntries) {
break
}
if possibleRevokedDoubleCountIndex >= len(b.possibleDoubleCountedRevokedSerials) {
break
}
if revokedEntries[listRevokedEntriesIndex] == b.possibleDoubleCountedRevokedSerials[possibleRevokedDoubleCountIndex] {
// This represents a double-counted revoked entry
b.decrementTotalRevokedCertificatesCount()
listRevokedEntriesIndex = listRevokedEntriesIndex + 1
possibleRevokedDoubleCountIndex = possibleRevokedDoubleCountIndex + 1
continue
}
if revokedEntries[listRevokedEntriesIndex] < b.possibleDoubleCountedRevokedSerials[possibleRevokedDoubleCountIndex] {
listRevokedEntriesIndex = listRevokedEntriesIndex + 1
continue
}
if revokedEntries[listRevokedEntriesIndex] > b.possibleDoubleCountedRevokedSerials[possibleRevokedDoubleCountIndex] {
possibleRevokedDoubleCountIndex = possibleRevokedDoubleCountIndex + 1
continue
}
}

return nil
kitography marked this conversation as resolved.
Show resolved Hide resolved
}

func (b *backend) incrementTotalCertificatesCount(certsCounted bool, newSerial string) {
switch {
case certsCounted:
atomic.AddUint32(b.certCount, 1)
metrics.SetGauge([]string{"secrets", "pki", "total_certificates_stored"}, float32(*b.certCount))
default:
// This is unsafe, but a good best-attempt
b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial)
}
}

func (b *backend) decrementTotalCertificatesCount() {
atomic.AddUint32(b.certCount, ^uint32(0))
metrics.SetGauge([]string{"secrets", "pki", "total_certificates_stored"}, float32(*b.certCount))
}

func (b *backend) incrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) {
kitography marked this conversation as resolved.
Show resolved Hide resolved
switch {
case certsCounted:
atomic.AddUint32(b.revokedCertCount, 1)
metrics.SetGauge([]string{"secrets", "pki", "total_revoked_certificates_stored"}, float32(*b.revokedCertCount))
default:
// This is unsafe, but a good best-attempt
b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial)
}
}

func (b *backend) decrementTotalRevokedCertificatesCount() {
atomic.AddUint32(b.revokedCertCount, ^uint32(0))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm pretty sure this can't happen in the current code/usage I'm seeing with the two decrements, but should we add a guard here for b.revokedCertCount being 0 and we under-flowing the unit32? The only safe way I can think of this is this ugly code block

testInt := uint32(1)

for i := 0; i <= 5; i++ {
	oldVal := atomic.LoadUint32(&testInt)
	if oldVal == 0 {
		break
	}
	newVal := oldVal - 1
	if atomic.CompareAndSwapUint32(&testInt, oldVal, newVal) {
		break
	}
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I admit I don't like this. If someone ever saw an underflow it would be solvable by "turning off and turning on" and would be a really really helpful report that we aren't honouring the promise of "always overcount".

If someone else agrees with this solution, I'll go ahead and add it, but in general I don't think we can protect against people writing bad code in the future with uglier code now.

metrics.SetGauge([]string{"secrets", "pki", "total_revoked_certificates_stored"}, float32(*b.revokedCertCount))
}
40 changes: 29 additions & 11 deletions builtin/logical/pki/backend_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3456,7 +3456,7 @@ func TestBackend_AllowedDomainsTemplate(t *testing.T) {
t.Fatal(err)
}

// Issue certificate for foobar.com to verify allowed_domain_templae doesnt break plain domains.
// Issue certificate for foobar.com to verify allowed_domain_template doesn't break plain domains.
_, err = client.Logical().Write("pki/issue/test", map[string]interface{}{"common_name": "foobar.com"})
if err != nil {
t.Fatal(err)
Expand Down Expand Up @@ -3731,7 +3731,10 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
metricsConf.EnableServiceLabel = false
metricsConf.EnableTypePrefix = false

metrics.NewGlobal(metricsConf, inmemSink)
_, err := metrics.NewGlobal(metricsConf, inmemSink)
if err != nil {
t.Fatal(err)
}

// Enable PKI secret engine
coreConfig := &vault.CoreConfig{
Expand All @@ -3748,8 +3751,6 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
vault.TestWaitActive(t, cores[0].Core)
client := cores[0].Client

var err error

// Mount /pki as a root CA
err = client.Sys().Mount("pki", &api.MountInput{
Type: "pki",
Expand Down Expand Up @@ -3819,6 +3820,21 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
t.Fatal(err)
}

// Check the cert-count metrics
expectedCertCountGaugeMetrics := map[string]float32{
"secrets.pki.total_revoked_certificates_stored": 1,
"secrets.pki.total_certificates_stored": 1,
}
mostRecentInterval := inmemSink.Data()[len(inmemSink.Data())-1]
for gauge, value := range expectedCertCountGaugeMetrics {
if _, ok := mostRecentInterval.Gauges[gauge]; !ok {
t.Fatalf("Expected metrics to include a value for gauge %s", gauge)
}
if value != mostRecentInterval.Gauges[gauge].Value {
t.Fatalf("Expected value metric %s to be %f but got %f", gauge, value, mostRecentInterval.Gauges[gauge].Value)
}
}

// Revoke adds a fixed 2s buffer, so we sleep for a bit longer to ensure
// the revocation time is past the current time.
time.Sleep(3 * time.Second)
Expand Down Expand Up @@ -3902,13 +3918,15 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
}
// Check the tidy metrics
{
// Map of gagues to expected value
// Map of gauges to expected value
expectedGauges := map[string]float32{
"secrets.pki.tidy.cert_store_current_entry": 0,
"secrets.pki.tidy.cert_store_total_entries": 1,
"secrets.pki.tidy.revoked_cert_current_entry": 0,
"secrets.pki.tidy.revoked_cert_total_entries": 1,
"secrets.pki.tidy.start_time_epoch": 0,
"secrets.pki.tidy.cert_store_current_entry": 0,
"secrets.pki.tidy.cert_store_total_entries": 1,
"secrets.pki.tidy.revoked_cert_current_entry": 0,
"secrets.pki.tidy.revoked_cert_total_entries": 1,
"secrets.pki.tidy.start_time_epoch": 0,
"secrets.pki.total_certificates_stored": 0,
"secrets.pki.total_revoked_certificates_stored": 0,
}
// Map of counters to the sum of the metrics for that counter
expectedCounters := map[string]float64{
Expand All @@ -3918,7 +3936,7 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
// Note that "secrets.pki.tidy.failure" won't be in the captured metrics
}

// If the metrics span mnore than one interval, skip the checks
// If the metrics span more than one interval, skip the checks
intervals := inmemSink.Data()
if len(intervals) == 1 {
interval := inmemSink.Data()[0]
Expand Down
8 changes: 8 additions & 0 deletions builtin/logical/pki/cert_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,10 +213,18 @@ func fetchCertBySerial(ctx context.Context, b *backend, req *logical.Request, pr

// Update old-style paths to new-style paths
certEntry.Key = path
certsCounted := b.certsCounted
if err = req.Storage.Put(ctx, certEntry); err != nil {
return nil, errutil.InternalError{Err: fmt.Sprintf("error saving certificate with serial %s to new location", serial)}
}
if err = req.Storage.Delete(ctx, legacyPath); err != nil {
// If we fail here, we have an extra (copy) of a cert in storage, add to metrics:
switch {
case strings.HasPrefix(prefix, "revoked/"):
b.incrementTotalRevokedCertificatesCount(certsCounted, path)
kitography marked this conversation as resolved.
Show resolved Hide resolved
default:
b.incrementTotalCertificatesCount(certsCounted, path)
}
return nil, errutil.InternalError{Err: fmt.Sprintf("error deleting certificate with serial %s from old location", serial)}
}

Expand Down
2 changes: 2 additions & 0 deletions builtin/logical/pki/crl_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,10 +230,12 @@ func revokeCert(ctx context.Context, b *backend, req *logical.Request, serial st
return nil, fmt.Errorf("error creating revocation entry")
}

certsCounted := b.certsCounted
err = req.Storage.Put(ctx, revEntry)
if err != nil {
return nil, fmt.Errorf("error saving revoked certificate to new location")
}
b.incrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key)
}

crlErr := b.crlBuilder.rebuild(ctx, b, req, false)
Expand Down
5 changes: 4 additions & 1 deletion builtin/logical/pki/path_issue_sign.go
Original file line number Diff line number Diff line change
Expand Up @@ -408,13 +408,16 @@ func (b *backend) pathIssueSignCert(ctx context.Context, req *logical.Request, d
}

if !role.NoStore {
key := "certs/" + normalizeSerial(cb.SerialNumber)
certsCounted := b.certsCounted
err = req.Storage.Put(ctx, &logical.StorageEntry{
Key: "certs/" + normalizeSerial(cb.SerialNumber),
Key: key,
Value: parsedBundle.CertificateBytes,
})
if err != nil {
return nil, fmt.Errorf("unable to store certificate locally: %w", err)
}
b.incrementTotalCertificatesCount(certsCounted, key)
}

if useCSR {
Expand Down
10 changes: 8 additions & 2 deletions builtin/logical/pki/path_root.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,13 +257,16 @@ func (b *backend) pathCAGenerateRoot(ctx context.Context, req *logical.Request,

// Also store it as just the certificate identified by serial number, so it
// can be revoked
key := "certs/" + normalizeSerial(cb.SerialNumber)
certsCounted := b.certsCounted
err = req.Storage.Put(ctx, &logical.StorageEntry{
Key: "certs/" + normalizeSerial(cb.SerialNumber),
Key: key,
Value: parsedBundle.CertificateBytes,
})
if err != nil {
return nil, fmt.Errorf("unable to store certificate locally: %w", err)
}
b.incrementTotalCertificatesCount(certsCounted, key)

// Build a fresh CRL
err = b.crlBuilder.rebuild(ctx, b, req, true)
Expand Down Expand Up @@ -439,13 +442,16 @@ func (b *backend) pathIssuerSignIntermediate(ctx context.Context, req *logical.R
return nil, fmt.Errorf("unsupported format argument: %s", format)
}

key := "certs/" + normalizeSerial(cb.SerialNumber)
certsCounted := b.certsCounted
err = req.Storage.Put(ctx, &logical.StorageEntry{
Key: "certs/" + normalizeSerial(cb.SerialNumber),
Key: key,
Value: parsedBundle.CertificateBytes,
})
if err != nil {
return nil, fmt.Errorf("unable to store certificate locally: %w", err)
}
b.incrementTotalCertificatesCount(certsCounted, key)

if parsedBundle.Certificate.MaxPathLen == 0 {
resp.AddWarning("Max path length of the signed certificate is zero. This certificate cannot be used to issue intermediate CA certificates.")
Expand Down
6 changes: 5 additions & 1 deletion builtin/logical/pki/path_tidy.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f
resp.Data["time_finished"] = b.tidyStatus.timeFinished
resp.Data["error"] = b.tidyStatus.err.Error()
// Don't clear the message so that it serves as a hint about when
// the error ocurred.
// the error occurred.
}

return resp, nil
Expand Down Expand Up @@ -358,13 +358,17 @@ func (b *backend) tidyStatusIncCertStoreCount() {
defer b.tidyStatusLock.Unlock()

b.tidyStatus.certStoreDeletedCount++

b.decrementTotalCertificatesCount()
}

func (b *backend) tidyStatusIncRevokedCertCount() {
b.tidyStatusLock.Lock()
defer b.tidyStatusLock.Unlock()

b.tidyStatus.revokedCertDeletedCount++

b.decrementTotalRevokedCertificatesCount()
}

const pathTidyHelpSyn = `
Expand Down
3 changes: 3 additions & 0 deletions changelog/16676.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
secrets/pki: Added gauge metrics "secrets.pki.total_revoked_certificates_stored" and "secrets.pki.total_certificates_stored" to track the number of certificates in storage.
```