Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ability to cancel PKI tidy operations, pause between tidying certs #16958

Merged
merged 8 commits into from
Aug 31, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions builtin/logical/pki/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ func Backend(conf *logical.BackendConfig) *backend {
pathRevoke(&b),
pathRevokeWithKey(&b),
pathTidy(&b),
pathTidyCancel(&b),
pathTidyStatus(&b),
pathConfigAutoTidy(&b),

Expand Down Expand Up @@ -184,6 +185,7 @@ func Backend(conf *logical.BackendConfig) *backend {
}

b.tidyCASGuard = new(uint32)
b.tidyCancelCAS = new(uint32)
b.tidyStatus = &tidyStatus{state: tidyStatusInactive}
b.storage = conf.StorageView
b.backendUUID = conf.BackendUUID
Expand All @@ -205,6 +207,7 @@ type backend struct {
storage logical.Storage
revokeStorageLock sync.RWMutex
tidyCASGuard *uint32
tidyCancelCAS *uint32

tidyStatusLock sync.RWMutex
tidyStatus *tidyStatus
Expand All @@ -223,10 +226,12 @@ type (
)

const (
tidyStatusInactive tidyStatusState = iota
tidyStatusStarted
tidyStatusFinished
tidyStatusError
tidyStatusInactive tidyStatusState = iota
tidyStatusStarted = iota
tidyStatusFinished = iota
tidyStatusError = iota
tidyStatusCancelling = iota
tidyStatusCancelled = iota
)

type tidyStatus struct {
Expand All @@ -235,6 +240,7 @@ type tidyStatus struct {
tidyCertStore bool
tidyRevokedCerts bool
tidyRevokedAssocs bool
pauseDuration string

// Status
state tidyStatusState
Expand Down
1 change: 1 addition & 0 deletions builtin/logical/pki/backend_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3877,6 +3877,7 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
"tidy_cert_store": true,
"tidy_revoked_certs": true,
"tidy_revoked_cert_issuer_associations": false,
"pause_duration": "0s",
"state": "Finished",
"error": nil,
"time_started": nil,
Expand Down
12 changes: 12 additions & 0 deletions builtin/logical/pki/fields.go
Original file line number Diff line number Diff line change
Expand Up @@ -463,5 +463,17 @@ Defaults to 72 hours.`,
Default: int(defaultTidyConfig.SafetyBuffer / time.Second), // TypeDurationSecond currently requires defaults to be int
}

fields["pause_duration"] = &framework.FieldSchema{
Type: framework.TypeString,
Description: `The amount of time to wait between processing
certificates. This allows operators to change the execution profile
of tidy to take consume less resources by slowing down how long it
takes to run. Note that the entire list of certificates will be
stored in memory during the entire tidy operation, but resources to
read/process/update existing entries will be spread out over a
greater period of time. By default this is zero seconds.`,
Default: "0s",
}

return fields
}
192 changes: 159 additions & 33 deletions builtin/logical/pki/path_tidy.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package pki
import (
"context"
"crypto/x509"
"errors"
"fmt"
"net/http"
"sync/atomic"
Expand All @@ -16,22 +17,26 @@ import (
"github.com/hashicorp/vault/sdk/logical"
)

var tidyCancelledError = errors.New("tidy operation cancelled")

type tidyConfig struct {
Enabled bool `json:"enabled"`
Interval time.Duration `json:"interval_duration"`
CertStore bool `json:"tidy_cert_store"`
RevokedCerts bool `json:"tidy_revoked_certs"`
IssuerAssocs bool `json:"tidy_revoked_cert_issuer_associations"`
SafetyBuffer time.Duration `json:"safety_buffer"`
Enabled bool `json:"enabled"`
Interval time.Duration `json:"interval_duration"`
CertStore bool `json:"tidy_cert_store"`
RevokedCerts bool `json:"tidy_revoked_certs"`
IssuerAssocs bool `json:"tidy_revoked_cert_issuer_associations"`
SafetyBuffer time.Duration `json:"safety_buffer"`
PauseDuration time.Duration `json:"pause_duration"`
}

var defaultTidyConfig = tidyConfig{
Enabled: false,
Interval: 12 * time.Hour,
CertStore: false,
RevokedCerts: false,
IssuerAssocs: false,
SafetyBuffer: 72 * time.Hour,
Enabled: false,
Interval: 12 * time.Hour,
CertStore: false,
RevokedCerts: false,
IssuerAssocs: false,
SafetyBuffer: 72 * time.Hour,
PauseDuration: 0 * time.Second,
}

func pathTidy(b *backend) *framework.Path {
Expand All @@ -49,6 +54,20 @@ func pathTidy(b *backend) *framework.Path {
}
}

func pathTidyCancel(b *backend) *framework.Path {
return &framework.Path{
Pattern: "tidy-cancel$",
Operations: map[logical.Operation]framework.OperationHandler{
logical.UpdateOperation: &framework.PathOperation{
Callback: b.pathTidyCancelWrite,
ForwardPerformanceStandby: true,
},
},
HelpSynopsis: pathTidyCancelHelpSyn,
HelpDescription: pathTidyCancelHelpDesc,
}
}

func pathTidyStatus(b *backend) *framework.Path {
return &framework.Path{
Pattern: "tidy-status$",
Expand Down Expand Up @@ -98,21 +117,32 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
tidyCertStore := d.Get("tidy_cert_store").(bool)
tidyRevokedCerts := d.Get("tidy_revoked_certs").(bool) || d.Get("tidy_revocation_list").(bool)
tidyRevokedAssocs := d.Get("tidy_revoked_cert_issuer_associations").(bool)
pauseDurationStr := d.Get("pause_duration").(string)
pauseDuration := 0 * time.Second

if safetyBuffer < 1 {
return logical.ErrorResponse("safety_buffer must be greater than zero"), nil
}

if pauseDurationStr != "" {
var err error
pauseDuration, err = time.ParseDuration(pauseDurationStr)
if err != nil {
return logical.ErrorResponse(fmt.Sprintf("Error parsing pause_duration: %v", err)), nil
}
}

bufferDuration := time.Duration(safetyBuffer) * time.Second

// Manual run with constructed configuration.
config := &tidyConfig{
Enabled: true,
Interval: 0 * time.Second,
CertStore: tidyCertStore,
RevokedCerts: tidyRevokedCerts,
IssuerAssocs: tidyRevokedAssocs,
SafetyBuffer: bufferDuration,
Enabled: true,
Interval: 0 * time.Second,
CertStore: tidyCertStore,
RevokedCerts: tidyRevokedCerts,
IssuerAssocs: tidyRevokedAssocs,
SafetyBuffer: bufferDuration,
PauseDuration: pauseDuration,
}

if !atomic.CompareAndSwapUint32(b.tidyCASGuard, 0, 1) {
Expand Down Expand Up @@ -164,6 +194,11 @@ func (b *backend) startTidyOperation(req *logical.Request, config *tidyConfig) {
}
}

// Check for cancel before continuing.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also schedule if case a cancel comes in after the doTidy func but before we release tidyCASGuard? -> defer atomic.StoreUint32(b.tidyCancelCAS, 0)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If anything, I think we just need a atomic.CompareAndSwapUint32(b.tidyCancelCAS, 1, 0) at the top of this function before getting the tidyCASGuard. otherwise, I don't think it really matters, IMO?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure that would work as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(I think that's correct because it ensures that if a cancel somehow came in after we finished, we'd want any new ones to start fresh).

if atomic.CompareAndSwapUint32(b.tidyCancelCAS, 1, 0) {
return tidyCancelledError
}

if config.RevokedCerts || config.IssuerAssocs {
if err := b.doTidyRevocationStore(ctx, req, logger, config); err != nil {
return nil
cipherboy marked this conversation as resolved.
Show resolved Hide resolved
Expand Down Expand Up @@ -201,6 +236,16 @@ func (b *backend) doTidyCertStore(ctx context.Context, req *logical.Request, log
b.tidyStatusMessage(fmt.Sprintf("Tidying certificate store: checking entry %d of %d", i, serialCount))
metrics.SetGauge([]string{"secrets", "pki", "tidy", "cert_store_current_entry"}, float32(i))

// Check for cancel before continuing.
if atomic.CompareAndSwapUint32(b.tidyCancelCAS, 1, 0) {
return tidyCancelledError
}

// Check for pause duration to reduce resource consumption.
if config.PauseDuration > (0 * time.Second) {
time.Sleep(config.PauseDuration)
}

certEntry, err := req.Storage.Get(ctx, "certs/"+serial)
if err != nil {
return fmt.Errorf("error fetching certificate %q: %w", serial, err)
Expand Down Expand Up @@ -272,6 +317,18 @@ func (b *backend) doTidyRevocationStore(ctx context.Context, req *logical.Reques
b.tidyStatusMessage(fmt.Sprintf("Tidying revoked certificates: checking certificate %d of %d", i, len(revokedSerials)))
metrics.SetGauge([]string{"secrets", "pki", "tidy", "revoked_cert_current_entry"}, float32(i))

// Check for cancel before continuing.
if atomic.CompareAndSwapUint32(b.tidyCancelCAS, 1, 0) {
return tidyCancelledError
}

// Check for pause duration to reduce resource consumption.
if config.PauseDuration > (0 * time.Second) {
b.revokeStorageLock.Unlock()
time.Sleep(config.PauseDuration)
b.revokeStorageLock.Lock()
}

revokedEntry, err := req.Storage.Get(ctx, "revoked/"+serial)
if err != nil {
return fmt.Errorf("unable to fetch revoked cert with serial %q: %w", serial, err)
Expand Down Expand Up @@ -379,6 +436,33 @@ func (b *backend) doTidyRevocationStore(ctx context.Context, req *logical.Reques
return nil
}

func (b *backend) pathTidyCancelWrite(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) {
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceSecondary) && !b.System().LocalMount() {
return nil, logical.ErrReadOnly
}

if atomic.LoadUint32(b.tidyCASGuard) == 0 {
resp := &logical.Response{}
resp.AddWarning("Tidy operation cannot be cancelled as none is currently running.")
return resp, nil
}

// Grab the status lock before writing the cancel atomic. This lets us
// update the status correctly as well, avoiding writing it if we're not
// presently running.
//
// Unlock needs to occur prior to calling read.
b.tidyStatusLock.Lock()
if b.tidyStatus.state == tidyStatusStarted || atomic.LoadUint32(b.tidyCASGuard) == 1 {
if atomic.CompareAndSwapUint32(b.tidyCancelCAS, 0, 1) {
b.tidyStatus.state = tidyStatusCancelling
}
}
b.tidyStatusLock.Unlock()

return b.pathTidyStatusRead(ctx, req, d)
}

func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *framework.FieldData) (*logical.Response, error) {
// If this node is a performance secondary return an ErrReadOnly so that the request gets forwarded,
// but only if the PKI backend is not a local mount.
Expand All @@ -391,17 +475,19 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f

resp := &logical.Response{
Data: map[string]interface{}{
"safety_buffer": nil,
"tidy_cert_store": nil,
"tidy_revoked_certs": nil,
"state": "Inactive",
"error": nil,
"time_started": nil,
"time_finished": nil,
"message": nil,
"cert_store_deleted_count": nil,
"revoked_cert_deleted_count": nil,
"missing_issuer_cert_count": nil,
"safety_buffer": nil,
"tidy_cert_store": nil,
"tidy_revoked_certs": nil,
"tidy_revoked_cert_issuer_associations": nil,
"pause_duration": nil,
"state": "Inactive",
"error": nil,
"time_started": nil,
"time_finished": nil,
"message": nil,
"cert_store_deleted_count": nil,
"revoked_cert_deleted_count": nil,
"missing_issuer_cert_count": nil,
},
}

Expand All @@ -413,6 +499,7 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f
resp.Data["tidy_cert_store"] = b.tidyStatus.tidyCertStore
resp.Data["tidy_revoked_certs"] = b.tidyStatus.tidyRevokedCerts
resp.Data["tidy_revoked_cert_issuer_associations"] = b.tidyStatus.tidyRevokedAssocs
resp.Data["pause_duration"] = b.tidyStatus.pauseDuration
resp.Data["time_started"] = b.tidyStatus.timeStarted
resp.Data["message"] = b.tidyStatus.message
resp.Data["cert_store_deleted_count"] = b.tidyStatus.certStoreDeletedCount
Expand All @@ -432,6 +519,11 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f
resp.Data["error"] = b.tidyStatus.err.Error()
// Don't clear the message so that it serves as a hint about when
// the error occurred.
case tidyStatusCancelling:
resp.Data["state"] = "Cancelling"
case tidyStatusCancelled:
resp.Data["state"] = "Cancelled"
resp.Data["time_finished"] = b.tidyStatus.timeFinished
}

return resp, nil
Expand All @@ -452,6 +544,7 @@ func (b *backend) pathConfigAutoTidyRead(ctx context.Context, req *logical.Reque
"tidy_revoked_certs": config.RevokedCerts,
"tidy_revoked_cert_issuer_associations": config.IssuerAssocs,
"safety_buffer": int(config.SafetyBuffer / time.Second),
"pause_duration": config.PauseDuration.String(),
},
}, nil
}
Expand Down Expand Up @@ -493,6 +586,13 @@ func (b *backend) pathConfigAutoTidyWrite(ctx context.Context, req *logical.Requ
}
}

if pauseDurationRaw, ok := d.GetOk("pause_duration"); ok {
config.PauseDuration, err = time.ParseDuration(pauseDurationRaw.(string))
if err != nil {
return logical.ErrorResponse(fmt.Sprintf("unable to parse given pause_duration: %v", err)), nil
}
}
cipherboy marked this conversation as resolved.
Show resolved Hide resolved

if config.Enabled && !(config.CertStore || config.RevokedCerts || config.IssuerAssocs) {
return logical.ErrorResponse(fmt.Sprintf("Auto-tidy enabled but no tidy operations were requested. Enable at least one tidy operation to be run (tidy_cert_store / tidy_revoked_certs / tidy_revoked_cert_issuer_associations).")), nil
}
Expand All @@ -509,8 +609,10 @@ func (b *backend) tidyStatusStart(config *tidyConfig) {
tidyCertStore: config.CertStore,
tidyRevokedCerts: config.RevokedCerts,
tidyRevokedAssocs: config.IssuerAssocs,
state: tidyStatusStarted,
timeStarted: time.Now(),
pauseDuration: config.PauseDuration.String(),

state: tidyStatusStarted,
timeStarted: time.Now(),
}

metrics.SetGauge([]string{"secrets", "pki", "tidy", "start_time_epoch"}, float32(b.tidyStatus.timeStarted.Unix()))
Expand All @@ -524,6 +626,8 @@ func (b *backend) tidyStatusStop(err error) {
b.tidyStatus.err = err
if err == nil {
b.tidyStatus.state = tidyStatusFinished
} else if err == tidyCancelledError {
b.tidyStatus.state = tidyStatusCancelled
} else {
b.tidyStatus.state = tidyStatusError
}
Expand Down Expand Up @@ -595,6 +699,18 @@ current time, minus the value of 'safety_buffer', is greater than the
expiration, it will be removed.
`

const pathTidyCancelHelpSyn = `
Cancels a currently running tidy operation.
`

const pathTidyCancelHelpDesc = `
This endpoint allows cancelling a currently running tidy operation.

Periodically throughout the invocation of tidy, we'll check if the operation
has been requested to be cancelled. If so, we'll stop the currently running
tidy operation.
`

const pathTidyStatusHelpSyn = `
Returns the status of the tidy operation.
`
Expand All @@ -619,6 +735,16 @@ The result includes the following fields:
* 'missing_issuer_cert_count': The number of revoked certificates which were missing a valid issuer reference
`

const pathConfigAutoTidySyn = ``
const pathConfigAutoTidySyn = `
Modifies the current configuration for automatic tidy execution.
`

const pathConfigAutoTidyDesc = `
This endpoint accepts parameters to a tidy operation (see /tidy) that
will be used for automatic tidy execution. This takes two extra parameters,
enabled (to enable or disable auto-tidy) and interval_duration (which
controls the frequency of auto-tidy execution).

const pathConfigAutoTidyDesc = ``
Once enabled, a tidy operation will be kicked off automatically, as if it
were executed with the posted configuration.
`
Loading