Skip to content

Commit

Permalink
CBG-3742: Allow registry rollbacks based on db config doc rollbacks (#…
Browse files Browse the repository at this point in the history
…6709)

* Extend startBootstrapServerWithoutConfigPolling polling interval

* tidy return

* Use config from bucket to roll back if no previous version is found (config vbucket rollback/config restore)

* wip

* Mark conflicting rollbacks as invalid in db registry and prevent loading by SG. Allows for manual repair of database config.

* Improve log message for invalid database configurations - handle multiple scenarios

* Handle unknown reasons why a db could be invalid

* Do database repair via normal Admin REST API rather to ensure externally recoverable

* Equal(len())->Len()

* Rename stat

* UD->MD fix

* lower retry timeout for testing

* Require 3 datastores

* Replace registry entry with one built from config

* Reword unexpected InvalidDatabase case

* Push mutliDatabsaeRollback into subtest

* fix ineffassign

* Fix non-deterministic slice ordering from RequireInvalidDatabaseConfigNames helper
  • Loading branch information
bbrks authored Mar 5, 2024
1 parent 172ac73 commit fe58465
Show file tree
Hide file tree
Showing 8 changed files with 322 additions and 42 deletions.
6 changes: 6 additions & 0 deletions base/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,10 @@ func (g *GlobalStat) initConfigStats() error {
if err != nil {
return err
}
configStat.DatabaseRollbackCollectionCollisions, err = NewIntStat(ConfigSubsystem, "database_config_collection_conflicts", StatUnitBytes, DatabaseCollectionConflictDesc, StatAddedVersion3dot1dot4, StatDeprecatedVersionNotDeprecated, StatStabilityCommitted, nil, nil, prometheus.CounterValue, 0)
if err != nil {
return err
}
g.ConfigStat = configStat
return nil
}
Expand Down Expand Up @@ -354,6 +358,8 @@ type ResourceUtilization struct {
type ConfigStat struct {
// The number of times the bucket specified in a database config doesn't match the bucket it's found in.
DatabaseBucketMismatches *SgwIntStat `json:"database_config_bucket_mismatches"`
// The number of times the config was rolled back to an invalid state (conflicting collections)
DatabaseRollbackCollectionCollisions *SgwIntStat `json:"database_config_rollback_collection_collisions"`
}

type DbStats struct {
Expand Down
3 changes: 2 additions & 1 deletion base/stats_descriptions.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ const (

// error stat
const (
DatabaseBucketMismatchesDesc = "The total number of times a database config is polled from a bucket that doesn't match the bucket specified in the database config."
DatabaseBucketMismatchesDesc = "The total number of times a database config is polled from a bucket that doesn't match the bucket specified in the database config."
DatabaseCollectionConflictDesc = "The total number of times a database config is rolled back to an invalid state (collection conflicts)."
)

// cache stats descriptions
Expand Down
50 changes: 33 additions & 17 deletions rest/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ type invalidConfigInfo struct {
logged bool
configBucketName string
persistedBucketName string
collectionConflicts bool
}

type invalidDatabaseConfigs struct {
Expand All @@ -303,27 +304,42 @@ type invalidDatabaseConfigs struct {
// addInvalidDatabase adds a db to invalid dbconfig map if it doesn't exist in there yet and will log for it at warning level
// if the db already exists there we will calculate if we need to log again according to the config update interval
func (d *invalidDatabaseConfigs) addInvalidDatabase(ctx context.Context, dbname string, cnf DatabaseConfig, bucket string) {
configInfo := invalidConfigInfo{
configBucketName: *cnf.Bucket,
persistedBucketName: bucket,
}
d.m.Lock()
defer d.m.Unlock()
if d.dbNames[dbname] == nil {
// db hasn't been tracked as invalid config yet so add it
d.dbNames[dbname] = &configInfo
d.dbNames[dbname] = &invalidConfigInfo{
configBucketName: *cnf.Bucket,
persistedBucketName: bucket,
collectionConflicts: cnf.Version == invalidDatabaseConflictingCollectionsVersion,
}
}

logMessage := "Must repair invalid database config for %q for it to be usable!"
logArgs := []interface{}{base.MD(dbname)}

// build log message
if isBucketMismatch := *cnf.Bucket != bucket; isBucketMismatch {
base.SyncGatewayStats.GlobalStats.ConfigStat.DatabaseBucketMismatches.Add(1)
logMessage += " Mismatched buckets (config bucket: %q, actual bucket: %q)"
logArgs = append(logArgs, base.MD(d.dbNames[dbname].configBucketName), base.MD(d.dbNames[dbname].persistedBucketName))
} else if cnf.Version == invalidDatabaseConflictingCollectionsVersion {
base.SyncGatewayStats.GlobalStats.ConfigStat.DatabaseRollbackCollectionCollisions.Add(1)
logMessage += " Conflicting collections detected"
} else {
// Nothing is expected to hit this case, but we might add more invalid sentinel values and forget to update this code.
logMessage += " Database was marked invalid. See logs for details."
}
logMessage := fmt.Sprintf("Mismatch in database config for database %q bucket name: %q and backend bucket: %q You must update database config immediately", base.MD(dbname), base.MD(d.dbNames[dbname].configBucketName), base.MD(d.dbNames[dbname].persistedBucketName))

// if we get here we already have the db logged as an invalid config, so now we need to work out iof we should log for it now
if !d.dbNames[dbname].logged {
// we need to log at warning if we haven't already logged for this particular corrupt db config
base.WarnfCtx(ctx, logMessage)
base.WarnfCtx(ctx, logMessage, logArgs...)
d.dbNames[dbname].logged = true
} else {
// already logged this entry at warning so need to log at info now
base.InfofCtx(ctx, base.KeyConfig, logMessage)
base.InfofCtx(ctx, base.KeyConfig, logMessage, logArgs...)
}
base.SyncGatewayStats.GlobalStats.ConfigStat.DatabaseBucketMismatches.Add(1)
}

func (d *invalidDatabaseConfigs) exists(dbname string) (*invalidConfigInfo, bool) {
Expand Down Expand Up @@ -1762,21 +1778,21 @@ func (sc *ServerContext) FetchConfigs(ctx context.Context, isInitialStartup bool
continue
}
for _, cnf := range configs {

// inherit properties the bootstrap config
cnf.CACertPath = sc.Config.Bootstrap.CACertPath

// We need to check for corruption in the database config (CC. CBG-3292). If the fetched config doesn't match the
// bucket name we got the config from we need to maker this db context as corrupt. Then remove the context and
// in memory representation on the server context.
if bucket != cnf.GetBucketName() {
// Handle invalid database registry entries. Either:
// - CBG-3292: Bucket in config doesn't match the actual bucket
// - CBG-3742: Registry entry marked invalid (due to rollback causing collection conflict)
if isRegistryDbConfigVersionInvalid(cnf.Version) || bucket != cnf.GetBucketName() {
sc.handleInvalidDatabaseConfig(ctx, bucket, *cnf)
continue
}

bucketCopy := bucket
// no corruption detected carry on as usual
cnf.Bucket = &bucketCopy

// inherit properties the bootstrap config
cnf.CACertPath = sc.Config.Bootstrap.CACertPath

// stamp per-database credentials if set
if dbCredentials, ok := sc.Config.DatabaseCredentials[cnf.Name]; ok && dbCredentials != nil {
cnf.setDatabaseCredentials(*dbCredentials)
Expand Down
25 changes: 16 additions & 9 deletions rest/config_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ func (b *bootstrapContext) GetDatabaseConfigs(ctx context.Context, bucketName, g
reloadRequired := false
for dbName, registryDb := range configGroup.Databases {
// Ignore databases with deleted version - represents an in-progress delete
if registryDb.Version == deletedDatabaseVersion {
if registryDb.IsDeleted() {
continue
}
dbConfig, err := b.getDatabaseConfig(ctx, bucketName, groupID, dbName, registryDb.Version, registry)
Expand Down Expand Up @@ -405,6 +405,13 @@ func (b *bootstrapContext) getConfigVersionWithRetry(ctx context.Context, bucket
}

config.cfgCas = cas

if version == invalidDatabaseConflictingCollectionsVersion {
// special case - return the invalid config to use in updates (repairs). Configs with this version will not be loaded by SG.
config.Version = invalidDatabaseConflictingCollectionsVersion
return false, nil, config
}

// If version matches, success!
if config.Version == version {
return false, nil, config
Expand All @@ -417,10 +424,10 @@ func (b *bootstrapContext) getConfigVersionWithRetry(ctx context.Context, bucket
// If the config has a newer version than requested, return the config but alert caller that they have
// requested a stale version.
return false, base.ErrConfigVersionMismatch, config
} else {
base.InfofCtx(ctx, base.KeyConfig, "getConfigVersionWithRetry for key %s found version mismatch, retrying. Requested: %s, Found: %s", metadataKey, version, config.Version)
return true, base.ErrConfigRegistryRollback, config
}

base.InfofCtx(ctx, base.KeyConfig, "getConfigVersionWithRetry for key %s found version mismatch, retrying. Requested: %s, Found: %s", metadataKey, version, config.Version)
return true, base.ErrConfigRegistryRollback, config
}

// Kick off the retry loop
Expand Down Expand Up @@ -558,11 +565,11 @@ func (b *bootstrapContext) rollbackRegistry(ctx context.Context, bucketName, gro

// non-nil config indicates database version in registry should be updated to match config
base.InfofCtx(ctx, base.KeyConfig, "Rolling back config registry to align with db config version %s for db: %s, bucket:%s configGroup:%s", config.Version, base.MD(dbName), base.MD(bucketName), base.MD(groupID))
registryErr := registry.rollbackDatabaseConfig(ctx, groupID, dbName)
registryErr := registry.rollbackDatabaseConfig(ctx, groupID, dbName, config)
if registryErr != nil {
// There shouldn't be a case where rollback introduces a collection conflict - it
// shouldn't be possible to add a conflicting collection to the registry while a previous
// config persistence is in-flight
// There is one case where the registry rollback can introduce a collection conflict.
// If there's no PreviousVersion present (i.e. we're handling a db config doc rollback, not a registry update)
// then it's possible for the db config to contain a collection that is now present on another database in the registry.
return fmt.Errorf("Unable to roll back registry to match existing config for database %s(%s): %w", base.MD(dbName), base.MD(groupID), registryErr)
}
}
Expand Down Expand Up @@ -677,7 +684,7 @@ func (b *bootstrapContext) getRegistryAndDatabase(ctx context.Context, bucketNam
}
return registry, nil, err
} else {
if registryDb.Version != "" && registryDb.Version != deletedDatabaseVersion {
if registryDb.Version != "" && !registryDb.IsDeleted() {
// Database exists in registry, go fetch the config
config, err = b.getDatabaseConfig(ctx, bucketName, groupID, dbName, registryDb.Version, registry)
if err == base.ErrConfigRegistryReloadRequired {
Expand Down
46 changes: 38 additions & 8 deletions rest/config_registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,14 @@ type GatewayRegistry struct {

const GatewayRegistryVersion = "1.0"

const deletedDatabaseVersion = "0-0"
// A set of special sentinel values to store as database versions.
// NOTE: Update addInvalidDatabase and isRegistryDbConfigVersionInvalid if you add more.
const (
// deletedDatabaseVersion represents an entry for an in-progress delete
deletedDatabaseVersion = "0-0"
// invalidDatabaseConflictingCollectionsVersion represents an entry for a rollback-induced invalid state
invalidDatabaseConflictingCollectionsVersion = "0-1"
)

// RegistryConfigGroup stores the set of databases for a given config group
type RegistryConfigGroup struct {
Expand All @@ -60,18 +67,33 @@ type RegistryConfigGroup struct {

// RegistryDatabase stores the version and set of RegistryScopes for a database
type RegistryDatabase struct {
RegistryDatabaseVersion // current version
MetadataID string `json:"metadata_id"` // Metadata ID
UUID string `json:"uuid,omitempty"` // Database UUID
RegistryDatabaseVersion // current version
// PreviousVersion stores the previous database version while an update is in progress, in case update of the config
// fails and rollback is required. Required to avoid cross-database collection conflicts during rollback.
PreviousVersion *RegistryDatabaseVersion `json:"previous_version,omitempty"`
MetadataID string `json:"metadata_id"` // Metadata ID
UUID string `json:"uuid,omitempty"` // Database UUID
}

// DatabaseVersion stores the version and collection set for a database. Used for storing current or previous version.
type RegistryDatabaseVersion struct {
Version string `json:"version,omitempty"` // Database Version
Scopes RegistryScopes `json:"scopes,omitempty"` // Scopes and collections for this version
Version string `json:"version,omitempty"` // Database Version
}

// IsDeleted returns true if the database is in a deleted state
func (r *RegistryDatabase) IsDeleted() bool {
return r.Version == deletedDatabaseVersion
}

// IsInvalid returns true if the database is in an invalid state
func (r *RegistryDatabase) IsInvalid() bool {
return isRegistryDbConfigVersionInvalid(r.Version)
}

// isRegistryDbConfigVersionInvalid returns true if the version is in an invalid state
func isRegistryDbConfigVersionInvalid(version string) bool {
return version == invalidDatabaseConflictingCollectionsVersion
}

type RegistryScopes map[string]RegistryScope
Expand Down Expand Up @@ -195,7 +217,7 @@ func (r *GatewayRegistry) upsertDatabaseConfig(ctx context.Context, configGroupI
}

// rollbackDatabaseConfig reverts the registry entry to the previous version, and removes the previous version
func (r *GatewayRegistry) rollbackDatabaseConfig(ctx context.Context, configGroupID string, dbName string) (err error) {
func (r *GatewayRegistry) rollbackDatabaseConfig(ctx context.Context, configGroupID string, dbName string, config *DatabaseConfig) (err error) {

configGroup, ok := r.ConfigGroups[configGroupID]
if !ok {
Expand All @@ -206,8 +228,16 @@ func (r *GatewayRegistry) rollbackDatabaseConfig(ctx context.Context, configGrou
return base.ErrNotFound
}

// handle dbconfig doc rollback without PreviousVersion
if registryDatabase.PreviousVersion == nil {
return fmt.Errorf("Rollback requested but registry did not include previous version for db %s", base.MD(dbName))
base.InfofCtx(ctx, base.KeyConfig, "Rollback requested but registry did not include previous version for db %s - using config doc as previous version", base.MD(dbName))
newRegistryDatabase := registryDatabaseFromConfig(config)
configGroup.Databases[dbName] = newRegistryDatabase
if conflicts := r.getCollectionConflicts(ctx, dbName, config.Scopes); len(conflicts) > 0 {
base.WarnfCtx(ctx, "db %s config rollback would cause collection conflicts (%v) - marking database as invalid to allow for manual repair", base.MD(dbName), base.MD(conflicts))
newRegistryDatabase.Version = invalidDatabaseConflictingCollectionsVersion
}
return nil
}

registryDatabase.Version = registryDatabase.PreviousVersion.Version
Expand Down Expand Up @@ -354,7 +384,7 @@ func registryDatabaseFromConfig(config *DatabaseConfig) *RegistryDatabase {
registryScope := RegistryScope{
Collections: make([]string, 0),
}
for collectionName, _ := range scope.Collections {
for collectionName := range scope.Collections {
registryScope.Collections = append(registryScope.Collections, collectionName)
}
rdb.Scopes[scopeName] = registryScope
Expand Down
Loading

0 comments on commit fe58465

Please sign in to comment.