Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CBG-3742: Allow registry rollbacks based on db config doc rollbacks #6709

Merged
merged 18 commits into from
Mar 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions base/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,10 @@ func (g *GlobalStat) initConfigStats() error {
if err != nil {
return err
}
configStat.DatabaseRollbackCollectionCollisions, err = NewIntStat(ConfigSubsystem, "database_config_collection_conflicts", StatUnitBytes, DatabaseCollectionConflictDesc, StatAddedVersion3dot1dot4, StatDeprecatedVersionNotDeprecated, StatStabilityCommitted, nil, nil, prometheus.CounterValue, 0)
if err != nil {
return err
}
g.ConfigStat = configStat
return nil
}
Expand Down Expand Up @@ -354,6 +358,8 @@ type ResourceUtilization struct {
type ConfigStat struct {
// The number of times the bucket specified in a database config doesn't match the bucket it's found in.
DatabaseBucketMismatches *SgwIntStat `json:"database_config_bucket_mismatches"`
// The number of times the config was rolled back to an invalid state (conflicting collections)
DatabaseRollbackCollectionCollisions *SgwIntStat `json:"database_config_rollback_collection_collisions"`
}

type DbStats struct {
Expand Down
3 changes: 2 additions & 1 deletion base/stats_descriptions.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ const (

// error stat
const (
DatabaseBucketMismatchesDesc = "The total number of times a database config is polled from a bucket that doesn't match the bucket specified in the database config."
DatabaseBucketMismatchesDesc = "The total number of times a database config is polled from a bucket that doesn't match the bucket specified in the database config."
DatabaseCollectionConflictDesc = "The total number of times a database config is rolled back to an invalid state (collection conflicts)."
)

// cache stats descriptions
Expand Down
50 changes: 33 additions & 17 deletions rest/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ type invalidConfigInfo struct {
logged bool
configBucketName string
persistedBucketName string
collectionConflicts bool
}

type invalidDatabaseConfigs struct {
Expand All @@ -303,27 +304,42 @@ type invalidDatabaseConfigs struct {
// addInvalidDatabase adds a db to invalid dbconfig map if it doesn't exist in there yet and will log for it at warning level
// if the db already exists there we will calculate if we need to log again according to the config update interval
func (d *invalidDatabaseConfigs) addInvalidDatabase(ctx context.Context, dbname string, cnf DatabaseConfig, bucket string) {
configInfo := invalidConfigInfo{
configBucketName: *cnf.Bucket,
persistedBucketName: bucket,
}
d.m.Lock()
defer d.m.Unlock()
if d.dbNames[dbname] == nil {
// db hasn't been tracked as invalid config yet so add it
d.dbNames[dbname] = &configInfo
d.dbNames[dbname] = &invalidConfigInfo{
configBucketName: *cnf.Bucket,
persistedBucketName: bucket,
collectionConflicts: cnf.Version == invalidDatabaseConflictingCollectionsVersion,
}
}

logMessage := "Must repair invalid database config for %q for it to be usable!"
logArgs := []interface{}{base.MD(dbname)}

// build log message
if isBucketMismatch := *cnf.Bucket != bucket; isBucketMismatch {
base.SyncGatewayStats.GlobalStats.ConfigStat.DatabaseBucketMismatches.Add(1)
logMessage += " Mismatched buckets (config bucket: %q, actual bucket: %q)"
logArgs = append(logArgs, base.MD(d.dbNames[dbname].configBucketName), base.MD(d.dbNames[dbname].persistedBucketName))
} else if cnf.Version == invalidDatabaseConflictingCollectionsVersion {
base.SyncGatewayStats.GlobalStats.ConfigStat.DatabaseRollbackCollectionCollisions.Add(1)
logMessage += " Conflicting collections detected"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this will be logged upstream, but I think it would great to somewhere print the name of the registry / config docs and what collections are conflicting

Copy link
Member Author

@bbrks bbrks Mar 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not feasible to do that here on load, but the warning log that occurred when the rollback set this invalid flag logs what collection(s) caused it.

2024-02-29T15:39:54.284Z [WRN] db:c1_db1 db <ud>c1_db1</ud> config rollback would cause collection conflicts (<ud>map[sg_test_0.sg_test_1:c1_db2]</ud>) - marking database as invalid to allow for manual repair -- rest.(*GatewayRegistry).rollbackDatabaseConfig() at config_registry.go:234
2024-02-29T15:39:54.285Z [WRN] Must repair invalid database config for "c1_db1" for it to be usable! Conflicting collections detected -- rest.(*invalidDatabaseConfigs).addInvalidDatabase() at config.go:334

Given the rollback that just happened, the customer probably doesn't want to change the non-rolled back database, so I'm not sure there's value in knowing which database this one conflicts with. The rollback is the problem that needs correcitng and we log the collection that needs removing.

} else {
// Nothing is expected to hit this case, but we might add more invalid sentinel values and forget to update this code.
logMessage += " Database was marked invalid. See logs for details."
}
logMessage := fmt.Sprintf("Mismatch in database config for database %q bucket name: %q and backend bucket: %q You must update database config immediately", base.MD(dbname), base.MD(d.dbNames[dbname].configBucketName), base.MD(d.dbNames[dbname].persistedBucketName))

// if we get here we already have the db logged as an invalid config, so now we need to work out iof we should log for it now
if !d.dbNames[dbname].logged {
// we need to log at warning if we haven't already logged for this particular corrupt db config
base.WarnfCtx(ctx, logMessage)
base.WarnfCtx(ctx, logMessage, logArgs...)
d.dbNames[dbname].logged = true
} else {
// already logged this entry at warning so need to log at info now
base.InfofCtx(ctx, base.KeyConfig, logMessage)
base.InfofCtx(ctx, base.KeyConfig, logMessage, logArgs...)
}
base.SyncGatewayStats.GlobalStats.ConfigStat.DatabaseBucketMismatches.Add(1)
}

func (d *invalidDatabaseConfigs) exists(dbname string) (*invalidConfigInfo, bool) {
Expand Down Expand Up @@ -1746,21 +1762,21 @@ func (sc *ServerContext) FetchConfigs(ctx context.Context, isInitialStartup bool
continue
}
for _, cnf := range configs {

// inherit properties the bootstrap config
cnf.CACertPath = sc.Config.Bootstrap.CACertPath

// We need to check for corruption in the database config (CC. CBG-3292). If the fetched config doesn't match the
// bucket name we got the config from we need to maker this db context as corrupt. Then remove the context and
// in memory representation on the server context.
if bucket != cnf.GetBucketName() {
// Handle invalid database registry entries. Either:
// - CBG-3292: Bucket in config doesn't match the actual bucket
// - CBG-3742: Registry entry marked invalid (due to rollback causing collection conflict)
if isRegistryDbConfigVersionInvalid(cnf.Version) || bucket != cnf.GetBucketName() {
sc.handleInvalidDatabaseConfig(ctx, bucket, *cnf)
continue
}

bucketCopy := bucket
// no corruption detected carry on as usual
cnf.Bucket = &bucketCopy

// inherit properties the bootstrap config
cnf.CACertPath = sc.Config.Bootstrap.CACertPath

// stamp per-database credentials if set
if dbCredentials, ok := sc.Config.DatabaseCredentials[cnf.Name]; ok && dbCredentials != nil {
cnf.setDatabaseCredentials(*dbCredentials)
Expand Down
25 changes: 16 additions & 9 deletions rest/config_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ func (b *bootstrapContext) GetDatabaseConfigs(ctx context.Context, bucketName, g
reloadRequired := false
for dbName, registryDb := range configGroup.Databases {
// Ignore databases with deleted version - represents an in-progress delete
if registryDb.Version == deletedDatabaseVersion {
if registryDb.IsDeleted() {
continue
}
dbConfig, err := b.getDatabaseConfig(ctx, bucketName, groupID, dbName, registryDb.Version, registry)
Expand Down Expand Up @@ -405,6 +405,13 @@ func (b *bootstrapContext) getConfigVersionWithRetry(ctx context.Context, bucket
}

config.cfgCas = cas

if version == invalidDatabaseConflictingCollectionsVersion {
// special case - return the invalid config to use in updates (repairs). Configs with this version will not be loaded by SG.
config.Version = invalidDatabaseConflictingCollectionsVersion
return false, nil, config
}

// If version matches, success!
if config.Version == version {
return false, nil, config
Expand All @@ -417,10 +424,10 @@ func (b *bootstrapContext) getConfigVersionWithRetry(ctx context.Context, bucket
// If the config has a newer version than requested, return the config but alert caller that they have
// requested a stale version.
return false, base.ErrConfigVersionMismatch, config
} else {
base.InfofCtx(ctx, base.KeyConfig, "getConfigVersionWithRetry for key %s found version mismatch, retrying. Requested: %s, Found: %s", metadataKey, version, config.Version)
return true, base.ErrConfigRegistryRollback, config
}

base.InfofCtx(ctx, base.KeyConfig, "getConfigVersionWithRetry for key %s found version mismatch, retrying. Requested: %s, Found: %s", metadataKey, version, config.Version)
return true, base.ErrConfigRegistryRollback, config
}

// Kick off the retry loop
Expand Down Expand Up @@ -558,11 +565,11 @@ func (b *bootstrapContext) rollbackRegistry(ctx context.Context, bucketName, gro

// non-nil config indicates database version in registry should be updated to match config
base.InfofCtx(ctx, base.KeyConfig, "Rolling back config registry to align with db config version %s for db: %s, bucket:%s configGroup:%s", config.Version, base.MD(dbName), base.MD(bucketName), base.MD(groupID))
registryErr := registry.rollbackDatabaseConfig(ctx, groupID, dbName)
registryErr := registry.rollbackDatabaseConfig(ctx, groupID, dbName, config)
if registryErr != nil {
// There shouldn't be a case where rollback introduces a collection conflict - it
// shouldn't be possible to add a conflicting collection to the registry while a previous
// config persistence is in-flight
// There is one case where the registry rollback can introduce a collection conflict.
// If there's no PreviousVersion present (i.e. we're handling a db config doc rollback, not a registry update)
// then it's possible for the db config to contain a collection that is now present on another database in the registry.
return fmt.Errorf("Unable to roll back registry to match existing config for database %s(%s): %w", base.MD(dbName), base.MD(groupID), registryErr)
}
}
Expand Down Expand Up @@ -677,7 +684,7 @@ func (b *bootstrapContext) getRegistryAndDatabase(ctx context.Context, bucketNam
}
return registry, nil, err
} else {
if registryDb.Version != "" && registryDb.Version != deletedDatabaseVersion {
if registryDb.Version != "" && !registryDb.IsDeleted() {
// Database exists in registry, go fetch the config
config, err = b.getDatabaseConfig(ctx, bucketName, groupID, dbName, registryDb.Version, registry)
if err == base.ErrConfigRegistryReloadRequired {
Expand Down
46 changes: 38 additions & 8 deletions rest/config_registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,14 @@ type GatewayRegistry struct {

const GatewayRegistryVersion = "1.0"

const deletedDatabaseVersion = "0-0"
// A set of special sentinel values to store as database versions.
// NOTE: Update addInvalidDatabase and isRegistryDbConfigVersionInvalid if you add more.
const (
// deletedDatabaseVersion represents an entry for an in-progress delete
deletedDatabaseVersion = "0-0"
// invalidDatabaseConflictingCollectionsVersion represents an entry for a rollback-induced invalid state
invalidDatabaseConflictingCollectionsVersion = "0-1"
)

// RegistryConfigGroup stores the set of databases for a given config group
type RegistryConfigGroup struct {
Expand All @@ -60,18 +67,33 @@ type RegistryConfigGroup struct {

// RegistryDatabase stores the version and set of RegistryScopes for a database
type RegistryDatabase struct {
RegistryDatabaseVersion // current version
MetadataID string `json:"metadata_id"` // Metadata ID
UUID string `json:"uuid,omitempty"` // Database UUID
RegistryDatabaseVersion // current version
// PreviousVersion stores the previous database version while an update is in progress, in case update of the config
// fails and rollback is required. Required to avoid cross-database collection conflicts during rollback.
PreviousVersion *RegistryDatabaseVersion `json:"previous_version,omitempty"`
MetadataID string `json:"metadata_id"` // Metadata ID
UUID string `json:"uuid,omitempty"` // Database UUID
}

// DatabaseVersion stores the version and collection set for a database. Used for storing current or previous version.
type RegistryDatabaseVersion struct {
Version string `json:"version,omitempty"` // Database Version
Scopes RegistryScopes `json:"scopes,omitempty"` // Scopes and collections for this version
Version string `json:"version,omitempty"` // Database Version
}

// IsDeleted returns true if the database is in a deleted state
func (r *RegistryDatabase) IsDeleted() bool {
return r.Version == deletedDatabaseVersion
}

// IsInvalid returns true if the database is in an invalid state
func (r *RegistryDatabase) IsInvalid() bool {
return isRegistryDbConfigVersionInvalid(r.Version)
}

// isRegistryDbConfigVersionInvalid returns true if the version is in an invalid state
func isRegistryDbConfigVersionInvalid(version string) bool {
return version == invalidDatabaseConflictingCollectionsVersion
}

type RegistryScopes map[string]RegistryScope
Expand Down Expand Up @@ -195,7 +217,7 @@ func (r *GatewayRegistry) upsertDatabaseConfig(ctx context.Context, configGroupI
}

// rollbackDatabaseConfig reverts the registry entry to the previous version, and removes the previous version
func (r *GatewayRegistry) rollbackDatabaseConfig(ctx context.Context, configGroupID string, dbName string) (err error) {
func (r *GatewayRegistry) rollbackDatabaseConfig(ctx context.Context, configGroupID string, dbName string, config *DatabaseConfig) (err error) {

configGroup, ok := r.ConfigGroups[configGroupID]
if !ok {
Expand All @@ -206,8 +228,16 @@ func (r *GatewayRegistry) rollbackDatabaseConfig(ctx context.Context, configGrou
return base.ErrNotFound
}

// handle dbconfig doc rollback without PreviousVersion
if registryDatabase.PreviousVersion == nil {
return fmt.Errorf("Rollback requested but registry did not include previous version for db %s", base.MD(dbName))
base.InfofCtx(ctx, base.KeyConfig, "Rollback requested but registry did not include previous version for db %s - using config doc as previous version", base.MD(dbName))
newRegistryDatabase := registryDatabaseFromConfig(config)
configGroup.Databases[dbName] = newRegistryDatabase
if conflicts := r.getCollectionConflicts(ctx, dbName, config.Scopes); len(conflicts) > 0 {
base.WarnfCtx(ctx, "db %s config rollback would cause collection conflicts (%v) - marking database as invalid to allow for manual repair", base.MD(dbName), base.MD(conflicts))
newRegistryDatabase.Version = invalidDatabaseConflictingCollectionsVersion
}
return nil
}

registryDatabase.Version = registryDatabase.PreviousVersion.Version
Expand Down Expand Up @@ -354,7 +384,7 @@ func registryDatabaseFromConfig(config *DatabaseConfig) *RegistryDatabase {
registryScope := RegistryScope{
Collections: make([]string, 0),
}
for collectionName, _ := range scope.Collections {
for collectionName := range scope.Collections {
registryScope.Collections = append(registryScope.Collections, collectionName)
}
rdb.Scopes[scopeName] = registryScope
Expand Down
Loading
Loading