From dd75195f30a749f7e9220b94c214f73154a28de2 Mon Sep 17 00:00:00 2001 From: Alex Barganier Date: Tue, 31 Oct 2023 15:12:13 -0400 Subject: [PATCH 1/3] pkg/util/log: remove stderr config code for deprecated log format Code used to validate the stderr config parsing previously checked to see if the `crdb-v1-tty` format was being used when `auditable: true` was set. If it was, it changed the format to `crdb-v1-tty-count`. However, the `crdb-v1` log formats have long been deprecated, and the official logging documentation states: https://www.cockroachlabs.com/docs/stable/configure-logs#output-to-stderr > Note: The `format` parameter for `stderr` is set to `crdb-v2-tty` and cannot be changed. Given this, it not longer makes sense to have code that accounts for a stderr log format that's no longer valid. Instead, we should *always* assert the format to be `crdb-v2-tty` to align with what our documentation says. A following patch will add this assertion. Release note: none --- pkg/util/log/logconfig/validate.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pkg/util/log/logconfig/validate.go b/pkg/util/log/logconfig/validate.go index 636907478a7e..2213f7cdf30d 100644 --- a/pkg/util/log/logconfig/validate.go +++ b/pkg/util/log/logconfig/validate.go @@ -172,10 +172,6 @@ func (c *Config) Validate(defaultLogDir *string) (resErr error) { } propagateCommonDefaults(&c.Sinks.Stderr.CommonSinkConfig, c.FileDefaults.CommonSinkConfig) if c.Sinks.Stderr.Auditable != nil && *c.Sinks.Stderr.Auditable { - if *c.Sinks.Stderr.Format == "crdb-v1-tty" { - f := "crdb-v1-tty-count" - c.Sinks.Stderr.Format = &f - } c.Sinks.Stderr.Criticality = &bt } c.Sinks.Stderr.Auditable = nil From c0addf4631062ad2914fd8e7cdeb9d26939f0dd0 Mon Sep 17 00:00:00 2001 From: Alex Barganier Date: Tue, 31 Oct 2023 16:19:10 -0400 Subject: [PATCH 2/3] pkg/util/log: selectively apply file-defaults.format-options to stderr Fixes: https://github.com/cockroachdb/cockroach/issues/113321 The stderr sink in the log config is only allowed to use the `crdb-v2-tty` format, and therefore, only `format-options` supported by the `crdb-v2-tty` format should be applied to the stderr sink. Unfortunately, a bug in the log config parse & validation code didn't follow this rule. The configuration that's part of the `file-defaults` is propagated to the stderr sink, and this included the `format-options`, even if they were only relevant to another log format type (e.g. `json`). This caused an error when trying to apply the options to the stderr log sink on startup, e.g.: ``` ERROR: unknown format option: "datetime-format" ``` To solve this problem, we should only propagate the `format-options` used in `file-defaults` to the stderr sink's config IFF the `file-defaults` format is of a `crdb-v2` variety. Since the stderr sink also uses the `crdb-v2-tty` format, we can only be sure that the `format-options` used in `file-defaults` is supported by the stderr sink if the `format` used in `file-defaults` is also part of `crdb-v2`. However, if `format-options` is explicitly defined within the `sinks.stderr` config, we need to be careful not to overwrite them with those defined in `file-defaults`. This patch accomplishes fixes for all these issues, and adds new tests to cover all these scenarios. Release note: none --- pkg/cli/interactive_tests/test_log_flags.tcl | 1 - pkg/util/log/logconfig/config.go | 6 +- pkg/util/log/logconfig/testdata/validate | 87 ++++++++++++++++++++ pkg/util/log/logconfig/validate.go | 17 ++++ 4 files changed, 108 insertions(+), 3 deletions(-) diff --git a/pkg/cli/interactive_tests/test_log_flags.tcl b/pkg/cli/interactive_tests/test_log_flags.tcl index 49fcac53342e..cfb9c44e20ab 100644 --- a/pkg/cli/interactive_tests/test_log_flags.tcl +++ b/pkg/cli/interactive_tests/test_log_flags.tcl @@ -77,7 +77,6 @@ end_test start_test "Check that the log flag is properly recognized for non-server commands" send "$argv debug reset-quorum 123 --log='sinks: {stderr: {format: json }}'\r" -eexpect "\"severity\":\"ERROR\"" eexpect "connection to server failed" eexpect ":/# " end_test diff --git a/pkg/util/log/logconfig/config.go b/pkg/util/log/logconfig/config.go index 0a2a6fbd242d..874f4e9a6906 100644 --- a/pkg/util/log/logconfig/config.go +++ b/pkg/util/log/logconfig/config.go @@ -29,8 +29,10 @@ import ( // specified in a configuration. const DefaultFileFormat = `crdb-v2` -// DefaultStderrFormat is the entry format for stderr sinks -// when not specified in a configuration. +// DefaultStderrFormat is the entry format for stderr sinks. +// NB: The format for stderr is always set to `crdb-v2-tty`, +// and cannot be changed. We enforce this in the validation step. +// See: https://www.cockroachlabs.com/docs/stable/configure-logs#output-to-stderr const DefaultStderrFormat = `crdb-v2-tty` // DefaultFluentFormat is the entry format for fluent sinks diff --git a/pkg/util/log/logconfig/testdata/validate b/pkg/util/log/logconfig/testdata/validate index b03266962ed5..c5b8c9c8f68b 100644 --- a/pkg/util/log/logconfig/testdata/validate +++ b/pkg/util/log/logconfig/testdata/validate @@ -254,6 +254,93 @@ capture-stray-errors: dir: /default-dir max-group-size: 100MiB +# Check that file-defaults format options are transferred to stderr if using a crdb-v2 format. +yaml +file-defaults: + format: crdb-v2 + format-options: {timezone: america/new_york} +---- +sinks: + file-groups: + default: + channels: {INFO: all} + filter: INFO + format-options: + timezone: america/new_york + stderr: + filter: NONE + format-options: + timezone: america/new_york +capture-stray-errors: + enable: true + dir: /default-dir + max-group-size: 100MiB + +# Check that file-defaults format options are NOT transferred to stderr if NOT using a crdb-v2 format. +yaml +file-defaults: + format: json + format-options: {datetime-format: rfc3339, datetime-timezone: America/New_York} +---- +sinks: + file-groups: + default: + channels: {INFO: all} + filter: INFO + format: json + format-options: + datetime-format: rfc3339 + datetime-timezone: America/New_York + stderr: + filter: NONE +capture-stray-errors: + enable: true + dir: /default-dir + max-group-size: 100MiB + +# Check that file-defaults format options do NOT overwrite format-options if explicitly defined in stderr. +yaml +file-defaults: + format: crdb-v2 + format-options: {timezone: america/new_york} +sinks: + stderr: + format-options: {timezone: america/chicago} +---- +sinks: + file-groups: + default: + channels: {INFO: all} + filter: INFO + format-options: + timezone: america/new_york + stderr: + filter: NONE + format-options: + timezone: america/chicago +capture-stray-errors: + enable: true + dir: /default-dir + max-group-size: 100MiB + +# Check that stderr always uses crdb-v2-tty format, even if we try to set it to an invalid format +yaml +sinks: + stderr: + format: crdb-v1-tty +---- +sinks: + file-groups: + default: + channels: {INFO: all} + filter: INFO + stderr: + filter: NONE +capture-stray-errors: + enable: true + dir: /default-dir + max-group-size: 100MiB + # Check that NONE filter elides files. yaml file-defaults: {filter: NONE} diff --git a/pkg/util/log/logconfig/validate.go b/pkg/util/log/logconfig/validate.go index 2213f7cdf30d..68bf016c884f 100644 --- a/pkg/util/log/logconfig/validate.go +++ b/pkg/util/log/logconfig/validate.go @@ -170,11 +170,28 @@ func (c *Config) Validate(defaultLogDir *string) (resErr error) { if c.Sinks.Stderr.Filter == logpb.Severity_UNKNOWN { c.Sinks.Stderr.Filter = logpb.Severity_NONE } + // We need to know if format-options were specifically defined on the stderr sink later on, + // since this information is lost once propagateCommonDefaults is called. + stdErrFormatOptionsOriginallySet := len(c.Sinks.Stderr.FormatOptions) > 0 propagateCommonDefaults(&c.Sinks.Stderr.CommonSinkConfig, c.FileDefaults.CommonSinkConfig) if c.Sinks.Stderr.Auditable != nil && *c.Sinks.Stderr.Auditable { c.Sinks.Stderr.Criticality = &bt } c.Sinks.Stderr.Auditable = nil + // The format parameter for stderr is set to `crdb-v2-tty` and cannot be changed. + // See docs: https://www.cockroachlabs.com/docs/stable/configure-logs#output-to-stderr + if *c.Sinks.Stderr.Format != DefaultStderrFormat { + f := DefaultStderrFormat + c.Sinks.Stderr.Format = &f + } + // FormatOptions are format-specific. We should only copy them over to StdErr from + // FileDefaults if FileDefaults is also making use of a crdb-v2 format. Otherwise, + // we are likely to error when trying to apply an unsupported format option. + if c.FileDefaults.CommonSinkConfig.Format != nil && + !strings.Contains(*c.FileDefaults.CommonSinkConfig.Format, "v2") && + !stdErrFormatOptionsOriginallySet { + c.Sinks.Stderr.CommonSinkConfig.FormatOptions = map[string]string{} + } if err := c.ValidateCommonSinkConfig(c.Sinks.Stderr.CommonSinkConfig); err != nil { fmt.Fprintf(&errBuf, "stderr sink: %v\n", err) } From acae46bf1ba70e1408a00ad947fa43a314de062c Mon Sep 17 00:00:00 2001 From: Michael Butler Date: Mon, 30 Oct 2023 14:34:58 -0400 Subject: [PATCH 3/3] roachtest: test AOST restore in backup-restore/* roachtests This patch allows the backup-restore driver to run and validate AOST restores from revision history backups. If the driver created a revision history backup, there's a 50% chance it will restore from an AOST between the full backup end time and the last incremental start time. A future patch will allow for AOST restores from non-revision history backups. Epic: none Release note: none --- .../roachtest/tests/mixed_version_backup.go | 93 +++++++++++++++++-- 1 file changed, 87 insertions(+), 6 deletions(-) diff --git a/pkg/cmd/roachtest/tests/mixed_version_backup.go b/pkg/cmd/roachtest/tests/mixed_version_backup.go index 1923682367f2..70a3371b7db1 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_backup.go +++ b/pkg/cmd/roachtest/tests/mixed_version_backup.go @@ -26,6 +26,7 @@ import ( "sync/atomic" "time" + "github.com/cockroachdb/apd/v3" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" @@ -41,6 +42,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/jobutils" + "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/cockroach/pkg/util/randutil" @@ -65,6 +67,9 @@ const ( // mixed-version state. mixedVersionRestoreProbability = 0.5 + // probability that we will attempt to run an AOST restore. + restoreFromAOSTProbability = 0.5 + // string label added to the names of backups taken while the cluster is // upgrading. finalizingLabel = "_finalizing" @@ -377,6 +382,10 @@ type ( // in the `contents` field. tables []string contents []tableContents + + // restoreAOST contains the AOST used in restore, if non-empty. It also + // determines the system time used to grab fingerprints. + restoreAOST string } fullBackup struct { @@ -984,6 +993,68 @@ func (bc *backupCollection) encryptionOption() *encryptionPassphrase { return nil } +func (bc *backupCollection) withRevisionHistory() bool { + for _, option := range bc.options { + if _, ok := option.(revisionHistory); ok { + return true + } + } + return false +} + +// maybeUseRestoreAOST potentially picks a restore AOST between the +// full backup end time and the last incremental backup end time. +// +// We don't bother choosing an AOST before the full backup endtime since the +// restore may fail. We lack good observability for choosing a valid AOST within +// the revision history full backup. +func (bc *backupCollection) maybeUseRestoreAOST( + l *logger.Logger, rng *rand.Rand, fullBackupEndTime, lastBackupEndTime string, +) error { + // TODO(msbutler): pick AOST restore for non revision history backups by + // randomly choosing a backup end time. + if !bc.withRevisionHistory() || rng.Float64() > restoreFromAOSTProbability { + return nil + } + + parseAOST := func(aost string) (hlc.Timestamp, error) { + d, _, err := apd.NewFromString(aost) + if err != nil { + return hlc.Timestamp{}, err + } + ts, err := hlc.DecimalToHLC(d) + if err != nil { + return hlc.Timestamp{}, err + } + return ts, nil + } + + min, err := parseAOST(fullBackupEndTime) + if err != nil { + return err + } + max, err := parseAOST(lastBackupEndTime) + if err != nil { + return err + } + + // Choose a random AOST between min and max with the following approach: + // divide the interval between min and max over 100 bins and randomly choose a + // bin. Randomly choosing a bin is more reproducible than randomly picking a + // time between min and max. + interval := max.WallTime - min.WallTime + binCount := int64(100) + bin := rng.Int63n(binCount) + + restoreAOST := hlc.Timestamp{ + WallTime: (bin*interval)/binCount + min.WallTime, + } + + l.Printf("preparing for an AOST restore at %s, between full backup end time %s and last incremental backup end time %s", restoreAOST.GoTime(), min.GoTime(), max.GoTime()) + bc.restoreAOST = restoreAOST.AsOfSystemTime() + return nil +} + // backupCollectionDesc builds a string that describes how a backup // collection comprised of a full backup and a follow-up incremental // backup was generated (in terms of which versions planned vs @@ -1846,12 +1917,13 @@ func (d *BackupRestoreTestDriver) createBackupCollection( internalSystemJobs bool, ) (*backupCollection, error) { var collection backupCollection - var timestamp string + var latestIncBackupEndTime string + var fullBackupEndTime string // Create full backup. if err := d.testUtils.runJobOnOneOf(ctx, l, fullBackupSpec.Execute.Nodes, func() error { var err error - collection, _, err = d.runBackup( + collection, fullBackupEndTime, err = d.runBackup( ctx, l, rng, fullBackupSpec.Plan.Nodes, fullBackupSpec.PauseProbability, fullBackup{backupNamePrefix}, internalSystemJobs, ) return err @@ -1866,7 +1938,7 @@ func (d *BackupRestoreTestDriver) createBackupCollection( d.randomWait(l, rng) if err := d.testUtils.runJobOnOneOf(ctx, l, incBackupSpec.Execute.Nodes, func() error { var err error - collection, timestamp, err = d.runBackup( + collection, latestIncBackupEndTime, err = d.runBackup( ctx, l, rng, incBackupSpec.Plan.Nodes, incBackupSpec.PauseProbability, incrementalBackup{collection: collection, incNum: i + 1}, internalSystemJobs, ) return err @@ -1875,7 +1947,15 @@ func (d *BackupRestoreTestDriver) createBackupCollection( } } - return d.saveContents(ctx, l, rng, &collection, timestamp) + if err := collection.maybeUseRestoreAOST(l, rng, fullBackupEndTime, latestIncBackupEndTime); err != nil { + return nil, err + } + + fingerprintAOST := latestIncBackupEndTime + if collection.restoreAOST != "" { + fingerprintAOST = collection.restoreAOST + } + return d.saveContents(ctx, l, rng, &collection, fingerprintAOST) } // sentinelFilePath returns the path to the file that prevents job @@ -2173,9 +2253,10 @@ func (bc *backupCollection) verifyBackupCollection( optionsStr = fmt.Sprintf(" WITH %s", strings.Join(restoreOptions, ", ")) } restoreStmt := fmt.Sprintf( - "%s FROM LATEST IN '%s'%s", - restoreCmd, bc.uri(), optionsStr, + "%s FROM LATEST IN '%s'%s %s", + restoreCmd, bc.uri(), aostFor(bc.restoreAOST), optionsStr, ) + l.Printf("Running restore: %s", restoreStmt) var jobID int if err := d.testUtils.QueryRow(ctx, rng, restoreStmt).Scan(&jobID); err != nil { return fmt.Errorf("backup %s: error in restore statement: %w", bc.name, err)