Skip to content

Commit

Permalink
storage: support WAL failover to an explicit path
Browse files Browse the repository at this point in the history
This commit expands on cockroachdb#120509, introducing a WAL failover mode that allows an
operator of a node with a single store to configure WAL failover to failover to
a particular path (rather than another store's directory).

This is configured via the --wal-failover flag:

    --wal-failover=path=/mnt/data2

When disabling or changing the path, the operator is required to pass the
previous path. Eg,

    --wal-failover=path=/mnt/data3,prev_path=/mnt/data2

or

    --wal-failover=disabled,prev_path=/mnt/data2

Informs cockroachdb#119418.
Informs cockroachdb/pebble#3230
Epic: CRDB-35401
Release note (ops change): Adds an additional option to the new (in 24.1)
--wal-failover CLI flag allowing an operator to specify an explicit path for
WAL failover for single-store nodes.
  • Loading branch information
jbowens committed Mar 22, 2024
1 parent 3a5f364 commit fa7cf49
Show file tree
Hide file tree
Showing 7 changed files with 289 additions and 48 deletions.
1 change: 1 addition & 0 deletions pkg/base/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ go_test(
"//pkg/util/leaktest",
"//pkg/util/log",
"//pkg/util/uuid",
"@com_github_cockroachdb_datadriven//:datadriven",
"@com_github_cockroachdb_errors//:errors",
"@com_github_davecgh_go_spew//spew",
"@com_github_stretchr_testify//require",
Expand Down
111 changes: 99 additions & 12 deletions pkg/base/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"net"
"net/url"
"os"
"strings"
"time"

"github.com/cockroachdb/cockroach/pkg/roachpb"
Expand Down Expand Up @@ -838,8 +839,7 @@ type TempStorageConfig struct {
Settings *cluster.Settings
}

// WALFailoverMode configures a node's stores behavior under high write latency
// to their write-ahead logs.
// WALFailoverMode specifies the mode of WAL failover.
type WALFailoverMode int8

const (
Expand All @@ -858,11 +858,11 @@ const (
// volume, the batch commit latency is insulated from the effects of momentary
// disk stalls.
WALFailoverAmongStores
// WALFailoverExplicitPath enables WAL failover for a single-store node to an
// explicitly specified path.
WALFailoverExplicitPath
)

// Type implements the pflag.Value interface.
func (m *WALFailoverMode) Type() string { return "string" }

// String implements fmt.Stringer.
func (m *WALFailoverMode) String() string {
return redact.StringWithoutMarkers(m)
Expand All @@ -877,25 +877,112 @@ func (m *WALFailoverMode) SafeFormat(p redact.SafePrinter, _ rune) {
p.SafeString("disabled")
case WALFailoverAmongStores:
p.SafeString("among-stores")
case WALFailoverExplicitPath:
p.SafeString("path")
default:
p.Printf("<unknown WALFailoverMode %d>", int8(*m))
}
}

// WALFailoverConfig configures a node's stores behavior under high write
// latency to their write-ahead logs.
type WALFailoverConfig struct {
Mode WALFailoverMode
// Path is the non-store path to which WALs should be written when failing
// over. It must be nonempty if and only if Mode == WALFailoverExplicitPath.
Path string
// PrevPath is the previously used non-store path. It may be set with Mode ==
// WALFailoverExplicitPath (when changing the secondary path) or
// WALFailoverDisabled (when disabling WAL failover after it was previously
// enabled with WALFailoverExplicitPath). It must be empty for other modes.
// If Mode is WALFailoverDisabled and previously WAL failover was enabled
// using WALFailoverAmongStores, then PrevPath must not be set.
PrevPath string
}

// Type implements the pflag.Value interface.
func (c *WALFailoverConfig) Type() string { return "string" }

// String implements fmt.Stringer.
func (c *WALFailoverConfig) String() string {
return redact.StringWithoutMarkers(c)
}

// SafeFormat implements the refact.SafeFormatter interface.
func (c *WALFailoverConfig) SafeFormat(p redact.SafePrinter, _ rune) {
switch c.Mode {
case WALFailoverDefault:
// Empty
case WALFailoverDisabled:
p.SafeString("disabled")
if c.PrevPath != "" {
p.SafeString(",prev_path=")
p.SafeString(redact.SafeString(c.PrevPath))
}
case WALFailoverAmongStores:
p.SafeString("among-stores")
case WALFailoverExplicitPath:
p.SafeString("path=")
p.SafeString(redact.SafeString(c.Path))
if c.PrevPath != "" {
p.SafeString(",prev_path=")
p.SafeString(redact.SafeString(c.PrevPath))
}
default:
p.Printf("<unknown WALFailoverMode %d>", int8(c.Mode))
}
}

// Set implements the pflag.Value interface.
func (m *WALFailoverMode) Set(s string) error {
switch s {
case "disabled":
*m = WALFailoverDisabled
case "among-stores":
*m = WALFailoverAmongStores
func (c *WALFailoverConfig) Set(s string) error {
switch {
case strings.HasPrefix(s, "disabled"):
c.Mode = WALFailoverDisabled
var ok bool
c.Path, c.PrevPath, ok = parseWALFailoverPathFields(strings.TrimPrefix(s, "disabled"))
if !ok || c.Path != "" {
return errors.Newf("invalid disabled --wal-failover setting: %s "+
"expect disabled[,prev_path=<prev_path>]", s)
}
case s == "among-stores":
c.Mode = WALFailoverAmongStores
case strings.HasPrefix(s, "path="):
c.Mode = WALFailoverExplicitPath
var ok bool
c.Path, c.PrevPath, ok = parseWALFailoverPathFields(s)
if !ok || c.Path == "" {
return errors.Newf("invalid path --wal-failover setting: %s "+
"expect path=<path>[,prev_path=<prev_path>]", s)
}
default:
return errors.Newf("invalid --wal-failover setting: %s "+
"(possible values: disabled, among-stores)", s)
"(possible values: disabled, among-stores, path=<path>)", s)
}
return nil
}

func parseWALFailoverPathFields(s string) (path, prevPath string, ok bool) {
if s == "" {
return "", "", true
}
if s2 := strings.TrimPrefix(s, "path="); len(s2) < len(s) {
s = s2
if i := strings.IndexByte(s, ','); i == -1 {
return s, "", true
} else {
path = s[:i]
s = s[i:]
}
}

// Any remainder must be a prev_path= field.
if !strings.HasPrefix(s, ",prev_path=") {
return "", "", false
}
prevPath = strings.TrimPrefix(s, ",prev_path=")
return path, prevPath, true
}

// ExternalIODirConfig describes various configuration options pertaining
// to external storage implementations.
// TODO(adityamaru): Rename ExternalIODirConfig to ExternalIOConfig because it
Expand Down
20 changes: 20 additions & 0 deletions pkg/base/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,18 @@
package base_test

import (
"bytes"
"fmt"
"math"
"strings"
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/testutils/datapathutils"
"github.com/cockroachdb/cockroach/pkg/testutils/echotest"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/datadriven"
"github.com/davecgh/go-spew/spew"
"github.com/stretchr/testify/require"
)
Expand Down Expand Up @@ -154,3 +157,20 @@ func TestRaftMaxInflightBytes(t *testing.T) {
})
}
}

func TestWALFailoverConfigRoundtrip(t *testing.T) {
defer leaktest.AfterTest(t)()

datadriven.RunTest(t, datapathutils.TestDataPath(t, "wal-failover-config"), func(t *testing.T, d *datadriven.TestData) string {
var buf bytes.Buffer
for _, l := range strings.Split(d.Input, "\n") {
var cfg base.WALFailoverConfig
if err := cfg.Set(l); err != nil {
fmt.Fprintf(&buf, "err: %s\n", err)
continue
}
fmt.Fprintln(&buf, cfg.String())
}
return buf.String()
})
}
29 changes: 29 additions & 0 deletions pkg/base/testdata/wal-failover-config
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
parse
among-stores
----
among-stores

parse
disabled
disabled,prev_path=foo
----
disabled
disabled,prev_path=foo

parse
path=/foo
path=/foo,prev_path=/bar
----
path=/foo
path=/foo,prev_path=/bar

parse
disabled,path=foo
among-stores,path=foo
among-stores,prev_path=foo
garbage
----
err: invalid disabled --wal-failover setting: disabled,path=foo expect disabled[,prev_path=<prev_path>]
err: invalid --wal-failover setting: among-stores,path=foo (possible values: disabled, among-stores, path=<path>)
err: invalid --wal-failover setting: among-stores,prev_path=foo (possible values: disabled, among-stores, path=<path>)
err: invalid --wal-failover setting: garbage (possible values: disabled, among-stores, path=<path>)
27 changes: 23 additions & 4 deletions pkg/cli/cliflags/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -1005,15 +1005,34 @@ which use 'cockroach-data-tenant-X' for tenant 'X')
Name: "wal-failover",
EnvVar: "COCKROACH_WAL_FAILOVER",
Description: `
Configures the use and behavior of WAL failover. Defaults to "disabled".
The value "among-stores" enables automatic failover to another store's
data directory if a WAL write does not complete within the configured
threshold. For example:
Configures the use and behavior of WAL failover. WAL failover enables
automatic failover to another directory if a WAL write does not complete
within the configured threshold. Defaults to "disabled". Possible values
depend on the number of stores a node is configured to use.
If a node has multiple stores, the value "among-stores" enables automatic
failover to another store's data directory. CockroachDB will automatically
assign each store a secondary to serve as its WAL failover destination.
For example:
<PRE>
--wal-failover=among-stores
</PRE>
If a node has a single store, the value "path=<path>" enables automatic
failover to the provided path. After this setting is used, changing the
configuration to a new path or disabling requires providing the previous
path as ",prev_path=<path>". For example:
<PRE>
--wal-failover=path=/mnt/data2
--wal-failover=path=/mnt/data3,prev_path=/mnt/data2
--wal-failover=disabled,prev_path=/mnt/data3
</PRE>
See the storage.wal_failover.unhealthy_op_threshold cluster setting.
`,
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/server/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ type BaseConfig struct {

// WALFailover enables and configures automatic WAL failover when latency to
// a store's primary WAL increases.
WALFailover base.WALFailoverMode
WALFailover base.WALFailoverConfig

// SharedStorage is specified to enable disaggregated shared storage.
SharedStorage string
Expand Down Expand Up @@ -316,7 +316,7 @@ func (cfg *BaseConfig) SetDefaults(
cfg.DisableMaxOffsetCheck = false
cfg.DefaultZoneConfig = zonepb.DefaultZoneConfig()
cfg.StorageEngine = storage.DefaultStorageEngine
cfg.WALFailover = base.WALFailoverDefault
cfg.WALFailover = base.WALFailoverConfig{Mode: base.WALFailoverDefault}
cfg.TestingInsecureWebAccess = disableWebLogin
cfg.Stores = base.StoreSpecList{
Specs: []base.StoreSpec{storeSpec},
Expand Down
Loading

0 comments on commit fa7cf49

Please sign in to comment.