From 0ff47567819ab199633b57891ef7f88022c1e6c4 Mon Sep 17 00:00:00 2001 From: Raphael 'kena' Poss Date: Wed, 7 Aug 2019 17:20:48 +0200 Subject: [PATCH] cli: use zone configs to disable replication As established in #39382 it is not safe to modify the global default zone config objects to non-standard values. This commit changes `start-single-node` and `demo` to use zone configs via SQL instead. Also as requested by @piyush-singh it now informs the user the replication was disabled, for example: ``` * * INFO: Replication was disabled for this cluster. * When/if adding nodes in the future, update zone configurations to increase the replication factor. * ``` Release note: None --- pkg/cli/demo.go | 28 ++++------ pkg/cli/disable_replication.go | 52 +++++++++++++++++++ pkg/cli/flags.go | 8 --- .../test_disable_replication.tcl | 51 ++++++++++++++++++ pkg/cli/interactive_tests/test_flags.tcl | 9 ---- pkg/cli/start.go | 27 +++++++--- pkg/server/server.go | 16 ++++++ 7 files changed, 149 insertions(+), 42 deletions(-) create mode 100644 pkg/cli/disable_replication.go create mode 100644 pkg/cli/interactive_tests/test_disable_replication.tcl diff --git a/pkg/cli/demo.go b/pkg/cli/demo.go index 6af08b0f4394..a189484d36cf 100644 --- a/pkg/cli/demo.go +++ b/pkg/cli/demo.go @@ -18,7 +18,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/cli/cliflags" - "github.com/cockroachdb/cockroach/pkg/config" "github.com/cockroachdb/cockroach/pkg/security" "github.com/cockroachdb/cockroach/pkg/server" "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" @@ -27,7 +26,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/log/logflags" "github.com/cockroachdb/cockroach/pkg/workload" "github.com/cockroachdb/cockroach/pkg/workload/workloadsql" - "github.com/gogo/protobuf/proto" "github.com/spf13/cobra" "github.com/spf13/pflag" ) @@ -107,27 +105,11 @@ func setupTransientServers( } cleanup = func() { stopper.Stop(ctx) } - // Set up the default zone configuration. We are using an in-memory store - // so we really want to disable replication. - cfg := config.DefaultZoneConfig() - sysCfg := config.DefaultSystemZoneConfig() - - if demoCtx.nodes < 3 { - cfg.NumReplicas = proto.Int32(1) - sysCfg.NumReplicas = proto.Int32(1) - } - // Create the first transient server. The others will join this one. args := base.TestServerArgs{ PartOfCluster: true, Insecure: true, - Knobs: base.TestingKnobs{ - Server: &server.TestingKnobs{ - DefaultZoneConfigOverride: &cfg, - DefaultSystemZoneConfigOverride: &sysCfg, - }, - }, - Stopper: stopper, + Stopper: stopper, } serverFactory := server.TestServerFactory s := serverFactory.New(args).(*server.TestServer) @@ -142,6 +124,14 @@ func setupTransientServers( } } + if demoCtx.nodes < 3 { + // Set up the default zone configuration. We are using an in-memory store + // so we really want to disable replication. + if err := cliDisableReplication(ctx, s.Server); err != nil { + return ``, ``, cleanup, err + } + } + // Prepare the URL for use by the SQL shell. options := url.Values{} options.Add("sslmode", "disable") diff --git a/pkg/cli/disable_replication.go b/pkg/cli/disable_replication.go new file mode 100644 index 000000000000..a14e6bb798ad --- /dev/null +++ b/pkg/cli/disable_replication.go @@ -0,0 +1,52 @@ +// Copyright 2019 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package cli + +import ( + "context" + "fmt" + + "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/sql" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/util/log" +) + +// cliDisableReplication changes the replication factor on +// all defined zones to become 1. This is used by start-single-node +// and demo to define single-node clusters, so as to avoid +// churn in the log files. +// +// The change is effected using the internal SQL interface of the +// given server object. +func cliDisableReplication(ctx context.Context, s *server.Server) error { + return s.RunLocalSQL(ctx, + func(ctx context.Context, ie *sql.InternalExecutor) error { + rows, err := ie.Query(ctx, "get-zones", nil, + "SELECT target FROM crdb_internal.zones") + if err != nil { + return err + } + + for _, row := range rows { + zone := string(*row[0].(*tree.DString)) + if _, err := ie.Exec(ctx, "set-zone", nil, + fmt.Sprintf("ALTER %s CONFIGURE ZONE USING num_replicas = 1", zone)); err != nil { + return err + } + } + log.Shout(ctx, log.Severity_INFO, + "Replication was disabled for this cluster.\n"+ + "When/if adding nodes in the future, update zone configurations to increase the replication factor.") + + return nil + }) +} diff --git a/pkg/cli/flags.go b/pkg/cli/flags.go index 04d32074f62b..cde510cb775e 100644 --- a/pkg/cli/flags.go +++ b/pkg/cli/flags.go @@ -26,7 +26,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/log/logflags" "github.com/cockroachdb/cockroach/pkg/util/netutil" "github.com/cockroachdb/errors" - "github.com/gogo/protobuf/proto" "github.com/spf13/cobra" "github.com/spf13/pflag" ) @@ -250,13 +249,6 @@ func init() { }) } - // start-single-node starts with default replication of 1. - AddPersistentPreRunE(startSingleNodeCmd, func(cmd *cobra.Command, _ []string) error { - serverCfg.DefaultSystemZoneConfig.NumReplicas = proto.Int32(1) - serverCfg.DefaultZoneConfig.NumReplicas = proto.Int32(1) - return nil - }) - // Map any flags registered in the standard "flag" package into the // top-level cockroach command. pf := cockroachCmd.PersistentFlags() diff --git a/pkg/cli/interactive_tests/test_disable_replication.tcl b/pkg/cli/interactive_tests/test_disable_replication.tcl new file mode 100644 index 000000000000..f2c73f3f4e8e --- /dev/null +++ b/pkg/cli/interactive_tests/test_disable_replication.tcl @@ -0,0 +1,51 @@ +#! /usr/bin/env expect -f + +source [file join [file dirname $argv0] common.tcl] + +spawn /bin/bash +send "PS1=':''/# '\r" +eexpect ":/# " + +start_test "Check that demo disables replication properly" +send "$argv demo -e 'show zone configuration for range default'\r" +eexpect "num_replicas = 1" +eexpect ":/# " +end_test + +start_test "Check that start-single-node disables replication properly" +system "rm -rf logs/db" +start_server $argv +send "$argv sql -e 'show zone configuration for range default'\r" +eexpect "num_replicas = 1" +eexpect ":/# " +end_test + +start_test "Check that it remains possible to reset the replication factor" +send "$argv sql -e 'alter range default configure zone using num_replicas = 3'\r" +eexpect "CONFIGURE ZONE" +eexpect ":/# " +stop_server $argv +start_server $argv +send "$argv sql -e 'show zone configuration for range default'\r" +eexpect "num_replicas = 3" +eexpect ":/# " +end_test + +stop_server $argv + +start_test "Check that start-single-node on a regular cluster does not reset the replication factor" +# make a fresh server but using the regular 'start' +system "rm -rf logs/db" +system "$argv start --insecure --pid-file=server_pid --background -s=path=logs/db >>logs/expect-cmd.log 2>&1; + $argv sql -e 'select 1'" +# restart with start-single-node +stop_server $argv +start_server $argv +# check that the replication factor was unchanged +send "$argv sql -e 'show zone configuration for range default'\r" +eexpect "num_replicas = 3" +eexpect ":/# " +end_test + +send "exit 0\r" +eexpect eof diff --git a/pkg/cli/interactive_tests/test_flags.tcl b/pkg/cli/interactive_tests/test_flags.tcl index 6b5b2ea04bc1..188c1602138a 100644 --- a/pkg/cli/interactive_tests/test_flags.tcl +++ b/pkg/cli/interactive_tests/test_flags.tcl @@ -80,14 +80,5 @@ interrupt eexpect ":/# " end_test -start_test "Check that start-single-node disables replication properly" -system "rm -rf logs/db" -start_server $argv -send "$argv sql -e 'show zone configuration for range default'\r" -eexpect "num_replicas = 1" -eexpect ":/# " -stop_server $argv -end_test - send "exit 0\r" eexpect eof diff --git a/pkg/cli/start.go b/pkg/cli/start.go index 667a84bf8101..14d75073f07b 100644 --- a/pkg/cli/start.go +++ b/pkg/cli/start.go @@ -416,18 +416,22 @@ func runStartSingleNode(cmd *cobra.Command, args []string) error { // Now actually set the flag as changed so that the start code // doesn't warn that it was not set. joinFlag.Changed = true - return runStart(cmd, args) + return runStart(cmd, args, true /*disableReplication*/) } func runStartJoin(cmd *cobra.Command, args []string) error { - return runStart(cmd, args) + return runStart(cmd, args, false /*disableReplication*/) } // runStart starts the cockroach node using --store as the list of // storage devices ("stores") on this machine and --join as the list // of other active nodes used to join this node to the cockroach // cluster, if this is its first time connecting. -func runStart(cmd *cobra.Command, args []string) error { +// +// If the argument disableReplication is true and we are starting +// a fresh cluster, the replication factor will be disabled in +// all zone configs. +func runStart(cmd *cobra.Command, args []string, disableReplication bool) error { tBegin := timeutil.Now() // First things first: if the user wants background processing, @@ -710,6 +714,18 @@ If problems persist, please see ` + base.DocsURL("cluster-setup-troubleshooting. s.PeriodicallyCheckForUpdates(ctx) } + initialBoot := s.InitialBoot() + + if disableReplication && initialBoot { + // For start-single-node, set the default replication factor to + // 1 so as to avoid warning message and unnecessary rebalance + // churn. + if err := cliDisableReplication(ctx, s); err != nil { + log.Errorf(ctx, "could not disable replication: %v", err) + return err + } + } + // Now inform the user that the server is running and tell the // user about its run-time derived parameters. var buf bytes.Buffer @@ -725,9 +741,9 @@ If problems persist, please see ` + base.DocsURL("cluster-setup-troubleshooting. pgURL, err := serverCfg.PGURL(url.User(security.RootUser)) if err != nil { log.Errorf(ctx, "failed computing the URL: %v", err) - } else { - fmt.Fprintf(tw, "sql:\t%s\n", pgURL) + return err } + fmt.Fprintf(tw, "sql:\t%s\n", pgURL) fmt.Fprintf(tw, "RPC client flags:\t%s\n", clientFlagsRPC()) if len(serverCfg.SocketFile) != 0 { @@ -754,7 +770,6 @@ If problems persist, please see ` + base.DocsURL("cluster-setup-troubleshooting. for i, spec := range serverCfg.Stores.Specs { fmt.Fprintf(tw, "store[%d]:\t%s\n", i, spec) } - initialBoot := s.InitialBoot() nodeID := s.NodeID() if initialBoot { if nodeID == server.FirstNodeID { diff --git a/pkg/server/server.go b/pkg/server/server.go index eef1bd8121a1..6432d4fc5df0 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -2225,3 +2225,19 @@ func listen( } return ln, nil } + +// RunLocalSQL calls fn on a SQL internal executor on this server. +// This is meant for use for SQL initialization during bootstrapping. +// +// The internal SQL interface should be used instead of a regular SQL +// network connection for SQL initializations when setting up a new +// sserver, because it's possible for the server to listen on a +// network interface that is not reachable from loopback. It is also +// possible for the TLS certificates to be invalid when used locally +// (e.g. if the hostname in the cert is an advertised address that's +// only reachable externally). +func (s *Server) RunLocalSQL( + ctx context.Context, fn func(ctx context.Context, sqlExec *sql.InternalExecutor) error, +) error { + return fn(ctx, s.internalExecutor) +}