diff --git a/Makefile b/Makefile index bf2dc0ae8d96..5a53943d65b5 100644 --- a/Makefile +++ b/Makefile @@ -1390,11 +1390,13 @@ pkg/ui/workspaces/db-console/dist/%.ccl.dll.js pkg/ui/workspaces/db-console/%.cc .PHONY: ui-test ui-test: $(UI_CCL_DLLS) $(UI_CCL_MANIFESTS) + $(info $(yellow)NOTE: consider using `./dev ui test` instead of `make ui-test`$(term-reset)) $(NODE_RUN) -C pkg/ui/workspaces/db-console $(KARMA) start $(NODE_RUN) -C pkg/ui/workspaces/cluster-ui yarn ci .PHONY: ui-test-watch ui-test-watch: $(UI_CCL_DLLS) $(UI_CCL_MANIFESTS) + $(info $(yellow)NOTE: consider using `./dev ui test --watch` instead of `make ui-test-watch`$(term-reset)) $(NODE_RUN) -C pkg/ui/workspaces/db-console $(KARMA) start --no-single-run --auto-watch & \ $(NODE_RUN) -C pkg/ui/workspaces/cluster-ui yarn test @@ -1430,11 +1432,13 @@ ui-watch ui-watch-secure: $(UI_CCL_DLLS) pkg/ui/yarn.opt.installed # # `node-run.sh` wrapper is removed because this command is supposed to be run in dev environment (not in docker of CI) # so it is safe to run yarn commands directly to preserve formatting and colors for outputs + $(info $(yellow)NOTE: consider using `./dev ui watch [--secure]` instead of `make ui-watch[-secure]`$(term-reset)) yarn --cwd pkg/ui/workspaces/cluster-ui build:watch & \ yarn --cwd pkg/ui/workspaces/db-console webpack-dev-server --config webpack.app.js --env.dist=ccl --env.WEBPACK_SERVE --port $(PORT) --mode "development" $(WEBPACK_DEV_SERVER_FLAGS) .PHONY: ui-clean ui-clean: ## Remove build artifacts. + $(info $(yellow)NOTE: consider using `./dev ui clean` instead of `make ui-clean`$(term-reset)) find pkg/ui/distccl/assets pkg/ui/distoss/assets -mindepth 1 -not -name .gitkeep -delete rm -rf pkg/ui/assets.ccl.installed pkg/ui/assets.oss.installed rm -rf pkg/ui/dist_vendor/* @@ -1445,6 +1449,7 @@ ui-clean: ## Remove build artifacts. .PHONY: ui-maintainer-clean ui-maintainer-clean: ## Like clean, but also remove installed dependencies ui-maintainer-clean: ui-clean + $(info $(yellow)NOTE: consider using `./dev ui clean --all` instead of `make ui-maintainer-clean`$(term-reset)) rm -rf pkg/ui/node_modules pkg/ui/workspaces/db-console/node_modules pkg/ui/yarn.installed pkg/ui/workspaces/cluster-ui/node_modules pkg/roachprod/vm/aws/embedded.go: bin/.bootstrap pkg/roachprod/vm/aws/config.json pkg/roachprod/vm/aws/old.json bin/terraformgen diff --git a/build/teamcity-check-genfiles.sh b/build/teamcity-check-genfiles.sh index 34e7a63776b0..d0e53bd1a0d6 100755 --- a/build/teamcity-check-genfiles.sh +++ b/build/teamcity-check-genfiles.sh @@ -9,6 +9,8 @@ tc_prepare tc_start_block "Ensure generated code is up-to-date" +begin_check_generated_code_tests + # NOTE(ricky): Please make sure any changes to the Bazel-related checks here are # propagated to build/teamcity/cockroach/ci/tests/check_generated_code_impl.sh # as well. @@ -29,24 +31,26 @@ if grep TODO DEPS.bzl; then echo "Missing TODO comment in DEPS.bzl. Did you run \`./dev generate bazel --mirror\`?" exit 1 fi -check_workspace_clean "Run \`./dev generate bazel\` to automatically regenerate these." +check_workspace_clean 'dev_generate_bazel' "Run \`./dev generate bazel\` to automatically regenerate these." run build/builder.sh make generate &> artifacts/generate.log || (cat artifacts/generate.log && false) rm artifacts/generate.log -check_workspace_clean "Run \`make generate\` to automatically regenerate these." +check_workspace_clean 'make_generate' "Run \`make generate\` to automatically regenerate these." run build/builder.sh make buildshort &> artifacts/buildshort.log || (cat artifacts/buildshort.log && false) rm artifacts/buildshort.log -check_workspace_clean "Run \`make buildshort\` to automatically regenerate these." +check_workspace_clean 'make_buildshort' "Run \`make buildshort\` to automatically regenerate these." tc_end_block "Ensure generated code is up-to-date" # generated code can generate new dependencies; check dependencies after generated code. tc_start_block "Ensure dependencies are up-to-date" # Run go mod tidy and `make -k vendor_rebuild` and ensure nothing changes. run build/builder.sh go mod tidy -check_workspace_clean "Run \`go mod tidy\` and \`make -k vendor_rebuild\` to automatically regenerate these." +check_workspace_clean 'go_mod_tidy' "Run \`go mod tidy\` and \`make -k vendor_rebuild\` to automatically regenerate these." run build/builder.sh make -k vendor_rebuild cd vendor -check_workspace_clean "Run \`make -k vendor_rebuild\` to automatically regenerate these." +check_workspace_clean 'vendor_rebuild' "Run \`make -k vendor_rebuild\` to automatically regenerate these." cd .. + +end_check_generated_code_tests tc_end_block "Ensure dependencies are up-to-date" tc_start_block "Test web UI" diff --git a/build/teamcity-support.sh b/build/teamcity-support.sh index 70b0afa0317a..1638c7ac46eb 100755 --- a/build/teamcity-support.sh +++ b/build/teamcity-support.sh @@ -307,9 +307,18 @@ generate_ssh_key() { fi } -# Call this function with one argument, the error message to print if the -# workspace is dirty. +begin_check_generated_code_tests() { + echo "##teamcity[testSuiteStarted name='CheckGeneratedCode']" +} + +end_check_generated_code_tests() { + echo "##teamcity[testSuiteFinished name='CheckGeneratedCode']" +} + +# Call this function with two arguments: the name of the "test" that will be +# reported to teamcity and the error message to print if the workspace is dirty. check_workspace_clean() { + echo "##teamcity[testStarted name='CheckGeneratedCode/$1' captureStandardOutput='true']" # The workspace is clean iff `git status --porcelain` produces no output. Any # output is either an error message or a listing of an untracked/dirty file. if [[ "$(git status --porcelain 2>&1)" != "" ]]; then @@ -317,7 +326,10 @@ check_workspace_clean() { git diff -a >&2 || true echo "====================================================" >&2 echo "Some automatically generated code is not up to date." >&2 - echo $1 >&2 + echo $2 >&2 + echo "##teamcity[testFailed name='CheckGeneratedCode/$1']" + echo "##teamcity[testFinished name='CheckGeneratedCode/$1']" exit 1 fi + echo "##teamcity[testFinished name='CheckGeneratedCode/$1']" } diff --git a/build/teamcity/cockroach/ci/tests/check_generated_code_impl.sh b/build/teamcity/cockroach/ci/tests/check_generated_code_impl.sh index ed7e27822aa1..9aa5593d64ee 100755 --- a/build/teamcity/cockroach/ci/tests/check_generated_code_impl.sh +++ b/build/teamcity/cockroach/ci/tests/check_generated_code_impl.sh @@ -8,6 +8,8 @@ source "$dir/teamcity-support.sh" # For $root, check_workspace_clean mkdir -p artifacts +begin_check_generated_code_tests + # Buffer noisy output and only print it on failure. if ! (./build/bazelutil/check.sh &> artifacts/buildshort.log || (cat artifacts/buildshort.log && false)); then # The command will output instructions on how to fix the error. @@ -24,11 +26,13 @@ if grep TODO DEPS.bzl; then echo "Missing TODO comment in DEPS.bzl. Did you run \`./dev generate bazel --mirror\`?" exit 1 fi -check_workspace_clean "Run \`./dev generate bazel\` to automatically regenerate these." +check_workspace_clean 'dev_generate_bazel' "Run \`./dev generate bazel\` to automatically regenerate these." # Run go mod tidy and ensure nothing changes. # NB: If files are missing from any packages then `go mod tidy` will # fail. So we need to make sure that `.pb.go` sources are populated. bazel run //pkg/gen:go_proto bazel run @go_sdk//:bin/go --ui_event_filters=-DEBUG,-info,-stdout,-stderr --noshow_progress mod tidy -check_workspace_clean "Run \`go mod tidy\` to automatically regenerate these." +check_workspace_clean 'go_mod_tidy' "Run \`go mod tidy\` to automatically regenerate these." + +end_check_generated_code_tests diff --git a/build/teamcity/cockroach/nightlies/pebble_nightly_metamorphic_impl.sh b/build/teamcity/cockroach/nightlies/pebble_nightly_metamorphic_impl.sh index a3f668cc20b6..c2e472031075 100755 --- a/build/teamcity/cockroach/nightlies/pebble_nightly_metamorphic_impl.sh +++ b/build/teamcity/cockroach/nightlies/pebble_nightly_metamorphic_impl.sh @@ -26,6 +26,7 @@ $BAZEL_BIN/pkg/cmd/bazci/bazci_/bazci --config=ci test \ --run_under "@com_github_cockroachdb_stress//:stress -bazel -shardable-artifacts 'GO_TEST_JSON_OUTPUT_FILE=cat,XML_OUTPUT_FILE=$BAZEL_BIN/pkg/cmd/bazci/bazci_/bazci merge-test-xmls' -maxtime 3h -maxfails 1 -stderr -p 1" \ --test_arg -dir --test_arg $ARTIFACTS_DIR \ --test_arg -ops --test_arg "uniform:5000-10000" \ + --test_output streamed \ || exit_status=$? BAZEL_SUPPORT_EXTRA_GITHUB_POST_ARGS=--formatter=pebble-metamorphic process_test_json \ diff --git a/build/teamcity/cockroach/nightlies/stress_impl.sh b/build/teamcity/cockroach/nightlies/stress_impl.sh index 2032ab96b6b0..4b14aa2493fe 100755 --- a/build/teamcity/cockroach/nightlies/stress_impl.sh +++ b/build/teamcity/cockroach/nightlies/stress_impl.sh @@ -41,6 +41,7 @@ do --run_under "@com_github_cockroachdb_stress//:stress -bazel -shardable-artifacts 'GO_TEST_JSON_OUTPUT_FILE=cat,XML_OUTPUT_FILE=$BAZEL_BIN/pkg/cmd/bazci/bazci_/bazci merge-test-xmls' $STRESSFLAGS" \ --define "gotags=$TAGS" \ --nocache_test_results \ + --test_output streamed \ ${EXTRA_BAZEL_FLAGS} \ || exit_status=$? process_test_json \ diff --git a/dev b/dev index 16bf2332c629..2e6aa325b802 100755 --- a/dev +++ b/dev @@ -3,7 +3,7 @@ set -euo pipefail # Bump this counter to force rebuilding `dev` on all machines. -DEV_VERSION=25 +DEV_VERSION=26 THIS_DIR=$(cd "$(dirname "$0")" && pwd) BINARY_DIR=$THIS_DIR/bin/dev-versions diff --git a/docs/generated/http/full.md b/docs/generated/http/full.md index cb47c6a8cdbd..12305be2aa1a 100644 --- a/docs/generated/http/full.md +++ b/docs/generated/http/full.md @@ -6194,6 +6194,7 @@ If no NodeIDs are given, it targets the recipient node. | ----- | ---- | ----- | ----------- | -------------- | | node_ids | [int32](#cockroach.server.serverpb.DecommissionRequest-int32) | repeated | | [reserved](#support-status) | | target_membership | [cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus](#cockroach.server.serverpb.DecommissionRequest-cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus) | | | [reserved](#support-status) | +| num_replica_report | [int32](#cockroach.server.serverpb.DecommissionRequest-int32) | | The number of decommissioning replicas to be reported. | [reserved](#support-status) | @@ -6230,6 +6231,21 @@ DecommissionStatusResponse lists decommissioning statuses for a number of NodeID | replica_count | [int64](#cockroach.server.serverpb.DecommissionStatusResponse-int64) | | The number of replicas on the node, computed by scanning meta2 ranges. | [reserved](#support-status) | | membership | [cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus](#cockroach.server.serverpb.DecommissionStatusResponse-cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus) | | The membership status of the given node. | [reserved](#support-status) | | draining | [bool](#cockroach.server.serverpb.DecommissionStatusResponse-bool) | | | [reserved](#support-status) | +| reported_replicas | [DecommissionStatusResponse.Replica](#cockroach.server.serverpb.DecommissionStatusResponse-cockroach.server.serverpb.DecommissionStatusResponse.Replica) | repeated | Decommissioning replicas on the given node to be reported. How many replicas are reported is determined by what was specified in the request. | [reserved](#support-status) | + + + + + + +#### DecommissionStatusResponse.Replica + + + +| Field | Type | Label | Description | Support status | +| ----- | ---- | ----- | ----------- | -------------- | +| replica_id | [int32](#cockroach.server.serverpb.DecommissionStatusResponse-int32) | | | [reserved](#support-status) | +| range_id | [int32](#cockroach.server.serverpb.DecommissionStatusResponse-int32) | | | [reserved](#support-status) | @@ -6258,6 +6274,7 @@ specified or, if none are specified, all nodes. | Field | Type | Label | Description | Support status | | ----- | ---- | ----- | ----------- | -------------- | | node_ids | [int32](#cockroach.server.serverpb.DecommissionStatusRequest-int32) | repeated | | [reserved](#support-status) | +| num_replica_report | [int32](#cockroach.server.serverpb.DecommissionStatusRequest-int32) | | The number of decommissioning replicas to be reported. | [reserved](#support-status) | @@ -6294,6 +6311,21 @@ DecommissionStatusResponse lists decommissioning statuses for a number of NodeID | replica_count | [int64](#cockroach.server.serverpb.DecommissionStatusResponse-int64) | | The number of replicas on the node, computed by scanning meta2 ranges. | [reserved](#support-status) | | membership | [cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus](#cockroach.server.serverpb.DecommissionStatusResponse-cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus) | | The membership status of the given node. | [reserved](#support-status) | | draining | [bool](#cockroach.server.serverpb.DecommissionStatusResponse-bool) | | | [reserved](#support-status) | +| reported_replicas | [DecommissionStatusResponse.Replica](#cockroach.server.serverpb.DecommissionStatusResponse-cockroach.server.serverpb.DecommissionStatusResponse.Replica) | repeated | Decommissioning replicas on the given node to be reported. How many replicas are reported is determined by what was specified in the request. | [reserved](#support-status) | + + + + + + +#### DecommissionStatusResponse.Replica + + + +| Field | Type | Label | Description | Support status | +| ----- | ---- | ----- | ----------- | -------------- | +| replica_id | [int32](#cockroach.server.serverpb.DecommissionStatusResponse-int32) | | | [reserved](#support-status) | +| range_id | [int32](#cockroach.server.serverpb.DecommissionStatusResponse-int32) | | | [reserved](#support-status) | diff --git a/pkg/acceptance/cluster/dockercluster.go b/pkg/acceptance/cluster/dockercluster.go index 162eafd12d1e..2be5a1e6bb95 100644 --- a/pkg/acceptance/cluster/dockercluster.go +++ b/pkg/acceptance/cluster/dockercluster.go @@ -235,7 +235,7 @@ func (l *DockerCluster) OneShot( if err := l.oneshot.Start(ctx); err != nil { return err } - return l.oneshot.Wait(ctx, container.WaitConditionNextExit) + return l.oneshot.Wait(ctx, container.WaitConditionNotRunning) } // stopOnPanic is invoked as a deferred function in Start in order to attempt @@ -372,7 +372,7 @@ func (l *DockerCluster) initCluster(ctx context.Context) { // and it'll get in the way of future runs. l.vols = c maybePanic(c.Start(ctx)) - maybePanic(c.Wait(ctx, container.WaitConditionNextExit)) + maybePanic(c.Wait(ctx, container.WaitConditionNotRunning)) } // cockroachEntryPoint returns the value to be used as diff --git a/pkg/cli/node.go b/pkg/cli/node.go index 421142b22f4f..1ddf911f62ed 100644 --- a/pkg/cli/node.go +++ b/pkg/cli/node.go @@ -457,6 +457,17 @@ func runDecommissionNodeImpl( MaxBackoff: 20 * time.Second, } + // Log verbosity is increased when there is possibly a decommission stall. + // If the number of decommissioning replicas does not decrease after some time + // (i.e. the decommission status has not changed after + // sameStatusThreshold iterations), verbosity is automatically set. + // Some decommissioning replicas will be reported to the operator. + const sameStatusThreshold = 15 + var ( + numReplicaReport = 0 + sameStatusCount = 0 + ) + // Decommissioning a node is driven by a three-step process. // 1) Mark each node as 'decommissioning'. In doing so, all replicas are // slowly moved off of these nodes. @@ -472,6 +483,7 @@ func runDecommissionNodeImpl( req := &serverpb.DecommissionRequest{ NodeIDs: nodeIDs, TargetMembership: livenesspb.MembershipStatus_DECOMMISSIONING, + NumReplicaReport: int32(numReplicaReport), } resp, err := c.Decommission(ctx, req) if err != nil { @@ -479,14 +491,33 @@ func runDecommissionNodeImpl( return errors.Wrap(err, "while trying to mark as decommissioning") } + if numReplicaReport > 0 { + printDecommissionReplicas(ctx, *resp) + } + if !reflect.DeepEqual(&prevResponse, resp) { fmt.Fprintln(stderr) - if err := printDecommissionStatus(*resp); err != nil { + if err = printDecommissionStatus(*resp); err != nil { return err } prevResponse = *resp + + // The decommissioning status changed. Set `sameStatusCount` back to zero. + sameStatusCount = 0 + numReplicaReport = 0 } else { + // Print a marker indicating that there has been no progress, + // instead of printing the same status. fmt.Fprintf(stderr, ".") + + // Report decommissioning replicas if there's been significant time of + // no progress. + if sameStatusCount >= sameStatusThreshold && numReplicaReport == 0 { + // Configure a number of replicas to report. + numReplicaReport = 5 + } else { + sameStatusCount++ + } } anyActive := false @@ -591,6 +622,21 @@ func printDecommissionStatus(resp serverpb.DecommissionStatusResponse) error { clisqlexec.NewRowSliceIter(decommissionResponseValueToRows(resp.Status), decommissionResponseAlignment())) } +func printDecommissionReplicas(ctx context.Context, resp serverpb.DecommissionStatusResponse) { + fmt.Fprintln(stderr, "\npossible decommission stall detected; reporting decommissioning replicas") + + for _, nodeStatus := range resp.Status { + for _, replica := range nodeStatus.ReportedReplicas { + fmt.Fprintf(stderr, + "n%d still has replica id %d for range r%d", + nodeStatus.NodeID, + replica.ReplicaID, + replica.RangeID, + ) + } + } +} + func runRecommissionNode(cmd *cobra.Command, args []string) error { ctx, cancel := context.WithCancel(context.Background()) defer cancel() diff --git a/pkg/cmd/dev/testdata/datadriven/ui b/pkg/cmd/dev/testdata/datadriven/ui index c51b8e32af9a..46398fa0a7cd 100644 --- a/pkg/cmd/dev/testdata/datadriven/ui +++ b/pkg/cmd/dev/testdata/datadriven/ui @@ -73,3 +73,27 @@ cp -r sandbox/pkg/ui/workspaces/cluster-ui/dist crdb-checkout/pkg/ui/workspaces/ bazel info workspace --color=no bazel run @nodejs//:yarn -- --silent --cwd crdb-checkout/pkg/ui/workspaces/db-console karma:watch bazel run @nodejs//:yarn -- --silent --cwd crdb-checkout/pkg/ui/workspaces/cluster-ui jest --watch + +exec +dev ui clean +---- +bazel info workspace --color=no +rm -rf crdb-checkout/pkg/ui/workspaces/db-console/src/js/protos.js +rm -rf crdb-checkout/pkg/ui/workspaces/db-console/src/js/protos.d.ts +rm -rf crdb-checkout/pkg/ui/workspaces/db-console/ccl/src/js/protos.js +rm -rf crdb-checkout/pkg/ui/workspaces/db-console/ccl/src/js/protos.d.ts +rm -rf crdb-checkout/pkg/ui/workspaces/cluster-ui/dist + +exec +dev ui clean --all +---- +bazel info workspace --color=no +bazel info workspace --color=no +rm -rf crdb-checkout/pkg/ui/workspaces/db-console/src/js/protos.js +rm -rf crdb-checkout/pkg/ui/workspaces/db-console/src/js/protos.d.ts +rm -rf crdb-checkout/pkg/ui/workspaces/db-console/ccl/src/js/protos.js +rm -rf crdb-checkout/pkg/ui/workspaces/db-console/ccl/src/js/protos.d.ts +rm -rf crdb-checkout/pkg/ui/workspaces/cluster-ui/dist +rm -rf crdb-checkout/pkg/ui/node_modules +rm -rf crdb-checkout/pkg/ui/workspaces/db-console/node_modules +rm -rf crdb-checkout/pkg/ui/workspaces/cluster-ui/node_modules diff --git a/pkg/cmd/dev/ui.go b/pkg/cmd/dev/ui.go index fedda8d604a8..2dfe9940ab6b 100644 --- a/pkg/cmd/dev/ui.go +++ b/pkg/cmd/dev/ui.go @@ -35,9 +35,10 @@ func makeUICmd(d *dev) *cobra.Command { Long: "Builds UI & runs UI-related commands to ease development flows", } - uiCmd.AddCommand(makeUIWatchCmd(d)) + uiCmd.AddCommand(makeUICleanCmd(d)) uiCmd.AddCommand(makeUILintCmd(d)) uiCmd.AddCommand(makeUITestCmd(d)) + uiCmd.AddCommand(makeUIWatchCmd(d)) return uiCmd } @@ -248,6 +249,58 @@ Replaces 'make ui-lint'.`, return lintCmd } +func makeUICleanCmd(d *dev) *cobra.Command { + const ( + allFlag = "all" + ) + + cleanCmd := &cobra.Command{ + Use: "clean [--all]", + Short: "clean artifacts produced by `dev ui`", + Long: "Clean the workspace of artifacts produced by `dev ui`. Pass the --all option to also delete all installed dependencies.", + Args: cobra.MaximumNArgs(0), + RunE: func(cmd *cobra.Command, commandLine []string) error { + all := mustGetFlagBool(cmd, allFlag) + uiDirs, err := getUIDirs(d) + if err != nil { + return err + } + pathsToDelete := []string{ + filepath.Join(uiDirs.dbConsole, "src", "js", "protos.js"), + filepath.Join(uiDirs.dbConsole, "src", "js", "protos.d.ts"), + filepath.Join(uiDirs.dbConsole, "ccl", "src", "js", "protos.js"), + filepath.Join(uiDirs.dbConsole, "ccl", "src", "js", "protos.d.ts"), + filepath.Join(uiDirs.clusterUI, "dist"), + } + if all { + workspace, err := d.getWorkspace(d.cli.Context()) + if err != nil { + return err + } + + pathsToDelete = append( + pathsToDelete, + filepath.Join(workspace, "pkg", "ui", "node_modules"), + filepath.Join(uiDirs.dbConsole, "node_modules"), + filepath.Join(uiDirs.clusterUI, "node_modules"), + ) + } + + for _, toDelete := range pathsToDelete { + err := d.os.RemoveAll(toDelete) + if err != nil { + return err + } + } + return nil + }, + } + + cleanCmd.Flags().Bool(allFlag, false, "additionally clean installed dependencies") + + return cleanCmd +} + // arrangeFilesForTestWatchers moves files from Bazel's build output directory // into the locations they'd be found during a non-Bazel build, so that test // watchers can successfully operate outside of the Bazel sandbox. diff --git a/pkg/cmd/github-pull-request-make/main.go b/pkg/cmd/github-pull-request-make/main.go index 9ce817bc37ee..430e9441f32c 100644 --- a/pkg/cmd/github-pull-request-make/main.go +++ b/pkg/cmd/github-pull-request-make/main.go @@ -317,6 +317,7 @@ func main() { args = append(args, "--test_arg=-test.timeout", fmt.Sprintf("--test_arg=%s", timeout)) // Give the entire test 1 more minute than the duration to wrap up. args = append(args, fmt.Sprintf("--test_timeout=%d", int((duration+1*time.Minute).Seconds()))) + args = append(args, "--test_output", "streamed") args = append(args, "--run_under", fmt.Sprintf("%s -bazel -shardable-artifacts 'XML_OUTPUT_FILE=%s merge-test-xmls' -stderr -maxfails 1 -maxtime %s -p %d", bazelStressTarget, bazciPath, duration, parallelism)) cmd := exec.Command("bazci", args...) diff --git a/pkg/cmd/roachtest/tests/decommission.go b/pkg/cmd/roachtest/tests/decommission.go index 71bee78138c1..062879e5d3a2 100644 --- a/pkg/cmd/roachtest/tests/decommission.go +++ b/pkg/cmd/roachtest/tests/decommission.go @@ -16,6 +16,7 @@ import ( "fmt" "math/rand" "reflect" + "regexp" "strconv" "strings" "time" @@ -26,6 +27,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" + "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/retry" "github.com/cockroachdb/cockroach/pkg/util/timeutil" @@ -99,6 +101,17 @@ func registerDecommission(r registry.Registry) { }, }) } + { + numNodes := 6 + r.Add(registry.TestSpec{ + Name: "decommission/slow", + Owner: registry.OwnerServer, + Cluster: r.MakeClusterSpec(numNodes), + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { + runDecommissionSlow(ctx, t, c) + }, + }) + } } // runDrainAndDecommission marks 3 nodes in the test cluster as "draining" and @@ -1091,6 +1104,84 @@ func runDecommissionDrains(ctx context.Context, t test.Test, c cluster.Cluster) require.NoError(t, cli.MatchCSV(o, expDecommissioned)) } +// runDecommissionSlow decommissions 5 nodes in a test cluster of 6 +// (with a replication factor of 5), which will guarantee a replica transfer +// stall. This test is meant to ensure that decommissioning replicas are +// reported when replica transfers stall. +func runDecommissionSlow(ctx context.Context, t test.Test, c cluster.Cluster) { + const ( + numNodes = 6 + pinnedNodeID = 1 + replicationFactor = 5 + ) + + var verboseStoreLogRe = regexp.MustCompile("possible decommission stall detected") + + err := c.PutE(ctx, t.L(), t.Cockroach(), "./cockroach", c.All()) + require.NoError(t, err) + + c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), c.All()) + + run := func(db *gosql.DB, query string) { + _, err = db.ExecContext(ctx, query) + require.NoError(t, err) + t.L().Printf("run: %s\n", query) + } + + { + db := c.Conn(ctx, t.L(), pinnedNodeID) + defer db.Close() + + // Set the replication factor to 5. + run(db, fmt.Sprintf(`ALTER RANGE default CONFIGURE ZONE USING num_replicas=%d`, replicationFactor)) + run(db, fmt.Sprintf(`ALTER DATABASE system CONFIGURE ZONE USING num_replicas=%d`, replicationFactor)) + + // Increase the speed of decommissioning. + run(db, `SET CLUSTER SETTING kv.snapshot_rebalance.max_rate='2GiB'`) + run(db, `SET CLUSTER SETTING kv.snapshot_recovery.max_rate='2GiB'`) + + // Wait for initial up-replication. + err := WaitForReplication(ctx, t, db, replicationFactor) + require.NoError(t, err) + } + + // Decommission 5 nodes from the cluster, resulting in immovable replicas. + // Be prepared to cancel the context for the processes running decommissions + // since the decommissions will stall. + decomCtx, cancel := context.WithTimeout(ctx, 10*time.Minute) + defer cancel() + m := c.NewMonitor(decomCtx) + for nodeID := 2; nodeID <= numNodes; nodeID++ { + id := nodeID + m.Go(func(ctx context.Context) error { + decom := func(id int) error { + t.Status(fmt.Sprintf("decommissioning node %d", id)) + return c.RunE(ctx, + c.Node(id), + fmt.Sprintf("./cockroach node decommission %d --insecure", id), + ) + } + return decom(id) + }) + } + + // Check for reported decommissioning replicas. + t.Status("checking for decommissioning replicas report...") + testutils.SucceedsWithin(t, func() error { + for nodeID := 1; nodeID <= numNodes; nodeID++ { + if err = c.RunE(ctx, + c.Node(nodeID), + fmt.Sprintf("grep -q '%s' logs/cockroach.log", verboseStoreLogRe), + ); err == nil { + return nil + } + } + return errors.New("still waiting for decommissioning replicas report") + }, + 3*time.Minute, + ) +} + // Header from the output of `cockroach node decommission`. var decommissionHeader = []string{ "id", "is_live", "replicas", "is_decommissioning", "membership", "is_draining", diff --git a/pkg/cmd/roachtest/tests/disk_full.go b/pkg/cmd/roachtest/tests/disk_full.go index abb652d296a4..f33f2ccf8d75 100644 --- a/pkg/cmd/roachtest/tests/disk_full.go +++ b/pkg/cmd/roachtest/tests/disk_full.go @@ -41,12 +41,12 @@ func registerDiskFull(r registry.Registry) { // Node 1 will soon be killed, when the ballast file fills up its disk. To // ensure that the ranges containing system tables are available on other - // nodes, we wait here for at least two replicas of each range. Without - // this, it's possible that we end up deadlocked on a system query that - // requires a range on node 1, but node 1 will not restart until the query + // nodes, we wait here for 3x replication of each range. Without this, + // it's possible that we end up deadlocked on a system query that requires + // a range on node 1, but node 1 will not restart until the query // completes. db := c.Conn(ctx, t.L(), 1) - err := WaitForReplication(ctx, t, db, 2) + err := WaitFor3XReplication(ctx, t, db) require.NoError(t, err) _ = db.Close() diff --git a/pkg/internal/sqlsmith/relational.go b/pkg/internal/sqlsmith/relational.go index 67e136d12c05..e66ef908d520 100644 --- a/pkg/internal/sqlsmith/relational.go +++ b/pkg/internal/sqlsmith/relational.go @@ -206,6 +206,8 @@ func makeEquiJoinExpr(s *Smither, refs colRefs, forJoin bool) (tree.TableExpr, c return nil, nil, false } + s.lock.RLock() + defer s.lock.RUnlock() // Determine overlapping types. var available [][2]tree.TypedExpr for _, leftCol := range leftRefs { diff --git a/pkg/internal/sqlsmith/type.go b/pkg/internal/sqlsmith/type.go index 3dddb02ecc00..4dfd4490ea2e 100644 --- a/pkg/internal/sqlsmith/type.go +++ b/pkg/internal/sqlsmith/type.go @@ -59,9 +59,9 @@ func (s *Smither) randScalarType() *types.T { // isScalarType returns true if t is a member of types.Scalar, or a user defined // enum. +// Requires s.lock to be held. func (s *Smither) isScalarType(t *types.T) bool { - s.lock.RLock() - defer s.lock.RUnlock() + s.lock.AssertRHeld() scalarTypes := types.Scalar if s.types != nil { scalarTypes = s.types.scalarTypes diff --git a/pkg/kv/kvserver/testdata/replica_unavailable_error.txt b/pkg/kv/kvserver/testdata/replica_unavailable_error.txt index e685782c302e..c7f140c39a6d 100644 --- a/pkg/kv/kvserver/testdata/replica_unavailable_error.txt +++ b/pkg/kv/kvserver/testdata/replica_unavailable_error.txt @@ -1,3 +1,3 @@ echo ---- -replica (n1,s10):1 unable to serve request to r10:‹{a-z}› [(n1,s10):1, (n2,s20):2, next=3, gen=0]: raft status: {"id":"0","term":0,"vote":"0","commit":0,"lead":"0","raftState":"StateFollower","applied":0,"progress":{},"leadtransferee":"0"}: replicas on non-live nodes: (n2,s20):2 (lost quorum: true): probe failed +replica unavailable: (n1,s10):1 unable to serve request to r10:‹{a-z}› [(n1,s10):1, (n2,s20):2, next=3, gen=0]: raft status: {"id":"0","term":0,"vote":"0","commit":0,"lead":"0","raftState":"StateFollower","applied":0,"progress":{},"leadtransferee":"0"}: replicas on non-live nodes: (n2,s20):2 (lost quorum: true): probe failed diff --git a/pkg/roachpb/replica_unavailable_error.go b/pkg/roachpb/replica_unavailable_error.go index 4259ee97eb94..75d1e7c2d492 100644 --- a/pkg/roachpb/replica_unavailable_error.go +++ b/pkg/roachpb/replica_unavailable_error.go @@ -37,7 +37,7 @@ var _ errors.Wrapper = (*ReplicaUnavailableError)(nil) // SafeFormatError implements errors.SafeFormatter. func (e *ReplicaUnavailableError) SafeFormatError(p errors.Printer) error { - p.Printf("replica %s unable to serve request to %s: %s", e.Replica, e.Desc, e.Unwrap()) + p.Printf("replica unavailable: %s unable to serve request to %s: %s", e.Replica, e.Desc, e.Unwrap()) return nil } diff --git a/pkg/roachpb/testdata/replica_unavailable_error.txt b/pkg/roachpb/testdata/replica_unavailable_error.txt index f2ae80eb8b9b..cdb6a6dc4c53 100644 --- a/pkg/roachpb/testdata/replica_unavailable_error.txt +++ b/pkg/roachpb/testdata/replica_unavailable_error.txt @@ -1,4 +1,4 @@ echo ---- -replica (n1,s2):3 unable to serve request to r123:/M{in-ax} [(n1,s2):1, next=2, gen=0]: slow proposal -replica (n1,s2):3 unable to serve request to r123:‹/M{in-ax}› [(n1,s2):1, next=2, gen=0]: slow proposal +replica unavailable: (n1,s2):3 unable to serve request to r123:/M{in-ax} [(n1,s2):1, next=2, gen=0]: slow proposal +replica unavailable: (n1,s2):3 unable to serve request to r123:‹/M{in-ax}› [(n1,s2):1, next=2, gen=0]: slow proposal diff --git a/pkg/server/admin.go b/pkg/server/admin.go index 05dcfa9f208c..d4d153c7cbd5 100644 --- a/pkg/server/admin.go +++ b/pkg/server/admin.go @@ -2302,6 +2302,7 @@ func (s *adminServer) decommissionStatusHelper( // Get the number of replicas on each node. We *may* not need all of them, // but that would be more complicated than seems worth it right now. nodeIDs := req.NodeIDs + numReplicaReport := req.NumReplicaReport // If no nodeIDs given, use all nodes. if len(nodeIDs) == 0 { @@ -2314,6 +2315,24 @@ func (s *adminServer) decommissionStatusHelper( } } + // If the client specified a number of decommissioning replicas to report, + // prepare to get decommissioning replicas to report to the operator. + // numReplicaReport is the number of replicas reported for each node. + var replicasToReport map[roachpb.NodeID][]*serverpb.DecommissionStatusResponse_Replica + if numReplicaReport > 0 { + log.Ops.Warning(ctx, "possible decommission stall detected; reporting decommissioning replicas") + replicasToReport = make(map[roachpb.NodeID][]*serverpb.DecommissionStatusResponse_Replica) + } + + isDecommissioningNode := func(n roachpb.NodeID) bool { + for _, nodeID := range nodeIDs { + if n == nodeID { + return true + } + } + return false + } + // Compute the replica counts for the target nodes only. This map doubles as // a lookup table to check whether we care about a given node. var replicaCounts map[roachpb.NodeID]int64 @@ -2331,6 +2350,24 @@ func (s *adminServer) decommissionStatusHelper( return errors.Wrapf(err, "%s: unable to unmarshal range descriptor", row.Key) } for _, r := range rangeDesc.Replicas().Descriptors() { + if numReplicaReport > 0 { + if len(replicasToReport[r.NodeID]) < int(numReplicaReport) { + if isDecommissioningNode(r.NodeID) { + replicasToReport[r.NodeID] = append(replicasToReport[r.NodeID], + &serverpb.DecommissionStatusResponse_Replica{ + ReplicaID: r.ReplicaID, + RangeID: rangeDesc.RangeID, + }, + ) + log.Ops.Warningf(ctx, + "n%d still has replica id %d for range r%d", + r.NodeID, + r.ReplicaID, + rangeDesc.RangeID, + ) + } + } + } if _, ok := replicaCounts[r.NodeID]; ok { replicaCounts[r.NodeID]++ } @@ -2368,15 +2405,15 @@ func (s *adminServer) decommissionStatusHelper( return nil, errors.Newf("unable to get liveness for %d", nodeID) } nodeResp := serverpb.DecommissionStatusResponse_Status{ - NodeID: l.NodeID, - ReplicaCount: replicaCounts[l.NodeID], - Membership: l.Membership, - Draining: l.Draining, + NodeID: l.NodeID, + ReplicaCount: replicaCounts[l.NodeID], + Membership: l.Membership, + Draining: l.Draining, + ReportedReplicas: replicasToReport[l.NodeID], } if l.IsLive(s.server.clock.Now().GoTime()) { nodeResp.IsLive = true } - res.Status = append(res.Status, nodeResp) } @@ -2415,7 +2452,7 @@ func (s *adminServer) Decommission( return &serverpb.DecommissionStatusResponse{}, nil } - return s.DecommissionStatus(ctx, &serverpb.DecommissionStatusRequest{NodeIDs: nodeIDs}) + return s.DecommissionStatus(ctx, &serverpb.DecommissionStatusRequest{NodeIDs: nodeIDs, NumReplicaReport: req.NumReplicaReport}) } // DataDistribution returns a count of replicas on each node for each table. diff --git a/pkg/server/serverpb/admin.proto b/pkg/server/serverpb/admin.proto index 42e65aea9c98..197c942b58bc 100644 --- a/pkg/server/serverpb/admin.proto +++ b/pkg/server/serverpb/admin.proto @@ -476,6 +476,8 @@ message DrainResponse { message DecommissionStatusRequest { repeated int32 node_ids = 1 [(gogoproto.customname) = "NodeIDs", (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.NodeID"]; + // The number of decommissioning replicas to be reported. + int32 num_replica_report = 2; } // DecommissionRequest requests the server to set the membership status on @@ -486,10 +488,18 @@ message DecommissionRequest { repeated int32 node_ids = 1 [(gogoproto.customname) = "NodeIDs", (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.NodeID"]; kv.kvserver.liveness.livenesspb.MembershipStatus target_membership = 2; + // The number of decommissioning replicas to be reported. + int32 num_replica_report = 3; } // DecommissionStatusResponse lists decommissioning statuses for a number of NodeIDs. message DecommissionStatusResponse { + message Replica { + int32 replica_id = 1 [ (gogoproto.customname) = "ReplicaID", + (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.ReplicaID"]; + int32 range_id = 2 [ (gogoproto.customname) = "RangeID", + (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.RangeID"]; + } message Status { int32 node_id = 1 [ (gogoproto.customname) = "NodeID", (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.NodeID"]; @@ -499,6 +509,10 @@ message DecommissionStatusResponse { // The membership status of the given node. kv.kvserver.liveness.livenesspb.MembershipStatus membership = 4; bool draining = 5; + // Decommissioning replicas on the given node to be reported. + // How many replicas are reported is determined by what was specified in the + // request. + repeated Replica reported_replicas = 6; } // Status of all affected nodes. repeated Status status = 2 [(gogoproto.nullable) = false];