diff --git a/Makefile b/Makefile
index bf2dc0ae8d96..5a53943d65b5 100644
--- a/Makefile
+++ b/Makefile
@@ -1390,11 +1390,13 @@ pkg/ui/workspaces/db-console/dist/%.ccl.dll.js pkg/ui/workspaces/db-console/%.cc
.PHONY: ui-test
ui-test: $(UI_CCL_DLLS) $(UI_CCL_MANIFESTS)
+ $(info $(yellow)NOTE: consider using `./dev ui test` instead of `make ui-test`$(term-reset))
$(NODE_RUN) -C pkg/ui/workspaces/db-console $(KARMA) start
$(NODE_RUN) -C pkg/ui/workspaces/cluster-ui yarn ci
.PHONY: ui-test-watch
ui-test-watch: $(UI_CCL_DLLS) $(UI_CCL_MANIFESTS)
+ $(info $(yellow)NOTE: consider using `./dev ui test --watch` instead of `make ui-test-watch`$(term-reset))
$(NODE_RUN) -C pkg/ui/workspaces/db-console $(KARMA) start --no-single-run --auto-watch & \
$(NODE_RUN) -C pkg/ui/workspaces/cluster-ui yarn test
@@ -1430,11 +1432,13 @@ ui-watch ui-watch-secure: $(UI_CCL_DLLS) pkg/ui/yarn.opt.installed
#
# `node-run.sh` wrapper is removed because this command is supposed to be run in dev environment (not in docker of CI)
# so it is safe to run yarn commands directly to preserve formatting and colors for outputs
+ $(info $(yellow)NOTE: consider using `./dev ui watch [--secure]` instead of `make ui-watch[-secure]`$(term-reset))
yarn --cwd pkg/ui/workspaces/cluster-ui build:watch & \
yarn --cwd pkg/ui/workspaces/db-console webpack-dev-server --config webpack.app.js --env.dist=ccl --env.WEBPACK_SERVE --port $(PORT) --mode "development" $(WEBPACK_DEV_SERVER_FLAGS)
.PHONY: ui-clean
ui-clean: ## Remove build artifacts.
+ $(info $(yellow)NOTE: consider using `./dev ui clean` instead of `make ui-clean`$(term-reset))
find pkg/ui/distccl/assets pkg/ui/distoss/assets -mindepth 1 -not -name .gitkeep -delete
rm -rf pkg/ui/assets.ccl.installed pkg/ui/assets.oss.installed
rm -rf pkg/ui/dist_vendor/*
@@ -1445,6 +1449,7 @@ ui-clean: ## Remove build artifacts.
.PHONY: ui-maintainer-clean
ui-maintainer-clean: ## Like clean, but also remove installed dependencies
ui-maintainer-clean: ui-clean
+ $(info $(yellow)NOTE: consider using `./dev ui clean --all` instead of `make ui-maintainer-clean`$(term-reset))
rm -rf pkg/ui/node_modules pkg/ui/workspaces/db-console/node_modules pkg/ui/yarn.installed pkg/ui/workspaces/cluster-ui/node_modules
pkg/roachprod/vm/aws/embedded.go: bin/.bootstrap pkg/roachprod/vm/aws/config.json pkg/roachprod/vm/aws/old.json bin/terraformgen
diff --git a/build/teamcity-check-genfiles.sh b/build/teamcity-check-genfiles.sh
index 34e7a63776b0..d0e53bd1a0d6 100755
--- a/build/teamcity-check-genfiles.sh
+++ b/build/teamcity-check-genfiles.sh
@@ -9,6 +9,8 @@ tc_prepare
tc_start_block "Ensure generated code is up-to-date"
+begin_check_generated_code_tests
+
# NOTE(ricky): Please make sure any changes to the Bazel-related checks here are
# propagated to build/teamcity/cockroach/ci/tests/check_generated_code_impl.sh
# as well.
@@ -29,24 +31,26 @@ if grep TODO DEPS.bzl; then
echo "Missing TODO comment in DEPS.bzl. Did you run \`./dev generate bazel --mirror\`?"
exit 1
fi
-check_workspace_clean "Run \`./dev generate bazel\` to automatically regenerate these."
+check_workspace_clean 'dev_generate_bazel' "Run \`./dev generate bazel\` to automatically regenerate these."
run build/builder.sh make generate &> artifacts/generate.log || (cat artifacts/generate.log && false)
rm artifacts/generate.log
-check_workspace_clean "Run \`make generate\` to automatically regenerate these."
+check_workspace_clean 'make_generate' "Run \`make generate\` to automatically regenerate these."
run build/builder.sh make buildshort &> artifacts/buildshort.log || (cat artifacts/buildshort.log && false)
rm artifacts/buildshort.log
-check_workspace_clean "Run \`make buildshort\` to automatically regenerate these."
+check_workspace_clean 'make_buildshort' "Run \`make buildshort\` to automatically regenerate these."
tc_end_block "Ensure generated code is up-to-date"
# generated code can generate new dependencies; check dependencies after generated code.
tc_start_block "Ensure dependencies are up-to-date"
# Run go mod tidy and `make -k vendor_rebuild` and ensure nothing changes.
run build/builder.sh go mod tidy
-check_workspace_clean "Run \`go mod tidy\` and \`make -k vendor_rebuild\` to automatically regenerate these."
+check_workspace_clean 'go_mod_tidy' "Run \`go mod tidy\` and \`make -k vendor_rebuild\` to automatically regenerate these."
run build/builder.sh make -k vendor_rebuild
cd vendor
-check_workspace_clean "Run \`make -k vendor_rebuild\` to automatically regenerate these."
+check_workspace_clean 'vendor_rebuild' "Run \`make -k vendor_rebuild\` to automatically regenerate these."
cd ..
+
+end_check_generated_code_tests
tc_end_block "Ensure dependencies are up-to-date"
tc_start_block "Test web UI"
diff --git a/build/teamcity-support.sh b/build/teamcity-support.sh
index 70b0afa0317a..1638c7ac46eb 100755
--- a/build/teamcity-support.sh
+++ b/build/teamcity-support.sh
@@ -307,9 +307,18 @@ generate_ssh_key() {
fi
}
-# Call this function with one argument, the error message to print if the
-# workspace is dirty.
+begin_check_generated_code_tests() {
+ echo "##teamcity[testSuiteStarted name='CheckGeneratedCode']"
+}
+
+end_check_generated_code_tests() {
+ echo "##teamcity[testSuiteFinished name='CheckGeneratedCode']"
+}
+
+# Call this function with two arguments: the name of the "test" that will be
+# reported to teamcity and the error message to print if the workspace is dirty.
check_workspace_clean() {
+ echo "##teamcity[testStarted name='CheckGeneratedCode/$1' captureStandardOutput='true']"
# The workspace is clean iff `git status --porcelain` produces no output. Any
# output is either an error message or a listing of an untracked/dirty file.
if [[ "$(git status --porcelain 2>&1)" != "" ]]; then
@@ -317,7 +326,10 @@ check_workspace_clean() {
git diff -a >&2 || true
echo "====================================================" >&2
echo "Some automatically generated code is not up to date." >&2
- echo $1 >&2
+ echo $2 >&2
+ echo "##teamcity[testFailed name='CheckGeneratedCode/$1']"
+ echo "##teamcity[testFinished name='CheckGeneratedCode/$1']"
exit 1
fi
+ echo "##teamcity[testFinished name='CheckGeneratedCode/$1']"
}
diff --git a/build/teamcity/cockroach/ci/tests/check_generated_code_impl.sh b/build/teamcity/cockroach/ci/tests/check_generated_code_impl.sh
index ed7e27822aa1..9aa5593d64ee 100755
--- a/build/teamcity/cockroach/ci/tests/check_generated_code_impl.sh
+++ b/build/teamcity/cockroach/ci/tests/check_generated_code_impl.sh
@@ -8,6 +8,8 @@ source "$dir/teamcity-support.sh" # For $root, check_workspace_clean
mkdir -p artifacts
+begin_check_generated_code_tests
+
# Buffer noisy output and only print it on failure.
if ! (./build/bazelutil/check.sh &> artifacts/buildshort.log || (cat artifacts/buildshort.log && false)); then
# The command will output instructions on how to fix the error.
@@ -24,11 +26,13 @@ if grep TODO DEPS.bzl; then
echo "Missing TODO comment in DEPS.bzl. Did you run \`./dev generate bazel --mirror\`?"
exit 1
fi
-check_workspace_clean "Run \`./dev generate bazel\` to automatically regenerate these."
+check_workspace_clean 'dev_generate_bazel' "Run \`./dev generate bazel\` to automatically regenerate these."
# Run go mod tidy and ensure nothing changes.
# NB: If files are missing from any packages then `go mod tidy` will
# fail. So we need to make sure that `.pb.go` sources are populated.
bazel run //pkg/gen:go_proto
bazel run @go_sdk//:bin/go --ui_event_filters=-DEBUG,-info,-stdout,-stderr --noshow_progress mod tidy
-check_workspace_clean "Run \`go mod tidy\` to automatically regenerate these."
+check_workspace_clean 'go_mod_tidy' "Run \`go mod tidy\` to automatically regenerate these."
+
+end_check_generated_code_tests
diff --git a/build/teamcity/cockroach/nightlies/pebble_nightly_metamorphic_impl.sh b/build/teamcity/cockroach/nightlies/pebble_nightly_metamorphic_impl.sh
index a3f668cc20b6..c2e472031075 100755
--- a/build/teamcity/cockroach/nightlies/pebble_nightly_metamorphic_impl.sh
+++ b/build/teamcity/cockroach/nightlies/pebble_nightly_metamorphic_impl.sh
@@ -26,6 +26,7 @@ $BAZEL_BIN/pkg/cmd/bazci/bazci_/bazci --config=ci test \
--run_under "@com_github_cockroachdb_stress//:stress -bazel -shardable-artifacts 'GO_TEST_JSON_OUTPUT_FILE=cat,XML_OUTPUT_FILE=$BAZEL_BIN/pkg/cmd/bazci/bazci_/bazci merge-test-xmls' -maxtime 3h -maxfails 1 -stderr -p 1" \
--test_arg -dir --test_arg $ARTIFACTS_DIR \
--test_arg -ops --test_arg "uniform:5000-10000" \
+ --test_output streamed \
|| exit_status=$?
BAZEL_SUPPORT_EXTRA_GITHUB_POST_ARGS=--formatter=pebble-metamorphic process_test_json \
diff --git a/build/teamcity/cockroach/nightlies/stress_impl.sh b/build/teamcity/cockroach/nightlies/stress_impl.sh
index 2032ab96b6b0..4b14aa2493fe 100755
--- a/build/teamcity/cockroach/nightlies/stress_impl.sh
+++ b/build/teamcity/cockroach/nightlies/stress_impl.sh
@@ -41,6 +41,7 @@ do
--run_under "@com_github_cockroachdb_stress//:stress -bazel -shardable-artifacts 'GO_TEST_JSON_OUTPUT_FILE=cat,XML_OUTPUT_FILE=$BAZEL_BIN/pkg/cmd/bazci/bazci_/bazci merge-test-xmls' $STRESSFLAGS" \
--define "gotags=$TAGS" \
--nocache_test_results \
+ --test_output streamed \
${EXTRA_BAZEL_FLAGS} \
|| exit_status=$?
process_test_json \
diff --git a/dev b/dev
index 16bf2332c629..2e6aa325b802 100755
--- a/dev
+++ b/dev
@@ -3,7 +3,7 @@
set -euo pipefail
# Bump this counter to force rebuilding `dev` on all machines.
-DEV_VERSION=25
+DEV_VERSION=26
THIS_DIR=$(cd "$(dirname "$0")" && pwd)
BINARY_DIR=$THIS_DIR/bin/dev-versions
diff --git a/docs/generated/http/full.md b/docs/generated/http/full.md
index cb47c6a8cdbd..12305be2aa1a 100644
--- a/docs/generated/http/full.md
+++ b/docs/generated/http/full.md
@@ -6194,6 +6194,7 @@ If no NodeIDs are given, it targets the recipient node.
| ----- | ---- | ----- | ----------- | -------------- |
| node_ids | [int32](#cockroach.server.serverpb.DecommissionRequest-int32) | repeated | | [reserved](#support-status) |
| target_membership | [cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus](#cockroach.server.serverpb.DecommissionRequest-cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus) | | | [reserved](#support-status) |
+| num_replica_report | [int32](#cockroach.server.serverpb.DecommissionRequest-int32) | | The number of decommissioning replicas to be reported. | [reserved](#support-status) |
@@ -6230,6 +6231,21 @@ DecommissionStatusResponse lists decommissioning statuses for a number of NodeID
| replica_count | [int64](#cockroach.server.serverpb.DecommissionStatusResponse-int64) | | The number of replicas on the node, computed by scanning meta2 ranges. | [reserved](#support-status) |
| membership | [cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus](#cockroach.server.serverpb.DecommissionStatusResponse-cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus) | | The membership status of the given node. | [reserved](#support-status) |
| draining | [bool](#cockroach.server.serverpb.DecommissionStatusResponse-bool) | | | [reserved](#support-status) |
+| reported_replicas | [DecommissionStatusResponse.Replica](#cockroach.server.serverpb.DecommissionStatusResponse-cockroach.server.serverpb.DecommissionStatusResponse.Replica) | repeated | Decommissioning replicas on the given node to be reported. How many replicas are reported is determined by what was specified in the request. | [reserved](#support-status) |
+
+
+
+
+
+
+#### DecommissionStatusResponse.Replica
+
+
+
+| Field | Type | Label | Description | Support status |
+| ----- | ---- | ----- | ----------- | -------------- |
+| replica_id | [int32](#cockroach.server.serverpb.DecommissionStatusResponse-int32) | | | [reserved](#support-status) |
+| range_id | [int32](#cockroach.server.serverpb.DecommissionStatusResponse-int32) | | | [reserved](#support-status) |
@@ -6258,6 +6274,7 @@ specified or, if none are specified, all nodes.
| Field | Type | Label | Description | Support status |
| ----- | ---- | ----- | ----------- | -------------- |
| node_ids | [int32](#cockroach.server.serverpb.DecommissionStatusRequest-int32) | repeated | | [reserved](#support-status) |
+| num_replica_report | [int32](#cockroach.server.serverpb.DecommissionStatusRequest-int32) | | The number of decommissioning replicas to be reported. | [reserved](#support-status) |
@@ -6294,6 +6311,21 @@ DecommissionStatusResponse lists decommissioning statuses for a number of NodeID
| replica_count | [int64](#cockroach.server.serverpb.DecommissionStatusResponse-int64) | | The number of replicas on the node, computed by scanning meta2 ranges. | [reserved](#support-status) |
| membership | [cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus](#cockroach.server.serverpb.DecommissionStatusResponse-cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus) | | The membership status of the given node. | [reserved](#support-status) |
| draining | [bool](#cockroach.server.serverpb.DecommissionStatusResponse-bool) | | | [reserved](#support-status) |
+| reported_replicas | [DecommissionStatusResponse.Replica](#cockroach.server.serverpb.DecommissionStatusResponse-cockroach.server.serverpb.DecommissionStatusResponse.Replica) | repeated | Decommissioning replicas on the given node to be reported. How many replicas are reported is determined by what was specified in the request. | [reserved](#support-status) |
+
+
+
+
+
+
+#### DecommissionStatusResponse.Replica
+
+
+
+| Field | Type | Label | Description | Support status |
+| ----- | ---- | ----- | ----------- | -------------- |
+| replica_id | [int32](#cockroach.server.serverpb.DecommissionStatusResponse-int32) | | | [reserved](#support-status) |
+| range_id | [int32](#cockroach.server.serverpb.DecommissionStatusResponse-int32) | | | [reserved](#support-status) |
diff --git a/pkg/acceptance/cluster/dockercluster.go b/pkg/acceptance/cluster/dockercluster.go
index 162eafd12d1e..2be5a1e6bb95 100644
--- a/pkg/acceptance/cluster/dockercluster.go
+++ b/pkg/acceptance/cluster/dockercluster.go
@@ -235,7 +235,7 @@ func (l *DockerCluster) OneShot(
if err := l.oneshot.Start(ctx); err != nil {
return err
}
- return l.oneshot.Wait(ctx, container.WaitConditionNextExit)
+ return l.oneshot.Wait(ctx, container.WaitConditionNotRunning)
}
// stopOnPanic is invoked as a deferred function in Start in order to attempt
@@ -372,7 +372,7 @@ func (l *DockerCluster) initCluster(ctx context.Context) {
// and it'll get in the way of future runs.
l.vols = c
maybePanic(c.Start(ctx))
- maybePanic(c.Wait(ctx, container.WaitConditionNextExit))
+ maybePanic(c.Wait(ctx, container.WaitConditionNotRunning))
}
// cockroachEntryPoint returns the value to be used as
diff --git a/pkg/cli/node.go b/pkg/cli/node.go
index 421142b22f4f..1ddf911f62ed 100644
--- a/pkg/cli/node.go
+++ b/pkg/cli/node.go
@@ -457,6 +457,17 @@ func runDecommissionNodeImpl(
MaxBackoff: 20 * time.Second,
}
+ // Log verbosity is increased when there is possibly a decommission stall.
+ // If the number of decommissioning replicas does not decrease after some time
+ // (i.e. the decommission status has not changed after
+ // sameStatusThreshold iterations), verbosity is automatically set.
+ // Some decommissioning replicas will be reported to the operator.
+ const sameStatusThreshold = 15
+ var (
+ numReplicaReport = 0
+ sameStatusCount = 0
+ )
+
// Decommissioning a node is driven by a three-step process.
// 1) Mark each node as 'decommissioning'. In doing so, all replicas are
// slowly moved off of these nodes.
@@ -472,6 +483,7 @@ func runDecommissionNodeImpl(
req := &serverpb.DecommissionRequest{
NodeIDs: nodeIDs,
TargetMembership: livenesspb.MembershipStatus_DECOMMISSIONING,
+ NumReplicaReport: int32(numReplicaReport),
}
resp, err := c.Decommission(ctx, req)
if err != nil {
@@ -479,14 +491,33 @@ func runDecommissionNodeImpl(
return errors.Wrap(err, "while trying to mark as decommissioning")
}
+ if numReplicaReport > 0 {
+ printDecommissionReplicas(ctx, *resp)
+ }
+
if !reflect.DeepEqual(&prevResponse, resp) {
fmt.Fprintln(stderr)
- if err := printDecommissionStatus(*resp); err != nil {
+ if err = printDecommissionStatus(*resp); err != nil {
return err
}
prevResponse = *resp
+
+ // The decommissioning status changed. Set `sameStatusCount` back to zero.
+ sameStatusCount = 0
+ numReplicaReport = 0
} else {
+ // Print a marker indicating that there has been no progress,
+ // instead of printing the same status.
fmt.Fprintf(stderr, ".")
+
+ // Report decommissioning replicas if there's been significant time of
+ // no progress.
+ if sameStatusCount >= sameStatusThreshold && numReplicaReport == 0 {
+ // Configure a number of replicas to report.
+ numReplicaReport = 5
+ } else {
+ sameStatusCount++
+ }
}
anyActive := false
@@ -591,6 +622,21 @@ func printDecommissionStatus(resp serverpb.DecommissionStatusResponse) error {
clisqlexec.NewRowSliceIter(decommissionResponseValueToRows(resp.Status), decommissionResponseAlignment()))
}
+func printDecommissionReplicas(ctx context.Context, resp serverpb.DecommissionStatusResponse) {
+ fmt.Fprintln(stderr, "\npossible decommission stall detected; reporting decommissioning replicas")
+
+ for _, nodeStatus := range resp.Status {
+ for _, replica := range nodeStatus.ReportedReplicas {
+ fmt.Fprintf(stderr,
+ "n%d still has replica id %d for range r%d",
+ nodeStatus.NodeID,
+ replica.ReplicaID,
+ replica.RangeID,
+ )
+ }
+ }
+}
+
func runRecommissionNode(cmd *cobra.Command, args []string) error {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
diff --git a/pkg/cmd/dev/testdata/datadriven/ui b/pkg/cmd/dev/testdata/datadriven/ui
index c51b8e32af9a..46398fa0a7cd 100644
--- a/pkg/cmd/dev/testdata/datadriven/ui
+++ b/pkg/cmd/dev/testdata/datadriven/ui
@@ -73,3 +73,27 @@ cp -r sandbox/pkg/ui/workspaces/cluster-ui/dist crdb-checkout/pkg/ui/workspaces/
bazel info workspace --color=no
bazel run @nodejs//:yarn -- --silent --cwd crdb-checkout/pkg/ui/workspaces/db-console karma:watch
bazel run @nodejs//:yarn -- --silent --cwd crdb-checkout/pkg/ui/workspaces/cluster-ui jest --watch
+
+exec
+dev ui clean
+----
+bazel info workspace --color=no
+rm -rf crdb-checkout/pkg/ui/workspaces/db-console/src/js/protos.js
+rm -rf crdb-checkout/pkg/ui/workspaces/db-console/src/js/protos.d.ts
+rm -rf crdb-checkout/pkg/ui/workspaces/db-console/ccl/src/js/protos.js
+rm -rf crdb-checkout/pkg/ui/workspaces/db-console/ccl/src/js/protos.d.ts
+rm -rf crdb-checkout/pkg/ui/workspaces/cluster-ui/dist
+
+exec
+dev ui clean --all
+----
+bazel info workspace --color=no
+bazel info workspace --color=no
+rm -rf crdb-checkout/pkg/ui/workspaces/db-console/src/js/protos.js
+rm -rf crdb-checkout/pkg/ui/workspaces/db-console/src/js/protos.d.ts
+rm -rf crdb-checkout/pkg/ui/workspaces/db-console/ccl/src/js/protos.js
+rm -rf crdb-checkout/pkg/ui/workspaces/db-console/ccl/src/js/protos.d.ts
+rm -rf crdb-checkout/pkg/ui/workspaces/cluster-ui/dist
+rm -rf crdb-checkout/pkg/ui/node_modules
+rm -rf crdb-checkout/pkg/ui/workspaces/db-console/node_modules
+rm -rf crdb-checkout/pkg/ui/workspaces/cluster-ui/node_modules
diff --git a/pkg/cmd/dev/ui.go b/pkg/cmd/dev/ui.go
index fedda8d604a8..2dfe9940ab6b 100644
--- a/pkg/cmd/dev/ui.go
+++ b/pkg/cmd/dev/ui.go
@@ -35,9 +35,10 @@ func makeUICmd(d *dev) *cobra.Command {
Long: "Builds UI & runs UI-related commands to ease development flows",
}
- uiCmd.AddCommand(makeUIWatchCmd(d))
+ uiCmd.AddCommand(makeUICleanCmd(d))
uiCmd.AddCommand(makeUILintCmd(d))
uiCmd.AddCommand(makeUITestCmd(d))
+ uiCmd.AddCommand(makeUIWatchCmd(d))
return uiCmd
}
@@ -248,6 +249,58 @@ Replaces 'make ui-lint'.`,
return lintCmd
}
+func makeUICleanCmd(d *dev) *cobra.Command {
+ const (
+ allFlag = "all"
+ )
+
+ cleanCmd := &cobra.Command{
+ Use: "clean [--all]",
+ Short: "clean artifacts produced by `dev ui`",
+ Long: "Clean the workspace of artifacts produced by `dev ui`. Pass the --all option to also delete all installed dependencies.",
+ Args: cobra.MaximumNArgs(0),
+ RunE: func(cmd *cobra.Command, commandLine []string) error {
+ all := mustGetFlagBool(cmd, allFlag)
+ uiDirs, err := getUIDirs(d)
+ if err != nil {
+ return err
+ }
+ pathsToDelete := []string{
+ filepath.Join(uiDirs.dbConsole, "src", "js", "protos.js"),
+ filepath.Join(uiDirs.dbConsole, "src", "js", "protos.d.ts"),
+ filepath.Join(uiDirs.dbConsole, "ccl", "src", "js", "protos.js"),
+ filepath.Join(uiDirs.dbConsole, "ccl", "src", "js", "protos.d.ts"),
+ filepath.Join(uiDirs.clusterUI, "dist"),
+ }
+ if all {
+ workspace, err := d.getWorkspace(d.cli.Context())
+ if err != nil {
+ return err
+ }
+
+ pathsToDelete = append(
+ pathsToDelete,
+ filepath.Join(workspace, "pkg", "ui", "node_modules"),
+ filepath.Join(uiDirs.dbConsole, "node_modules"),
+ filepath.Join(uiDirs.clusterUI, "node_modules"),
+ )
+ }
+
+ for _, toDelete := range pathsToDelete {
+ err := d.os.RemoveAll(toDelete)
+ if err != nil {
+ return err
+ }
+ }
+ return nil
+ },
+ }
+
+ cleanCmd.Flags().Bool(allFlag, false, "additionally clean installed dependencies")
+
+ return cleanCmd
+}
+
// arrangeFilesForTestWatchers moves files from Bazel's build output directory
// into the locations they'd be found during a non-Bazel build, so that test
// watchers can successfully operate outside of the Bazel sandbox.
diff --git a/pkg/cmd/github-pull-request-make/main.go b/pkg/cmd/github-pull-request-make/main.go
index 9ce817bc37ee..430e9441f32c 100644
--- a/pkg/cmd/github-pull-request-make/main.go
+++ b/pkg/cmd/github-pull-request-make/main.go
@@ -317,6 +317,7 @@ func main() {
args = append(args, "--test_arg=-test.timeout", fmt.Sprintf("--test_arg=%s", timeout))
// Give the entire test 1 more minute than the duration to wrap up.
args = append(args, fmt.Sprintf("--test_timeout=%d", int((duration+1*time.Minute).Seconds())))
+ args = append(args, "--test_output", "streamed")
args = append(args, "--run_under", fmt.Sprintf("%s -bazel -shardable-artifacts 'XML_OUTPUT_FILE=%s merge-test-xmls' -stderr -maxfails 1 -maxtime %s -p %d", bazelStressTarget, bazciPath, duration, parallelism))
cmd := exec.Command("bazci", args...)
diff --git a/pkg/cmd/roachtest/tests/decommission.go b/pkg/cmd/roachtest/tests/decommission.go
index 71bee78138c1..062879e5d3a2 100644
--- a/pkg/cmd/roachtest/tests/decommission.go
+++ b/pkg/cmd/roachtest/tests/decommission.go
@@ -16,6 +16,7 @@ import (
"fmt"
"math/rand"
"reflect"
+ "regexp"
"strconv"
"strings"
"time"
@@ -26,6 +27,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
+ "github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
@@ -99,6 +101,17 @@ func registerDecommission(r registry.Registry) {
},
})
}
+ {
+ numNodes := 6
+ r.Add(registry.TestSpec{
+ Name: "decommission/slow",
+ Owner: registry.OwnerServer,
+ Cluster: r.MakeClusterSpec(numNodes),
+ Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
+ runDecommissionSlow(ctx, t, c)
+ },
+ })
+ }
}
// runDrainAndDecommission marks 3 nodes in the test cluster as "draining" and
@@ -1091,6 +1104,84 @@ func runDecommissionDrains(ctx context.Context, t test.Test, c cluster.Cluster)
require.NoError(t, cli.MatchCSV(o, expDecommissioned))
}
+// runDecommissionSlow decommissions 5 nodes in a test cluster of 6
+// (with a replication factor of 5), which will guarantee a replica transfer
+// stall. This test is meant to ensure that decommissioning replicas are
+// reported when replica transfers stall.
+func runDecommissionSlow(ctx context.Context, t test.Test, c cluster.Cluster) {
+ const (
+ numNodes = 6
+ pinnedNodeID = 1
+ replicationFactor = 5
+ )
+
+ var verboseStoreLogRe = regexp.MustCompile("possible decommission stall detected")
+
+ err := c.PutE(ctx, t.L(), t.Cockroach(), "./cockroach", c.All())
+ require.NoError(t, err)
+
+ c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), c.All())
+
+ run := func(db *gosql.DB, query string) {
+ _, err = db.ExecContext(ctx, query)
+ require.NoError(t, err)
+ t.L().Printf("run: %s\n", query)
+ }
+
+ {
+ db := c.Conn(ctx, t.L(), pinnedNodeID)
+ defer db.Close()
+
+ // Set the replication factor to 5.
+ run(db, fmt.Sprintf(`ALTER RANGE default CONFIGURE ZONE USING num_replicas=%d`, replicationFactor))
+ run(db, fmt.Sprintf(`ALTER DATABASE system CONFIGURE ZONE USING num_replicas=%d`, replicationFactor))
+
+ // Increase the speed of decommissioning.
+ run(db, `SET CLUSTER SETTING kv.snapshot_rebalance.max_rate='2GiB'`)
+ run(db, `SET CLUSTER SETTING kv.snapshot_recovery.max_rate='2GiB'`)
+
+ // Wait for initial up-replication.
+ err := WaitForReplication(ctx, t, db, replicationFactor)
+ require.NoError(t, err)
+ }
+
+ // Decommission 5 nodes from the cluster, resulting in immovable replicas.
+ // Be prepared to cancel the context for the processes running decommissions
+ // since the decommissions will stall.
+ decomCtx, cancel := context.WithTimeout(ctx, 10*time.Minute)
+ defer cancel()
+ m := c.NewMonitor(decomCtx)
+ for nodeID := 2; nodeID <= numNodes; nodeID++ {
+ id := nodeID
+ m.Go(func(ctx context.Context) error {
+ decom := func(id int) error {
+ t.Status(fmt.Sprintf("decommissioning node %d", id))
+ return c.RunE(ctx,
+ c.Node(id),
+ fmt.Sprintf("./cockroach node decommission %d --insecure", id),
+ )
+ }
+ return decom(id)
+ })
+ }
+
+ // Check for reported decommissioning replicas.
+ t.Status("checking for decommissioning replicas report...")
+ testutils.SucceedsWithin(t, func() error {
+ for nodeID := 1; nodeID <= numNodes; nodeID++ {
+ if err = c.RunE(ctx,
+ c.Node(nodeID),
+ fmt.Sprintf("grep -q '%s' logs/cockroach.log", verboseStoreLogRe),
+ ); err == nil {
+ return nil
+ }
+ }
+ return errors.New("still waiting for decommissioning replicas report")
+ },
+ 3*time.Minute,
+ )
+}
+
// Header from the output of `cockroach node decommission`.
var decommissionHeader = []string{
"id", "is_live", "replicas", "is_decommissioning", "membership", "is_draining",
diff --git a/pkg/cmd/roachtest/tests/disk_full.go b/pkg/cmd/roachtest/tests/disk_full.go
index abb652d296a4..f33f2ccf8d75 100644
--- a/pkg/cmd/roachtest/tests/disk_full.go
+++ b/pkg/cmd/roachtest/tests/disk_full.go
@@ -41,12 +41,12 @@ func registerDiskFull(r registry.Registry) {
// Node 1 will soon be killed, when the ballast file fills up its disk. To
// ensure that the ranges containing system tables are available on other
- // nodes, we wait here for at least two replicas of each range. Without
- // this, it's possible that we end up deadlocked on a system query that
- // requires a range on node 1, but node 1 will not restart until the query
+ // nodes, we wait here for 3x replication of each range. Without this,
+ // it's possible that we end up deadlocked on a system query that requires
+ // a range on node 1, but node 1 will not restart until the query
// completes.
db := c.Conn(ctx, t.L(), 1)
- err := WaitForReplication(ctx, t, db, 2)
+ err := WaitFor3XReplication(ctx, t, db)
require.NoError(t, err)
_ = db.Close()
diff --git a/pkg/internal/sqlsmith/relational.go b/pkg/internal/sqlsmith/relational.go
index 67e136d12c05..e66ef908d520 100644
--- a/pkg/internal/sqlsmith/relational.go
+++ b/pkg/internal/sqlsmith/relational.go
@@ -206,6 +206,8 @@ func makeEquiJoinExpr(s *Smither, refs colRefs, forJoin bool) (tree.TableExpr, c
return nil, nil, false
}
+ s.lock.RLock()
+ defer s.lock.RUnlock()
// Determine overlapping types.
var available [][2]tree.TypedExpr
for _, leftCol := range leftRefs {
diff --git a/pkg/internal/sqlsmith/type.go b/pkg/internal/sqlsmith/type.go
index 3dddb02ecc00..4dfd4490ea2e 100644
--- a/pkg/internal/sqlsmith/type.go
+++ b/pkg/internal/sqlsmith/type.go
@@ -59,9 +59,9 @@ func (s *Smither) randScalarType() *types.T {
// isScalarType returns true if t is a member of types.Scalar, or a user defined
// enum.
+// Requires s.lock to be held.
func (s *Smither) isScalarType(t *types.T) bool {
- s.lock.RLock()
- defer s.lock.RUnlock()
+ s.lock.AssertRHeld()
scalarTypes := types.Scalar
if s.types != nil {
scalarTypes = s.types.scalarTypes
diff --git a/pkg/kv/kvserver/testdata/replica_unavailable_error.txt b/pkg/kv/kvserver/testdata/replica_unavailable_error.txt
index e685782c302e..c7f140c39a6d 100644
--- a/pkg/kv/kvserver/testdata/replica_unavailable_error.txt
+++ b/pkg/kv/kvserver/testdata/replica_unavailable_error.txt
@@ -1,3 +1,3 @@
echo
----
-replica (n1,s10):1 unable to serve request to r10:‹{a-z}› [(n1,s10):1, (n2,s20):2, next=3, gen=0]: raft status: {"id":"0","term":0,"vote":"0","commit":0,"lead":"0","raftState":"StateFollower","applied":0,"progress":{},"leadtransferee":"0"}: replicas on non-live nodes: (n2,s20):2 (lost quorum: true): probe failed
+replica unavailable: (n1,s10):1 unable to serve request to r10:‹{a-z}› [(n1,s10):1, (n2,s20):2, next=3, gen=0]: raft status: {"id":"0","term":0,"vote":"0","commit":0,"lead":"0","raftState":"StateFollower","applied":0,"progress":{},"leadtransferee":"0"}: replicas on non-live nodes: (n2,s20):2 (lost quorum: true): probe failed
diff --git a/pkg/roachpb/replica_unavailable_error.go b/pkg/roachpb/replica_unavailable_error.go
index 4259ee97eb94..75d1e7c2d492 100644
--- a/pkg/roachpb/replica_unavailable_error.go
+++ b/pkg/roachpb/replica_unavailable_error.go
@@ -37,7 +37,7 @@ var _ errors.Wrapper = (*ReplicaUnavailableError)(nil)
// SafeFormatError implements errors.SafeFormatter.
func (e *ReplicaUnavailableError) SafeFormatError(p errors.Printer) error {
- p.Printf("replica %s unable to serve request to %s: %s", e.Replica, e.Desc, e.Unwrap())
+ p.Printf("replica unavailable: %s unable to serve request to %s: %s", e.Replica, e.Desc, e.Unwrap())
return nil
}
diff --git a/pkg/roachpb/testdata/replica_unavailable_error.txt b/pkg/roachpb/testdata/replica_unavailable_error.txt
index f2ae80eb8b9b..cdb6a6dc4c53 100644
--- a/pkg/roachpb/testdata/replica_unavailable_error.txt
+++ b/pkg/roachpb/testdata/replica_unavailable_error.txt
@@ -1,4 +1,4 @@
echo
----
-replica (n1,s2):3 unable to serve request to r123:/M{in-ax} [(n1,s2):1, next=2, gen=0]: slow proposal
-replica (n1,s2):3 unable to serve request to r123:‹/M{in-ax}› [(n1,s2):1, next=2, gen=0]: slow proposal
+replica unavailable: (n1,s2):3 unable to serve request to r123:/M{in-ax} [(n1,s2):1, next=2, gen=0]: slow proposal
+replica unavailable: (n1,s2):3 unable to serve request to r123:‹/M{in-ax}› [(n1,s2):1, next=2, gen=0]: slow proposal
diff --git a/pkg/server/admin.go b/pkg/server/admin.go
index 05dcfa9f208c..d4d153c7cbd5 100644
--- a/pkg/server/admin.go
+++ b/pkg/server/admin.go
@@ -2302,6 +2302,7 @@ func (s *adminServer) decommissionStatusHelper(
// Get the number of replicas on each node. We *may* not need all of them,
// but that would be more complicated than seems worth it right now.
nodeIDs := req.NodeIDs
+ numReplicaReport := req.NumReplicaReport
// If no nodeIDs given, use all nodes.
if len(nodeIDs) == 0 {
@@ -2314,6 +2315,24 @@ func (s *adminServer) decommissionStatusHelper(
}
}
+ // If the client specified a number of decommissioning replicas to report,
+ // prepare to get decommissioning replicas to report to the operator.
+ // numReplicaReport is the number of replicas reported for each node.
+ var replicasToReport map[roachpb.NodeID][]*serverpb.DecommissionStatusResponse_Replica
+ if numReplicaReport > 0 {
+ log.Ops.Warning(ctx, "possible decommission stall detected; reporting decommissioning replicas")
+ replicasToReport = make(map[roachpb.NodeID][]*serverpb.DecommissionStatusResponse_Replica)
+ }
+
+ isDecommissioningNode := func(n roachpb.NodeID) bool {
+ for _, nodeID := range nodeIDs {
+ if n == nodeID {
+ return true
+ }
+ }
+ return false
+ }
+
// Compute the replica counts for the target nodes only. This map doubles as
// a lookup table to check whether we care about a given node.
var replicaCounts map[roachpb.NodeID]int64
@@ -2331,6 +2350,24 @@ func (s *adminServer) decommissionStatusHelper(
return errors.Wrapf(err, "%s: unable to unmarshal range descriptor", row.Key)
}
for _, r := range rangeDesc.Replicas().Descriptors() {
+ if numReplicaReport > 0 {
+ if len(replicasToReport[r.NodeID]) < int(numReplicaReport) {
+ if isDecommissioningNode(r.NodeID) {
+ replicasToReport[r.NodeID] = append(replicasToReport[r.NodeID],
+ &serverpb.DecommissionStatusResponse_Replica{
+ ReplicaID: r.ReplicaID,
+ RangeID: rangeDesc.RangeID,
+ },
+ )
+ log.Ops.Warningf(ctx,
+ "n%d still has replica id %d for range r%d",
+ r.NodeID,
+ r.ReplicaID,
+ rangeDesc.RangeID,
+ )
+ }
+ }
+ }
if _, ok := replicaCounts[r.NodeID]; ok {
replicaCounts[r.NodeID]++
}
@@ -2368,15 +2405,15 @@ func (s *adminServer) decommissionStatusHelper(
return nil, errors.Newf("unable to get liveness for %d", nodeID)
}
nodeResp := serverpb.DecommissionStatusResponse_Status{
- NodeID: l.NodeID,
- ReplicaCount: replicaCounts[l.NodeID],
- Membership: l.Membership,
- Draining: l.Draining,
+ NodeID: l.NodeID,
+ ReplicaCount: replicaCounts[l.NodeID],
+ Membership: l.Membership,
+ Draining: l.Draining,
+ ReportedReplicas: replicasToReport[l.NodeID],
}
if l.IsLive(s.server.clock.Now().GoTime()) {
nodeResp.IsLive = true
}
-
res.Status = append(res.Status, nodeResp)
}
@@ -2415,7 +2452,7 @@ func (s *adminServer) Decommission(
return &serverpb.DecommissionStatusResponse{}, nil
}
- return s.DecommissionStatus(ctx, &serverpb.DecommissionStatusRequest{NodeIDs: nodeIDs})
+ return s.DecommissionStatus(ctx, &serverpb.DecommissionStatusRequest{NodeIDs: nodeIDs, NumReplicaReport: req.NumReplicaReport})
}
// DataDistribution returns a count of replicas on each node for each table.
diff --git a/pkg/server/serverpb/admin.proto b/pkg/server/serverpb/admin.proto
index 42e65aea9c98..197c942b58bc 100644
--- a/pkg/server/serverpb/admin.proto
+++ b/pkg/server/serverpb/admin.proto
@@ -476,6 +476,8 @@ message DrainResponse {
message DecommissionStatusRequest {
repeated int32 node_ids = 1 [(gogoproto.customname) = "NodeIDs",
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.NodeID"];
+ // The number of decommissioning replicas to be reported.
+ int32 num_replica_report = 2;
}
// DecommissionRequest requests the server to set the membership status on
@@ -486,10 +488,18 @@ message DecommissionRequest {
repeated int32 node_ids = 1 [(gogoproto.customname) = "NodeIDs",
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.NodeID"];
kv.kvserver.liveness.livenesspb.MembershipStatus target_membership = 2;
+ // The number of decommissioning replicas to be reported.
+ int32 num_replica_report = 3;
}
// DecommissionStatusResponse lists decommissioning statuses for a number of NodeIDs.
message DecommissionStatusResponse {
+ message Replica {
+ int32 replica_id = 1 [ (gogoproto.customname) = "ReplicaID",
+ (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.ReplicaID"];
+ int32 range_id = 2 [ (gogoproto.customname) = "RangeID",
+ (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.RangeID"];
+ }
message Status {
int32 node_id = 1 [ (gogoproto.customname) = "NodeID",
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.NodeID"];
@@ -499,6 +509,10 @@ message DecommissionStatusResponse {
// The membership status of the given node.
kv.kvserver.liveness.livenesspb.MembershipStatus membership = 4;
bool draining = 5;
+ // Decommissioning replicas on the given node to be reported.
+ // How many replicas are reported is determined by what was specified in the
+ // request.
+ repeated Replica reported_replicas = 6;
}
// Status of all affected nodes.
repeated Status status = 2 [(gogoproto.nullable) = false];