From ae0ad5ca29ff865324cfc52bf75c75357f8af592 Mon Sep 17 00:00:00 2001
From: Miral Gadani <miral@cockroachlabs.com>
Date: Tue, 25 Oct 2022 18:01:40 +0000
Subject: [PATCH 1/7] roachtest: This change introduces a transparent to
 caller, default retry when encountering an error of 255 (SSH). The retry
 could be exposed to callers fairly simply, but this PR does not introduce
 that.

Today there appears the concept of a "roachprod" and a "command error",
the former of which denotes an issue in roachprod handling code, the
latter representing an error executing a command over SSH. This PR
preserves this behaviour and introduces an updated function signature
from `f(i int) ([]byte, error) to f(i int) (*RunResultDetails, error).

RunResultDetails has been expanded to capture information about the
execution of a command, such as stderr/our, exit code, error, attempt
number. Any roachprod error will result in an error being returned, and set
in RunResultDetails.Err. Any command error would be only set in
RunResultDetails.Err with the returned error being nil. This allows callers to
differentiate between a roachprod and a command error, which existing code
depends on.

Retry handling code can use a function's returned RunResultDetails to
determine whether a retry should take place. The default retry
handling occurs on `RunResultDetails.RemoteExitStatus == 255`

Release note: None
Epic: CRDB-21386
---
 pkg/cmd/roachtest/tests/BUILD.bazel           |   1 +
 pkg/cmd/roachtest/tests/activerecord.go       |  15 +-
 pkg/cmd/roachtest/tests/django.go             |  15 +-
 pkg/cmd/roachtest/tests/gopg.go               |  29 +-
 pkg/cmd/roachtest/tests/jepsen.go             |   2 +-
 .../mixed_version_decl_schemachange_compat.go |   6 +-
 pkg/cmd/roachtest/tests/nodejs_postgres.go    |  15 +-
 pkg/cmd/roachtest/tests/pgx.go                |  15 +-
 pkg/cmd/roachtest/tests/psycopg.go            |  15 +-
 pkg/cmd/roachtest/tests/ruby_pg.go            |  15 +-
 pkg/cmd/roachtest/tests/sqlalchemy.go         |  15 +-
 pkg/roachprod/errors/errors.go                |  51 +-
 pkg/roachprod/install/BUILD.bazel             |   3 +
 pkg/roachprod/install/cluster_synced.go       | 566 +++++++++++-------
 pkg/roachprod/install/cluster_synced_test.go  |  96 +++
 pkg/roachprod/install/cockroach.go            |  39 +-
 pkg/roachprod/install/session.go              |  79 +--
 pkg/roachprod/roachprod.go                    |  44 +-
 pkg/util/retry/retry.go                       |   5 +
 19 files changed, 571 insertions(+), 455 deletions(-)

diff --git a/pkg/cmd/roachtest/tests/BUILD.bazel b/pkg/cmd/roachtest/tests/BUILD.bazel
index 94c351adc5b1..1ab47dd54863 100644
--- a/pkg/cmd/roachtest/tests/BUILD.bazel
+++ b/pkg/cmd/roachtest/tests/BUILD.bazel
@@ -189,6 +189,7 @@ go_library(
         "//pkg/roachpb",
         "//pkg/roachprod",
         "//pkg/roachprod/config",
+        "//pkg/roachprod/errors",
         "//pkg/roachprod/install",
         "//pkg/roachprod/logger",
         "//pkg/roachprod/prometheus",
diff --git a/pkg/cmd/roachtest/tests/activerecord.go b/pkg/cmd/roachtest/tests/activerecord.go
index 90c2d2dcfd93..61a29b633621 100644
--- a/pkg/cmd/roachtest/tests/activerecord.go
+++ b/pkg/cmd/roachtest/tests/activerecord.go
@@ -21,6 +21,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
+	rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/install"
 	"github.com/cockroachdb/errors"
 )
@@ -181,16 +182,10 @@ func registerActiveRecord(r registry.Registry) {
 				`sudo RUBYOPT="-W0" TESTOPTS="-v" bundle exec rake test`,
 		)
 
-		// Expected to fail but we should still scan the error to check if
-		// there's an SSH/roachprod error.
-		if err != nil {
-			// install.NonZeroExitCode includes unrelated to SSH errors ("255")
-			// or roachprod errors, so we call t.Fatal if the error is not an
-			// install.NonZeroExitCode error
-			commandError := (*install.NonZeroExitCode)(nil)
-			if !errors.As(err, &commandError) {
-				t.Fatal(err)
-			}
+		// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
+		// Proceed for any other (command) errors
+		if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
+			t.Fatal(err)
 		}
 
 		// Result error contains stdout, stderr, and any error content returned by exec package.
diff --git a/pkg/cmd/roachtest/tests/django.go b/pkg/cmd/roachtest/tests/django.go
index db00c1493949..d6f4155a04e2 100644
--- a/pkg/cmd/roachtest/tests/django.go
+++ b/pkg/cmd/roachtest/tests/django.go
@@ -20,6 +20,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
+	rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/install"
 	"github.com/cockroachdb/errors"
 )
@@ -186,16 +187,10 @@ func registerDjango(r registry.Registry) {
 			t.Status("Running django test app ", testName)
 			result, err := c.RunWithDetailsSingleNode(ctx, t.L(), node, fmt.Sprintf(djangoRunTestCmd, testName))
 
-			// Expected to fail but we should still scan the error to check if
-			// there's an SSH/roachprod error.
-			if err != nil {
-				// install.NonZeroExitCode includes unrelated to SSH errors ("255")
-				// or roachprod errors, so we call t.Fatal if the error is not an
-				// install.NonZeroExitCode error
-				commandError := (*install.NonZeroExitCode)(nil)
-				if !errors.As(err, &commandError) {
-					t.Fatal(err)
-				}
+			// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
+			// Proceed for any other (command) errors
+			if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
+				t.Fatal(err)
 			}
 
 			rawResults := []byte(result.Stdout + result.Stderr)
diff --git a/pkg/cmd/roachtest/tests/gopg.go b/pkg/cmd/roachtest/tests/gopg.go
index b1e54a4ac741..1424536cda3f 100644
--- a/pkg/cmd/roachtest/tests/gopg.go
+++ b/pkg/cmd/roachtest/tests/gopg.go
@@ -23,6 +23,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
+	rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/install"
 	"github.com/cockroachdb/errors"
 )
@@ -115,16 +116,10 @@ func registerGopg(r registry.Registry) {
 				destPath, removeColorCodes, resultsFilePath),
 		)
 
-		// Expected to fail but we should still scan the error to check if
-		// there's an SSH/roachprod error.
-		if err != nil {
-			// install.NonZeroExitCode includes unrelated to SSH errors ("255")
-			// or roachprod errors, so we call t.Fatal if the error is not an
-			// install.NonZeroExitCode error
-			commandError := (*install.NonZeroExitCode)(nil)
-			if !errors.As(err, &commandError) {
-				t.Fatal(err)
-			}
+		// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
+		// Proceed for any other (command) errors
+		if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
+			t.Fatal(err)
 		}
 
 		rawResults := []byte(result.Stdout + result.Stderr)
@@ -152,16 +147,10 @@ func registerGopg(r registry.Registry) {
 				destPath, goPath, resultsFilePath, goPath),
 		)
 
-		// Expected to fail but we should still scan the error to check if
-		// there's an SSH/roachprod error.
-		if err != nil {
-			// install.NonZeroExitCode includes unrelated to SSH errors ("255")
-			// or roachprod errors, so we call t.Fatal if the error is not an
-			// install.NonZeroExitCode error
-			commandError := (*install.NonZeroExitCode)(nil)
-			if !errors.As(err, &commandError) {
-				t.Fatal(err)
-			}
+		// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
+		// Proceed for any other (command) errors
+		if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
+			t.Fatal(err)
 		}
 
 		xmlResults := []byte(result.Stdout + result.Stderr)
diff --git a/pkg/cmd/roachtest/tests/jepsen.go b/pkg/cmd/roachtest/tests/jepsen.go
index 9ac55335b99b..754e2b63f013 100644
--- a/pkg/cmd/roachtest/tests/jepsen.go
+++ b/pkg/cmd/roachtest/tests/jepsen.go
@@ -165,7 +165,7 @@ func initJepsen(ctx context.Context, t test.Test, c cluster.Cluster, j jepsenCon
 		ctx, t.L(), controller, "sh", "-c",
 		`"sudo DEBIAN_FRONTEND=noninteractive apt-get -qqy install openjdk-8-jre openjdk-8-jre-headless libjna-java gnuplot > /dev/null 2>&1"`,
 	); err != nil {
-		if result.RemoteExitStatus == "100" {
+		if result.RemoteExitStatus == 100 {
 			t.Skip("apt-get failure (#31944)", result.Stdout+result.Stderr)
 		}
 		t.Fatal(err)
diff --git a/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go b/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go
index 39237170a775..48bd885ff987 100644
--- a/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go
+++ b/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go
@@ -84,14 +84,14 @@ func validateCorpusFile(
 	// Detect validation failures in standard output first, and dump those out.
 	failureRegex := regexp.MustCompile(`failed to validate.*`)
 	if matches := failureRegex.FindAllString(details.Stdout, -1); len(matches) > 0 {
-		t.Fatalf("Validation of corpus has failed (exit status %s): \n%s",
+		t.Fatalf("Validation of corpus has failed (exit status %d): \n%s",
 			details.RemoteExitStatus,
 			strings.Join(matches, "\n"))
 	}
 
 	// If no error is logged dump out both stdout and std error.
-	if details.RemoteExitStatus != "0" {
-		t.Fatalf("Validation command failed with exist status %s, output:\n %s\n%s\n",
+	if details.RemoteExitStatus != 0 {
+		t.Fatalf("Validation command failed with exit status %d, output:\n %s\n%s\n",
 			details.RemoteExitStatus,
 			details.Stdout,
 			details.Stderr,
diff --git a/pkg/cmd/roachtest/tests/nodejs_postgres.go b/pkg/cmd/roachtest/tests/nodejs_postgres.go
index 2b9a05fbf90c..094fe8d4ad2e 100644
--- a/pkg/cmd/roachtest/tests/nodejs_postgres.go
+++ b/pkg/cmd/roachtest/tests/nodejs_postgres.go
@@ -19,6 +19,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
+	rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/install"
 	"github.com/cockroachdb/errors"
 	"github.com/stretchr/testify/require"
@@ -141,16 +142,10 @@ PGSSLCERT=$HOME/certs/client.%s.crt PGSSLKEY=$HOME/certs/client.%s.key PGSSLROOT
 			),
 		)
 
-		// Expected to fail but we should still scan the error to check if
-		// there's an SSH/roachprod error.
-		if err != nil {
-			// install.NonZeroExitCode includes unrelated to SSH errors ("255")
-			// or roachprod errors, so we call t.Fatal if the error is not an
-			// install.NonZeroExitCode error
-			commandError := (*install.NonZeroExitCode)(nil)
-			if !errors.As(err, &commandError) {
-				t.Fatal(err)
-			}
+		// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
+		// Proceed for any other (command) errors
+		if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
+			t.Fatal(err)
 		}
 
 		rawResultsStr := result.Stdout + result.Stderr
diff --git a/pkg/cmd/roachtest/tests/pgx.go b/pkg/cmd/roachtest/tests/pgx.go
index 6144ad3583ef..c59b6954d2aa 100644
--- a/pkg/cmd/roachtest/tests/pgx.go
+++ b/pkg/cmd/roachtest/tests/pgx.go
@@ -19,6 +19,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
+	rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/install"
 	"github.com/cockroachdb/errors"
 )
@@ -121,16 +122,10 @@ func registerPgx(r registry.Registry) {
 				"`go env GOPATH`/bin/go-junit-report",
 		)
 
-		// Expected to fail but we should still scan the error to check if
-		// there's an SSH/roachprod error.
-		if err != nil {
-			// install.NonZeroExitCode includes unrelated to SSH errors ("255")
-			// or roachprod errors, so we call t.Fatal if the error is not an
-			// install.NonZeroExitCode error
-			commandError := (*install.NonZeroExitCode)(nil)
-			if !errors.As(err, &commandError) {
-				t.Fatal(err)
-			}
+		// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
+		// Proceed for any other (command) errors
+		if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
+			t.Fatal(err)
 		}
 
 		// Result error contains stdout, stderr, and any error content returned by exec package.
diff --git a/pkg/cmd/roachtest/tests/psycopg.go b/pkg/cmd/roachtest/tests/psycopg.go
index b556720c02f6..b9de3e7ca68f 100644
--- a/pkg/cmd/roachtest/tests/psycopg.go
+++ b/pkg/cmd/roachtest/tests/psycopg.go
@@ -19,6 +19,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
+	rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/install"
 	"github.com/cockroachdb/errors"
 )
@@ -124,16 +125,10 @@ func registerPsycopg(r registry.Registry) {
 			make check PYTHON_VERSION=3`,
 		)
 
-		// Expected to fail but we should still scan the error to check if
-		// there's an SSH/roachprod error.
-		if err != nil {
-			// install.NonZeroExitCode includes unrelated to SSH errors ("255")
-			// or roachprod errors, so we call t.Fatal if the error is not an
-			// install.NonZeroExitCode error
-			commandError := (*install.NonZeroExitCode)(nil)
-			if !errors.As(err, &commandError) {
-				t.Fatal(err)
-			}
+		// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
+		// Proceed for any other (command) errors
+		if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
+			t.Fatal(err)
 		}
 
 		// Result error contains stdout, stderr, and any error content returned by exec package.
diff --git a/pkg/cmd/roachtest/tests/ruby_pg.go b/pkg/cmd/roachtest/tests/ruby_pg.go
index 7418263e99fc..5e3874e541ec 100644
--- a/pkg/cmd/roachtest/tests/ruby_pg.go
+++ b/pkg/cmd/roachtest/tests/ruby_pg.go
@@ -22,6 +22,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
+	rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/install"
 	"github.com/cockroachdb/errors"
 	"github.com/stretchr/testify/require"
@@ -152,16 +153,10 @@ func registerRubyPG(r registry.Registry) {
 			`cd /mnt/data1/ruby-pg/ && bundle exec rake compile test`,
 		)
 
-		// Expected to fail but we should still scan the error to check if
-		// there's an SSH/roachprod error.
-		if err != nil {
-			// install.NonZeroExitCode includes unrelated to SSH errors ("255")
-			// or roachprod errors, so we call t.Fatal if the error is not an
-			// install.NonZeroExitCode error
-			commandError := (*install.NonZeroExitCode)(nil)
-			if !errors.As(err, &commandError) {
-				t.Fatal(err)
-			}
+		// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
+		// Proceed for any other (command) errors
+		if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
+			t.Fatal(err)
 		}
 
 		rawResults := []byte(result.Stdout + result.Stderr)
diff --git a/pkg/cmd/roachtest/tests/sqlalchemy.go b/pkg/cmd/roachtest/tests/sqlalchemy.go
index 0951fea093da..da9604893c8b 100644
--- a/pkg/cmd/roachtest/tests/sqlalchemy.go
+++ b/pkg/cmd/roachtest/tests/sqlalchemy.go
@@ -22,6 +22,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
+	rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/install"
 	"github.com/cockroachdb/errors"
 )
@@ -171,16 +172,10 @@ func runSQLAlchemy(ctx context.Context, t test.Test, c cluster.Cluster) {
 		test/test_suite_sqlalchemy.py
 	`)
 
-	// Expected to fail but we should still scan the error to check if
-	// there's an SSH/roachprod error.
-	if err != nil {
-		// install.NonZeroExitCode includes unrelated to SSH errors ("255")
-		// or roachprod errors, so we call t.Fatal if the error is not an
-		// install.NonZeroExitCode error
-		commandError := (*install.NonZeroExitCode)(nil)
-		if !errors.As(err, &commandError) {
-			t.Fatal(err)
-		}
+	// Fatal for a roachprod or SSH error. A roachprod error is when result.Err==nil.
+	// Proceed for any other (command) errors
+	if err != nil && (result.Err == nil || errors.Is(err, rperrors.ErrSSH255)) {
+		t.Fatal(err)
 	}
 
 	rawResults := []byte(result.Stdout + result.Stderr)
diff --git a/pkg/roachprod/errors/errors.go b/pkg/roachprod/errors/errors.go
index f0fab50e6396..a6f5823200ea 100644
--- a/pkg/roachprod/errors/errors.go
+++ b/pkg/roachprod/errors/errors.go
@@ -118,8 +118,8 @@ func ClassifyCmdError(err error) Error {
 		return nil
 	}
 
-	if exitErr, ok := asExitError(err); ok {
-		if exitErr.ExitCode() == 255 {
+	if exitCode, ok := GetExitCode(err); ok {
+		if exitCode == 255 {
 			return SSH{errors.Mark(err, ErrSSH255)}
 		}
 		return Cmd{err}
@@ -128,6 +128,16 @@ func ClassifyCmdError(err error) Error {
 	return Unclassified{err}
 }
 
+// GetExitCode returns an exit code, true if the error is an instance
+// of an ExitError, or -1, false otherwise
+func GetExitCode(err error) (int, bool) {
+	if exitErr, ok := asExitError(err); ok {
+		return exitErr.ExitCode(), true
+	}
+
+	return -1, false
+}
+
 // Extract the ExitError from err's error tree or (nil, false) if none exists.
 func asExitError(err error) (*exec.ExitError, bool) {
 	var exitErr *exec.ExitError
@@ -145,40 +155,3 @@ func AsError(err error) (Error, bool) {
 	}
 	return nil, false
 }
-
-// SelectPriorityError selects an error from the list in this priority order:
-//
-// - the Error with the highest exit code
-// - one of the `error`s
-// - nil
-func SelectPriorityError(errors []error) error {
-	var result Error
-	for _, err := range errors {
-		if err == nil {
-			continue
-		}
-
-		rpErr, _ := AsError(err)
-		if result == nil {
-			result = rpErr
-			continue
-		}
-
-		if rpErr.ExitCode() > result.ExitCode() {
-			result = rpErr
-		}
-	}
-
-	if result != nil {
-		return result
-	}
-
-	for _, err := range errors {
-		if err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-var _ = SelectPriorityError
diff --git a/pkg/roachprod/install/BUILD.bazel b/pkg/roachprod/install/BUILD.bazel
index 184678ddc639..1b13417f9c6f 100644
--- a/pkg/roachprod/install/BUILD.bazel
+++ b/pkg/roachprod/install/BUILD.bazel
@@ -53,8 +53,11 @@ go_test(
     data = glob(["testdata/**"]),
     embed = [":install"],
     deps = [
+        "//pkg/roachprod/logger",
         "//pkg/testutils",
+        "//pkg/util/retry",
         "@com_github_cockroachdb_datadriven//:datadriven",
+        "@com_github_cockroachdb_errors//:errors",
         "@com_github_stretchr_testify//require",
     ],
 )
diff --git a/pkg/roachprod/install/cluster_synced.go b/pkg/roachprod/install/cluster_synced.go
index 3817e4b56ce7..930cfade3628 100644
--- a/pkg/roachprod/install/cluster_synced.go
+++ b/pkg/roachprod/install/cluster_synced.go
@@ -100,6 +100,64 @@ func NewSyncedCluster(
 	return c, nil
 }
 
+// ErrAfterRetry marks an error that has occurred/persisted after retries
+var ErrAfterRetry = errors.New("error occurred after retries")
+
+// The first retry is after 5s, the second and final is after 25s
+var defaultRunRetryOpt = retry.Options{
+	InitialBackoff: 5 * time.Second,
+	Multiplier:     5,
+	MaxBackoff:     1 * time.Minute,
+	// This will run a total of 3 times `runWithMaybeRetry`
+	MaxRetries: 2,
+}
+
+// runWithMaybeRetry will run the specified function `f` at least once.
+// Any returned error from `f` is passed to the `shouldRetryFn` which,
+// if it returns true, will result in `f` being retried using the `retryOpts`
+// If the `shouldRetryFn` is not specified (nil), then no retries will be
+// performed.
+//
+// We operate on a pointer to RunResultDetails as it has already have been
+// captured in a *RunResultDetails[] in Run, but here we may enrich with attempt
+// number and a wrapper error.
+func runWithMaybeRetry(
+	l *logger.Logger,
+	retryOpts retry.Options,
+	shouldRetryFn func(details *RunResultDetails) bool,
+	f func() (*RunResultDetails, error),
+) (*RunResultDetails, error) {
+	var err error
+	var res *RunResultDetails
+
+	var cmdErr error
+
+	for r := retry.Start(retryOpts); r.Next(); {
+		res, err = f()
+		res.Attempt = r.CurrentAttempt() + 1
+		// nil err indicates a potentially retryable res.Err
+		if err == nil && res.Err != nil {
+			cmdErr = errors.CombineErrors(cmdErr, res.Err)
+			if shouldRetryFn != nil && shouldRetryFn(res) {
+				l.Printf("Encountered [%v] on attempt %v of %v", res.Err, r.CurrentAttempt()+1, defaultRunRetryOpt.MaxRetries+1)
+				continue
+			}
+		}
+		break
+	}
+
+	if res.Err != nil && res.Attempt > 1 {
+		// An error cannot be marked with more than one reference error. Since res.Err may already be marked, we create
+		// a new error here and mark it.
+		res.Err = errors.Mark(errors.Wrapf(cmdErr, "error persisted after %v attempts", res.Attempt), ErrAfterRetry)
+	}
+	return res, err
+}
+
+func defaultShouldRetry(res *RunResultDetails) bool {
+	return errors.Is(res.Err, rperrors.ErrSSH255)
+}
+
 // Host returns the public IP of a node.
 func (c *SyncedCluster) Host(n Node) string {
 	return c.VMs[n-1].PublicIP
@@ -237,10 +295,11 @@ func (c *SyncedCluster) Stop(
 	if wait {
 		display += " and waiting"
 	}
-	return c.Parallel(l, display, len(c.Nodes), 0, func(i int) ([]byte, error) {
-		sess, err := c.newSession(c.Nodes[i])
+	return c.Parallel(l, display, len(c.Nodes), 0, func(i int) (*RunResultDetails, error) {
+		node := c.Nodes[i]
+		sess, err := c.newSession(node)
 		if err != nil {
-			return nil, err
+			return newRunResultDetails(node, err), err
 		}
 		defer sess.Close()
 
@@ -263,8 +322,8 @@ func (c *SyncedCluster) Stop(
     done
     echo "${pid}: dead" >> %[1]s/roachprod.log
   done`,
-				c.LogDir(c.Nodes[i]), // [1]
-				maxWait,              // [2]
+				c.LogDir(node), // [1]
+				maxWait,        // [2]
 			)
 		}
 
@@ -281,12 +340,15 @@ if [ -n "${pids}" ]; then
   kill -%[3]d ${pids}
 %[4]s
 fi`,
-			c.LogDir(c.Nodes[i]),            // [1]
-			c.roachprodEnvRegex(c.Nodes[i]), // [2]
-			sig,                             // [3]
-			waitCmd,                         // [4]
+			c.LogDir(node),            // [1]
+			c.roachprodEnvRegex(node), // [2]
+			sig,                       // [3]
+			waitCmd,                   // [4]
 		)
-		return sess.CombinedOutput(ctx, cmd)
+		out, cmdErr := sess.CombinedOutput(ctx, cmd)
+		res := newRunResultDetails(node, cmdErr)
+		res.CombinedOut = out
+		return res, res.Err
 	})
 }
 
@@ -296,10 +358,11 @@ func (c *SyncedCluster) Wipe(ctx context.Context, l *logger.Logger, preserveCert
 	if err := c.Stop(ctx, l, 9, true /* wait */, 0 /* maxWait */); err != nil {
 		return err
 	}
-	return c.Parallel(l, display, len(c.Nodes), 0, func(i int) ([]byte, error) {
-		sess, err := c.newSession(c.Nodes[i])
+	return c.Parallel(l, display, len(c.Nodes), 0, func(i int) (*RunResultDetails, error) {
+		node := c.Nodes[i]
+		sess, err := c.newSession(node)
 		if err != nil {
-			return nil, err
+			return newRunResultDetails(node, err), err
 		}
 		defer sess.Close()
 
@@ -324,7 +387,10 @@ sudo rm -fr logs &&
 				cmd += "sudo rm -fr tenant-certs* ;\n"
 			}
 		}
-		return sess.CombinedOutput(ctx, cmd)
+		out, cmdErr := sess.CombinedOutput(ctx, cmd)
+		res := newRunResultDetails(node, cmdErr)
+		res.CombinedOut = out
+		return res, res.Err
 	})
 }
 
@@ -341,19 +407,19 @@ type NodeStatus struct {
 func (c *SyncedCluster) Status(ctx context.Context, l *logger.Logger) ([]NodeStatus, error) {
 	display := fmt.Sprintf("%s: status", c.Name)
 	results := make([]NodeStatus, len(c.Nodes))
-	if err := c.Parallel(l, display, len(c.Nodes), 0, func(i int) ([]byte, error) {
-		sess, err := c.newSession(c.Nodes[i])
+	if err := c.Parallel(l, display, len(c.Nodes), 0, func(i int) (*RunResultDetails, error) {
+		node := c.Nodes[i]
+		sess, err := c.newSession(node)
 		if err != nil {
-			results[i] = NodeStatus{Err: err}
-			return nil, nil
+			return newRunResultDetails(node, err), err
 		}
 		defer sess.Close()
 
-		binary := cockroachNodeBinary(c, c.Nodes[i])
+		binary := cockroachNodeBinary(c, node)
 		cmd := fmt.Sprintf(`out=$(ps axeww -o pid -o ucomm -o command | \
   sed 's/export ROACHPROD=//g' | \
   awk '/%s/ {print $2, $1}'`,
-			c.roachprodEnvRegex(c.Nodes[i]))
+			c.roachprodEnvRegex(node))
 		cmd += ` | sort | uniq);
 vers=$(` + binary + ` version 2>/dev/null | awk '/Build Tag:/ {print $NF}')
 if [ -n "${out}" -a -n "${vers}" ]; then
@@ -362,19 +428,23 @@ else
   echo ${out}
 fi
 `
-		out, err := sess.CombinedOutput(ctx, cmd)
-		var msg string
-		if err != nil {
-			return nil, errors.Wrapf(err, "~ %s\n%s", cmd, out)
+		out, cmdErr := sess.CombinedOutput(ctx, cmd)
+		res := newRunResultDetails(node, cmdErr)
+		res.CombinedOut = out
+
+		if res.Err != nil {
+			return res, errors.Wrapf(res.Err, "~ %s\n%s", cmd, res.CombinedOut)
 		}
-		msg = strings.TrimSpace(string(out))
+
+		msg := strings.TrimSpace(string(res.CombinedOut))
 		if msg == "" {
 			results[i] = NodeStatus{Running: false}
-			return nil, nil
+			return res, nil
 		}
 		info := strings.Split(msg, " ")
 		results[i] = NodeStatus{Running: true, Version: info[0], Pid: info[1]}
-		return nil, nil
+
+		return res, nil
 	}); err != nil {
 		return nil, err
 	}
@@ -574,49 +644,45 @@ type RunResultDetails struct {
 	Node             Node
 	Stdout           string
 	Stderr           string
+	CombinedOut      []byte
 	Err              error
-	RemoteExitStatus string
+	RemoteExitStatus int
+	Attempt          int
 }
 
-func processStdout(stdout string) (string, string) {
-	retStdout := stdout
-	exitStatusPattern := "LAST EXIT STATUS: "
-	exitStatusIndex := strings.LastIndex(retStdout, exitStatusPattern)
-	remoteExitStatus := "-1"
-	// If exitStatusIndex is -1 then "echo LAST EXIT STATUS: $?" didn't run
-	// mostly due to an ssh error but avoid speculation and temporarily
-	// use "-1" for unknown error before checking if it's SSH related later.
-	if exitStatusIndex != -1 {
-		retStdout = stdout[:exitStatusIndex]
-		remoteExitStatus = strings.TrimSpace(stdout[exitStatusIndex+len(exitStatusPattern):])
-	}
-	return retStdout, remoteExitStatus
+func newRunResultDetails(node Node, err error) *RunResultDetails {
+	res := RunResultDetails{
+		Node: node,
+		Err:  err,
+	}
+	if exitCode, success := rperrors.GetExitCode(err); success {
+		res.RemoteExitStatus = exitCode
+	}
+
+	return &res
 }
 
-func runCmdOnSingleNode(
-	ctx context.Context, l *logger.Logger, c *SyncedCluster, node Node, cmd string,
-) (RunResultDetails, error) {
-	result := RunResultDetails{Node: node}
+func (c *SyncedCluster) runCmdOnSingleNode(
+	ctx context.Context,
+	l *logger.Logger,
+	node Node,
+	cmd string,
+	combined bool,
+	stdout, stderr io.Writer,
+) (*RunResultDetails, error) {
 	sess, err := c.newSession(node)
 	if err != nil {
-		return result, err
+		return newRunResultDetails(node, err), err
 	}
 	defer sess.Close()
 
-	sess.SetWithExitStatus(true)
-	var stdoutBuffer, stderrBuffer bytes.Buffer
-	multStdout := io.MultiWriter(&stdoutBuffer, l.Stdout)
-	multStderr := io.MultiWriter(&stderrBuffer, l.Stderr)
-	sess.SetStdout(multStdout)
-	sess.SetStderr(multStderr)
-
 	// Argument template expansion is node specific (e.g. for {store-dir}).
 	e := expander{
 		node: node,
 	}
 	expandedCmd, err := e.expand(ctx, l, c, cmd)
 	if err != nil {
-		return result, err
+		return newRunResultDetails(node, err), err
 	}
 
 	// Be careful about changing these command strings. In particular, we need
@@ -633,31 +699,29 @@ func runCmdOnSingleNode(
 		nodeCmd = fmt.Sprintf("cd %s; %s", c.localVMDir(node), nodeCmd)
 	}
 
-	err = sess.Run(ctx, nodeCmd)
-	result.Stderr = stderrBuffer.String()
-	result.Stdout, result.RemoteExitStatus = processStdout(stdoutBuffer.String())
+	var res *RunResultDetails
+	if combined {
+		out, cmdErr := sess.CombinedOutput(ctx, nodeCmd)
+		res = newRunResultDetails(node, cmdErr)
+		res.CombinedOut = out
+	} else {
+		// We stream the output if running on a single node.
+		var stdoutBuffer, stderrBuffer bytes.Buffer
+		multStdout := io.MultiWriter(&stdoutBuffer, stdout)
+		multStderr := io.MultiWriter(&stderrBuffer, stderr)
+		sess.SetStdout(multStdout)
+		sess.SetStderr(multStderr)
 
-	if err != nil {
-		detailMsg := fmt.Sprintf("Node %d. Command with error:\n```\n%s\n```\n", node, cmd)
-		err = errors.WithDetail(err, detailMsg)
-		err = rperrors.ClassifyCmdError(err)
-		if errors.Is(err, rperrors.ErrSSH255) {
-			result.RemoteExitStatus = "255"
-		}
-		result.Err = err
-	} else if result.RemoteExitStatus != "0" {
-		result.Err = &NonZeroExitCode{fmt.Sprintf("Non-zero exit code: %s", result.RemoteExitStatus)}
+		res = newRunResultDetails(node, sess.Run(ctx, nodeCmd))
+		res.Stderr = stderrBuffer.String()
+		res.Stdout = stdoutBuffer.String()
 	}
-	return result, nil
-}
 
-// NonZeroExitCode is returned when a command executed by Run() exits with a non-zero status.
-type NonZeroExitCode struct {
-	message string
-}
-
-func (e *NonZeroExitCode) Error() string {
-	return e.message
+	if res.Err != nil {
+		detailMsg := fmt.Sprintf("Node %d. Command with error:\n```\n%s\n```\n", node, cmd)
+		res.Err = errors.WithDetail(res.Err, detailMsg)
+	}
+	return res, nil
 }
 
 // Run a command on >= 1 node in the cluster.
@@ -681,74 +745,47 @@ func (c *SyncedCluster) Run(
 		display = fmt.Sprintf("%s: %s", c.Name, title)
 	}
 
-	errs := make([]error, len(nodes))
-	results := make([]string, len(nodes))
-	if err := c.Parallel(l, display, len(nodes), 0, func(i int) ([]byte, error) {
-		sess, err := c.newSession(nodes[i])
-		if err != nil {
-			errs[i] = err
-			results[i] = err.Error()
-			return nil, nil
-		}
-		defer sess.Close()
+	results := make([]*RunResultDetails, len(nodes))
 
-		// Argument template expansion is node specific (e.g. for {store-dir}).
-		e := expander{
-			node: nodes[i],
-		}
-		expandedCmd, err := e.expand(ctx, l, c, cmd)
-		if err != nil {
-			return nil, err
+	// A result is the output of running a command (could be interpreted as an error)
+	if _, err := c.ParallelE(l, display, len(nodes), 0, func(i int) (*RunResultDetails, error) {
+		// An err returned here is an unexpected state within roachprod (non-command error).
+		// For errors that occur as part of running a command over ssh, the `result` will contain
+		// the actual error on a specific node.
+		result, err := c.runCmdOnSingleNode(ctx, l, nodes[i], cmd, !stream, stdout, stderr)
+		results[i] = result
+		return result, err
+	}); err != nil {
+		return err
+	}
+
+	return processResults(results, stream, stdout)
+}
+
+// processResults returns the error from the RunResultDetails with the highest RemoteExitStatus
+func processResults(results []*RunResultDetails, stream bool, stdout io.Writer) error {
+	var resultWithError *RunResultDetails
+	for i, r := range results {
+		if !stream {
+			fmt.Fprintf(stdout, "  %2d: %s\n%v\n", i+1, strings.TrimSpace(string(r.CombinedOut)), r.Err)
 		}
 
-		// Be careful about changing these command strings. In particular, we need
-		// to support running commands in the background on both local and remote
-		// nodes. For example:
-		//
-		//   roachprod run cluster -- "sleep 60 &> /dev/null < /dev/null &"
-		//
-		// That command should return immediately. And a "roachprod status" should
-		// reveal that the sleep command is running on the cluster.
-		nodeCmd := fmt.Sprintf(`export ROACHPROD=%s GOTRACEBACK=crash && bash -c %s`,
-			c.roachprodEnvValue(nodes[i]), ssh.Escape1(expandedCmd))
-		if c.IsLocal() {
-			nodeCmd = fmt.Sprintf("cd %s; %s", c.localVMDir(nodes[i]), nodeCmd)
-		}
-
-		if stream {
-			sess.SetStdout(stdout)
-			sess.SetStderr(stderr)
-			errs[i] = sess.Run(ctx, nodeCmd)
-			if errs[i] != nil {
-				detailMsg := fmt.Sprintf("Node %d. Command with error:\n```\n%s\n```\n", nodes[i], cmd)
-				err = errors.WithDetail(errs[i], detailMsg)
-				err = rperrors.ClassifyCmdError(err)
-				errs[i] = err
+		if r.Err != nil {
+			if resultWithError == nil {
+				resultWithError = r
+				continue
 			}
-			return nil, nil
-		}
 
-		out, err := sess.CombinedOutput(ctx, nodeCmd)
-		msg := strings.TrimSpace(string(out))
-		if err != nil {
-			detailMsg := fmt.Sprintf("Node %d. Command with error:\n```\n%s\n```\n", nodes[i], cmd)
-			err = errors.WithDetail(err, detailMsg)
-			err = rperrors.ClassifyCmdError(err)
-			errs[i] = err
-			msg += fmt.Sprintf("\n%v", err)
-		}
-		results[i] = msg
-		return nil, nil
-	}); err != nil {
-		return err
+			if r.RemoteExitStatus > resultWithError.RemoteExitStatus {
+				resultWithError = r
+			}
+		}
 	}
 
-	if !stream {
-		for i, r := range results {
-			fmt.Fprintf(stdout, "  %2d: %s\n", nodes[i], r)
-		}
+	if resultWithError != nil {
+		return resultWithError.Err
 	}
-	return rperrors.SelectPriorityError(errs)
+	return nil
 }
 
 // RunWithDetails runs a command on the specified nodes and returns results details and an error.
@@ -756,19 +793,24 @@ func (c *SyncedCluster) RunWithDetails(
 	ctx context.Context, l *logger.Logger, nodes Nodes, title, cmd string,
 ) ([]RunResultDetails, error) {
 	display := fmt.Sprintf("%s: %s", c.Name, title)
-	results := make([]RunResultDetails, len(nodes))
 
-	failed, err := c.ParallelE(l, display, len(nodes), 0, func(i int) ([]byte, error) {
-		result, err := runCmdOnSingleNode(ctx, l, c, nodes[i], cmd)
-		if err != nil {
-			return nil, err
-		}
-		results[i] = result
-		return nil, nil
+	// We use pointers here as we are capturing the state of a result even though it may
+	// be processed further by the caller.
+	resultPtrs := make([]*RunResultDetails, len(nodes))
+
+	// Both return values are explicitly ignored because, in this case, resultPtrs
+	// capture both error and result state for each node
+	_, _ = c.ParallelE(l, display, len(nodes), 0, func(i int) (*RunResultDetails, error) { //nolint:errcheck
+		result, err := c.runCmdOnSingleNode(ctx, l, nodes[i], cmd, false, l.Stdout, l.Stderr)
+		resultPtrs[i] = result
+		return result, err
 	})
-	if err != nil {
-		for _, node := range failed {
-			results[node.Index].Err = node.Err
+
+	// Return values to preserve API
+	results := make([]RunResultDetails, len(nodes))
+	for i, v := range resultPtrs {
+		if v != nil {
+			results[i] = *v
 		}
 	}
 	return results, nil
@@ -807,9 +849,11 @@ func (c *SyncedCluster) RepeatRun(
 func (c *SyncedCluster) Wait(ctx context.Context, l *logger.Logger) error {
 	display := fmt.Sprintf("%s: waiting for nodes to start", c.Name)
 	errs := make([]error, len(c.Nodes))
-	if err := c.Parallel(l, display, len(c.Nodes), 0, func(i int) ([]byte, error) {
+	if err := c.Parallel(l, display, len(c.Nodes), 0, func(i int) (*RunResultDetails, error) {
+		node := c.Nodes[i]
+		res := &RunResultDetails{Node: node}
 		for j := 0; j < 600; j++ {
-			sess, err := c.newSession(c.Nodes[i])
+			sess, err := c.newSession(node)
 			if err != nil {
 				time.Sleep(500 * time.Millisecond)
 				continue
@@ -821,10 +865,11 @@ func (c *SyncedCluster) Wait(ctx context.Context, l *logger.Logger) error {
 				time.Sleep(500 * time.Millisecond)
 				continue
 			}
-			return nil, nil
+			return res, nil
 		}
 		errs[i] = errors.New("timed out after 5m")
-		return nil, nil
+		res.Err = errs[i]
+		return res, nil
 	}); err != nil {
 		return err
 	}
@@ -842,6 +887,16 @@ func (c *SyncedCluster) Wait(ctx context.Context, l *logger.Logger) error {
 	return nil
 }
 
+// setupSession is a helper which creates a new session and
+// populates RunResultDetails with an error if one occurrs (unlikely
+// given the code in `newSession`)
+// RunResultDetails is used across all functions which
+// create a session and holds error and stdout information
+func (c *SyncedCluster) setupSession(node Node) (session, error) {
+	sess, err := c.newSession(node)
+	return sess, err
+}
+
 // SetupSSH configures the cluster for use with SSH. This is generally run after
 // the cloud.Cluster has been synced which resets the SSH credentials on the
 // machines and sets them up for the current user. This method enables the
@@ -870,10 +925,10 @@ func (c *SyncedCluster) SetupSSH(ctx context.Context, l *logger.Logger) error {
 	// Generate an ssh key that we'll distribute to all of the nodes in the
 	// cluster in order to allow inter-node ssh.
 	var sshTar []byte
-	if err := c.Parallel(l, "generating ssh key", 1, 0, func(i int) ([]byte, error) {
-		sess, err := c.newSession(1)
+	if err := c.Parallel(l, "generating ssh key", 1, 0, func(i int) (*RunResultDetails, error) {
+		sess, err := c.setupSession(1)
 		if err != nil {
-			return nil, err
+			return newRunResultDetails(1, err), err
 		}
 		defer sess.Close()
 
@@ -892,30 +947,40 @@ tar cf - .ssh/id_rsa .ssh/id_rsa.pub .ssh/authorized_keys
 		sess.SetStdout(&stdout)
 		sess.SetStderr(&stderr)
 
-		if err := sess.Run(ctx, cmd); err != nil {
-			return nil, errors.Wrapf(err, "%s: stderr:\n%s", cmd, stderr.String())
+		res := newRunResultDetails(1, sess.Run(ctx, cmd))
+
+		res.Stdout = stdout.String()
+		res.Stderr = stderr.String()
+		if res.Err != nil {
+			return res, errors.Wrapf(res.Err, "%s: stderr:\n%s", cmd, res.Stderr)
 		}
-		sshTar = stdout.Bytes()
-		return nil, nil
+		sshTar = []byte(res.Stdout)
+		return res, nil
 	}); err != nil {
 		return err
 	}
 
 	// Skip the first node which is where we generated the key.
 	nodes := c.Nodes[1:]
-	if err := c.Parallel(l, "distributing ssh key", len(nodes), 0, func(i int) ([]byte, error) {
-		sess, err := c.newSession(nodes[i])
+	if err := c.Parallel(l, "distributing ssh key", len(nodes), 0, func(i int) (*RunResultDetails, error) {
+		node := nodes[i]
+		sess, err := c.newSession(node)
 		if err != nil {
-			return nil, err
+			return newRunResultDetails(node, err), err
 		}
 		defer sess.Close()
 
 		sess.SetStdin(bytes.NewReader(sshTar))
 		cmd := `tar xf -`
-		if out, err := sess.CombinedOutput(ctx, cmd); err != nil {
-			return nil, errors.Wrapf(err, "%s: output:\n%s", cmd, out)
+
+		out, cmdErr := sess.CombinedOutput(ctx, cmd)
+		res := newRunResultDetails(node, cmdErr)
+		res.CombinedOut = out
+
+		if res.Err != nil {
+			return res, errors.Wrapf(res.Err, "%s: output:\n%s", cmd, res.CombinedOut)
 		}
-		return nil, nil
+		return res, nil
 	}); err != nil {
 		return err
 	}
@@ -925,19 +990,23 @@ tar cf - .ssh/id_rsa .ssh/id_rsa.pub .ssh/authorized_keys
 	// known hosts file in unhashed format, working around a limitation of jsch
 	// (which is used in jepsen tests).
 	ips := make([]string, len(c.Nodes), len(c.Nodes)*2)
-	if err := c.Parallel(l, "retrieving hosts", len(c.Nodes), 0, func(i int) ([]byte, error) {
+	if err := c.Parallel(l, "retrieving hosts", len(c.Nodes), 0, func(i int) (*RunResultDetails, error) {
+		node := c.Nodes[i]
+		res := &RunResultDetails{Node: node}
 		for j := 0; j < 20 && ips[i] == ""; j++ {
 			var err error
-			ips[i], err = c.GetInternalIP(ctx, c.Nodes[i])
+			ips[i], err = c.GetInternalIP(ctx, node)
 			if err != nil {
-				return nil, errors.Wrapf(err, "pgurls")
+				res.Err = errors.Wrapf(err, "pgurls")
+				return res, res.Err
 			}
 			time.Sleep(time.Second)
 		}
 		if ips[i] == "" {
-			return nil, fmt.Errorf("retrieved empty IP address")
+			res.Err = fmt.Errorf("retrieved empty IP address")
+			return res, res.Err
 		}
-		return nil, nil
+		return res, nil
 	}); err != nil {
 		return err
 	}
@@ -946,10 +1015,11 @@ tar cf - .ssh/id_rsa .ssh/id_rsa.pub .ssh/authorized_keys
 		ips = append(ips, c.Host(i))
 	}
 	var knownHostsData []byte
-	if err := c.Parallel(l, "scanning hosts", 1, 0, func(i int) ([]byte, error) {
-		sess, err := c.newSession(c.Nodes[i])
+	if err := c.Parallel(l, "scanning hosts", 1, 0, func(i int) (*RunResultDetails, error) {
+		node := c.Nodes[i]
+		sess, err := c.newSession(node)
 		if err != nil {
-			return nil, err
+			return newRunResultDetails(node, err), err
 		}
 		defer sess.Close()
 
@@ -979,19 +1049,25 @@ exit 1
 		var stderr bytes.Buffer
 		sess.SetStdout(&stdout)
 		sess.SetStderr(&stderr)
-		if err := sess.Run(ctx, cmd); err != nil {
-			return nil, errors.Wrapf(err, "%s: stderr:\n%s", cmd, stderr.String())
+
+		res := newRunResultDetails(node, sess.Run(ctx, cmd))
+
+		res.Stdout = stdout.String()
+		res.Stderr = stderr.String()
+		if res.Err != nil {
+			return res, errors.Wrapf(res.Err, "%s: stderr:\n%s", cmd, res.Stderr)
 		}
 		knownHostsData = stdout.Bytes()
-		return nil, nil
+		return res, nil
 	}); err != nil {
 		return err
 	}
 
-	if err := c.Parallel(l, "distributing known_hosts", len(c.Nodes), 0, func(i int) ([]byte, error) {
-		sess, err := c.newSession(c.Nodes[i])
+	if err := c.Parallel(l, "distributing known_hosts", len(c.Nodes), 0, func(i int) (*RunResultDetails, error) {
+		node := c.Nodes[i]
+		sess, err := c.newSession(node)
 		if err != nil {
-			return nil, err
+			return newRunResultDetails(node, err), err
 		}
 		defer sess.Close()
 
@@ -1023,10 +1099,14 @@ if [[ "$(whoami)" != "` + config.SharedUser + `" ]]; then
         '"'"'{}'"'"' ~` + config.SharedUser + `/.ssh' \;
 fi
 `
-		if out, err := sess.CombinedOutput(ctx, cmd); err != nil {
-			return nil, errors.Wrapf(err, "%s: output:\n%s", cmd, out)
+		out, cmdErr := sess.CombinedOutput(ctx, cmd)
+		res := newRunResultDetails(node, cmdErr)
+		res.CombinedOut = out
+
+		if res.Err != nil {
+			return res, errors.Wrapf(res.Err, "%s: output:\n%s", cmd, res.CombinedOut)
 		}
-		return nil, nil
+		return res, nil
 	}); err != nil {
 		return err
 	}
@@ -1037,10 +1117,11 @@ fi
 		// additional authorized_keys to both the current user (your username on
 		// gce and the shared user on aws) as well as to the shared user on both
 		// platforms.
-		if err := c.Parallel(l, "adding additional authorized keys", len(c.Nodes), 0, func(i int) ([]byte, error) {
-			sess, err := c.newSession(c.Nodes[i])
+		if err := c.Parallel(l, "adding additional authorized keys", len(c.Nodes), 0, func(i int) (*RunResultDetails, error) {
+			node := c.Nodes[i]
+			sess, err := c.newSession(node)
 			if err != nil {
-				return nil, err
+				return newRunResultDetails(node, err), err
 			}
 			defer sess.Close()
 
@@ -1067,10 +1148,14 @@ if [[ "$(whoami)" != "` + config.SharedUser + `" ]]; then
         "${tmp2}" ~` + config.SharedUser + `/.ssh/authorized_keys
 fi
 `
-			if out, err := sess.CombinedOutput(ctx, cmd); err != nil {
-				return nil, errors.Wrapf(err, "~ %s\n%s", cmd, out)
+			out, cmdErr := sess.CombinedOutput(ctx, cmd)
+			res := newRunResultDetails(node, cmdErr)
+			res.CombinedOut = out
+
+			if res.Err != nil {
+				return res, errors.Wrapf(res.Err, "~ %s\n%s", cmd, res.CombinedOut)
 			}
-			return nil, nil
+			return res, nil
 		}); err != nil {
 			return err
 		}
@@ -1099,10 +1184,10 @@ func (c *SyncedCluster) DistributeCerts(ctx context.Context, l *logger.Logger) e
 	// Generate the ca, client and node certificates on the first node.
 	var msg string
 	display := fmt.Sprintf("%s: initializing certs", c.Name)
-	if err := c.Parallel(l, display, 1, 0, func(i int) ([]byte, error) {
-		sess, err := c.newSession(1)
+	if err := c.Parallel(l, display, 1, 0, func(i int) (*RunResultDetails, error) {
+		sess, err := c.setupSession(1)
 		if err != nil {
-			return nil, err
+			return newRunResultDetails(1, err), err
 		}
 		defer sess.Close()
 
@@ -1119,10 +1204,15 @@ mkdir -p certs
 %[1]s cert create-node %[2]s --certs-dir=certs --ca-key=certs/ca.key
 tar cvf %[3]s certs
 `, cockroachNodeBinary(c, 1), strings.Join(nodeNames, " "), certsTarName)
-		if out, err := sess.CombinedOutput(ctx, cmd); err != nil {
-			msg = fmt.Sprintf("%s: %v", out, err)
+
+		out, cmdErr := sess.CombinedOutput(ctx, cmd)
+		res := newRunResultDetails(1, cmdErr)
+		res.CombinedOut = out
+
+		if res.Err != nil {
+			msg = fmt.Sprintf("%s: %v", res.CombinedOut, res.Err)
 		}
-		return nil, nil
+		return res, nil
 	}); err != nil {
 		return err
 	}
@@ -1183,11 +1273,11 @@ func (c *SyncedCluster) createTenantCertBundle(
 	ctx context.Context, l *logger.Logger, bundleName string, tenantID int, nodeNames []string,
 ) error {
 	display := fmt.Sprintf("%s: initializing tenant certs", c.Name)
-	return c.Parallel(l, display, 1, 0, func(i int) ([]byte, error) {
+	return c.Parallel(l, display, 1, 0, func(i int) (*RunResultDetails, error) {
 		node := c.Nodes[i]
 		sess, err := c.newSession(node)
 		if err != nil {
-			return nil, err
+			return newRunResultDetails(node, err), err
 		}
 		defer sess.Close()
 
@@ -1221,10 +1311,14 @@ tar cvf %[5]s $CERT_DIR
 			bundleName,
 		)
 
-		if out, err := sess.CombinedOutput(ctx, cmd); err != nil {
-			return nil, errors.Wrapf(err, "certificate creation error: %s", out)
+		out, cmdErr := sess.CombinedOutput(ctx, cmd)
+		res := newRunResultDetails(node, cmdErr)
+		res.CombinedOut = out
+
+		if res.Err != nil {
+			return res, errors.Wrapf(res.Err, "certificate creation error: %s", res.CombinedOut)
 		}
-		return nil, nil
+		return res, nil
 	})
 }
 
@@ -1299,14 +1393,20 @@ func (c *SyncedCluster) fileExistsOnFirstNode(
 ) bool {
 	var existsErr error
 	display := fmt.Sprintf("%s: checking %s", c.Name, path)
-	if err := c.Parallel(l, display, 1, 0, func(i int) ([]byte, error) {
-		sess, err := c.newSession(c.Nodes[i])
+	if err := c.Parallel(l, display, 1, 0, func(i int) (*RunResultDetails, error) {
+		node := c.Nodes[i]
+		sess, err := c.newSession(node)
 		if err != nil {
-			return nil, err
+			return newRunResultDetails(node, err), err
 		}
 		defer sess.Close()
-		_, existsErr = sess.CombinedOutput(ctx, `test -e `+path)
-		return nil, nil
+
+		out, cmdErr := sess.CombinedOutput(ctx, `test -e `+path)
+		res := newRunResultDetails(node, cmdErr)
+		res.CombinedOut = out
+
+		existsErr = res.Err
+		return res, nil
 	}); err != nil {
 		return false
 	}
@@ -1325,10 +1425,13 @@ func (c *SyncedCluster) createNodeCertArguments(
 	nodes := allNodes(len(c.VMs))
 	if !c.IsLocal() {
 		ips = make([]string, len(nodes))
-		if err := c.Parallel(l, "", len(nodes), 0, func(i int) ([]byte, error) {
-			var err error
-			ips[i], err = c.GetInternalIP(ctx, nodes[i])
-			return nil, errors.Wrapf(err, "IPs")
+		if err := c.Parallel(l, "", len(nodes), 0, func(i int) (*RunResultDetails, error) {
+			node := nodes[i]
+			res := &RunResultDetails{Node: node}
+
+			res.Stdout, res.Err = c.GetInternalIP(ctx, node)
+			ips[i] = res.Stdout
+			return res, errors.Wrapf(res.Err, "IPs")
 		}); err != nil {
 			return nil, err
 		}
@@ -1369,27 +1472,33 @@ func (c *SyncedCluster) distributeLocalCertsTar(
 	}
 
 	display := c.Name + ": distributing certs"
-	return c.Parallel(l, display, len(nodes), 0, func(i int) ([]byte, error) {
-		sess, err := c.newSession(nodes[i])
+	return c.Parallel(l, display, len(nodes), 0, func(i int) (*RunResultDetails, error) {
+		node := nodes[i]
+		sess, err := c.newSession(node)
 		if err != nil {
-			return nil, err
+			return newRunResultDetails(node, err), err
 		}
 		defer sess.Close()
 
 		sess.SetStdin(bytes.NewReader(certsTar))
 		var cmd string
 		if c.IsLocal() {
-			cmd = fmt.Sprintf("cd %s ; ", c.localVMDir(nodes[i]))
+			cmd = fmt.Sprintf("cd %s ; ", c.localVMDir(node))
 		}
 		if stripComponents > 0 {
 			cmd += fmt.Sprintf("tar --strip-components=%d -xf -", stripComponents)
 		} else {
 			cmd += "tar xf -"
 		}
-		if out, err := sess.CombinedOutput(ctx, cmd); err != nil {
-			return nil, errors.Wrapf(err, "~ %s\n%s", cmd, out)
+
+		out, cmdErr := sess.CombinedOutput(ctx, cmd)
+		res := newRunResultDetails(node, cmdErr)
+		res.CombinedOut = out
+
+		if res.Err != nil {
+			return res, errors.Wrapf(res.Err, "~ %s\n%s", cmd, res.CombinedOut)
 		}
-		return nil, nil
+		return res, nil
 	})
 }
 
@@ -2055,10 +2164,12 @@ func (c *SyncedCluster) pghosts(
 	ctx context.Context, l *logger.Logger, nodes Nodes,
 ) (map[Node]string, error) {
 	ips := make([]string, len(nodes))
-	if err := c.Parallel(l, "", len(nodes), 0, func(i int) ([]byte, error) {
-		var err error
-		ips[i], err = c.GetInternalIP(ctx, nodes[i])
-		return nil, errors.Wrapf(err, "pghosts")
+	if err := c.Parallel(l, "", len(nodes), 0, func(i int) (*RunResultDetails, error) {
+		node := nodes[i]
+		res := &RunResultDetails{Node: node}
+		res.Stdout, res.Err = c.GetInternalIP(ctx, node)
+		ips[i] = res.Stdout
+		return res, errors.Wrapf(res.Err, "pghosts")
 	}); err != nil {
 		return nil, err
 	}
@@ -2161,7 +2272,10 @@ type ParallelResult struct {
 //
 // See ParallelE for more information.
 func (c *SyncedCluster) Parallel(
-	l *logger.Logger, display string, count, concurrency int, fn func(i int) ([]byte, error),
+	l *logger.Logger,
+	display string,
+	count, concurrency int,
+	fn func(i int) (*RunResultDetails, error),
 ) error {
 	failed, err := c.ParallelE(l, display, count, concurrency, fn)
 	if err != nil {
@@ -2182,10 +2296,16 @@ func (c *SyncedCluster) Parallel(
 // `config.MaxConcurrency` if it is lower) in parallel. If `concurrency` is
 // 0, then it defaults to `count`.
 //
+// The function returns a pointer to RunResultDetails as we may enrich
+// the result with retry information (attempt number, wrapper error)
+//
 // If err is non-nil, the slice of ParallelResults will contain the
 // results from any of the failed invocations.
 func (c *SyncedCluster) ParallelE(
-	l *logger.Logger, display string, count, concurrency int, fn func(i int) ([]byte, error),
+	l *logger.Logger,
+	display string,
+	count, concurrency int,
+	fn func(i int) (*RunResultDetails, error),
 ) ([]ParallelResult, error) {
 	if concurrency == 0 || concurrency > count {
 		concurrency = count
@@ -2202,8 +2322,8 @@ func (c *SyncedCluster) ParallelE(
 	startNext := func() {
 		go func(i int) {
 			defer wg.Done()
-			out, err := fn(i)
-			results <- ParallelResult{i, out, err}
+			res, err := runWithMaybeRetry(l, defaultRunRetryOpt, defaultShouldRetry, func() (*RunResultDetails, error) { return fn(i) })
+			results <- ParallelResult{i, res.CombinedOut, err}
 		}(index)
 		index++
 	}
@@ -2286,7 +2406,7 @@ func (c *SyncedCluster) ParallelE(
 		for _, res := range failed {
 			err = errors.CombineErrors(err, res.Err)
 		}
-		return nil, errors.Wrap(err, "parallel execution failure")
+		return failed, errors.Wrap(err, "parallel execution failure")
 	}
 	return nil, nil
 }
diff --git a/pkg/roachprod/install/cluster_synced_test.go b/pkg/roachprod/install/cluster_synced_test.go
index fb25e09ea689..c0490e159ef0 100644
--- a/pkg/roachprod/install/cluster_synced_test.go
+++ b/pkg/roachprod/install/cluster_synced_test.go
@@ -12,7 +12,14 @@ package install
 
 import (
 	"fmt"
+	"io"
 	"testing"
+	"time"
+
+	"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
+	"github.com/cockroachdb/cockroach/pkg/util/retry"
+	"github.com/cockroachdb/errors"
+	"github.com/stretchr/testify/require"
 )
 
 // TestRoachprodEnv tests the roachprodEnvRegex and roachprodEnvValue methods.
@@ -75,3 +82,92 @@ func TestRoachprodEnv(t *testing.T) {
 		})
 	}
 }
+
+func TestRunWithMaybeRetry(t *testing.T) {
+	var testRetryOpts = retry.Options{
+		InitialBackoff: 10 * time.Millisecond,
+		Multiplier:     2,
+		MaxBackoff:     1 * time.Second,
+		// This will run a total of 3 times `runWithMaybeRetry`
+		MaxRetries: 2,
+	}
+
+	l := nilLogger()
+
+	attempt := 0
+	cases := []struct {
+		f                func() (*RunResultDetails, error)
+		shouldRetryFn    func(res *RunResultDetails) bool
+		expectedAttempts int
+		shouldError      bool
+	}{
+		{ // Happy path: no error, no retry required
+			f: func() (*RunResultDetails, error) {
+				return newResult(0), nil
+			},
+			expectedAttempts: 1,
+			shouldError:      false,
+		},
+		{ // Error, but not retry function specified
+			f: func() (*RunResultDetails, error) {
+				return newResult(1), nil
+			},
+			expectedAttempts: 1,
+			shouldError:      true,
+		},
+		{ // Error, with retries exhausted
+			f: func() (*RunResultDetails, error) {
+				return newResult(255), nil
+			},
+			shouldRetryFn:    func(d *RunResultDetails) bool { return d.RemoteExitStatus == 255 },
+			expectedAttempts: 3,
+			shouldError:      true,
+		},
+		{ // Eventual success after retries
+			f: func() (*RunResultDetails, error) {
+				attempt++
+				if attempt == 3 {
+					return newResult(0), nil
+				}
+				return newResult(255), nil
+			},
+			shouldRetryFn:    func(d *RunResultDetails) bool { return d.RemoteExitStatus == 255 },
+			expectedAttempts: 3,
+			shouldError:      false,
+		},
+	}
+
+	for idx, tc := range cases {
+		attempt = 0
+		t.Run(fmt.Sprintf("%d", idx+1), func(t *testing.T) {
+			res, _ := runWithMaybeRetry(l, testRetryOpts, tc.shouldRetryFn, tc.f)
+
+			require.Equal(t, tc.shouldError, res.Err != nil)
+			require.Equal(t, tc.expectedAttempts, res.Attempt)
+
+			if tc.shouldError && tc.expectedAttempts == 3 {
+				require.True(t, errors.Is(res.Err, ErrAfterRetry))
+			}
+		})
+	}
+}
+
+func newResult(exitCode int) *RunResultDetails {
+	var err error
+	if exitCode != 0 {
+		err = errors.Newf("Error with exit code %v", exitCode)
+	}
+	return &RunResultDetails{RemoteExitStatus: exitCode, Err: err}
+}
+
+func nilLogger() *logger.Logger {
+	lcfg := logger.Config{
+		Stdout: io.Discard,
+		Stderr: io.Discard,
+	}
+	l, err := lcfg.NewLogger("" /* path */)
+	if err != nil {
+		panic(err)
+	}
+	return l
+}
diff --git a/pkg/roachprod/install/cockroach.go b/pkg/roachprod/install/cockroach.go
index 7cbe288311d5..1afa27771e28 100644
--- a/pkg/roachprod/install/cockroach.go
+++ b/pkg/roachprod/install/cockroach.go
@@ -172,24 +172,25 @@ func (c *SyncedCluster) Start(ctx context.Context, l *logger.Logger, startOpts S
 	}
 
 	l.Printf("%s: starting nodes", c.Name)
-	if err := c.Parallel(l, "", len(nodes), parallelism, func(nodeIdx int) ([]byte, error) {
+	if err := c.Parallel(l, "", len(nodes), parallelism, func(nodeIdx int) (*RunResultDetails, error) {
 		node := nodes[nodeIdx]
-
+		res := &RunResultDetails{Node: node}
 		// NB: if cockroach started successfully, we ignore the output as it is
 		// some harmless start messaging.
 		if _, err := c.startNode(ctx, l, node, startOpts); err != nil {
-			return nil, err
+			res.Err = err
+			return res, err
 		}
 
 		// Code that follows applies only for regular nodes.
 		if startOpts.Target != StartDefault {
-			return nil, nil
+			return res, nil
 		}
 
 		// We reserve a few special operations (bootstrapping, and setting
 		// cluster settings) to the InitTarget.
 		if startOpts.GetInitTarget() != node {
-			return nil, nil
+			return res, nil
 		}
 
 		// NB: The code blocks below are not parallelized, so it's safe for us
@@ -199,20 +200,22 @@ func (c *SyncedCluster) Start(ctx context.Context, l *logger.Logger, startOpts S
 		// 2. We don't init when invoking with `start-single-node`.
 
 		if startOpts.SkipInit {
-			return nil, nil
+			return res, nil
 		}
 
 		shouldInit := !c.useStartSingleNode()
 		if shouldInit {
 			if err := c.initializeCluster(ctx, l, node); err != nil {
-				return nil, errors.Wrap(err, "failed to initialize cluster")
+				res.Err = err
+				return res, errors.Wrap(err, "failed to initialize cluster")
 			}
 		}
 
 		if err := c.setClusterSettings(ctx, l, node); err != nil {
-			return nil, errors.Wrap(err, "failed to set cluster settings")
+			res.Err = err
+			return res, errors.Wrap(err, "failed to set cluster settings")
 		}
-		return nil, nil
+		return res, nil
 	}); err != nil {
 		return err
 	}
@@ -314,11 +317,11 @@ func (c *SyncedCluster) RunSQL(ctx context.Context, l *logger.Logger, args []str
 	resultChan := make(chan result, len(c.Nodes))
 
 	display := fmt.Sprintf("%s: executing sql", c.Name)
-	if err := c.Parallel(l, display, len(c.Nodes), 0, func(nodeIdx int) ([]byte, error) {
+	if err := c.Parallel(l, display, len(c.Nodes), 0, func(nodeIdx int) (*RunResultDetails, error) {
 		node := c.Nodes[nodeIdx]
 		sess, err := c.newSession(node)
 		if err != nil {
-			return nil, err
+			return newRunResultDetails(node, err), err
 		}
 		defer sess.Close()
 
@@ -330,13 +333,15 @@ func (c *SyncedCluster) RunSQL(ctx context.Context, l *logger.Logger, args []str
 			c.NodeURL("localhost", c.NodePort(node)) + " " +
 			ssh.Escape(args)
 
-		out, err := sess.CombinedOutput(ctx, cmd)
-		if err != nil {
-			return nil, errors.Wrapf(err, "~ %s\n%s", cmd, out)
-		}
+		out, cmdErr := sess.CombinedOutput(ctx, cmd)
+		res := newRunResultDetails(node, cmdErr)
+		res.CombinedOut = out
 
-		resultChan <- result{node: node, output: string(out)}
-		return nil, nil
+		if res.Err != nil {
+			return res, errors.Wrapf(res.Err, "~ %s\n%s", cmd, res.CombinedOut)
+		}
+		resultChan <- result{node: node, output: string(res.CombinedOut)}
+		return res, nil
 	}); err != nil {
 		return err
 	}
diff --git a/pkg/roachprod/install/session.go b/pkg/roachprod/install/session.go
index f65092d57b34..ea2fd8fe9b17 100644
--- a/pkg/roachprod/install/session.go
+++ b/pkg/roachprod/install/session.go
@@ -16,17 +16,16 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
-	"strings"
 	"sync"
 
 	"github.com/cockroachdb/cockroach/pkg/roachprod/config"
+	rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
 	"github.com/cockroachdb/errors"
 )
 
 type session interface {
 	CombinedOutput(ctx context.Context, cmd string) ([]byte, error)
 	Run(ctx context.Context, cmd string) error
-	SetWithExitStatus(value bool)
 	SetStdin(r io.Reader)
 	SetStdout(w io.Writer)
 	SetStderr(w io.Writer)
@@ -41,9 +40,8 @@ type session interface {
 
 type remoteSession struct {
 	*exec.Cmd
-	cancel         func()
-	logfile        string // captures ssh -vvv
-	withExitStatus bool
+	cancel  func()
+	logfile string // captures ssh -vvv
 }
 
 func newRemoteSession(user, host string, logdir string) (*remoteSession, error) {
@@ -56,7 +54,6 @@ func newRemoteSession(user, host string, logdir string) (*remoteSession, error)
 	// 	fmt.Sprintf("ssh_%s_%s", host, timeutil.Now().Format(time.RFC3339)),
 	// )
 	const logfile = ""
-	withExitStatus := false
 	args := []string{
 		user + "@" + host,
 
@@ -81,7 +78,7 @@ func newRemoteSession(user, host string, logdir string) (*remoteSession, error)
 	args = append(args, sshAuthArgs()...)
 	ctx, cancel := context.WithCancel(context.Background())
 	cmd := exec.CommandContext(ctx, "ssh", args...)
-	return &remoteSession{cmd, cancel, logfile, withExitStatus}, nil
+	return &remoteSession{cmd, cancel, logfile}, nil
 }
 
 func (s *remoteSession) errWithDebug(err error) error {
@@ -93,13 +90,6 @@ func (s *remoteSession) errWithDebug(err error) error {
 }
 
 func (s *remoteSession) CombinedOutput(ctx context.Context, cmd string) ([]byte, error) {
-	if s.withExitStatus {
-		cmd = strings.TrimSpace(cmd)
-		if !strings.HasSuffix(cmd, ";") {
-			cmd += ";"
-		}
-		cmd += "echo -n 'LAST EXIT STATUS: '$?;"
-	}
 	s.Cmd.Args = append(s.Cmd.Args, cmd)
 
 	var b []byte
@@ -117,18 +107,11 @@ func (s *remoteSession) CombinedOutput(ctx context.Context, cmd string) ([]byte,
 		s.Close()
 		return nil, ctx.Err()
 	case <-commandFinished:
-		return b, err
+		return b, rperrors.ClassifyCmdError(err)
 	}
 }
 
 func (s *remoteSession) Run(ctx context.Context, cmd string) error {
-	if s.withExitStatus {
-		cmd = strings.TrimSpace(cmd)
-		if !strings.HasSuffix(cmd, ";") {
-			cmd += ";"
-		}
-		cmd += "echo -n 'LAST EXIT STATUS: '$?;"
-	}
 	s.Cmd.Args = append(s.Cmd.Args, cmd)
 
 	var err error
@@ -143,24 +126,13 @@ func (s *remoteSession) Run(ctx context.Context, cmd string) error {
 		s.Close()
 		return ctx.Err()
 	case <-commandFinished:
-		return err
+		return rperrors.ClassifyCmdError(err)
 	}
 }
 
 func (s *remoteSession) Start(cmd string) error {
-	if s.withExitStatus {
-		cmd = strings.TrimSpace(cmd)
-		if !strings.HasSuffix(cmd, ";") {
-			cmd += ";"
-		}
-		cmd += "echo -n 'LAST EXIT STATUS: '$?;"
-	}
 	s.Cmd.Args = append(s.Cmd.Args, cmd)
-	return s.Cmd.Start()
-}
-
-func (s *remoteSession) SetWithExitStatus(value bool) {
-	s.withExitStatus = value
+	return rperrors.ClassifyCmdError(s.Cmd.Start())
 }
 
 func (s *remoteSession) SetStdin(r io.Reader) {
@@ -207,25 +179,16 @@ func (s *remoteSession) Close() {
 
 type localSession struct {
 	*exec.Cmd
-	cancel         func()
-	withExitStatus bool
+	cancel func()
 }
 
 func newLocalSession() *localSession {
 	ctx, cancel := context.WithCancel(context.Background())
-	withExitStatus := false
 	cmd := exec.CommandContext(ctx, "/bin/bash", "-c")
-	return &localSession{cmd, cancel, withExitStatus}
+	return &localSession{cmd, cancel}
 }
 
 func (s *localSession) CombinedOutput(ctx context.Context, cmd string) ([]byte, error) {
-	if s.withExitStatus {
-		cmd = strings.TrimSpace(cmd)
-		if !strings.HasSuffix(cmd, ";") {
-			cmd += ";"
-		}
-		cmd += "echo -n 'LAST EXIT STATUS: '$?;"
-	}
 	s.Cmd.Args = append(s.Cmd.Args, cmd)
 
 	var b []byte
@@ -242,18 +205,11 @@ func (s *localSession) CombinedOutput(ctx context.Context, cmd string) ([]byte,
 		s.Close()
 		return nil, ctx.Err()
 	case <-commandFinished:
-		return b, err
+		return b, rperrors.ClassifyCmdError(err)
 	}
 }
 
 func (s *localSession) Run(ctx context.Context, cmd string) error {
-	if s.withExitStatus {
-		cmd = strings.TrimSpace(cmd)
-		if !strings.HasSuffix(cmd, ";") {
-			cmd += ";"
-		}
-		cmd += "echo -n 'LAST EXIT STATUS: '$?;"
-	}
 	s.Cmd.Args = append(s.Cmd.Args, cmd)
 
 	var err error
@@ -268,24 +224,13 @@ func (s *localSession) Run(ctx context.Context, cmd string) error {
 		s.Close()
 		return ctx.Err()
 	case <-commandFinished:
-		return err
+		return rperrors.ClassifyCmdError(err)
 	}
 }
 
 func (s *localSession) Start(cmd string) error {
-	if s.withExitStatus {
-		cmd = strings.TrimSpace(cmd)
-		if !strings.HasSuffix(cmd, ";") {
-			cmd += ";"
-		}
-		cmd += "echo -n 'LAST EXIT STATUS: '$?;"
-	}
 	s.Cmd.Args = append(s.Cmd.Args, cmd)
-	return s.Cmd.Start()
-}
-
-func (s *localSession) SetWithExitStatus(value bool) {
-	s.withExitStatus = value
+	return rperrors.ClassifyCmdError(s.Cmd.Start())
 }
 
 func (s *localSession) SetStdin(r io.Reader) {
diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go
index 197384dd529c..49f2d76207a2 100644
--- a/pkg/roachprod/roachprod.go
+++ b/pkg/roachprod/roachprod.go
@@ -449,9 +449,12 @@ func IP(
 		}
 	} else {
 		var err error
-		if err := c.Parallel(l, "", len(nodes), 0, func(i int) ([]byte, error) {
-			ips[i], err = c.GetInternalIP(ctx, nodes[i])
-			return nil, err
+		if err := c.Parallel(l, "", len(nodes), 0, func(i int) (*install.RunResultDetails, error) {
+			node := nodes[i]
+			res := &install.RunResultDetails{Node: node}
+			res.Stdout, res.Err = c.GetInternalIP(ctx, node)
+			ips[i] = res.Stdout
+			return res, err
 		}); err != nil {
 			return nil, err
 		}
@@ -861,9 +864,12 @@ func PgURL(
 		}
 	} else {
 		var err error
-		if err := c.Parallel(l, "", len(nodes), 0, func(i int) ([]byte, error) {
-			ips[i], err = c.GetInternalIP(ctx, nodes[i])
-			return nil, err
+		if err := c.Parallel(l, "", len(nodes), 0, func(i int) (*install.RunResultDetails, error) {
+			node := nodes[i]
+			res := &install.RunResultDetails{Node: node}
+			res.Stdout, res.Err = c.GetInternalIP(ctx, node)
+			ips[i] = res.Stdout
+			return res, err
 		}); err != nil {
 			return nil, err
 		}
@@ -991,8 +997,9 @@ func Pprof(l *logger.Logger, clusterName string, opts PprofOpts) error {
 	httpClient := httputil.NewClientWithTimeout(timeout)
 	startTime := timeutil.Now().Unix()
 	nodes := c.TargetNodes()
-	failed, err := c.ParallelE(l, description, len(nodes), 0, func(i int) ([]byte, error) {
+	failed, err := c.ParallelE(l, description, len(nodes), 0, func(i int) (*install.RunResultDetails, error) {
 		node := nodes[i]
+		res := &install.RunResultDetails{Node: node}
 		host := c.Host(node)
 		port := c.NodeUIPort(node)
 		scheme := "http"
@@ -1003,7 +1010,8 @@ func Pprof(l *logger.Logger, clusterName string, opts PprofOpts) error {
 		outputDir := filepath.Dir(outputFile)
 		file, err := os.CreateTemp(outputDir, ".pprof")
 		if err != nil {
-			return nil, errors.Wrap(err, "create tmpfile for pprof download")
+			res.Err = errors.Wrap(err, "create tmpfile for pprof download")
+			return res, res.Err
 		}
 
 		defer func() {
@@ -1020,31 +1028,37 @@ func Pprof(l *logger.Logger, clusterName string, opts PprofOpts) error {
 		pprofURL := fmt.Sprintf("%s://%s:%d/%s", scheme, host, port, pprofPath)
 		resp, err := httpClient.Get(context.Background(), pprofURL)
 		if err != nil {
-			return nil, err
+			res.Err = err
+			return res, res.Err
 		}
 		defer resp.Body.Close()
 
 		if resp.StatusCode != http.StatusOK {
-			return nil, errors.Newf("unexpected status from pprof endpoint: %s", resp.Status)
+			res.Err = errors.Newf("unexpected status from pprof endpoint: %s", resp.Status)
+			return res, res.Err
 		}
 
 		if _, err := io.Copy(file, resp.Body); err != nil {
-			return nil, err
+			res.Err = err
+			return res, res.Err
 		}
 		if err := file.Sync(); err != nil {
-			return nil, err
+			res.Err = err
+			return res, res.Err
 		}
 		if err := file.Close(); err != nil {
-			return nil, err
+			res.Err = err
+			return res, res.Err
 		}
 		if err := os.Rename(file.Name(), outputFile); err != nil {
-			return nil, err
+			res.Err = err
+			return res, res.Err
 		}
 
 		mu.Lock()
 		outputFiles = append(outputFiles, outputFile)
 		mu.Unlock()
-		return nil, nil
+		return res, nil
 	})
 
 	for _, s := range outputFiles {
diff --git a/pkg/util/retry/retry.go b/pkg/util/retry/retry.go
index f6010085160c..b082a462c7bf 100644
--- a/pkg/util/retry/retry.go
+++ b/pkg/util/retry/retry.go
@@ -158,6 +158,11 @@ func (r *Retry) NextCh() <-chan time.Time {
 	return time.After(r.retryIn())
 }
 
+// CurrentAttempt returns the current attempt
+func (r *Retry) CurrentAttempt() int {
+	return r.currentAttempt
+}
+
 // Do invokes the closure according to the retry options until it returns
 // success or no more retries are possible. Always returns an error unless the
 // return is prompted by a successful invocation of `fn`.

From d596620b3ed37046beab989c87139d7d43982bc6 Mon Sep 17 00:00:00 2001
From: Steven Danna <danna@cockroachlabs.com>
Date: Tue, 15 Nov 2022 00:13:13 +0000
Subject: [PATCH 2/7] testutils: add job id to error message

Release note: None
---
 pkg/testutils/jobutils/jobs_verification.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/testutils/jobutils/jobs_verification.go b/pkg/testutils/jobutils/jobs_verification.go
index be8febec0f5b..7e5d8c9f1644 100644
--- a/pkg/testutils/jobutils/jobs_verification.go
+++ b/pkg/testutils/jobutils/jobs_verification.go
@@ -98,7 +98,7 @@ func WaitForJobToHaveNoLease(t testing.TB, db *sqlutils.SQLRunner, jobID jobspb.
 		if sessionID == nil && !instanceID.Valid {
 			return nil
 		}
-		return errors.Newf("job %d still has claim information")
+		return errors.Newf("job %d still has claim information", jobID)
 	}, 2*time.Minute)
 }
 

From 330f74c406ec32fece3a2d4a5ba6ef6643476463 Mon Sep 17 00:00:00 2001
From: maryliag <marylia@cockroachlabs.com>
Date: Mon, 14 Nov 2022 19:14:30 -0500
Subject: [PATCH 3/7] ui: use format from cluster-ui

The format.js file was duplicated on db console,
making it hard to maintain.
This commit replaces all usages of the format
functions on db-console package, to the ones
from cluster-ui.

Epic: None

Release note: None
---
 .../components/nodeOrLocality/capacityArc.tsx |   3 +-
 .../reports/containers/stores/encryption.tsx  |   3 +-
 .../db-console/src/util/format.spec.ts        |  51 ------
 .../workspaces/db-console/src/util/format.ts  | 160 ------------------
 .../containers/clusterOverview/capacity.tsx   |   4 +-
 .../containers/clusterOverview/index.tsx      |   7 +-
 .../views/cluster/containers/events/index.tsx |  11 +-
 .../containers/nodeGraphs/summaryBar.tsx      |   2 +-
 .../cluster/containers/nodeLogs/index.tsx     |   3 +-
 .../cluster/containers/nodeOverview/index.tsx |   2 +-
 .../containers/nodesOverview/index.tsx        |   7 +-
 .../db-console/src/views/hotRanges/index.tsx  |   7 +-
 .../nodeHistory/decommissionedNodeHistory.tsx |   3 +-
 .../reports/containers/range/rangeTable.tsx   |   5 +-
 .../reports/containers/settings/index.tsx     |   3 +-
 .../statementDiagnosticsHistory/index.tsx     |   4 +-
 16 files changed, 32 insertions(+), 243 deletions(-)
 delete mode 100644 pkg/ui/workspaces/db-console/src/util/format.spec.ts
 delete mode 100644 pkg/ui/workspaces/db-console/src/util/format.ts

diff --git a/pkg/ui/workspaces/db-console/ccl/src/views/clusterviz/components/nodeOrLocality/capacityArc.tsx b/pkg/ui/workspaces/db-console/ccl/src/views/clusterviz/components/nodeOrLocality/capacityArc.tsx
index 40ed5066421a..2046d1a968d8 100644
--- a/pkg/ui/workspaces/db-console/ccl/src/views/clusterviz/components/nodeOrLocality/capacityArc.tsx
+++ b/pkg/ui/workspaces/db-console/ccl/src/views/clusterviz/components/nodeOrLocality/capacityArc.tsx
@@ -14,7 +14,7 @@ import {
   LIGHT_TEXT_BLUE,
   MAIN_BLUE,
 } from "src/views/shared/colors";
-import { Bytes } from "src/util/format";
+import { util } from "@cockroachlabs/cluster-ui";
 import {
   NodeArcPercentageTooltip,
   NodeArcUsedCapacityTooltip,
@@ -34,6 +34,7 @@ interface CapacityArcProps {
 
 export class CapacityArc extends React.Component<CapacityArcProps> {
   render() {
+    const { Bytes } = util;
     // Compute used percentage.
     const usedCapacity = this.props.usedCapacity;
     const capacity = this.props.usableCapacity;
diff --git a/pkg/ui/workspaces/db-console/ccl/src/views/reports/containers/stores/encryption.tsx b/pkg/ui/workspaces/db-console/ccl/src/views/reports/containers/stores/encryption.tsx
index 825fe0a265c3..a8fd1a87f951 100644
--- a/pkg/ui/workspaces/db-console/ccl/src/views/reports/containers/stores/encryption.tsx
+++ b/pkg/ui/workspaces/db-console/ccl/src/views/reports/containers/stores/encryption.tsx
@@ -14,7 +14,7 @@ import moment from "moment";
 import * as protos from "src/js/protos";
 import * as protosccl from "@cockroachlabs/crdb-protobuf-client-ccl";
 import { EncryptionStatusProps } from "oss/src/views/reports/containers/stores/encryption";
-import { Bytes } from "src/util/format";
+import { util } from "@cockroachlabs/cluster-ui";
 import { FixLong } from "src/util/fixLong";
 
 const dateFormat = "Y-MM-DD HH:mm:ss";
@@ -104,6 +104,7 @@ export default class EncryptionStatus {
   }
 
   renderFileStats(stats: protos.cockroach.server.serverpb.IStoreDetails) {
+    const { Bytes } = util;
     const totalFiles = FixLong(stats.total_files);
     const totalBytes = FixLong(stats.total_bytes);
     if (totalFiles.eq(0) && totalBytes.eq(0)) {
diff --git a/pkg/ui/workspaces/db-console/src/util/format.spec.ts b/pkg/ui/workspaces/db-console/src/util/format.spec.ts
deleted file mode 100644
index 4ff739e9c363..000000000000
--- a/pkg/ui/workspaces/db-console/src/util/format.spec.ts
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2018 The Cockroach Authors.
-//
-// Use of this software is governed by the Business Source License
-// included in the file licenses/BSL.txt.
-//
-// As of the Change Date specified in that file, in accordance with
-// the Business Source License, use of this software will be governed
-// by the Apache License, Version 2.0, included in the file
-// licenses/APL.txt.
-
-import {
-  DurationFitScale,
-  durationUnits,
-  BytesFitScale,
-  byteUnits,
-} from "./format";
-
-describe("Format utils", () => {
-  describe("DurationFitScale", () => {
-    it("converts nanoseconds to provided units", () => {
-      // test zero values
-      expect(DurationFitScale(durationUnits[0])(undefined)).toEqual("0.00 ns");
-      expect(DurationFitScale(durationUnits[0])(0)).toEqual("0.00 ns");
-      // "ns", "µs", "ms", "s"
-      expect(DurationFitScale(durationUnits[0])(32)).toEqual("32.00 ns");
-      expect(DurationFitScale(durationUnits[1])(32120)).toEqual("32.12 µs");
-      expect(DurationFitScale(durationUnits[2])(32122300)).toEqual("32.12 ms");
-      expect(DurationFitScale(durationUnits[3])(32122343000)).toEqual(
-        "32.12 s",
-      );
-    });
-  });
-
-  describe("BytesFitScale", () => {
-    it("converts bytes to provided units", () => {
-      // test zero values
-      expect(BytesFitScale(byteUnits[0])(undefined)).toEqual("0.00 B");
-      expect(BytesFitScale(byteUnits[0])(0)).toEqual("0.00 B");
-      // "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
-      expect(BytesFitScale(byteUnits[0])(1)).toEqual("1.00 B");
-      expect(BytesFitScale(byteUnits[1])(10240)).toEqual("10.00 KiB");
-      expect(BytesFitScale(byteUnits[2])(12582912)).toEqual("12.00 MiB");
-      expect(BytesFitScale(byteUnits[3])(12884901888)).toEqual("12.00 GiB");
-      expect(BytesFitScale(byteUnits[4])(1.319414e13)).toEqual("12.00 TiB");
-      expect(BytesFitScale(byteUnits[5])(1.3510799e16)).toEqual("12.00 PiB");
-      expect(BytesFitScale(byteUnits[6])(1.3835058e19)).toEqual("12.00 EiB");
-      expect(BytesFitScale(byteUnits[7])(1.4167099e22)).toEqual("12.00 ZiB");
-      expect(BytesFitScale(byteUnits[8])(1.450711e25)).toEqual("12.00 YiB");
-    });
-  });
-});
diff --git a/pkg/ui/workspaces/db-console/src/util/format.ts b/pkg/ui/workspaces/db-console/src/util/format.ts
deleted file mode 100644
index 717d9f043d39..000000000000
--- a/pkg/ui/workspaces/db-console/src/util/format.ts
+++ /dev/null
@@ -1,160 +0,0 @@
-// Copyright 2018 The Cockroach Authors.
-//
-// Use of this software is governed by the Business Source License
-// included in the file licenses/BSL.txt.
-//
-// As of the Change Date specified in that file, in accordance with
-// the Business Source License, use of this software will be governed
-// by the Apache License, Version 2.0, included in the file
-// licenses/APL.txt.
-
-export const kibi = 1024;
-export const byteUnits: string[] = [
-  "B",
-  "KiB",
-  "MiB",
-  "GiB",
-  "TiB",
-  "PiB",
-  "EiB",
-  "ZiB",
-  "YiB",
-];
-export const durationUnits: string[] = ["ns", "µs", "ms", "s"];
-
-interface UnitValue {
-  value: number;
-  units: string;
-}
-
-// computePrefixExponent is used to compute an appropriate display unit for a value
-// for which the units have metric-style prefixes available. For example, the
-// value may be expressed in bytes, but we may want to display it on the graph
-// as a larger prefix unit (such as "kilobytes" or "gigabytes") in order to make
-// the numbers more readable.
-export function ComputePrefixExponent(
-  value: number,
-  prefixMultiple: number,
-  prefixList: string[],
-) {
-  // Compute the metric prefix that will be used to label the axis.
-  let maxUnits = Math.abs(value);
-  let prefixScale: number;
-  for (
-    prefixScale = 0;
-    maxUnits >= prefixMultiple && prefixScale < prefixList.length - 1;
-    prefixScale++
-  ) {
-    maxUnits /= prefixMultiple;
-  }
-  return prefixScale;
-}
-
-/**
- * ComputeByteScale calculates the appropriate scale factor and unit to use to
- * display a given byte value, without actually converting the value.
- *
- * This is used to prescale byte values before passing them to a d3-axis.
- */
-export function ComputeByteScale(bytes: number): UnitValue {
-  const scale = ComputePrefixExponent(bytes, kibi, byteUnits);
-  return {
-    value: Math.pow(kibi, scale),
-    units: byteUnits[scale],
-  };
-}
-
-export function BytesToUnitValue(bytes: number): UnitValue {
-  const scale = ComputeByteScale(bytes);
-  return {
-    value: bytes / scale.value,
-    units: scale.units,
-  };
-}
-
-/**
- * Bytes creates a string representation for a number of bytes. For
- * large numbers of bytes, the value will be converted into a large unit
- * (e.g. Kibibytes, Mebibytes).
- *
- * This function was adapted from
- * https://stackoverflow.com/questions/10420352/converting-file-size-in-bytes-to-human-readable
- */
-export function Bytes(bytes: number): string {
-  return BytesWithPrecision(bytes, 1);
-}
-
-/**
- * BytesWithPrecision is like Bytes, but accepts a precision parameter
- * indicating how many digits after the decimal point are desired.
- */
-export function BytesWithPrecision(bytes: number, precision: number): string {
-  const unitVal = BytesToUnitValue(bytes);
-  if (!unitVal.value) {
-    return "0 B";
-  }
-  return unitVal.value.toFixed(precision) + " " + unitVal.units;
-}
-
-/**
- * Cast bytes to provided scale units
- */
-export const BytesFitScale = (scale: string) => (bytes: number) => {
-  if (!bytes) {
-    return `0.00 ${scale}`;
-  }
-  const n = byteUnits.indexOf(scale);
-  return `${(bytes / Math.pow(kibi, n)).toFixed(2)} ${scale}`;
-};
-
-/**
- * Percentage creates a string representation of a fraction as a percentage.
- */
-export function Percentage(numerator: number, denominator: number): string {
-  if (denominator === 0) {
-    return "--%";
-  }
-  return Math.floor((numerator / denominator) * 100).toString() + "%";
-}
-
-/**
- * ComputeDurationScale calculates an appropriate scale factor and unit to use
- * to display a given duration value, without actually converting the value.
- */
-export function ComputeDurationScale(nanoseconds: number): UnitValue {
-  const scale = ComputePrefixExponent(nanoseconds, 1000, durationUnits);
-  return {
-    value: Math.pow(1000, scale),
-    units: durationUnits[scale],
-  };
-}
-
-/**
- * Duration creates a string representation for a duration. The expectation is
- * that units are passed in nanoseconds; for larger durations, the value will
- * be converted into larger units.
- */
-export function Duration(nanoseconds: number): string {
-  const scale = ComputeDurationScale(nanoseconds);
-  const unitVal = nanoseconds / scale.value;
-  return unitVal.toFixed(1) + " " + scale.units;
-}
-
-/**
- * Cast nanoseconds to provided scale units
- */
-export const DurationFitScale = (scale: string) => (nanoseconds: number) => {
-  if (!nanoseconds) {
-    return `0.00 ${scale}`;
-  }
-  const n = durationUnits.indexOf(scale);
-  return `${(nanoseconds / Math.pow(1000, n)).toFixed(2)} ${scale}`;
-};
-
-export const DATE_FORMAT = "MMM DD, YYYY [at] H:mm";
-
-/**
- * Alternate 24 hour UTC format
- */
-export const DATE_FORMAT_24_UTC = "MMM DD, YYYY [at] H:mm UTC";
-export const DATE_WITH_SECONDS_FORMAT_24_UTC = "MMM DD, YYYY [at] H:mm:ss UTC";
diff --git a/pkg/ui/workspaces/db-console/src/views/cluster/containers/clusterOverview/capacity.tsx b/pkg/ui/workspaces/db-console/src/views/cluster/containers/clusterOverview/capacity.tsx
index cf23968252d4..b7bf363879f6 100644
--- a/pkg/ui/workspaces/db-console/src/views/cluster/containers/clusterOverview/capacity.tsx
+++ b/pkg/ui/workspaces/db-console/src/views/cluster/containers/clusterOverview/capacity.tsx
@@ -10,7 +10,7 @@
 
 import d3 from "d3";
 
-import { ComputeByteScale } from "src/util/format";
+import { util } from "@cockroachlabs/cluster-ui";
 
 const LOW_DISK_SPACE_RATIO = 0.15;
 
@@ -47,7 +47,7 @@ function capacityChart() {
     // Compute the appropriate scale factor for a value slightly smaller than the
     // usable capacity, so that if the usable capacity is exactly 1 {MiB,GiB,etc}
     // we show the scale in the next-smaller unit.
-    const byteScale = ComputeByteScale(capacity.usable - 1);
+    const byteScale = util.ComputeByteScale(capacity.usable - 1);
 
     const scaled = {
       used: capacity.used / byteScale.value,
diff --git a/pkg/ui/workspaces/db-console/src/views/cluster/containers/clusterOverview/index.tsx b/pkg/ui/workspaces/db-console/src/views/cluster/containers/clusterOverview/index.tsx
index 228851b7e8bc..c325ceab6a1b 100644
--- a/pkg/ui/workspaces/db-console/src/views/cluster/containers/clusterOverview/index.tsx
+++ b/pkg/ui/workspaces/db-console/src/views/cluster/containers/clusterOverview/index.tsx
@@ -17,7 +17,7 @@ import { createSelector } from "reselect";
 
 import { AdminUIState } from "src/redux/state";
 import { nodeSumsSelector } from "src/redux/nodes";
-import { Bytes as formatBytes } from "src/util/format";
+import { util } from "@cockroachlabs/cluster-ui";
 import createChartComponent from "src/views/shared/util/d3-react";
 import capacityChart from "./capacity";
 import spinner from "assets/spinner.gif";
@@ -47,6 +47,7 @@ interface CapacityUsageProps {
 const formatPercentage = d3.format("0.1%");
 
 function renderCapacityUsage(props: CapacityUsageProps) {
+  const { Bytes } = util;
   const { usedCapacity, usableCapacity } = props;
   const usedPercentage =
     usableCapacity !== 0 ? usedCapacity / usableCapacity : 0;
@@ -69,13 +70,13 @@ function renderCapacityUsage(props: CapacityUsageProps) {
       <UsedTooltip>Used</UsedTooltip>
     </div>,
     <div className="capacity-usage cluster-summary__metric storage-used">
-      {formatBytes(usedCapacity)}
+      {Bytes(usedCapacity)}
     </div>,
     <div className="capacity-usage cluster-summary__label storage-usable">
       <UsableTooltip>Usable</UsableTooltip>
     </div>,
     <div className="capacity-usage cluster-summary__metric storage-usable">
-      {formatBytes(usableCapacity)}
+      {Bytes(usableCapacity)}
     </div>,
   ];
 }
diff --git a/pkg/ui/workspaces/db-console/src/views/cluster/containers/events/index.tsx b/pkg/ui/workspaces/db-console/src/views/cluster/containers/events/index.tsx
index 210147c96448..41ece622aead 100644
--- a/pkg/ui/workspaces/db-console/src/views/cluster/containers/events/index.tsx
+++ b/pkg/ui/workspaces/db-console/src/views/cluster/containers/events/index.tsx
@@ -23,11 +23,14 @@ import {
 } from "src/redux/events";
 import { LocalSetting } from "src/redux/localsettings";
 import { AdminUIState } from "src/redux/state";
-import { util } from "@cockroachlabs/cluster-ui";
 import { getEventDescription } from "src/util/events";
-import { DATE_FORMAT_24_UTC } from "src/util/format";
 import { ToolTipWrapper } from "src/views/shared/components/toolTip";
-import { Loading, SortSetting, SortedTable } from "@cockroachlabs/cluster-ui";
+import {
+  Loading,
+  SortSetting,
+  SortedTable,
+  util,
+} from "@cockroachlabs/cluster-ui";
 import "./events.styl";
 
 type Event$Properties = protos.cockroach.server.serverpb.EventsResponse.IEvent;
@@ -57,7 +60,7 @@ export function getEventInfo(e: Event$Properties): SimplifiedEvent {
   return {
     fromNowString: util
       .TimestampToMoment(e.timestamp)
-      .format(DATE_FORMAT_24_UTC)
+      .format(util.DATE_FORMAT_24_UTC)
       .replace("second", "sec")
       .replace("minute", "min"),
     content: <span>{getEventDescription(e)}</span>,
diff --git a/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeGraphs/summaryBar.tsx b/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeGraphs/summaryBar.tsx
index b0e457a16749..d556f79e04d0 100644
--- a/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeGraphs/summaryBar.tsx
+++ b/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeGraphs/summaryBar.tsx
@@ -14,7 +14,6 @@ import { Link } from "react-router-dom";
 import * as d3 from "d3";
 
 import { nodeStatusesSelector, nodeSumsSelector } from "src/redux/nodes";
-import { Bytes } from "src/util/format";
 import { util } from "@cockroachlabs/cluster-ui";
 import { createSelector } from "reselect";
 
@@ -98,6 +97,7 @@ export interface ClusterSummaryProps {
 }
 
 export default function (props: ClusterSummaryProps) {
+  const { Bytes } = util;
   const nodeSums = useSelector(nodeSumsSelector);
   // Capacity math used in the summary status section.
   const { capacityUsed, capacityUsable } = nodeSums;
diff --git a/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeLogs/index.tsx b/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeLogs/index.tsx
index 02f4f9d1b352..8746558d9fe2 100644
--- a/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeLogs/index.tsx
+++ b/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeLogs/index.tsx
@@ -26,7 +26,6 @@ import { getDisplayName } from "src/redux/nodes";
 import { Loading, SortedTable, util } from "@cockroachlabs/cluster-ui";
 import { getMatchParamByName } from "src/util/query";
 import "./logs.styl";
-import { DATE_WITH_SECONDS_FORMAT_24_UTC } from "src/util/format";
 
 type LogEntries = protos.cockroach.util.log.IEntry;
 
@@ -58,7 +57,7 @@ export class Logs extends React.Component<LogProps & RouteComponentProps, {}> {
         cell: (logEntry: LogEntries) =>
           util
             .LongToMoment(logEntry.time)
-            .format(DATE_WITH_SECONDS_FORMAT_24_UTC),
+            .format(util.DATE_WITH_SECONDS_FORMAT_24_UTC),
       },
       {
         title: "Severity",
diff --git a/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeOverview/index.tsx b/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeOverview/index.tsx
index d6c1d75f3b20..1b42e0cf4b56 100644
--- a/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeOverview/index.tsx
+++ b/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeOverview/index.tsx
@@ -24,7 +24,6 @@ import {
 } from "src/redux/nodes";
 import { AdminUIState } from "src/redux/state";
 import { nodeIDAttr } from "src/util/constants";
-import { Bytes, DATE_FORMAT_24_UTC, Percentage } from "src/util/format";
 import { INodeStatus, MetricConstants, StatusMetrics } from "src/util/proto";
 import { getMatchParamByName } from "src/util/query";
 import {
@@ -116,6 +115,7 @@ export class NodeOverview extends React.Component<NodeOverviewProps, {}> {
 
   render() {
     const { node, nodesSummary } = this.props;
+    const { Bytes, Percentage, DATE_FORMAT_24_UTC } = util;
     if (!node) {
       return (
         <div className="section">
diff --git a/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodesOverview/index.tsx b/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodesOverview/index.tsx
index c2d620e0e3f1..97a5831f203b 100644
--- a/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodesOverview/index.tsx
+++ b/pkg/ui/workspaces/db-console/src/views/cluster/containers/nodesOverview/index.tsx
@@ -35,7 +35,6 @@ import {
   SortSetting,
   util,
 } from "@cockroachlabs/cluster-ui";
-import { DATE_FORMAT_24_UTC, Percentage } from "src/util/format";
 import { FixLong } from "src/util/fixLong";
 import { getNodeLocalityTiers } from "src/util/localities";
 import { LocalityTier } from "src/redux/localities";
@@ -291,7 +290,7 @@ export class NodeList extends React.Component<LiveNodeListProps> {
         </NodelistCapacityUsageTooltip>
       ),
       render: (_text: string, record: NodeStatusRow) =>
-        Percentage(record.usedCapacity, record.availableCapacity),
+        util.Percentage(record.usedCapacity, record.availableCapacity),
       sorter: (a: NodeStatusRow, b: NodeStatusRow) =>
         a.usedCapacity / a.availableCapacity -
         b.usedCapacity / b.availableCapacity,
@@ -302,7 +301,7 @@ export class NodeList extends React.Component<LiveNodeListProps> {
       key: "memoryUse",
       title: <MemoryUseTooltip>Memory Use</MemoryUseTooltip>,
       render: (_text: string, record: NodeStatusRow) =>
-        Percentage(record.usedMemory, record.availableMemory),
+        util.Percentage(record.usedMemory, record.availableMemory),
       sorter: (a: NodeStatusRow, b: NodeStatusRow) =>
         a.usedMemory / a.availableMemory - b.usedMemory / b.availableMemory,
       className: "column--align-right",
@@ -442,7 +441,7 @@ class DecommissionedNodeList extends React.Component<DecommissionedNodeListProps
       key: "decommissionedSince",
       title: "decommissioned on",
       render: (_text: string, record: DecommissionedNodeStatusRow) =>
-        record.decommissionedDate.format(DATE_FORMAT_24_UTC),
+        record.decommissionedDate.format(util.DATE_FORMAT_24_UTC),
     },
     {
       key: "status",
diff --git a/pkg/ui/workspaces/db-console/src/views/hotRanges/index.tsx b/pkg/ui/workspaces/db-console/src/views/hotRanges/index.tsx
index c336f38619fc..9774cd878ed0 100644
--- a/pkg/ui/workspaces/db-console/src/views/hotRanges/index.tsx
+++ b/pkg/ui/workspaces/db-console/src/views/hotRanges/index.tsx
@@ -12,10 +12,10 @@ import { cockroach } from "src/js/protos";
 import { useDispatch, useSelector } from "react-redux";
 import React, { useRef, useEffect, useState } from "react";
 import { Helmet } from "react-helmet";
-import { refreshHotRanges } from "../../redux/apiReducers";
+import { refreshHotRanges } from "src/redux/apiReducers";
 import HotRangesTable from "./hotRangesTable";
 import ErrorBoundary from "../app/components/errorMessage/errorBoundary";
-import { Loading, Text, Anchor } from "@cockroachlabs/cluster-ui";
+import { Loading, Text, Anchor, util } from "@cockroachlabs/cluster-ui";
 import classNames from "classnames/bind";
 import styles from "./hotRanges.module.styl";
 import {
@@ -26,7 +26,6 @@ import {
   lastSetAtSelector,
 } from "src/redux/hotRanges";
 import { selectNodeLocalities } from "src/redux/localities";
-import { DATE_FORMAT_24_UTC } from "src/util/format";
 import { performanceBestPracticesHotSpots } from "src/util/docs";
 import { HotRangesFilter } from "src/views/hotRanges/hotRangesFilter";
 
@@ -92,7 +91,7 @@ const HotRangesPage = () => {
             <HotRangesTable
               hotRangesList={filteredHotRanges}
               lastUpdate={
-                lastSetAt && lastSetAt?.utc().format(DATE_FORMAT_24_UTC)
+                lastSetAt && lastSetAt?.utc().format(util.DATE_FORMAT_24_UTC)
               }
               nodeIdToLocalityMap={nodeIdToLocalityMap}
               clearFilterContainer={<span ref={clearButtonRef} />}
diff --git a/pkg/ui/workspaces/db-console/src/views/reports/containers/nodeHistory/decommissionedNodeHistory.tsx b/pkg/ui/workspaces/db-console/src/views/reports/containers/nodeHistory/decommissionedNodeHistory.tsx
index e7538d47beff..27dd63be8ddb 100644
--- a/pkg/ui/workspaces/db-console/src/views/reports/containers/nodeHistory/decommissionedNodeHistory.tsx
+++ b/pkg/ui/workspaces/db-console/src/views/reports/containers/nodeHistory/decommissionedNodeHistory.tsx
@@ -31,7 +31,6 @@ import {
   util,
 } from "@cockroachlabs/cluster-ui";
 import { createSelector } from "reselect";
-import { DATE_FORMAT_24_UTC } from "src/util/format";
 
 const decommissionedNodesSortSetting = new LocalSetting<
   AdminUIState,
@@ -89,7 +88,7 @@ export class DecommissionedNodeHistory extends React.Component<DecommissionedNod
       title: "Decommissioned On",
       sorter: sortByDecommissioningDate,
       render: (_text, record) => {
-        return record.decommissionedDate.format(DATE_FORMAT_24_UTC);
+        return record.decommissionedDate.format(util.DATE_FORMAT_24_UTC);
       },
     },
   ];
diff --git a/pkg/ui/workspaces/db-console/src/views/reports/containers/range/rangeTable.tsx b/pkg/ui/workspaces/db-console/src/views/reports/containers/range/rangeTable.tsx
index 0e9a5f89e0e9..3882f6808339 100644
--- a/pkg/ui/workspaces/db-console/src/views/reports/containers/range/rangeTable.tsx
+++ b/pkg/ui/workspaces/db-console/src/views/reports/containers/range/rangeTable.tsx
@@ -17,7 +17,6 @@ import * as protos from "src/js/protos";
 import { cockroach } from "src/js/protos";
 import { util } from "@cockroachlabs/cluster-ui";
 import { FixLong } from "src/util/fixLong";
-import { Bytes } from "src/util/format";
 import Lease from "src/views/reports/containers/range/lease";
 import Print from "src/views/reports/containers/range/print";
 import RangeInfo from "src/views/reports/containers/range/rangeInfo";
@@ -343,7 +342,7 @@ export default class RangeTable extends React.Component<RangeTableProps, {}> {
   }
 
   contentMVCC(bytes: Long, count: Long): RangeTableCellContent {
-    const humanizedBytes = Bytes(bytes.toNumber());
+    const humanizedBytes = util.Bytes(bytes.toNumber());
     return {
       value: [`${humanizedBytes} / ${count.toString()} count`],
       title: [
@@ -358,7 +357,7 @@ export default class RangeTable extends React.Component<RangeTableProps, {}> {
     className: string = null,
     toolTip: string = null,
   ): RangeTableCellContent {
-    const humanized = Bytes(bytes.toNumber());
+    const humanized = util.Bytes(bytes.toNumber());
     if (_.isNull(className)) {
       return {
         value: [humanized],
diff --git a/pkg/ui/workspaces/db-console/src/views/reports/containers/settings/index.tsx b/pkg/ui/workspaces/db-console/src/views/reports/containers/settings/index.tsx
index 790e9717882b..8a7ae1d85ff3 100644
--- a/pkg/ui/workspaces/db-console/src/views/reports/containers/settings/index.tsx
+++ b/pkg/ui/workspaces/db-console/src/views/reports/containers/settings/index.tsx
@@ -17,7 +17,6 @@ import { RouteComponentProps, withRouter } from "react-router-dom";
 import * as protos from "src/js/protos";
 import { refreshSettings } from "src/redux/apiReducers";
 import { AdminUIState } from "src/redux/state";
-import { DATE_FORMAT_24_UTC } from "src/util/format";
 import {
   Loading,
   ColumnDescriptor,
@@ -111,7 +110,7 @@ export class Settings extends React.Component<SettingsProps, SettingsState> {
         title: "Last Updated",
         cell: (setting: IterableSetting) =>
           setting.last_updated
-            ? setting.last_updated.format(DATE_FORMAT_24_UTC)
+            ? setting.last_updated.format(util.DATE_FORMAT_24_UTC)
             : "No overrides",
         sort: (setting: IterableSetting) => setting.last_updated?.valueOf(),
       },
diff --git a/pkg/ui/workspaces/db-console/src/views/reports/containers/statementDiagnosticsHistory/index.tsx b/pkg/ui/workspaces/db-console/src/views/reports/containers/statementDiagnosticsHistory/index.tsx
index fb886df17c0e..6c31352686e1 100644
--- a/pkg/ui/workspaces/db-console/src/views/reports/containers/statementDiagnosticsHistory/index.tsx
+++ b/pkg/ui/workspaces/db-console/src/views/reports/containers/statementDiagnosticsHistory/index.tsx
@@ -45,10 +45,10 @@ import {
   SortedTable,
   SortSetting,
   ColumnDescriptor,
+  util,
 } from "@cockroachlabs/cluster-ui";
 import { cancelStatementDiagnosticsReportAction } from "src/redux/statements";
 import { trackCancelDiagnosticsBundleAction } from "src/redux/analyticsActions";
-import { DATE_FORMAT_24_UTC } from "src/util/format";
 
 type StatementDiagnosticsHistoryViewProps = MapStateToProps &
   MapDispatchToProps;
@@ -96,7 +96,7 @@ class StatementDiagnosticsHistoryView extends React.Component<
       title: "Activated on",
       name: "activated_on",
       cell: record =>
-        moment.utc(record.requested_at).format(DATE_FORMAT_24_UTC),
+        moment.utc(record.requested_at).format(util.DATE_FORMAT_24_UTC),
       sort: record => {
         return moment.utc(record.requested_at).unix();
       },

From e3447ad8609ee117d42d1ee711132d4658f9265a Mon Sep 17 00:00:00 2001
From: maryliag <marylia@cockroachlabs.com>
Date: Mon, 14 Nov 2022 19:20:24 -0500
Subject: [PATCH 4/7] ui: add leading zeros to hex value with less than 16
 chars

Previously, some fingerprint IDs in hex format would have
a leading 0, that was not being returning from the DB calls.
This was causing confusion when seeing the value on the UI
and trying to find the same fringerprint on our tables.

This commits checks the hex values used and add the missing
leading zeros back.

Part Of: #91763

Release note (bug fix): Add leading zeros to fingerprint IDs
with less than 16 characters.
---
 .../cluster-ui/src/api/insightsApi.ts         | 29 ++++++++++---------
 .../statementsPage.selectors.ts               |  8 +++--
 .../transactionsTable/transactionsTable.tsx   | 11 +++++--
 .../cluster-ui/src/util/format.spec.ts        | 11 +++++++
 .../workspaces/cluster-ui/src/util/format.ts  | 15 ++++++++++
 .../src/views/statements/statementsPage.tsx   |  7 +++--
 6 files changed, 61 insertions(+), 20 deletions(-)

diff --git a/pkg/ui/workspaces/cluster-ui/src/api/insightsApi.ts b/pkg/ui/workspaces/cluster-ui/src/api/insightsApi.ts
index bbd7be686749..9971dc3a447a 100644
--- a/pkg/ui/workspaces/cluster-ui/src/api/insightsApi.ts
+++ b/pkg/ui/workspaces/cluster-ui/src/api/insightsApi.ts
@@ -19,17 +19,18 @@ import {
 } from "./sqlApi";
 import {
   BlockedContentionDetails,
+  FlattenedStmtInsightEvent,
+  getInsightFromCause,
+  getInsightsFromProblemsAndCauses,
   InsightExecEnum,
   InsightNameEnum,
-  FlattenedStmtInsightEvent,
-  TxnContentionInsightEvent,
   TxnContentionInsightDetails,
+  TxnContentionInsightEvent,
   TxnInsightEvent,
-  getInsightsFromProblemsAndCauses,
-  getInsightFromCause,
 } from "src/insights";
 import moment from "moment";
 import { INTERNAL_APP_NAME_PREFIX } from "src/activeExecutions/activeStatementUtils";
+import { CheckHexValue } from "../util";
 
 // Transaction contention insight events.
 
@@ -86,7 +87,7 @@ function formatTxnContentionResults(
 
   return response.execution.txn_results[0].rows.map(row => ({
     transactionID: row.waiting_txn_id,
-    transactionFingerprintID: row.waiting_txn_fingerprint_id,
+    transactionFingerprintID: CheckHexValue(row.waiting_txn_fingerprint_id),
     startTime: moment(row.collection_ts).utc(),
     contentionDuration: moment.duration(row.contention_duration),
     contentionThreshold: moment.duration(row.threshold).asMilliseconds(),
@@ -133,7 +134,7 @@ function formatTxnFingerprintsResults(
   }
 
   return response.execution.txn_results[0].rows.map(row => ({
-    transactionFingerprintID: row.transaction_fingerprint_id,
+    transactionFingerprintID: CheckHexValue(row.transaction_fingerprint_id),
     queryIDs: row.query_ids,
     application: row.app_name,
   }));
@@ -168,7 +169,7 @@ function createStmtFingerprintToQueryMap(
     return idToQuery;
   }
   response.execution.txn_results[0].rows.forEach(row => {
-    idToQuery.set(row.statement_fingerprint_id, row.query);
+    idToQuery.set(CheckHexValue(row.statement_fingerprint_id), row.query);
   });
 
   return idToQuery;
@@ -363,7 +364,9 @@ function formatTxnContentionDetailsResponse(
     totalContentionTime += contentionTimeInMs;
     blockingContentionDetails[idx] = {
       blockingExecutionID: value.blocking_txn_id,
-      blockingTxnFingerprintID: value.blocking_txn_fingerprint_id,
+      blockingTxnFingerprintID: CheckHexValue(
+        value.blocking_txn_fingerprint_id,
+      ),
       blockingQueries: null,
       collectionTimeStamp: moment(value.collection_ts).utc(),
       contentionTimeMs: contentionTimeInMs,
@@ -382,7 +385,7 @@ function formatTxnContentionDetailsResponse(
   const contentionThreshold = moment.duration(row.threshold).asMilliseconds();
   return {
     transactionExecutionID: row.waiting_txn_id,
-    transactionFingerprintID: row.waiting_txn_fingerprint_id,
+    transactionFingerprintID: CheckHexValue(row.waiting_txn_fingerprint_id),
     startTime: moment(row.collection_ts).utc(),
     totalContentionTimeMs: totalContentionTime,
     blockingContentionDetails: blockingContentionDetails,
@@ -493,13 +496,11 @@ function buildTxnContentionInsightDetails(
       partialTxnContentionDetails.transactionFingerprintID,
   );
 
-  const res = {
+  return {
     ...partialTxnContentionDetails,
     application: waitingTxn.application,
     queries: waitingTxn.queryIDs.map(id => stmtFingerprintToQuery.get(id)),
   };
-
-  return res;
 }
 
 type ExecutionInsightsResponseRow = {
@@ -550,7 +551,7 @@ function organizeExecutionInsightsResponseIntoTxns(
     if (!txnInsight) {
       txnInsight = {
         transactionExecutionID: row.txn_id,
-        transactionFingerprintID: row.txn_fingerprint_id,
+        transactionFingerprintID: CheckHexValue(row.txn_fingerprint_id),
         implicitTxn: row.implicit_txn,
         databaseName: row.database_name,
         application: row.app_name,
@@ -574,7 +575,7 @@ function organizeExecutionInsightsResponseIntoTxns(
       endTime: end,
       elapsedTimeMillis: end.diff(start, "milliseconds"),
       statementExecutionID: row.stmt_id,
-      statementFingerprintID: row.stmt_fingerprint_id,
+      statementFingerprintID: CheckHexValue(row.stmt_fingerprint_id),
       isFullScan: row.full_scan,
       rowsRead: row.rows_read,
       rowsWritten: row.rows_written,
diff --git a/pkg/ui/workspaces/cluster-ui/src/statementsPage/statementsPage.selectors.ts b/pkg/ui/workspaces/cluster-ui/src/statementsPage/statementsPage.selectors.ts
index f2d8ec08bc15..06fa07c2a0a7 100644
--- a/pkg/ui/workspaces/cluster-ui/src/statementsPage/statementsPage.selectors.ts
+++ b/pkg/ui/workspaces/cluster-ui/src/statementsPage/statementsPage.selectors.ts
@@ -12,6 +12,7 @@ import { createSelector } from "reselect";
 import {
   aggregateStatementStats,
   appAttr,
+  CheckHexValue,
   combineStatementStats,
   ExecutionStatistics,
   flattenStatementStats,
@@ -177,8 +178,9 @@ export const selectStatements = createSelector(
       if (!(key in statsByStatementKey)) {
         statsByStatementKey[key] = {
           statementFingerprintID: stmt.statement_fingerprint_id?.toString(),
-          statementFingerprintHexID:
+          statementFingerprintHexID: CheckHexValue(
             stmt.statement_fingerprint_id?.toString(16),
+          ),
           statement: stmt.statement,
           statementSummary: stmt.statement_summary,
           aggregatedTs: stmt.aggregated_ts,
@@ -197,7 +199,9 @@ export const selectStatements = createSelector(
       const stmt = statsByStatementKey[key];
       return {
         aggregatedFingerprintID: stmt.statementFingerprintID,
-        aggregatedFingerprintHexID: stmt.statementFingerprintHexID,
+        aggregatedFingerprintHexID: CheckHexValue(
+          stmt.statementFingerprintHexID,
+        ),
         label: stmt.statement,
         summary: stmt.statementSummary,
         aggregatedTs: stmt.aggregatedTs,
diff --git a/pkg/ui/workspaces/cluster-ui/src/transactionsTable/transactionsTable.tsx b/pkg/ui/workspaces/cluster-ui/src/transactionsTable/transactionsTable.tsx
index ed45505f5cd8..17ff5b72e8fb 100644
--- a/pkg/ui/workspaces/cluster-ui/src/transactionsTable/transactionsTable.tsx
+++ b/pkg/ui/workspaces/cluster-ui/src/transactionsTable/transactionsTable.tsx
@@ -28,7 +28,14 @@ import {
 import { statisticsTableTitles } from "../statsTableUtil/statsTableUtil";
 import { tableClasses } from "./transactionsTableClasses";
 import { transactionLink } from "./transactionsCells";
-import { Count, FixLong, longToInt, TimestampToString, unset } from "src/util";
+import {
+  CheckHexValue,
+  Count,
+  FixLong,
+  longToInt,
+  TimestampToString,
+  unset,
+} from "src/util";
 import { SortSetting } from "../sortedtable";
 import {
   getStatementsByFingerprintId,
@@ -243,7 +250,7 @@ export function makeTransactionsColumns(
       name: "transactionFingerprintId",
       title: statisticsTableTitles.transactionFingerprintId(statType),
       cell: (item: TransactionInfo) =>
-        item.stats_data?.transaction_fingerprint_id.toString(16),
+        CheckHexValue(item.stats_data?.transaction_fingerprint_id.toString(16)),
       sort: (item: TransactionInfo) =>
         item.stats_data?.transaction_fingerprint_id.toString(16),
       showByDefault: false,
diff --git a/pkg/ui/workspaces/cluster-ui/src/util/format.spec.ts b/pkg/ui/workspaces/cluster-ui/src/util/format.spec.ts
index f3c250aac373..818ac7374db7 100644
--- a/pkg/ui/workspaces/cluster-ui/src/util/format.spec.ts
+++ b/pkg/ui/workspaces/cluster-ui/src/util/format.spec.ts
@@ -15,6 +15,7 @@ import {
   BytesFitScale,
   byteUnits,
   HexStringToInt64String,
+  CheckHexValue,
 } from "./format";
 
 describe("Format utils", () => {
@@ -59,4 +60,14 @@ describe("Format utils", () => {
       );
     });
   });
+
+  describe("CheckHexValue", () => {
+    it("add leading 0 to hex values", () => {
+      expect(CheckHexValue(undefined)).toBe("");
+      expect(CheckHexValue(null)).toBe("");
+      expect(CheckHexValue("fb9111f22f2213b7")).toBe("fb9111f22f2213b7");
+      expect(CheckHexValue("b9111f22f2213b7")).toBe("0b9111f22f2213b7");
+      expect(CheckHexValue("9111f22f2213b7")).toBe("009111f22f2213b7");
+    });
+  });
 });
diff --git a/pkg/ui/workspaces/cluster-ui/src/util/format.ts b/pkg/ui/workspaces/cluster-ui/src/util/format.ts
index b0d2e5577609..60c1ddfbd913 100644
--- a/pkg/ui/workspaces/cluster-ui/src/util/format.ts
+++ b/pkg/ui/workspaces/cluster-ui/src/util/format.ts
@@ -256,6 +256,21 @@ export function HexStringToInt64String(s: string): string {
   return dec;
 }
 
+// CheckHexValue adds the leading 0 on strings with hex value that
+// have a length < 16.
+export function CheckHexValue(s: string): string {
+  if (s === undefined || s === null || s.length === 0) {
+    return "";
+  }
+  if (s?.length === 16) {
+    return s;
+  }
+  while (s.length < 16) {
+    s = `0${s}`;
+  }
+  return s;
+}
+
 // capitalize capitalizes a string.
 export function capitalize(str: string): string {
   if (!str) return str;
diff --git a/pkg/ui/workspaces/db-console/src/views/statements/statementsPage.tsx b/pkg/ui/workspaces/db-console/src/views/statements/statementsPage.tsx
index 5223d596b408..b513716c150b 100644
--- a/pkg/ui/workspaces/db-console/src/views/statements/statementsPage.tsx
+++ b/pkg/ui/workspaces/db-console/src/views/statements/statementsPage.tsx
@@ -142,8 +142,9 @@ export const selectStatements = createSelector(
       if (!(key in statsByStatementKey)) {
         statsByStatementKey[key] = {
           statementFingerprintID: stmt.statement_fingerprint_id?.toString(),
-          statementFingerprintHexID:
+          statementFingerprintHexID: util.CheckHexValue(
             stmt.statement_fingerprint_id?.toString(16),
+          ),
           statement: stmt.statement,
           statementSummary: stmt.statement_summary,
           aggregatedTs: stmt.aggregated_ts,
@@ -162,7 +163,9 @@ export const selectStatements = createSelector(
       const stmt = statsByStatementKey[key];
       return {
         aggregatedFingerprintID: stmt.statementFingerprintID,
-        aggregatedFingerprintHexID: stmt.statementFingerprintHexID,
+        aggregatedFingerprintHexID: util.CheckHexValue(
+          stmt.statementFingerprintHexID,
+        ),
         label: stmt.statement,
         summary: stmt.statementSummary,
         aggregatedTs: stmt.aggregatedTs,

From a4ab9b6141e7b6ee8ee1d4b1ec63ee9bd13cfe2b Mon Sep 17 00:00:00 2001
From: Steven Danna <danna@cockroachlabs.com>
Date: Tue, 15 Nov 2022 02:19:16 +0000
Subject: [PATCH 5/7] changfeedccl: deflake TestAlterChangefeedTelemetry

The job system clears the lease asyncronously after cancelation. This
lease clearing transaction can cause a restart in the alter changefeed
transaction, which will lead to different feature counter
counts. Thus, we want to wait for the lease clear. However, the lease
clear isn't guaranteed to happen, so we only wait a few seconds for
it.

Release note: None
---
 pkg/ccl/changefeedccl/BUILD.bazel             |  1 -
 .../changefeedccl/alter_changefeed_test.go    | 28 +++++++++++++++++--
 pkg/testutils/jobutils/jobs_verification.go   | 13 ---------
 3 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/pkg/ccl/changefeedccl/BUILD.bazel b/pkg/ccl/changefeedccl/BUILD.bazel
index ce5b5782aeb6..5cccc7d139bf 100644
--- a/pkg/ccl/changefeedccl/BUILD.bazel
+++ b/pkg/ccl/changefeedccl/BUILD.bazel
@@ -250,7 +250,6 @@ go_test(
         "//pkg/sql/types",
         "//pkg/storage",
         "//pkg/testutils",
-        "//pkg/testutils/jobutils",
         "//pkg/testutils/serverutils",
         "//pkg/testutils/skip",
         "//pkg/testutils/sqlutils",
diff --git a/pkg/ccl/changefeedccl/alter_changefeed_test.go b/pkg/ccl/changefeedccl/alter_changefeed_test.go
index 759df1a35439..367c0966f030 100644
--- a/pkg/ccl/changefeedccl/alter_changefeed_test.go
+++ b/pkg/ccl/changefeedccl/alter_changefeed_test.go
@@ -27,7 +27,6 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
 	"github.com/cockroachdb/cockroach/pkg/sql/tests"
 	"github.com/cockroachdb/cockroach/pkg/testutils"
-	"github.com/cockroachdb/cockroach/pkg/testutils/jobutils"
 	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
 	"github.com/cockroachdb/cockroach/pkg/testutils/skip"
 	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
@@ -423,11 +422,34 @@ func TestAlterChangefeedTelemetry(t *testing.T) {
 
 		testFeed := feed(t, f, `CREATE CHANGEFEED FOR foo, bar WITH diff`)
 		defer closeFeed(t, testFeed)
-
 		feed := testFeed.(cdctest.EnterpriseTestFeed)
 
 		require.NoError(t, feed.Pause())
-		jobutils.WaitForJobToHaveNoLease(t, sqlDB, feed.JobID())
+
+		// The job system clears the lease asyncronously after
+		// cancellation. This lease clearing transaction can
+		// cause a restart in the alter changefeed
+		// transaction, which will lead to different feature
+		// counter counts. Thus, we want to wait for the lease
+		// clear. However, the lease clear isn't guaranteed to
+		// happen, so we only wait a few seconds for it.
+		waitForNoLease := func() {
+			ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+			defer cancel()
+			for {
+				if ctx.Err() != nil {
+					return
+				}
+				var sessionID []byte
+				sqlDB.QueryRow(t, `SELECT claim_session_id FROM system.jobs WHERE id = $1`, feed.JobID()).Scan(&sessionID)
+				if sessionID == nil {
+					return
+				}
+				time.Sleep(250 * time.Millisecond)
+			}
+		}
+
+		waitForNoLease()
 		sqlDB.Exec(t, fmt.Sprintf(`ALTER CHANGEFEED %d DROP bar, foo ADD baz UNSET diff SET resolved, format=json`, feed.JobID()))
 
 		counts := telemetry.GetFeatureCounts(telemetry.Raw, telemetry.ResetCounts)
diff --git a/pkg/testutils/jobutils/jobs_verification.go b/pkg/testutils/jobutils/jobs_verification.go
index 7e5d8c9f1644..1872596d41c7 100644
--- a/pkg/testutils/jobutils/jobs_verification.go
+++ b/pkg/testutils/jobutils/jobs_verification.go
@@ -89,19 +89,6 @@ func waitForJobToHaveStatus(
 	}, 2*time.Minute)
 }
 
-func WaitForJobToHaveNoLease(t testing.TB, db *sqlutils.SQLRunner, jobID jobspb.JobID) {
-	t.Helper()
-	testutils.SucceedsWithin(t, func() error {
-		var sessionID []byte
-		var instanceID gosql.NullInt64
-		db.QueryRow(t, `SELECT claim_session_id, claim_instance_id FROM system.jobs WHERE id = $1`, jobID).Scan(&sessionID, &instanceID)
-		if sessionID == nil && !instanceID.Valid {
-			return nil
-		}
-		return errors.Newf("job %d still has claim information", jobID)
-	}, 2*time.Minute)
-}
-
 // RunJob runs the provided job control statement, initializing, notifying and
 // closing the chan at the passed pointer (see below for why) and returning the
 // jobID and error result. PAUSE JOB and CANCEL JOB are racy in that it's hard

From 66730b8c20f0942e2821203419963e00cc1bd4fa Mon Sep 17 00:00:00 2001
From: maryliag <marylia@cockroachlabs.com>
Date: Mon, 14 Nov 2022 19:25:25 -0500
Subject: [PATCH 6/7] roachpb, server, ui: add fingerprint ID to details pages

Previously, there was no easy way to get a statement or
fingerprint id from their respective details pages,
allowing then the value to be found on our internal tables
using cli.
This commit adds the fingerprint id on the response of
the statement details page.
It also adds the fingeprint ID to the Statement Details
page and Transaction Details page.

Fixes #91763

Release note (ui change): Add fingerprint ID in hex format
to the Statement Details page and Transaction Details page.
---
 pkg/roachpb/app_stats.proto                   |  1 +
 pkg/server/combined_statement_stats.go        | 12 ++++++---
 .../sqlstatsutil/json_encoding.go             |  8 ++++++
 .../sqlstatsutil/json_encoding_test.go        |  3 ++-
 .../sqlstatsutil/json_impl.go                 |  1 +
 .../cluster-ui/src/api/insightsApi.ts         | 27 +++++++++++++------
 .../src/statementDetails/statementDetails.tsx |  7 ++++-
 .../statementsPage.selectors.ts               |  6 ++---
 .../transactionDetails/transactionDetails.tsx |  9 +++++++
 .../transactionsTable/transactionsTable.tsx   |  6 +++--
 .../cluster-ui/src/util/format.spec.ts        | 18 ++++++++-----
 .../workspaces/cluster-ui/src/util/format.ts  | 12 ++++-----
 .../src/views/statements/statementsPage.tsx   |  4 +--
 13 files changed, 80 insertions(+), 34 deletions(-)

diff --git a/pkg/roachpb/app_stats.proto b/pkg/roachpb/app_stats.proto
index c5d3a8822143..658ef94698b5 100644
--- a/pkg/roachpb/app_stats.proto
+++ b/pkg/roachpb/app_stats.proto
@@ -224,6 +224,7 @@ message AggregatedStatementMetadata {
   optional int64 full_scan_count = 10 [(gogoproto.nullable) = false];
   optional int64 vec_count = 11 [(gogoproto.nullable) = false];
   optional int64 total_count = 12 [(gogoproto.nullable) = false];
+  optional string fingerprint_id = 13 [(gogoproto.nullable) = false, (gogoproto.customname) = "FingerprintID"];
 }
 
 // CollectedStatementStatistics wraps collected timings and metadata for some
diff --git a/pkg/server/combined_statement_stats.go b/pkg/server/combined_statement_stats.go
index fe26562bc2e7..6d115f49a059 100644
--- a/pkg/server/combined_statement_stats.go
+++ b/pkg/server/combined_statement_stats.go
@@ -480,7 +480,7 @@ func getStatementDetailsQueryClausesAndArgs(
 	return whereClause, args, nil
 }
 
-// getTotalStatementDetails return all the statistics for the selectec statement combined.
+// getTotalStatementDetails return all the statistics for the selected statement combined.
 func getTotalStatementDetails(
 	ctx context.Context, ie *sql.InternalExecutor, whereClause string, args []interface{},
 ) (serverpb.StatementDetailsResponse_CollectedStatementSummary, error) {
@@ -490,13 +490,15 @@ func getTotalStatementDetails(
 				aggregation_interval,
 				array_agg(app_name) as app_names,
 				crdb_internal.merge_statement_stats(array_agg(statistics)) AS statistics,
-				max(sampled_plan) as sampled_plan
+				max(sampled_plan) as sampled_plan,
+				encode(fingerprint_id, 'hex') as fingerprint_id
 		FROM crdb_internal.statement_statistics %s
 		GROUP BY
-				aggregation_interval
+				aggregation_interval,
+				fingerprint_id
 		LIMIT 1`, whereClause)
 
-	const expectedNumDatums = 5
+	const expectedNumDatums = 6
 	var statement serverpb.StatementDetailsResponse_CollectedStatementSummary
 
 	row, err := ie.QueryRowEx(ctx, "combined-stmts-details-total", nil,
@@ -552,6 +554,8 @@ func getTotalStatementDetails(
 	cfg.LineWidth = tree.ConsoleLineWidth
 	aggregatedMetadata.FormattedQuery = cfg.Pretty(queryTree.AST)
 
+	aggregatedMetadata.FingerprintID = string(tree.MustBeDString(row[5]))
+
 	statement = serverpb.StatementDetailsResponse_CollectedStatementSummary{
 		Metadata:            aggregatedMetadata,
 		AggregationInterval: time.Duration(aggInterval.Nanos()),
diff --git a/pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil/json_encoding.go b/pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil/json_encoding.go
index 7d32b50963ba..09e527e03d43 100644
--- a/pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil/json_encoding.go
+++ b/pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil/json_encoding.go
@@ -267,7 +267,9 @@ func BuildTxnStatisticsJSON(statistics *roachpb.CollectedTransactionStatistics)
 //	  "properties": {
 //	    "stmtType":             { "type": "string" },
 //	    "query":                { "type": "string" },
+//	    "fingerprintID":        { "type": "string" },
 //	    "querySummary":         { "type": "string" },
+//	    "formattedQuery":       { "type": "string" },
 //	    "implicitTxn":          { "type": "boolean" },
 //	    "distSQLCount":         { "type": "number" },
 //	    "failedCount":          { "type": "number" },
@@ -280,6 +282,12 @@ func BuildTxnStatisticsJSON(statistics *roachpb.CollectedTransactionStatistics)
 //	   		  "type": "string"
 //	   		}
 //	  	},
+//	    "appNames":             {
+//	   		"type": "array",
+//	   		"items": {
+//	   		  "type": "string"
+//	   		}
+//	  	},
 //	  }
 //	}
 func BuildStmtDetailsMetadataJSON(
diff --git a/pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil/json_encoding_test.go b/pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil/json_encoding_test.go
index acb5d2a895e5..d9e83f9c9248 100644
--- a/pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil/json_encoding_test.go
+++ b/pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil/json_encoding_test.go
@@ -395,7 +395,8 @@ func TestSQLStatsJsonEncoding(t *testing.T) {
   "fullScanCount": {{.Int64}},
   "totalCount": {{.Int64}},
   "db": [{{joinStrings .StringArray}}],
-  "appNames": [{{joinStrings .StringArray}}]
+  "appNames": [{{joinStrings .StringArray}}],
+  "fingerprintID": "{{.String}}"
 }
 		 `
 		expectedAggregatedMetadataStr := fillTemplate(t, expectedAggregatedMetadataStrTemplate, data)
diff --git a/pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil/json_impl.go b/pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil/json_impl.go
index f68a806b1998..dc8d163bde22 100644
--- a/pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil/json_impl.go
+++ b/pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil/json_impl.go
@@ -120,6 +120,7 @@ func (s *aggregatedMetadata) jsonFields() jsonFields {
 		{"stmtType", (*jsonString)(&s.StmtType)},
 		{"vecCount", (*jsonInt)(&s.VecCount)},
 		{"totalCount", (*jsonInt)(&s.TotalCount)},
+		{"fingerprintID", (*jsonString)(&s.FingerprintID)},
 	}
 }
 
diff --git a/pkg/ui/workspaces/cluster-ui/src/api/insightsApi.ts b/pkg/ui/workspaces/cluster-ui/src/api/insightsApi.ts
index 9971dc3a447a..003ce1c3797f 100644
--- a/pkg/ui/workspaces/cluster-ui/src/api/insightsApi.ts
+++ b/pkg/ui/workspaces/cluster-ui/src/api/insightsApi.ts
@@ -30,7 +30,7 @@ import {
 } from "src/insights";
 import moment from "moment";
 import { INTERNAL_APP_NAME_PREFIX } from "src/activeExecutions/activeStatementUtils";
-import { CheckHexValue } from "../util";
+import { FixFingerprintHexValue } from "../util";
 
 // Transaction contention insight events.
 
@@ -87,7 +87,9 @@ function formatTxnContentionResults(
 
   return response.execution.txn_results[0].rows.map(row => ({
     transactionID: row.waiting_txn_id,
-    transactionFingerprintID: CheckHexValue(row.waiting_txn_fingerprint_id),
+    transactionFingerprintID: FixFingerprintHexValue(
+      row.waiting_txn_fingerprint_id,
+    ),
     startTime: moment(row.collection_ts).utc(),
     contentionDuration: moment.duration(row.contention_duration),
     contentionThreshold: moment.duration(row.threshold).asMilliseconds(),
@@ -134,7 +136,9 @@ function formatTxnFingerprintsResults(
   }
 
   return response.execution.txn_results[0].rows.map(row => ({
-    transactionFingerprintID: CheckHexValue(row.transaction_fingerprint_id),
+    transactionFingerprintID: FixFingerprintHexValue(
+      row.transaction_fingerprint_id,
+    ),
     queryIDs: row.query_ids,
     application: row.app_name,
   }));
@@ -169,7 +173,10 @@ function createStmtFingerprintToQueryMap(
     return idToQuery;
   }
   response.execution.txn_results[0].rows.forEach(row => {
-    idToQuery.set(CheckHexValue(row.statement_fingerprint_id), row.query);
+    idToQuery.set(
+      FixFingerprintHexValue(row.statement_fingerprint_id),
+      row.query,
+    );
   });
 
   return idToQuery;
@@ -364,7 +371,7 @@ function formatTxnContentionDetailsResponse(
     totalContentionTime += contentionTimeInMs;
     blockingContentionDetails[idx] = {
       blockingExecutionID: value.blocking_txn_id,
-      blockingTxnFingerprintID: CheckHexValue(
+      blockingTxnFingerprintID: FixFingerprintHexValue(
         value.blocking_txn_fingerprint_id,
       ),
       blockingQueries: null,
@@ -385,7 +392,9 @@ function formatTxnContentionDetailsResponse(
   const contentionThreshold = moment.duration(row.threshold).asMilliseconds();
   return {
     transactionExecutionID: row.waiting_txn_id,
-    transactionFingerprintID: CheckHexValue(row.waiting_txn_fingerprint_id),
+    transactionFingerprintID: FixFingerprintHexValue(
+      row.waiting_txn_fingerprint_id,
+    ),
     startTime: moment(row.collection_ts).utc(),
     totalContentionTimeMs: totalContentionTime,
     blockingContentionDetails: blockingContentionDetails,
@@ -551,7 +560,9 @@ function organizeExecutionInsightsResponseIntoTxns(
     if (!txnInsight) {
       txnInsight = {
         transactionExecutionID: row.txn_id,
-        transactionFingerprintID: CheckHexValue(row.txn_fingerprint_id),
+        transactionFingerprintID: FixFingerprintHexValue(
+          row.txn_fingerprint_id,
+        ),
         implicitTxn: row.implicit_txn,
         databaseName: row.database_name,
         application: row.app_name,
@@ -575,7 +586,7 @@ function organizeExecutionInsightsResponseIntoTxns(
       endTime: end,
       elapsedTimeMillis: end.diff(start, "milliseconds"),
       statementExecutionID: row.stmt_id,
-      statementFingerprintID: CheckHexValue(row.stmt_fingerprint_id),
+      statementFingerprintID: FixFingerprintHexValue(row.stmt_fingerprint_id),
       isFullScan: row.full_scan,
       rowsRead: row.rows_read,
       rowsWritten: row.rows_written,
diff --git a/pkg/ui/workspaces/cluster-ui/src/statementDetails/statementDetails.tsx b/pkg/ui/workspaces/cluster-ui/src/statementDetails/statementDetails.tsx
index cdfbb98ffe0c..081a61ada4d2 100644
--- a/pkg/ui/workspaces/cluster-ui/src/statementDetails/statementDetails.tsx
+++ b/pkg/ui/workspaces/cluster-ui/src/statementDetails/statementDetails.tsx
@@ -30,6 +30,7 @@ import { AlignedData, Options } from "uplot";
 import {
   appAttr,
   appNamesAttr,
+  FixFingerprintHexValue,
   DATE_FORMAT_24_UTC,
   intersperse,
   queryByName,
@@ -72,7 +73,6 @@ import {
 import { Delayed } from "../delayed";
 import moment from "moment";
 import {
-  CancelStmtDiagnosticRequest,
   InsertStmtDiagnosticRequest,
   StatementDiagnosticsReport,
 } from "../api";
@@ -514,6 +514,7 @@ export class StatementDetails extends React.Component<
     const {
       app_names,
       databases,
+      fingerprint_id,
       failed_count,
       full_scan_count,
       vec_count,
@@ -651,6 +652,10 @@ export class StatementDetails extends React.Component<
                     ", ",
                   )}
                 />
+                <SummaryCardItem
+                  label="Fingerprint ID"
+                  value={FixFingerprintHexValue(fingerprint_id)}
+                />
               </SummaryCard>
             </Col>
             <Col className="gutter-row" span={12}>
diff --git a/pkg/ui/workspaces/cluster-ui/src/statementsPage/statementsPage.selectors.ts b/pkg/ui/workspaces/cluster-ui/src/statementsPage/statementsPage.selectors.ts
index 06fa07c2a0a7..c42d81fa0663 100644
--- a/pkg/ui/workspaces/cluster-ui/src/statementsPage/statementsPage.selectors.ts
+++ b/pkg/ui/workspaces/cluster-ui/src/statementsPage/statementsPage.selectors.ts
@@ -12,7 +12,7 @@ import { createSelector } from "reselect";
 import {
   aggregateStatementStats,
   appAttr,
-  CheckHexValue,
+  FixFingerprintHexValue,
   combineStatementStats,
   ExecutionStatistics,
   flattenStatementStats,
@@ -178,7 +178,7 @@ export const selectStatements = createSelector(
       if (!(key in statsByStatementKey)) {
         statsByStatementKey[key] = {
           statementFingerprintID: stmt.statement_fingerprint_id?.toString(),
-          statementFingerprintHexID: CheckHexValue(
+          statementFingerprintHexID: FixFingerprintHexValue(
             stmt.statement_fingerprint_id?.toString(16),
           ),
           statement: stmt.statement,
@@ -199,7 +199,7 @@ export const selectStatements = createSelector(
       const stmt = statsByStatementKey[key];
       return {
         aggregatedFingerprintID: stmt.statementFingerprintID,
-        aggregatedFingerprintHexID: CheckHexValue(
+        aggregatedFingerprintHexID: FixFingerprintHexValue(
           stmt.statementFingerprintHexID,
         ),
         label: stmt.statement,
diff --git a/pkg/ui/workspaces/cluster-ui/src/transactionDetails/transactionDetails.tsx b/pkg/ui/workspaces/cluster-ui/src/transactionDetails/transactionDetails.tsx
index 6178e78b0ae6..f98aace149bb 100644
--- a/pkg/ui/workspaces/cluster-ui/src/transactionDetails/transactionDetails.tsx
+++ b/pkg/ui/workspaces/cluster-ui/src/transactionDetails/transactionDetails.tsx
@@ -39,6 +39,7 @@ import { SummaryCard, SummaryCardItem } from "../summaryCard";
 import {
   Bytes,
   calculateTotalWorkload,
+  FixFingerprintHexValue,
   Duration,
   formatNumberForDisplay,
   unset,
@@ -410,6 +411,14 @@ export class TransactionDetails extends React.Component<
                               : unset
                           }
                         />
+                        <SummaryCardItem
+                          label="Fingerprint ID"
+                          value={FixFingerprintHexValue(
+                            transaction?.stats_data.transaction_fingerprint_id.toString(
+                              16,
+                            ),
+                          )}
+                        />
                         <p
                           className={summaryCardStylesCx(
                             "summary--card__divider",
diff --git a/pkg/ui/workspaces/cluster-ui/src/transactionsTable/transactionsTable.tsx b/pkg/ui/workspaces/cluster-ui/src/transactionsTable/transactionsTable.tsx
index 17ff5b72e8fb..0c3e28e8d951 100644
--- a/pkg/ui/workspaces/cluster-ui/src/transactionsTable/transactionsTable.tsx
+++ b/pkg/ui/workspaces/cluster-ui/src/transactionsTable/transactionsTable.tsx
@@ -29,7 +29,7 @@ import { statisticsTableTitles } from "../statsTableUtil/statsTableUtil";
 import { tableClasses } from "./transactionsTableClasses";
 import { transactionLink } from "./transactionsCells";
 import {
-  CheckHexValue,
+  FixFingerprintHexValue,
   Count,
   FixLong,
   longToInt,
@@ -250,7 +250,9 @@ export function makeTransactionsColumns(
       name: "transactionFingerprintId",
       title: statisticsTableTitles.transactionFingerprintId(statType),
       cell: (item: TransactionInfo) =>
-        CheckHexValue(item.stats_data?.transaction_fingerprint_id.toString(16)),
+        FixFingerprintHexValue(
+          item.stats_data?.transaction_fingerprint_id.toString(16),
+        ),
       sort: (item: TransactionInfo) =>
         item.stats_data?.transaction_fingerprint_id.toString(16),
       showByDefault: false,
diff --git a/pkg/ui/workspaces/cluster-ui/src/util/format.spec.ts b/pkg/ui/workspaces/cluster-ui/src/util/format.spec.ts
index 818ac7374db7..515f1992dd81 100644
--- a/pkg/ui/workspaces/cluster-ui/src/util/format.spec.ts
+++ b/pkg/ui/workspaces/cluster-ui/src/util/format.spec.ts
@@ -15,7 +15,7 @@ import {
   BytesFitScale,
   byteUnits,
   HexStringToInt64String,
-  CheckHexValue,
+  FixFingerprintHexValue,
 } from "./format";
 
 describe("Format utils", () => {
@@ -61,13 +61,17 @@ describe("Format utils", () => {
     });
   });
 
-  describe("CheckHexValue", () => {
+  describe("FixFingerprintHexValue", () => {
     it("add leading 0 to hex values", () => {
-      expect(CheckHexValue(undefined)).toBe("");
-      expect(CheckHexValue(null)).toBe("");
-      expect(CheckHexValue("fb9111f22f2213b7")).toBe("fb9111f22f2213b7");
-      expect(CheckHexValue("b9111f22f2213b7")).toBe("0b9111f22f2213b7");
-      expect(CheckHexValue("9111f22f2213b7")).toBe("009111f22f2213b7");
+      expect(FixFingerprintHexValue(undefined)).toBe("");
+      expect(FixFingerprintHexValue(null)).toBe("");
+      expect(FixFingerprintHexValue("fb9111f22f2213b7")).toBe(
+        "fb9111f22f2213b7",
+      );
+      expect(FixFingerprintHexValue("b9111f22f2213b7")).toBe(
+        "0b9111f22f2213b7",
+      );
+      expect(FixFingerprintHexValue("9111f22f2213b7")).toBe("009111f22f2213b7");
     });
   });
 });
diff --git a/pkg/ui/workspaces/cluster-ui/src/util/format.ts b/pkg/ui/workspaces/cluster-ui/src/util/format.ts
index 60c1ddfbd913..11a4ed2a884e 100644
--- a/pkg/ui/workspaces/cluster-ui/src/util/format.ts
+++ b/pkg/ui/workspaces/cluster-ui/src/util/format.ts
@@ -256,15 +256,15 @@ export function HexStringToInt64String(s: string): string {
   return dec;
 }
 
-// CheckHexValue adds the leading 0 on strings with hex value that
-// have a length < 16.
-export function CheckHexValue(s: string): string {
+// FixFingerprintHexValue adds the leading 0 on strings with hex value that
+// have a length < 16. This can occur because it was returned like this from the DB
+// or because the hex value was generated using `.toString(16)` (which removes the
+// leading zeros).
+// The zeros need to be added back to match the value on our sql stats tables.
+export function FixFingerprintHexValue(s: string): string {
   if (s === undefined || s === null || s.length === 0) {
     return "";
   }
-  if (s?.length === 16) {
-    return s;
-  }
   while (s.length < 16) {
     s = `0${s}`;
   }
diff --git a/pkg/ui/workspaces/db-console/src/views/statements/statementsPage.tsx b/pkg/ui/workspaces/db-console/src/views/statements/statementsPage.tsx
index b513716c150b..c3a2cf3f7b3d 100644
--- a/pkg/ui/workspaces/db-console/src/views/statements/statementsPage.tsx
+++ b/pkg/ui/workspaces/db-console/src/views/statements/statementsPage.tsx
@@ -142,7 +142,7 @@ export const selectStatements = createSelector(
       if (!(key in statsByStatementKey)) {
         statsByStatementKey[key] = {
           statementFingerprintID: stmt.statement_fingerprint_id?.toString(),
-          statementFingerprintHexID: util.CheckHexValue(
+          statementFingerprintHexID: util.FixFingerprintHexValue(
             stmt.statement_fingerprint_id?.toString(16),
           ),
           statement: stmt.statement,
@@ -163,7 +163,7 @@ export const selectStatements = createSelector(
       const stmt = statsByStatementKey[key];
       return {
         aggregatedFingerprintID: stmt.statementFingerprintID,
-        aggregatedFingerprintHexID: util.CheckHexValue(
+        aggregatedFingerprintHexID: util.FixFingerprintHexValue(
           stmt.statementFingerprintHexID,
         ),
         label: stmt.statement,

From 2f0313e0a79f84c629cda92309e1d29ace6d0ff4 Mon Sep 17 00:00:00 2001
From: Andrei Matei <andrei@cockroachlabs.com>
Date: Fri, 11 Nov 2022 14:16:50 -0500
Subject: [PATCH 7/7] utilccl: fix a comment

It was missing a key subject.

Release note: None
Epic: None
---
 pkg/ccl/utilccl/license_check.go | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pkg/ccl/utilccl/license_check.go b/pkg/ccl/utilccl/license_check.go
index 3fdb1cc82929..1c5349ae01bc 100644
--- a/pkg/ccl/utilccl/license_check.go
+++ b/pkg/ccl/utilccl/license_check.go
@@ -113,7 +113,8 @@ func ApplyTenantLicense() error {
 // IsEnterpriseEnabled() instead.
 //
 // The ClusterID argument should be the tenant-specific logical
-// cluster ID. is not used for the check itself; it is merely embedded
+// cluster ID.
+// `feature` is not used for the check itself; it is merely embedded
 // in the URL displayed in the error message.
 func CheckEnterpriseEnabled(st *cluster.Settings, cluster uuid.UUID, feature string) error {
 	return checkEnterpriseEnabledAt(st, timeutil.Now(), cluster, feature, true /* withDetails */)
@@ -125,7 +126,8 @@ func CheckEnterpriseEnabled(st *cluster.Settings, cluster uuid.UUID, feature str
 // hot paths.
 //
 // The ClusterID argument should be the tenant-specific logical
-// cluster ID. is not used for the check itself; it is merely embedded
+// cluster ID.
+// `feature` is not used for the check itself; it is merely embedded
 // in the URL displayed in the error message.
 func IsEnterpriseEnabled(st *cluster.Settings, cluster uuid.UUID, feature string) bool {
 	return checkEnterpriseEnabledAt(