Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
65639: sql: deflake and unskip TestDistSQLDeadHosts r=rytaft a=rytaft

This commit deflakes and unskips `TestDistSQLDeadHosts`. It ensures that the
test never hangs due to unavailable ranges and allows the leaseholder of a
range to relocate to either of its replicas when the original node dies.

Fixes #49843

Release note: None

65643: jobs: use low-priority transactions for claim, adopt, cancel/pause r=ajwerner a=ajwerner

These transactions can be slow and long-running and they hold locks. This is
unfortunate for UX reasons.

Fixes #65077.

Release note: (bug fix): Improved availability of jobs table for reads in large,
global clusters by running background tasks at low priority.

65657: TEAMS: update KV triage column ID r=erikgrinaker a=tbg

This sets the column ID to that of the "Roachtest/Unit Test Backlog"
column, which is where we want all test flakes to end up.

Release note: None


65658: kv: massage IsExpectedRebalanceError r=knz a=tbg

The error that would bubble up when a raft group got deleted has changed
slightly, likely as a result of #62719.

Fixes #65546.

Release note: None


Co-authored-by: Rebecca Taft <[email protected]>
Co-authored-by: Andrew Werner <[email protected]>
Co-authored-by: Tobias Grieger <[email protected]>
  • Loading branch information
4 people committed May 25, 2021
5 parents 01a77cb + becf917 + aed014f + 0ca4bbe + 34908fd commit 6f9d115
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 24 deletions.
2 changes: 1 addition & 1 deletion TEAMS.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ cockroachdb/sql-observability:
triage_column_id: 12618343
cockroachdb/kv:
aliases: [cockroachdb/kv-triage]
triage_column_id: 3550674
triage_column_id: 14242655
cockroachdb/geospatial:
triage_column_id: 9487269
cockroachdb/dev-inf:
Expand Down
11 changes: 11 additions & 0 deletions pkg/jobs/adopt.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (

"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/security"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/sessiondata"
Expand Down Expand Up @@ -47,6 +48,11 @@ const (
// available.
func (r *Registry) claimJobs(ctx context.Context, s sqlliveness.Session) error {
return r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
// Run the claim transaction at low priority to ensure that it does not
// contend with foreground reads.
if err := txn.SetUserPriority(roachpb.MinUserPriority); err != nil {
return errors.WithAssertionFailure(err)
}
numRows, err := r.ex.Exec(
ctx, "claim-jobs", txn, `
UPDATE system.jobs
Expand Down Expand Up @@ -283,6 +289,11 @@ func (r *Registry) runJob(

func (r *Registry) servePauseAndCancelRequests(ctx context.Context, s sqlliveness.Session) error {
return r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
// Run the claim transaction at low priority to ensure that it does not
// contend with foreground reads.
if err := txn.SetUserPriority(roachpb.MinUserPriority); err != nil {
return errors.WithAssertionFailure(err)
}
// Note that we have to buffer all rows first - before processing each
// job - because we have to make sure that the query executes without an
// error (otherwise, the system.jobs table might diverge from the jobs
Expand Down
20 changes: 15 additions & 5 deletions pkg/jobs/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/security"
"github.com/cockroachdb/cockroach/pkg/server/telemetry"
"github.com/cockroachdb/cockroach/pkg/settings"
Expand Down Expand Up @@ -659,16 +660,25 @@ func (r *Registry) Start(
}

removeClaimsFromDeadSessions := func(ctx context.Context, s sqlliveness.Session) {
if _, err := r.ex.QueryRowEx(
ctx, "expire-sessions", nil,
sessiondata.InternalExecutorOverride{User: security.RootUserName()}, `
if err := r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
// Run the expiration transaction at low priority to ensure that it does
// not contend with foreground reads. Note that the adoption and cancellation
// queries also use low priority so they will interact nicely.
if err := txn.SetUserPriority(roachpb.MinUserPriority); err != nil {
return errors.WithAssertionFailure(err)
}
_, err := r.ex.ExecEx(
ctx, "expire-sessions", nil,
sessiondata.InternalExecutorOverride{User: security.RootUserName()}, `
UPDATE system.jobs
SET claim_session_id = NULL
WHERE claim_session_id <> $1
AND status IN `+claimableStatusTupleString+`
AND NOT crdb_internal.sql_liveness_is_alive(claim_session_id)`,
s.ID().UnsafeBytes(),
); err != nil {
s.ID().UnsafeBytes(),
)
return err
}); err != nil {
log.Errorf(ctx, "error expiring job sessions: %s", err)
}
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/kv/test_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,13 @@ func IsExpectedRelocateError(err error) bool {
"unable to remove replica .* which is not present",
"unable to add replica .* which is already present",
"received invalid ChangeReplicasTrigger .* to remove self",
"failed to apply snapshot: raft group deleted",
"raft group deleted",
"snapshot failed",
"breaker open",
"unable to select removal target", // https://github.com/cockroachdb/cockroach/issues/49513
"cannot up-replicate to .*; missing gossiped StoreDescriptor",
"remote couldn't accept .* snapshot",
"cannot add placeholder",
}
pattern := "(" + strings.Join(allowlist, "|") + ")"
return testutils.IsError(err, pattern)
Expand Down
47 changes: 30 additions & 17 deletions pkg/sql/distsql_physical_planner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -453,8 +453,6 @@ func TestDistSQLDeadHosts(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

skip.WithIssue(t, 49843, "test is too slow; we need to tweak timeouts so connections die faster (see #14376)")

const n = 100
const numNodes = 5

Expand All @@ -475,14 +473,25 @@ func TestDistSQLDeadHosts(t *testing.T) {
r.Exec(t, fmt.Sprintf("ALTER TABLE t SPLIT AT VALUES (%d)", n*i/5))
}

// Evenly spread the ranges between the first 4 nodes. Only the last range
// has a replica on the fifth node.
for i := 0; i < numNodes; i++ {
r.Exec(t, fmt.Sprintf(
"ALTER TABLE t EXPERIMENTAL_RELOCATE VALUES (ARRAY[%d,%d,%d], %d)",
i+1, (i+1)%5+1, (i+2)%5+1, n*i/5,
i+1, (i+1)%4+1, (i+2)%4+1, n*i/5,
))
}

r.Exec(t, "SHOW RANGES FROM TABLE t")
r.CheckQueryResults(t,
"SELECT start_key, end_key, lease_holder, replicas FROM [SHOW RANGES FROM TABLE t]",
[][]string{
{"NULL", "/0", "1", "{1}"},
{"/0", "/20", "1", "{1,2,3}"},
{"/20", "/40", "2", "{2,3,4}"},
{"/40", "/60", "3", "{1,3,4}"},
{"/60", "/80", "4", "{1,2,4}"},
{"/80", "NULL", "5", "{2,3,5}"},
},
)

r.Exec(t, fmt.Sprintf("INSERT INTO t SELECT i, i*i FROM generate_series(1, %d) AS g(i)", n))

Expand All @@ -508,28 +517,32 @@ func TestDistSQLDeadHosts(t *testing.T) {
// Verify the plan (should include all 5 nodes).
r.CheckQueryResults(t,
"SELECT info FROM [EXPLAIN (DISTSQL) SELECT sum(xsquared) FROM t] WHERE info LIKE 'Diagram%'",
[][]string{{"Diagram: https://cockroachdb.github.io/distsqlplan/decode.html#eJy8k09LwzAYxu9-CnlOCu9h7bo5e5rHHXQy9SQ91OalFLamJCkoo99d1iDaIskgo8f8-T2_PG1yRC0FP-UH1kjfEYEQgzAHIQFhgYzQKFmw1lKdtlhgIz6RzghV3bTmNJ0RCqkY6RGmMntGitf8Y887zgUrEASbvNr3kkZVh1x9rQ0I29ak1-sYWUeQrflJ6-h8z0NZKi5zI0eal7fHm3V0e3b0b2JbSyVYsRgEZt2F5dFE38_jCakQT1TB4wmpMJ-ogscTUiGZqILHc6mH-E_0jnUja82jBznMywgsSrZvWctWFfysZNGH2-G2391PCNbGrkZ2sKnt0ulYf-HICccDOBrDsdvsUc-ddOKGk5BzL5zw0m1ehpjvnPDKbV6FmO_d_2rmuSbuSzZ2Z93VdwAAAP__XTV6BQ=="}},
[][]string{{"Diagram: https://cockroachdb.github.io/distsqlplan/decode.html#eJyslF-Lm0AUxd_7KeRC2QQm-CfGdX3K0rVUmt1sY0oLiw_TeCtC4rgzI2wJ-e5FLawJuzOKfXTmnnvO7zrcI4jnPQQQh6vw09bIi9_M-LxZ3xtP4c_H1W30YEzuongbf1tNjX81ojpMXsRzRTmm07ZYJsaPL-EmbPWr6GtoXN3lNOP08PEKCBQsxQd6QAHBE9hAwAECcyDgAoEFJARKznYoBON1ybERROkLBBaBvCgrWR8nBHaMIwRHkLncIwSwpb_2uEGaIjctIJCipPm-sZHLkucHyv8AgbikhQiMmenURetKBsbSgeREgFXytbuQNEMI7BPpn-A2yzhmVDJuLs4DxN_vJ0t7-q6N867Na_eqYDxFjulZ6-SkDmJbw5LMz5LY_Udua0duOtbMdIdPXROiA-uNmbrTn9XRs7rWzPSGs2pCdFivx7DO-7PO9ayeNTP94ayaEB1Wfwyr25_V1bP61mwwqCZBB_Tmf62NN2w2KEpWCLxYH293tuq1gmmG7Q4SrOI7fORs19i0n-tG1xykKGR7a7cfUdFe1QG7Ylspds7E9qXYUTtrrOdKtasWu2NyL5RiT-3sjXG-Vop9tbM_xvlG_a8szTNRP7JL7-T04W8AAAD__wQH0sk="}},
)

// Stop node 5.
tc.StopServer(4)

testutils.SucceedsSoon(t, runQuery)

r.CheckQueryResults(t,
"SELECT info FROM [EXPLAIN (DISTSQL) SELECT sum(xsquared) FROM t] WHERE info LIKE 'Diagram%'",
[][]string{{"Diagram: https://cockroachdb.github.io/distsqlplan/decode.html#eJy8k8FK7DAYhff3KS5npZCF6dRx7KouZ6Ejo64ki9j8lEKnKUkKytB3lzaItkg60qHL5M93vpySHlFpRQ_yQBbJKzgYIjCswBBDMNRGZ2StNt3YH96qdyRXDEVVN67bFgyZNoTkCFe4kpDgWb6VtCepyIBBkZNF2QtqUxyk-UgdGHaNS_6nEUTLoBv3lday0z13eW4ol06PNE8v9xcpvzw5-juxqbRRZEgNAkV7Zjlf6PtNeOZUiBaqMOGZU2G1UIUJz7le8S_Re7K1riyNXvMwTzCQysn_CFY3JqNHo7M-3C93_el-Q5F1fsr9Ylv5UXetnzAPwtEA5mM4CsK3YfMqCMdhOJ5z7esgvA6b13PMN0F4EzZv_mQW7b_PAAAA__-DuA-E"}},
)
// The leaseholder for the last range should have moved to either node 2 or 3.
query := "SELECT info FROM [EXPLAIN (DISTSQL) SELECT sum(xsquared) FROM t] WHERE info LIKE 'Diagram%'"
exp2 := [][]string{{"Diagram: https://cockroachdb.github.io/distsqlplan/decode.html#eJysk9Fr2zAQxt_3V4iD0QQUbMuum_kpZfWYWdp0ccYGxQ9adPMMieVKMnSE_O_D9qBJaeWY7NG6-933fZJvB_pxAxGk8Tz-uCJF-UuST8vFLXmIf9zPr5M7MrpJ0lX6dT4m_3p0vR096ceaKxTjrtlk5PvneBl3_Dz5EpOLm4Lnim_fXwCFUgq841vUED2ABxQYUPCBQgAZhUrJNWotVVPetc2JeILIpVCUVW2a44zCWiqEaAemMBuECFb85waXyAUqxwUKAg0vNq2EmVWq2HL1ByikFS91RCYOa5oWtYnIjEG2pyBr8zxdG54jRN6enu7gOs8V5txI5QTHBtJvt6OZN35Thr0p8zy9LqUSqFAcjc72diPTYUb8IyPe6Tfu9d64w9yJE7iEl4J4RJrfqAY_QI-hg9yX5zwAOz03688duBMnHP6z9Zg4yBqek9U_PavfnzV0J850eNYeEwdZr_7XYr0is0RdyVLjiwV7fbLbLB6KHLst1bJWa7xXct3KdJ-LlmsPBGrTVb3uIym7UmPwEPasMDuCvZcws8If7Mq-FQ7scHCO7UsrHNqVw3OUr6zw1K48HaSc7d_9DQAA__-rUWHE"}}
exp3 := [][]string{{"Diagram: https://cockroachdb.github.io/distsqlplan/decode.html#eJysk9Fr2zAQxt_3V4iD0QQUbMuum_kpZfWYWdp0ccYGxQ9adPMMieVKMnSE_O_D9qBJaeWY7NG6-933fWduB_pxAxGk8Tz-uCJF-UuST8vFLXmIf9zPr5M7MrpJ0lX6dT4m_3p0vR096ceaKxTjrtlk5PvneBl3_Dz5EpOLm4Lnim_fXwCFUgq841vUED2ABxQYUPCBQgAZhUrJNWotVVPetc2JeILIpVCUVW2a54zCWiqEaAemMBuECFb85waXyAUqxwUKAg0vNq2EmVWq2HL1ByikFS91RCYOa5oWtYnIjEG2pyBr8zxdG54jRN6enu7gOs8V5txI5QTHBtJvt6OZN35Thr0p8zy9LqUSqFAcjc72diPTYUb8IyPe6Rv3ejfuMHfiBMOX3mPiIOvlOUtnp2dl_VkDd-KELuGlIB6R5jeqwbl7DB3kDs_J7Z-e2-_PHboTZzr8H_eYOMh69b8O6xWZJepKlhpfHNjrk93m8FDk2F2plrVa472S61am-1y0XPsgUJuu6nUfSdmVGoOHsGeF2RHsvYSZFf5gV_atcGCHg3NsX1rh0K4cnqN8ZYWnduXpIOVs_-5vAAAA__-ToWHE"}}

// Stop node 2; note that no range had replicas on both 2 and 5.
tc.StopServer(1)
res := r.QueryStr(t, query)
if !reflect.DeepEqual(res, exp2) {
if !reflect.DeepEqual(res, exp3) {
t.Errorf("query '%s': expected:\neither\n%vor\n%v\ngot:\n%v\n",
query, sqlutils.MatrixToStr(exp2), sqlutils.MatrixToStr(exp3), sqlutils.MatrixToStr(res),
)
}
}

testutils.SucceedsSoon(t, runQuery)
// Stop node 4; note that no range had replicas on both 4 and 5.
tc.StopServer(3)

r.CheckQueryResults(t,
"SELECT info FROM [EXPLAIN (DISTSQL) SELECT sum(xsquared) FROM t] WHERE info LIKE 'Diagram%'",
[][]string{{"Diagram: https://cockroachdb.github.io/distsqlplan/decode.html#eJy8kkFLwzAUx-9-CvmfFHIwXZ3QUz3uoJOpJ8khNo9S6JrykoIy-t2lDaItkk02dkxe_r_fe-Ht0FhDj3pLDtkbJAQWEEihBFq2BTlneSiFhyvzgexGoGrazg_XSqCwTMh28JWvCRle9HtNG9KGGAKGvK7qEd5ytdX8mXsIrDufXeYJVC9gO_9N68XhnvuyZCq1tzPN8-vDVS6vD0b_ELvGsiEmMwGq_sRyeab_2-M5ZoTkTCPs8ZxqBf5Ab8i1tnE0W4UpTwmQKSlskbMdF_TEthjh4bgeX48XhpwPVRkOqyaUhrZ-h2U0nEzCch5OouG7uHkRDafxcHpM27fR8DJuXv7LrPqLrwAAAP__vMyldA=="}},
)
testutils.SucceedsSoon(t, runQuery)
}

func TestDistSQLDrainingHosts(t *testing.T) {
Expand Down

0 comments on commit 6f9d115

Please sign in to comment.