Skip to content

Commit

Permalink
roachtest: add replicate/wide roachtest
Browse files Browse the repository at this point in the history
Add the `replicate/wide` roachtest which starts up a 9-node cluster,
sets the replication factor for all zones to 9, waits for full
replication, and then restarts the cluster, bringing up only nodes
1-6. Previously, this would cause down-replication and that
down-replication could cause unavailable ranges.

Further, test decommissioning one of the nodes and verify that the
replication of the ranges falls to 7. Lastly, decrease the replication
factor to 5 and verify the replicas per range again falls.

See cockroachdb#34122

Release note: None
  • Loading branch information
petermattis committed Jan 19, 2019
1 parent 1739426 commit 1ef1592
Showing 1 changed file with 137 additions and 2 deletions.
139 changes: 137 additions & 2 deletions pkg/cmd/roachtest/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
gosql "database/sql"
"fmt"
"math"
"strings"
"time"

"github.com/cockroachdb/cockroach/pkg/util/timeutil"
Expand Down Expand Up @@ -76,19 +77,25 @@ func registerAllocator(r *registry) {
}

r.Add(testSpec{
Name: `upreplicate/1to3`,
Name: `replicate/up/1to3`,
Nodes: nodes(3),
Run: func(ctx context.Context, t *test, c *cluster) {
runAllocator(ctx, t, c, 1, 10.0)
},
})
r.Add(testSpec{
Name: `rebalance/3to5`,
Name: `replicate/rebalance/3to5`,
Nodes: nodes(5),
Run: func(ctx context.Context, t *test, c *cluster) {
runAllocator(ctx, t, c, 3, 42.0)
},
})
r.Add(testSpec{
Name: `replicate/wide`,
Timeout: 10 * time.Minute,
Nodes: nodes(9, cpu(1)),
Run: runWideReplication,
})
}

// printRebalanceStats prints the time it took for rebalancing to finish and the
Expand Down Expand Up @@ -243,3 +250,131 @@ func waitForRebalance(ctx context.Context, l *logger, db *gosql.DB, maxStdDev fl
}
}
}

func runWideReplication(ctx context.Context, t *test, c *cluster) {
nodes := c.nodes
if nodes != 9 {
t.Fatalf("9-node cluster required")
}

args := startArgs("--env=COCKROACH_SCAN_MAX_IDLE_TIME=5ms")
c.Put(ctx, cockroach, "./cockroach")
c.Start(ctx, t, c.All(), args)

db := c.Conn(ctx, 1)
defer db.Close()

zones := func() []string {
rows, err := db.Query(`SELECT zone_name FROM crdb_internal.zones`)
if err != nil {
t.Fatal(err)
}
defer rows.Close()
var results []string
for rows.Next() {
var name string
if err := rows.Scan(&name); err != nil {
t.Fatal(err)
}
results = append(results, name)
}
return results
}

run := func(stmt string) {
t.l.Printf("%s\n", stmt)
if _, err := db.ExecContext(ctx, stmt); err != nil {
t.Fatal(err)
}
}

setReplication := func(width int) {
// Change every zone to have the same number of replicas as the number of
// nodes in the cluster.
for _, zone := range zones() {
which := "RANGE"
if zone[0] == '.' {
zone = zone[1:]
} else if strings.Count(zone, ".") == 0 {
which = "DATABASE"
} else {
which = "TABLE"
}
run(fmt.Sprintf(`ALTER %s %s CONFIGURE ZONE USING num_replicas = %d`,
which, zone, width))
}
}
setReplication(nodes)

countMisreplicated := func(width int) int {
var count int
if err := db.QueryRow(
"SELECT COUNT(*) FROM crdb_internal.ranges WHERE array_length(replicas,1) != $1",
width,
).Scan(&count); err != nil {
t.Fatal(err)
}
return count
}

waitForReplication := func(width int) {
for count := -1; count != 0; time.Sleep(time.Second) {
count = countMisreplicated(width)
t.l.Printf("%d mis-replicated ranges\n", count)
}
}

waitForReplication(nodes)

numRanges := func() int {
var count int
if err := db.QueryRow(`SELECT COUNT(*) FROM crdb_internal.ranges`).Scan(&count); err != nil {
t.Fatal(err)
}
return count
}()

// Stop the cluster and restart 2/3 of the nodes.
c.Stop(ctx)
c.Start(ctx, t, c.Range(1, 6), args)

waitForUnderReplicated := func(count int) {
for ; ; time.Sleep(time.Second) {
query := `
SELECT sum((metrics->>'ranges.unavailable')::DECIMAL)::INT AS ranges_unavailable,
sum((metrics->>'ranges.underreplicated')::DECIMAL)::INT AS ranges_underreplicated
FROM crdb_internal.kv_store_status
`
var unavailable, underReplicated int
if err := db.QueryRow(query).Scan(&unavailable, &underReplicated); err != nil {
t.Fatal(err)
}
t.l.Printf("%d unavailable, %d under-replicated ranges\n", unavailable, underReplicated)
if unavailable != 0 {
t.Fatalf("%d unavailable ranges", unavailable)
}
if underReplicated >= count {
break
}
}
}

waitForUnderReplicated(numRanges)
if n := countMisreplicated(9); n != 0 {
t.Fatalf("expected 0 mis-replicated ranges, but found %d", n)
}

decom := func(id int) {
c.Run(ctx, c.Node(1),
fmt.Sprintf("./cockroach node decommission --insecure --wait=none %d", id))
}

// Decommission a node. The ranges should down-replicate to 7 replicas.
decom(9)
waitForReplication(7)

// Set the replication width to 5. The replicas should down-replicate.
// TODO(peter): This doesn't work. It should. There seems to be a bug.
// setReplication(5)
// waitForReplication(5)
}

0 comments on commit 1ef1592

Please sign in to comment.