Skip to content

Commit

Permalink
roachtest: Add sstable-corruption roachtest
Browse files Browse the repository at this point in the history
Adds a new roachtest, sstable-corruption/table, that
imports TPCC, then finds an sstable that contains
table keys (which are replicated) on one node, then
calls `dd` to write random bytes into the middle
of them. It then checks that the corrupt node
either crashes on restart, or crashes after the
tpcc workload is run on it.

Informs #67568.

Release note: None.
  • Loading branch information
itsbilal committed Oct 14, 2021
1 parent acc7306 commit 607db33
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 0 deletions.
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/tests/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ func RegisterTests(r registry.Registry) {
registerSequelize(r)
registerSQLAlchemy(r)
registerSQLSmith(r)
registerSSTableCorruption(r)
registerSyncTest(r)
registerSysbench(r)
registerTLP(r)
Expand Down
111 changes: 111 additions & 0 deletions pkg/cmd/roachtest/tests/sstable_corruption.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// Copyright 2021 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package tests

import (
"context"
"fmt"
"strconv"
"strings"
"time"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/stretchr/testify/require"
)

func runSSTableCorruption(ctx context.Context, t test.Test, c cluster.Cluster) {
crdbNodes := c.Range(1, c.Spec().NodeCount)
workloadNode := c.Node(1)
const corruptNode = 3

t.Status("installing cockroach")
c.Put(ctx, t.Cockroach(), "./cockroach", crdbNodes)
c.Start(ctx, crdbNodes)

{
m := c.NewMonitor(ctx, crdbNodes)

m.Go(func(ctx context.Context) error {
// We don't really need tpcc, we just need a good amount of data. Enough
// to have multiple ranges, and some sstables with only table keys.
t.Status("importing tpcc fixture")
c.Run(ctx, workloadNode,
"./cockroach workload fixtures import tpcc --warehouses=100 --fks=false --checks=false")
return nil
})
m.Wait()
}

c.Stop(ctx, crdbNodes)

tableSSTs, err := c.RunWithBuffer(ctx, t.L(), c.Node(corruptNode),
"./cockroach debug pebble manifest dump {store-dir}/MANIFEST-* | grep -v added | grep -v deleted | grep \"\\[/Table\"")
if err != nil {
t.Fatal(err)
}
strTableSSTs := strings.Split(string(tableSSTs), "\n")
if len(strTableSSTs) == 0 {
t.Fatal("expected at least one sst containing table keys only, got none")
}
// Corrupt up to 6 SSTs containing table keys.
corruptedFiles := 0
for _, sstLine := range strTableSSTs {
sstLine = strings.TrimSpace(sstLine)
firstFileIdx := strings.Index(sstLine, ":")
_, err = strconv.Atoi(sstLine[:firstFileIdx])
if err != nil {
t.Fatal("error when converting %s to int: %s", sstLine[:firstFileIdx], err.Error())
}

t.Status(fmt.Sprintf("corrupting sstable %s on node %d", sstLine[:firstFileIdx], corruptNode))
c.Run(ctx, c.Node(corruptNode), fmt.Sprintf("dd if=/dev/urandom of={store-dir}/%s.sst seek=256 count=128 bs=1 conv=notrunc", sstLine[:firstFileIdx]))
corruptedFiles++
if corruptedFiles >= 6 {
break
}
}

if err := c.StartE(ctx, crdbNodes); err != nil {
// Node detected corruption on start and crashed. This is good. No need
// to run workload; the test is complete.
return
}

{
m := c.NewMonitor(ctx)
// Run a workload to try to get the node to notice corruption and crash.
m.Go(func(ctx context.Context) error {
_ = c.RunE(ctx, workloadNode,
fmt.Sprintf("./cockroach workload run tpcc --warehouses=100 --tolerate-errors --duration=%s", 2*time.Minute))
// Don't return an error from the workload. We want outcome of WaitE to be
// determined by the monitor noticing that a node died. The workload may
// also fail, despite --tolerate-errors, if a node crashes too early.
return nil
})
require.Error(t, m.WaitE())
}

// Exempt corrupted node from roachtest harness' post-test liveness checks.
_ = c.WipeE(ctx, t.L(), c.Node(corruptNode))
}

func registerSSTableCorruption(r registry.Registry) {
r.Add(registry.TestSpec{
Name: "sstable-corruption/table",
Owner: registry.OwnerStorage,
Cluster: r.MakeClusterSpec(3),
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runSSTableCorruption(ctx, t, c)
},
})
}

0 comments on commit 607db33

Please sign in to comment.