Skip to content

Commit

Permalink
roachtest: Add sstable-corruption roachtest
Browse files Browse the repository at this point in the history
Adds a new roachtest, sstable-corruption/table, that
imports TPCC, then finds an sstable that contains
table keys (which are replicated) on one node, then
calls `dd` to write random bytes into the middle
of them. It then checks that the corrupt node
either crashes on restart, or crashes after the
tpcc workload is run on it.

Informs cockroachdb#67568.

Release note: None.
  • Loading branch information
itsbilal committed Oct 14, 2021
1 parent acc7306 commit 607db33
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 0 deletions.
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/tests/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ func RegisterTests(r registry.Registry) {
registerSequelize(r)
registerSQLAlchemy(r)
registerSQLSmith(r)
registerSSTableCorruption(r)
registerSyncTest(r)
registerSysbench(r)
registerTLP(r)
Expand Down
111 changes: 111 additions & 0 deletions pkg/cmd/roachtest/tests/sstable_corruption.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// Copyright 2021 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package tests

import (
"context"
"fmt"
"strconv"
"strings"
"time"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/stretchr/testify/require"
)

func runSSTableCorruption(ctx context.Context, t test.Test, c cluster.Cluster) {
crdbNodes := c.Range(1, c.Spec().NodeCount)
workloadNode := c.Node(1)
const corruptNode = 3

t.Status("installing cockroach")
c.Put(ctx, t.Cockroach(), "./cockroach", crdbNodes)
c.Start(ctx, crdbNodes)

{
m := c.NewMonitor(ctx, crdbNodes)

m.Go(func(ctx context.Context) error {
// We don't really need tpcc, we just need a good amount of data. Enough
// to have multiple ranges, and some sstables with only table keys.
t.Status("importing tpcc fixture")
c.Run(ctx, workloadNode,
"./cockroach workload fixtures import tpcc --warehouses=100 --fks=false --checks=false")
return nil
})
m.Wait()
}

c.Stop(ctx, crdbNodes)

tableSSTs, err := c.RunWithBuffer(ctx, t.L(), c.Node(corruptNode),
"./cockroach debug pebble manifest dump {store-dir}/MANIFEST-* | grep -v added | grep -v deleted | grep \"\\[/Table\"")
if err != nil {
t.Fatal(err)
}
strTableSSTs := strings.Split(string(tableSSTs), "\n")
if len(strTableSSTs) == 0 {
t.Fatal("expected at least one sst containing table keys only, got none")
}
// Corrupt up to 6 SSTs containing table keys.
corruptedFiles := 0
for _, sstLine := range strTableSSTs {
sstLine = strings.TrimSpace(sstLine)
firstFileIdx := strings.Index(sstLine, ":")
_, err = strconv.Atoi(sstLine[:firstFileIdx])
if err != nil {
t.Fatal("error when converting %s to int: %s", sstLine[:firstFileIdx], err.Error())
}

t.Status(fmt.Sprintf("corrupting sstable %s on node %d", sstLine[:firstFileIdx], corruptNode))
c.Run(ctx, c.Node(corruptNode), fmt.Sprintf("dd if=/dev/urandom of={store-dir}/%s.sst seek=256 count=128 bs=1 conv=notrunc", sstLine[:firstFileIdx]))
corruptedFiles++
if corruptedFiles >= 6 {
break
}
}

if err := c.StartE(ctx, crdbNodes); err != nil {
// Node detected corruption on start and crashed. This is good. No need
// to run workload; the test is complete.
return
}

{
m := c.NewMonitor(ctx)
// Run a workload to try to get the node to notice corruption and crash.
m.Go(func(ctx context.Context) error {
_ = c.RunE(ctx, workloadNode,
fmt.Sprintf("./cockroach workload run tpcc --warehouses=100 --tolerate-errors --duration=%s", 2*time.Minute))
// Don't return an error from the workload. We want outcome of WaitE to be
// determined by the monitor noticing that a node died. The workload may
// also fail, despite --tolerate-errors, if a node crashes too early.
return nil
})
require.Error(t, m.WaitE())
}

// Exempt corrupted node from roachtest harness' post-test liveness checks.
_ = c.WipeE(ctx, t.L(), c.Node(corruptNode))
}

func registerSSTableCorruption(r registry.Registry) {
r.Add(registry.TestSpec{
Name: "sstable-corruption/table",
Owner: registry.OwnerStorage,
Cluster: r.MakeClusterSpec(3),
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runSSTableCorruption(ctx, t, c)
},
})
}

0 comments on commit 607db33

Please sign in to comment.