From d2bfa7aa9290c383163aae08890bc6967caa7a69 Mon Sep 17 00:00:00 2001 From: Bilal Akhtar Date: Tue, 28 Sep 2021 15:29:11 -0400 Subject: [PATCH] roachtest: Add sstable-corruption roachtest Adds a new roachtest, sstable-corruption/table, that imports TPCC, then finds an sstable that contains table keys (which are replicated) on one node, then calls `dd` to write random bytes into the middle of them. It then checks that the corrupt node either crashes on restart, or crashes after the tpcc workload is run on it. Informs #67568. Release note: None. --- .../roachtest/cluster/monitor_interface.go | 1 + pkg/cmd/roachtest/monitor.go | 5 + pkg/cmd/roachtest/tests/registry.go | 1 + pkg/cmd/roachtest/tests/sstable_corruption.go | 124 ++++++++++++++++++ 4 files changed, 131 insertions(+) create mode 100644 pkg/cmd/roachtest/tests/sstable_corruption.go diff --git a/pkg/cmd/roachtest/cluster/monitor_interface.go b/pkg/cmd/roachtest/cluster/monitor_interface.go index d2072cf78315..25e8207bea23 100644 --- a/pkg/cmd/roachtest/cluster/monitor_interface.go +++ b/pkg/cmd/roachtest/cluster/monitor_interface.go @@ -17,6 +17,7 @@ type Monitor interface { ExpectDeath() ExpectDeaths(count int32) ResetDeaths() + NumExpectedDeaths() int32 Go(fn func(context.Context) error) WaitE() error Wait() diff --git a/pkg/cmd/roachtest/monitor.go b/pkg/cmd/roachtest/monitor.go index 53d5cef2b34b..dfd5d005708b 100644 --- a/pkg/cmd/roachtest/monitor.go +++ b/pkg/cmd/roachtest/monitor.go @@ -78,6 +78,11 @@ func (m *monitorImpl) ResetDeaths() { atomic.StoreInt32(&m.expDeaths, 0) } +// NumExpectedDeaths is the number of expected deaths that have yet to happen. +func (m *monitorImpl) NumExpectedDeaths() int32 { + return atomic.LoadInt32(&m.expDeaths) +} + var errTestFatal = errors.New("t.Fatal() was called") func (m *monitorImpl) Go(fn func(context.Context) error) { diff --git a/pkg/cmd/roachtest/tests/registry.go b/pkg/cmd/roachtest/tests/registry.go index b206eee6111d..e2a7aae7c29d 100644 --- a/pkg/cmd/roachtest/tests/registry.go +++ b/pkg/cmd/roachtest/tests/registry.go @@ -99,6 +99,7 @@ func RegisterTests(r registry.Registry) { registerSequelize(r) registerSQLAlchemy(r) registerSQLSmith(r) + registerSstableCorruption(r) registerSyncTest(r) registerSysbench(r) registerTLP(r) diff --git a/pkg/cmd/roachtest/tests/sstable_corruption.go b/pkg/cmd/roachtest/tests/sstable_corruption.go new file mode 100644 index 000000000000..bbed7ce11b0d --- /dev/null +++ b/pkg/cmd/roachtest/tests/sstable_corruption.go @@ -0,0 +1,124 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package tests + +import ( + "context" + "fmt" + "strconv" + "strings" + "time" + + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" +) + +func runSstableCorruption(ctx context.Context, t test.Test, c cluster.Cluster) { + crdbNodes := c.Range(1, c.Spec().NodeCount) + workloadNode := c.Node(1) + const corruptNode = 3 + + t.Status("installing cockroach") + c.Put(ctx, t.Cockroach(), "./cockroach", crdbNodes) + c.Start(ctx, crdbNodes) + + // We don't really need tpcc, we just need a good amount of data. Enough + // to have multiple ranges, and some sstables with only table keys. + t.Status("importing tpcc fixture") + c.Run(ctx, workloadNode, + "./cockroach workload fixtures import tpcc --warehouses=100 --fks=false --checks=false") + + m := c.NewMonitor(ctx, crdbNodes) + signalChan := make(chan bool) + + m.Go(func(ctx context.Context) error { + // Wait for a signal from the other goroutine below before running the + // workload. This will only be called after the cluster has been restarted + // with corrupt sstables on one node. + select { + case proceed := <-signalChan: + if !proceed { + return nil + } + case <-ctx.Done(): + return ctx.Err() + } + _ = c.RunE(ctx, workloadNode, "./cockroach workload run tpcc --warehouses=100 "+ + fmt.Sprintf("--tolerate-errors --duration=%s", 5*time.Minute)) + return nil + }) + + m.Go(func(ctx context.Context) error { + m.ExpectDeaths(3) + c.Stop(ctx, crdbNodes) + + tableSSTs, err := c.RunWithBuffer(ctx, t.L(), c.Node(corruptNode), + "./cockroach debug pebble manifest dump {store-dir}/MANIFEST-* | grep -v added | grep -v deleted | grep \"\\[/Table\"") + if err != nil { + return err + } + strTableSSTs := strings.Split(string(tableSSTs), "\n") + if len(strTableSSTs) == 0 { + t.Fatal("expected at least one sst containing table keys only, got none") + } + // Corrupt up to 6 SSTs containing table keys. + corruptedFiles := 0 + for _, sstLine := range strTableSSTs { + sstLine = strings.TrimSpace(sstLine) + firstFileIdx := strings.Index(sstLine, ":") + _, err = strconv.Atoi(sstLine[:firstFileIdx]) + if err != nil { + t.Fatal("error when converting %s to int: %s", sstLine[:firstFileIdx], err.Error()) + } + + t.Status(fmt.Sprintf("corrupting sstable %s on node %d", sstLine[:firstFileIdx], corruptNode)) + c.Run(ctx, c.Node(corruptNode), fmt.Sprintf("dd if=/dev/urandom of={store-dir}/%s.sst seek=256 count=128 bs=1 conv=notrunc", sstLine[:firstFileIdx])) + corruptedFiles++ + if corruptedFiles >= 6 { + break + } + } + + m.ExpectDeath() + if err := c.StartE(ctx, crdbNodes); err != nil { + // Node detected corruption on start and crashed. This is good. No need + // to run workload. + signalChan <- false + return nil + } + // Start the workload. This should cause the node to crash. + signalChan <- true + time.Sleep(2 * time.Minute) + + if num := m.NumExpectedDeaths(); num > 0 { + t.Fatalf("expected node death to have occurred, still waiting on %d deaths", num) + } + // Reset deaths and restart the corrupt node as a clean node. This is + // necessary for the test to not fail upon cleanup. + m.ResetDeaths() + _ = c.WipeE(ctx, t.L(), c.Node(corruptNode)) + c.Start(ctx, c.Node(corruptNode)) + return nil + }) + m.Wait() +} + +func registerSstableCorruption(r registry.Registry) { + r.Add(registry.TestSpec{ + Name: "sstable-corruption/table", + Owner: registry.OwnerStorage, + Cluster: r.MakeClusterSpec(3), + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { + runSstableCorruption(ctx, t, c) + }, + }) +}