Skip to content

Commit

Permalink
e2e/daily-txsource: Enable crash points in daily tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ptrus committed Nov 10, 2020
1 parent 969bc09 commit a80b664
Show file tree
Hide file tree
Showing 9 changed files with 74 additions and 6 deletions.
8 changes: 8 additions & 0 deletions go/oasis-test-runner/oasis/args.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/spf13/viper"

"github.com/oasisprotocol/oasis-core/go/common"
"github.com/oasisprotocol/oasis-core/go/common/crash"
commonGrpc "github.com/oasisprotocol/oasis-core/go/common/grpc"
"github.com/oasisprotocol/oasis-core/go/common/sgx"
"github.com/oasisprotocol/oasis-core/go/consensus/tendermint"
Expand Down Expand Up @@ -494,6 +495,13 @@ func (args *argBuilder) appendSeedNodes(seeds []*Seed) *argBuilder {
return args
}

func (args *argBuilder) configureDebugCrashPoints(prob float64) *argBuilder {
args.vec = append(args.vec,
"--"+crash.CfgDefaultCrashPointProbability, fmt.Sprintf("%f", prob),
)
return args
}

func (args *argBuilder) appendNodeMetrics(node *Node) *argBuilder {
args.vec = append(args.vec, []string{
"--" + metrics.CfgMetricsMode, metrics.MetricsModePush,
Expand Down
4 changes: 3 additions & 1 deletion go/oasis-test-runner/oasis/compute.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ func (worker *Compute) startNode() error {
workerRuntimeProvisioner(worker.runtimeProvisioner).
workerRuntimeSGXLoader(worker.net.cfg.RuntimeSGXLoaderBinary).
workerExecutorScheduleCheckTxEnabled().
configureDebugCrashPoints(worker.crashPointsProbability).
appendNetwork(worker.net).
appendSeedNodes(worker.net.seeds).
appendEntity(worker.entity)
Expand Down Expand Up @@ -155,10 +156,11 @@ func (net *Network) NewCompute(cfg *ComputeCfg) (*Compute, error) {
dir: computeDir,
termEarlyOk: cfg.AllowEarlyTermination,
termErrorOk: cfg.AllowErrorTermination,
noAutoStart: cfg.NoAutoStart,
crashPointsProbability: cfg.CrashPointsProbability,
disableDefaultLogWatcherHandlerFactories: cfg.DisableDefaultLogWatcherHandlerFactories,
logWatcherHandlerFactories: cfg.LogWatcherHandlerFactories,
consensus: cfg.Consensus,
noAutoStart: cfg.NoAutoStart,
},
entity: cfg.Entity,
runtimeProvisioner: cfg.RuntimeProvisioner,
Expand Down
17 changes: 16 additions & 1 deletion go/oasis-test-runner/oasis/fixture.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ type ValidatorFixture struct { // nolint: maligned

NoAutoStart bool `json:"no_auto_start,omitempty"`

CrashPointsProbability float64 `json:"crash_points_probability,omitempty"`

Entity int `json:"entity"`

LogWatcherHandlerFactories []log.WatcherHandlerFactory `json:"-"`
Expand Down Expand Up @@ -186,6 +188,7 @@ func (f *ValidatorFixture) Create(net *Network) (*Validator, error) {
LogWatcherHandlerFactories: f.LogWatcherHandlerFactories,
Consensus: f.Consensus,
NoAutoStart: f.NoAutoStart,
CrashPointsProbability: f.CrashPointsProbability,
},
Entity: entity,
Sentries: sentries,
Expand Down Expand Up @@ -291,6 +294,8 @@ type KeymanagerFixture struct {
// Consensus contains configuration for the consensus backend.
Consensus ConsensusFixture `json:"consensus"`

CrashPointsProbability float64 `json:"crash_points_probability,omitempty"`

LogWatcherHandlerFactories []log.WatcherHandlerFactory `json:"-"`
}

Expand All @@ -314,6 +319,7 @@ func (f *KeymanagerFixture) Create(net *Network) (*Keymanager, error) {
AllowEarlyTermination: f.AllowEarlyTermination,
AllowErrorTermination: f.AllowErrorTermination,
LogWatcherHandlerFactories: f.LogWatcherHandlerFactories,
CrashPointsProbability: f.CrashPointsProbability,
Consensus: f.Consensus,
NoAutoStart: f.NoAutoStart,
},
Expand Down Expand Up @@ -347,6 +353,8 @@ type StorageWorkerFixture struct { // nolint: maligned
IgnoreApplies bool `json:"ignore_applies,omitempty"`
CheckpointSyncEnabled bool `json:"checkpoint_sync_enabled,omitempty"`

CrashPointsProbability float64 `json:"crash_points_probability,omitempty"`

// Runtimes contains the indexes of the runtimes to enable. Leave
// empty or nil for the default behaviour (i.e. include all runtimes).
Runtimes []int `json:"runtimes,omitempty"`
Expand All @@ -363,6 +371,7 @@ func (f *StorageWorkerFixture) Create(net *Network) (*Storage, error) {
NodeCfg: NodeCfg{
AllowEarlyTermination: f.AllowEarlyTermination,
AllowErrorTermination: f.AllowErrorTermination,
CrashPointsProbability: f.CrashPointsProbability,
NoAutoStart: f.NoAutoStart,
LogWatcherHandlerFactories: f.LogWatcherHandlerFactories,
Consensus: f.Consensus,
Expand All @@ -372,7 +381,7 @@ func (f *StorageWorkerFixture) Create(net *Network) (*Storage, error) {
SentryIndices: f.Sentries,
CheckpointCheckInterval: f.CheckpointCheckInterval,
IgnoreApplies: f.IgnoreApplies,
// The checkpoint syncing flas is intentionally flipped here.
// The checkpoint syncing flag is intentionally flipped here.
// Syncing should normally be enabled, but normally disabled in tests.
CheckpointSyncDisabled: !f.CheckpointSyncEnabled,
DisableCertRotation: f.DisableCertRotation,
Expand All @@ -394,6 +403,8 @@ type ComputeWorkerFixture struct {
// Consensus contains configuration for the consensus backend.
Consensus ConsensusFixture `json:"consensus"`

CrashPointsProbability float64 `json:"crash_point_probability"`

LogWatcherHandlerFactories []log.WatcherHandlerFactory `json:"-"`

// Runtimes contains the indexes of the runtimes to enable.
Expand All @@ -412,6 +423,7 @@ func (f *ComputeWorkerFixture) Create(net *Network) (*Compute, error) {
AllowEarlyTermination: f.AllowEarlyTermination,
AllowErrorTermination: f.AllowErrorTermination,
NoAutoStart: f.NoAutoStart,
CrashPointsProbability: f.CrashPointsProbability,
LogWatcherHandlerFactories: f.LogWatcherHandlerFactories,
Consensus: f.Consensus,
},
Expand All @@ -437,6 +449,8 @@ func (f *SeedFixture) Create(net *Network) (*Seed, error) {
type SentryFixture struct {
LogWatcherHandlerFactories []log.WatcherHandlerFactory `json:"-"`

CrashPointsProbability float64 `json:"crash_points_probability,omitempty"`

Validators []int `json:"validators"`
StorageWorkers []int `json:"storage_workers"`
KeymanagerWorkers []int `json:"keymanager_workers"`
Expand All @@ -447,6 +461,7 @@ func (f *SentryFixture) Create(net *Network) (*Sentry, error) {
return net.NewSentry(&SentryCfg{
NodeCfg: NodeCfg{
LogWatcherHandlerFactories: f.LogWatcherHandlerFactories,
CrashPointsProbability: f.CrashPointsProbability,
},
ValidatorIndices: f.Validators,
StorageIndices: f.StorageWorkers,
Expand Down
2 changes: 2 additions & 0 deletions go/oasis-test-runner/oasis/keymanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ func (km *Keymanager) startNode() error {
workerRuntimePath(km.runtime.id, km.runtime.binaries[0]).
workerKeymanagerEnabled().
workerKeymanagerRuntimeID(km.runtime.id).
configureDebugCrashPoints(km.crashPointsProbability).
appendNetwork(km.net).
appendEntity(km.entity)

Expand Down Expand Up @@ -342,6 +343,7 @@ func (net *Network) NewKeymanager(cfg *KeymanagerCfg) (*Keymanager, error) {
dir: kmDir,
termEarlyOk: cfg.AllowEarlyTermination,
termErrorOk: cfg.AllowErrorTermination,
crashPointsProbability: cfg.CrashPointsProbability,
disableDefaultLogWatcherHandlerFactories: cfg.DisableDefaultLogWatcherHandlerFactories,
logWatcherHandlerFactories: cfg.LogWatcherHandlerFactories,
consensus: cfg.Consensus,
Expand Down
19 changes: 17 additions & 2 deletions go/oasis-test-runner/oasis/oasis.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"crypto"
"crypto/x509"
"encoding/json"
"errors"
"fmt"
"io"
"io/ioutil"
Expand All @@ -19,6 +20,7 @@ import (

"github.com/spf13/viper"

"github.com/oasisprotocol/oasis-core/go/common/crash"
"github.com/oasisprotocol/oasis-core/go/common/crypto/drbg"
"github.com/oasisprotocol/oasis-core/go/common/crypto/signature"
fileSigner "github.com/oasisprotocol/oasis-core/go/common/crypto/signature/signers/file"
Expand Down Expand Up @@ -85,6 +87,8 @@ type Node struct { // nolint: maligned
isStopping bool
noAutoStart bool

crashPointsProbability float64

disableDefaultLogWatcherHandlerFactories bool
logWatcherHandlerFactories []log.WatcherHandlerFactory

Expand Down Expand Up @@ -231,8 +235,9 @@ func (n *Node) SetConsensusStateSync(cfg *ConsensusStateSyncCfg) {

// NodeCfg defines the common node configuration options.
type NodeCfg struct { // nolint: maligned
AllowEarlyTermination bool
AllowErrorTermination bool
AllowEarlyTermination bool
AllowErrorTermination bool
CrashPointsProbability float64

NoAutoStart bool

Expand Down Expand Up @@ -855,6 +860,16 @@ func (net *Network) startOasisNode(
}

if err := node.handleExit(cmdErr); err != nil {
var exitErr *exec.ExitError
if errors.As(err, &exitErr) && exitErr.ExitCode() == crash.CrashDefaultExitCode {
// Termination due to crasher. Restart node.
net.logger.Info("Node debug crash point triggered. Restarting...", "node", node.Name)
if err = net.startOasisNode(node, subCmd, extraArgs); err != nil {
net.errCh <- fmt.Errorf("oasis: %s failed restarting node after crash point: %w", node.Name, err)
}
return
}

net.errCh <- fmt.Errorf("oasis: %s node terminated: %w", node.Name, err)
}
}()
Expand Down
2 changes: 2 additions & 0 deletions go/oasis-test-runner/oasis/sentry.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ func (sentry *Sentry) startNode() error {
tendermintCoreAddress(sentry.consensusPort).
tendermintPrune(sentry.consensus.PruneNumKept).
tendermintRecoverCorruptedWAL(sentry.consensus.TendermintRecoverCorruptedWAL).
configureDebugCrashPoints(sentry.crashPointsProbability).
appendNetwork(sentry.net).
appendSeedNodes(sentry.net.seeds).
internalSocketAddress(sentry.net.validators[0].SocketPath())
Expand Down Expand Up @@ -148,6 +149,7 @@ func (net *Network) NewSentry(cfg *SentryCfg) (*Sentry, error) {
Name: sentryName,
net: net,
dir: sentryDir,
crashPointsProbability: cfg.CrashPointsProbability,
disableDefaultLogWatcherHandlerFactories: cfg.DisableDefaultLogWatcherHandlerFactories,
logWatcherHandlerFactories: cfg.LogWatcherHandlerFactories,
},
Expand Down
4 changes: 4 additions & 0 deletions go/oasis-test-runner/oasis/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ func (worker *Storage) startNode() error {
workerStorageDebugIgnoreApplies(worker.ignoreApplies).
workerStorageDebugDisableCheckpointSync(worker.checkpointSyncDisabled).
workerStorageCheckpointCheckInterval(worker.checkpointCheckInterval).
configureDebugCrashPoints(worker.crashPointsProbability).
appendNetwork(worker.net).
appendEntity(worker.entity)

Expand Down Expand Up @@ -192,6 +193,9 @@ func (net *Network) NewStorage(cfg *StorageCfg) (*Storage, error) {
net: net,
dir: storageDir,
noAutoStart: cfg.NoAutoStart,
termEarlyOk: cfg.AllowEarlyTermination,
termErrorOk: cfg.AllowErrorTermination,
crashPointsProbability: cfg.CrashPointsProbability,
disableDefaultLogWatcherHandlerFactories: cfg.DisableDefaultLogWatcherHandlerFactories,
logWatcherHandlerFactories: cfg.LogWatcherHandlerFactories,
consensus: cfg.Consensus,
Expand Down
2 changes: 2 additions & 0 deletions go/oasis-test-runner/oasis/validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ func (val *Validator) startNode() error {
tendermintSubmissionGasPrice(val.consensus.SubmissionGasPrice).
tendermintPrune(val.consensus.PruneNumKept).
tendermintRecoverCorruptedWAL(val.consensus.TendermintRecoverCorruptedWAL).
configureDebugCrashPoints(val.crashPointsProbability).
appendNetwork(val.net).
appendEntity(val.entity)

Expand Down Expand Up @@ -134,6 +135,7 @@ func (net *Network) NewValidator(cfg *ValidatorCfg) (*Validator, error) {
dir: valDir,
termEarlyOk: cfg.AllowEarlyTermination,
termErrorOk: cfg.AllowErrorTermination,
crashPointsProbability: cfg.CrashPointsProbability,
disableDefaultLogWatcherHandlerFactories: cfg.DisableDefaultLogWatcherHandlerFactories,
logWatcherHandlerFactories: cfg.LogWatcherHandlerFactories,
consensus: cfg.Consensus,
Expand Down
22 changes: 20 additions & 2 deletions go/oasis-test-runner/scenario/e2e/runtime/txsource.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ const (
nodeLongRestartDuration = 10 * time.Minute
livenessCheckInterval = 1 * time.Minute
txSourceGasPrice = 1

crashPointProbability = 0.0005
)

// TxSourceMultiShort uses multiple workloads for a short time.
Expand Down Expand Up @@ -95,15 +97,16 @@ var TxSourceMulti scenario.Scenario = &txSourceImpl{
consensusPruneDisabledProbability: 0.1,
consensusPruneMinKept: 100,
consensusPruneMaxKept: 1000,
enableCrashPoints: true,
// Nodes getting killed commonly result in corrupted tendermint WAL when the
// node is restarted. Enable automatic corrupted WAL recovery for validator
// nodes.
tendermintRecoverCorruptedWAL: true,
// Use 4 storage nodes so runtime continues to work when one of the nodes
// is shut down.
numStorageNodes: 4,
// In tests with long restarts we want to have 3 worker nodes nodes in the
// runtime executor worker committee. That is so that each published runtime
// In tests with long restarts we want to have 3 worker nodes in the runtime
// executor worker committee. That is so that each published runtime
// transaction will be received by at least one active executor worker.
// In worst case, 2 nodes can be offline at the same time. Aditionally we
// need one backup node and one extra node.
Expand All @@ -128,6 +131,8 @@ type txSourceImpl struct { // nolint: maligned

tendermintRecoverCorruptedWAL bool

enableCrashPoints bool

// Configurable number of storage nodes. If running tests with long node
// shutdowns enabled, make sure this is at least `MinWriteReplication+1`,
// so that the runtime continues to work, even if one of the nodes is shut
Expand Down Expand Up @@ -373,13 +378,19 @@ func (sc *txSourceImpl) Fixture() (*oasis.NetworkFixture, error) {
// for long period can sync from it.
// Note: validator-0 is also never restarted.
sc.generateConsensusFixture(&f.Validators[i].Consensus, i == 0)
if i > 0 && sc.enableCrashPoints {
f.Validators[i].CrashPointsProbability = crashPointProbability
}
}
// Update all other nodes to use a specific gas price.
for i := range f.Keymanagers {
f.Keymanagers[i].Consensus.SubmissionGasPrice = txSourceGasPrice
// Enable recovery from corrupted WAL.
f.Keymanagers[i].Consensus.TendermintRecoverCorruptedWAL = sc.tendermintRecoverCorruptedWAL
sc.generateConsensusFixture(&f.Keymanagers[i].Consensus, false)
if i > 0 && sc.enableCrashPoints {
f.Keymanagers[i].CrashPointsProbability = crashPointProbability
}
}
for i := range f.StorageWorkers {
f.StorageWorkers[i].Consensus.SubmissionGasPrice = txSourceGasPrice
Expand All @@ -388,13 +399,19 @@ func (sc *txSourceImpl) Fixture() (*oasis.NetworkFixture, error) {
sc.generateConsensusFixture(&f.StorageWorkers[i].Consensus, false)
if i > 0 {
f.StorageWorkers[i].CheckpointSyncEnabled = true
if sc.enableCrashPoints {
f.StorageWorkers[i].CrashPointsProbability = crashPointProbability
}
}
}
for i := range f.ComputeWorkers {
f.ComputeWorkers[i].Consensus.SubmissionGasPrice = txSourceGasPrice
// Enable recovery from corrupted WAL.
f.ComputeWorkers[i].Consensus.TendermintRecoverCorruptedWAL = sc.tendermintRecoverCorruptedWAL
sc.generateConsensusFixture(&f.ComputeWorkers[i].Consensus, false)
if i > 0 && sc.enableCrashPoints {
f.ComputeWorkers[i].CrashPointsProbability = crashPointProbability
}
}
for i := range f.ByzantineNodes {
f.ByzantineNodes[i].Consensus.SubmissionGasPrice = txSourceGasPrice
Expand Down Expand Up @@ -656,6 +673,7 @@ func (sc *txSourceImpl) Clone() scenario.Scenario {
consensusPruneMinKept: sc.consensusPruneMinKept,
consensusPruneMaxKept: sc.consensusPruneMaxKept,
tendermintRecoverCorruptedWAL: sc.tendermintRecoverCorruptedWAL,
enableCrashPoints: sc.enableCrashPoints,
numStorageNodes: sc.numStorageNodes,
numComputeNodes: sc.numComputeNodes,
seed: sc.seed,
Expand Down

0 comments on commit a80b664

Please sign in to comment.