From 5bb676ed99f35c878a7ba59a19a6c6ec3137c3d5 Mon Sep 17 00:00:00 2001 From: ptrus Date: Wed, 17 Mar 2021 13:23:11 +0100 Subject: [PATCH] go/roothash/reindexBlocks: return latest round if no rounds reindexed --- .changelog/3791.bugfix.md | 4 ++++ go/consensus/tendermint/roothash/roothash.go | 11 +++++++++++ .../scenario/e2e/runtime/runtime_dynamic.go | 18 ++++++++++++++++++ 3 files changed, 33 insertions(+) create mode 100644 .changelog/3791.bugfix.md diff --git a/.changelog/3791.bugfix.md b/.changelog/3791.bugfix.md new file mode 100644 index 00000000000..e1ff171380f --- /dev/null +++ b/.changelog/3791.bugfix.md @@ -0,0 +1,4 @@ +go/roothash/reindexBlocks: return latest known round if no new rounds indexed + +This fixes a case where a storage node would not register if restarted while +synced and there were no new runtime rounds (e.g. the runtime is suspended). diff --git a/go/consensus/tendermint/roothash/roothash.go b/go/consensus/tendermint/roothash/roothash.go index 576a4de4e0e..03661e4da3b 100644 --- a/go/consensus/tendermint/roothash/roothash.go +++ b/go/consensus/tendermint/roothash/roothash.go @@ -474,6 +474,17 @@ func (sc *serviceClient) reindexBlocks(currentHeight int64, bh api.BlockHistory) } } + if lastRound == api.RoundInvalid { + sc.logger.Debug("no new round reindexed, return latest known round") + switch blk, err := bh.GetLatestBlock(sc.ctx); err { + case api.ErrNotFound: + case nil: + lastRound = blk.Header.Round + default: + return lastRound, fmt.Errorf("failed to get latest block: %w", err) + } + } + sc.logger.Debug("block reindex complete", "last_round", lastRound, ) diff --git a/go/oasis-test-runner/scenario/e2e/runtime/runtime_dynamic.go b/go/oasis-test-runner/scenario/e2e/runtime/runtime_dynamic.go index 9596a7b1074..03962c50594 100644 --- a/go/oasis-test-runner/scenario/e2e/runtime/runtime_dynamic.go +++ b/go/oasis-test-runner/scenario/e2e/runtime/runtime_dynamic.go @@ -466,6 +466,24 @@ func (sc *runtimeDynamicImpl) Run(childEnv *env.Env) error { // nolint: gocyclo return err } + // Restart nodes to test that the nodes will re-register although + // the runtime is suspended. + sc.Logger.Info("Restarting storage node to ensure it re-registers") + if err = sc.Net.StorageWorkers()[0].Stop(); err != nil { + return fmt.Errorf("failed to stop node: %w", err) + } + if err = sc.Net.StorageWorkers()[0].Start(); err != nil { + return fmt.Errorf("failed to start node: %w", err) + } + + sc.Logger.Info("Restarting compute node to ensure it re-registers") + if err = sc.Net.ComputeWorkers()[0].Stop(); err != nil { + return fmt.Errorf("failed to stop node: %w", err) + } + if err = sc.Net.ComputeWorkers()[0].Start(); err != nil { + return fmt.Errorf("failed to start node: %w", err) + } + // Another epoch transition to make sure the runtime keeps being suspended. if err = sc.epochTransition(ctx); err != nil { return err