Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

go/consensus/tendermint: sync-worker additionally check block timestamps #2873

Merged
merged 2 commits into from
Apr 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changelog/2873.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
go/consensus/tendermint: sync-worker additionally check block timestamps

Sync-worker relied on Tendermint fast-sync to determine if the node is still
catching up. This PR adds aditional condition that the latest block is not
older than 1 minute. This prevents cases where node would report as caught up
after stopping fast-sync, but before it has actually caught up.
35 changes: 31 additions & 4 deletions go/consensus/tendermint/tendermint.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,12 @@ const (
// StateDir is the name of the directory located inside the node's data
// directory which contains the tendermint state.
StateDir = "tendermint"

// Time difference threshold used when considering if node is done with
// initial syncing. If difference is greater than the specified threshold
// the node is considered not yet synced.
// NOTE: this is only used during the initial sync.
syncWorkerLastBlockTimeDiffThreshold = 1 * time.Minute
)

var (
Expand Down Expand Up @@ -1195,17 +1201,38 @@ func (t *tendermintService) syncWorker() {
case <-t.node.Quit():
return
case <-time.After(1 * time.Second):
isSyncing, err := checkSyncFn()
isFastSyncing, err := checkSyncFn()
if err != nil {
t.Logger.Error("Failed to poll FastSync",
"err", err,
)
return
}
if !isSyncing {
if !isFastSyncing {
t.Logger.Info("Tendermint Node finished fast-sync")
close(t.syncedCh)
return

// Check latest block time.
tmBlock, err := t.GetTendermintBlock(t.ctx, consensusAPI.HeightLatest)
if err != nil {
t.Logger.Error("Failed to get tendermint block",
"err", err,
)
return
}

now := time.Now()
// No committed blocks or latest block within threshold.
if tmBlock == nil || now.Sub(tmBlock.Header.Time) < syncWorkerLastBlockTimeDiffThreshold {
t.Logger.Info("Tendermint Node finished initial sync")
close(t.syncedCh)
ptrus marked this conversation as resolved.
Show resolved Hide resolved
return
}

t.Logger.Debug("Node still syncing",
"currentTime", now,
"latestBlockTime", tmBlock.Time,
"diff", now.Sub(tmBlock.Time),
)
}
}
}
Expand Down
73 changes: 73 additions & 0 deletions go/oasis-test-runner/scenario/e2e/late_start.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package e2e

import (
"time"

"github.com/oasislabs/oasis-core/go/oasis-test-runner/env"
"github.com/oasislabs/oasis-core/go/oasis-test-runner/oasis"
"github.com/oasislabs/oasis-core/go/oasis-test-runner/scenario"
)

var (
// LateStart is the LateStart node basic scenario.
LateStart scenario.Scenario = newLateStartImpl("late-start", "simple-keyvalue-client", nil)
)

const lateStartInitialWait = 2 * time.Minute

type lateStartImpl struct {
basicImpl
}

func newLateStartImpl(name, clientBinary string, clientArgs []string) scenario.Scenario {
return &lateStartImpl{
basicImpl: *newBasicImpl(name, clientBinary, clientArgs),
}
}

func (sc *lateStartImpl) Fixture() (*oasis.NetworkFixture, error) {
f, err := sc.basicImpl.Fixture()
if err != nil {
return nil, err
}

// Start without a client.
f.Clients = []oasis.ClientFixture{}

return f, nil
}

func (sc *lateStartImpl) Run(childEnv *env.Env) error {
// Start the network.
var err error
if err = sc.net.Start(); err != nil {
return err
}

sc.logger.Info("Waiting before starting the client node",
"wait_for", lateStartInitialWait,
)
time.Sleep(lateStartInitialWait)

sc.logger.Info("Starting the client node")
clientFixture := &oasis.ClientFixture{}
client, err := clientFixture.Create(sc.net)
if err != nil {
return err
}
if err = client.Start(); err != nil {
return err
}

sc.logger.Info("Starting the basic client")
cmd, err := startClient(childEnv, sc.net, resolveClientBinary(sc.clientBinary), sc.clientArgs)
if err != nil {
return err
}
clientErrCh := make(chan error)
go func() {
clientErrCh <- cmd.Wait()
}()

return sc.wait(childEnv, cmd, clientErrCh)
}
2 changes: 2 additions & 0 deletions go/oasis-test-runner/test-runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ func main() {
_ = cmd.Register(e2e.NodeUpgradeCancel)
// Debonding entries from genesis test.
_ = cmd.Register(e2e.Debond)
// Late start test.
_ = cmd.Register(e2e.LateStart)

// Register the remote signer test cases.
rootCmd.Flags().AddFlagSet(remotesigner.Flags)
Expand Down