Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove bootstrapping retry config #2301

Merged
merged 12 commits into from
Nov 17, 2023
60 changes: 26 additions & 34 deletions chains/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,34 +174,32 @@ type ManagerConfig struct {
StakingBLSKey *bls.SecretKey
TracingEnabled bool
// Must not be used unless [TracingEnabled] is true as this may be nil.
Tracer trace.Tracer
Log logging.Logger
LogFactory logging.Factory
VMManager vms.Manager // Manage mappings from vm ID --> vm
BlockAcceptorGroup snow.AcceptorGroup
TxAcceptorGroup snow.AcceptorGroup
VertexAcceptorGroup snow.AcceptorGroup
DB database.Database
MsgCreator message.OutboundMsgBuilder // message creator, shared with network
Router router.Router // Routes incoming messages to the appropriate chain
Net network.Network // Sends consensus messages to other validators
Validators validators.Manager // Validators validating on this chain
NodeID ids.NodeID // The ID of this node
NetworkID uint32 // ID of the network this node is connected to
PartialSyncPrimaryNetwork bool
Server server.Server // Handles HTTP API calls
Keystore keystore.Keystore
AtomicMemory *atomic.Memory
AVAXAssetID ids.ID
XChainID ids.ID // ID of the X-Chain,
CChainID ids.ID // ID of the C-Chain,
CriticalChains set.Set[ids.ID] // Chains that can't exit gracefully
TimeoutManager timeout.Manager // Manages request timeouts when sending messages to other validators
Health health.Registerer
RetryBootstrap bool // Should Bootstrap be retried
RetryBootstrapWarnFrequency int // Max number of times to retry bootstrap before warning the node operator
SubnetConfigs map[ids.ID]subnets.Config // ID -> SubnetConfig
ChainConfigs map[string]ChainConfig // alias -> ChainConfig
Tracer trace.Tracer
Log logging.Logger
LogFactory logging.Factory
VMManager vms.Manager // Manage mappings from vm ID --> vm
BlockAcceptorGroup snow.AcceptorGroup
TxAcceptorGroup snow.AcceptorGroup
VertexAcceptorGroup snow.AcceptorGroup
DB database.Database
MsgCreator message.OutboundMsgBuilder // message creator, shared with network
Router router.Router // Routes incoming messages to the appropriate chain
Net network.Network // Sends consensus messages to other validators
Validators validators.Manager // Validators validating on this chain
NodeID ids.NodeID // The ID of this node
NetworkID uint32 // ID of the network this node is connected to
PartialSyncPrimaryNetwork bool
Server server.Server // Handles HTTP API calls
Keystore keystore.Keystore
AtomicMemory *atomic.Memory
AVAXAssetID ids.ID
XChainID ids.ID // ID of the X-Chain,
CChainID ids.ID // ID of the C-Chain,
CriticalChains set.Set[ids.ID] // Chains that can't exit gracefully
TimeoutManager timeout.Manager // Manages request timeouts when sending messages to other validators
Health health.Registerer
SubnetConfigs map[ids.ID]subnets.Config // ID -> SubnetConfig
ChainConfigs map[string]ChainConfig // alias -> ChainConfig
// ShutdownNodeFunc allows the chain manager to issue a request to shutdown the node
ShutdownNodeFunc func(exitCode int)
MeterVMEnabled bool // Should each VM be wrapped with a MeterVM
Expand Down Expand Up @@ -889,8 +887,6 @@ func (m *manager) createAvalancheChain(
Sender: snowmanMessageSender,
BootstrapTracker: sb,
Timer: h,
RetryBootstrap: m.RetryBootstrap,
RetryBootstrapWarnFrequency: m.RetryBootstrapWarnFrequency,
AncestorsMaxContainersReceived: m.BootstrapAncestorsMaxContainersReceived,
SharedCfg: &common.SharedConfig{},
},
Expand Down Expand Up @@ -941,8 +937,6 @@ func (m *manager) createAvalancheChain(
Sender: avalancheMessageSender,
BootstrapTracker: sb,
Timer: h,
RetryBootstrap: m.RetryBootstrap,
RetryBootstrapWarnFrequency: m.RetryBootstrapWarnFrequency,
AncestorsMaxContainersReceived: m.BootstrapAncestorsMaxContainersReceived,
SharedCfg: &common.SharedConfig{},
},
Expand Down Expand Up @@ -1244,8 +1238,6 @@ func (m *manager) createSnowmanChain(
Sender: messageSender,
BootstrapTracker: sb,
Timer: h,
RetryBootstrap: m.RetryBootstrap,
RetryBootstrapWarnFrequency: m.RetryBootstrapWarnFrequency,
AncestorsMaxContainersReceived: m.BootstrapAncestorsMaxContainersReceived,
SharedCfg: &common.SharedConfig{},
}
Expand Down
2 changes: 0 additions & 2 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -521,8 +521,6 @@ func getStateSyncConfig(v *viper.Viper) (node.StateSyncConfig, error) {

func getBootstrapConfig(v *viper.Viper, networkID uint32) (node.BootstrapConfig, error) {
config := node.BootstrapConfig{
RetryBootstrap: v.GetBool(RetryBootstrapKey),
RetryBootstrapWarnFrequency: v.GetInt(RetryBootstrapWarnFrequencyKey),
BootstrapBeaconConnectionTimeout: v.GetDuration(BootstrapBeaconConnectionTimeoutKey),
BootstrapMaxTimeGetAncestors: v.GetDuration(BootstrapMaxTimeGetAncestorsKey),
BootstrapAncestorsMaxContainersSent: int(v.GetUint(BootstrapAncestorsMaxContainersSentKey)),
Expand Down
2 changes: 0 additions & 2 deletions config/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,8 +292,6 @@ func addNodeFlags(fs *pflag.FlagSet) {
// TODO: combine "BootstrapIPsKey" and "BootstrapIDsKey" into one flag
fs.String(BootstrapIPsKey, "", "Comma separated list of bootstrap peer ips to connect to. Example: 127.0.0.1:9630,127.0.0.1:9631")
fs.String(BootstrapIDsKey, "", "Comma separated list of bootstrap peer ids to connect to. Example: NodeID-JR4dVmy6ffUGAKCBDkyCbeZbyHQBeDsET,NodeID-8CrVPQZ4VSqgL8zTdvL14G8HqAfrBr4z")
fs.Bool(RetryBootstrapKey, true, "Specifies whether bootstrap should be retried")
fs.Int(RetryBootstrapWarnFrequencyKey, 50, "Specifies how many times bootstrap should be retried before warning the operator")
fs.Duration(BootstrapBeaconConnectionTimeoutKey, time.Minute, "Timeout before emitting a warn log when connecting to bootstrapping beacons")
fs.Duration(BootstrapMaxTimeGetAncestorsKey, 50*time.Millisecond, "Max Time to spend fetching a container and its ancestors when responding to a GetAncestors")
fs.Uint(BootstrapAncestorsMaxContainersSentKey, 2000, "Max number of containers in an Ancestors message sent by this node")
Expand Down
2 changes: 0 additions & 2 deletions config/keys.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,6 @@ const (
RouterHealthMaxOutstandingRequestsKey = "router-health-max-outstanding-requests"
HealthCheckFreqKey = "health-check-frequency"
HealthCheckAveragerHalflifeKey = "health-check-averager-halflife"
RetryBootstrapKey = "bootstrap-retry-enabled"
RetryBootstrapWarnFrequencyKey = "bootstrap-retry-warn-frequency"
abi87 marked this conversation as resolved.
Show resolved Hide resolved
PluginDirKey = "plugin-dir"
BootstrapBeaconConnectionTimeoutKey = "bootstrap-beacon-connection-timeout"
BootstrapMaxTimeGetAncestorsKey = "bootstrap-max-time-get-ancestors"
Expand Down
6 changes: 0 additions & 6 deletions node/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,6 @@ type StateSyncConfig struct {
}

type BootstrapConfig struct {
// Should Bootstrap be retried
RetryBootstrap bool `json:"retryBootstrap"`

// Max number of times to retry bootstrap before warning the node operator
RetryBootstrapWarnFrequency int `json:"retryBootstrapWarnFrequency"`

// Timeout before emitting a warn log when connecting to bootstrapping beacons
BootstrapBeaconConnectionTimeout time.Duration `json:"bootstrapBeaconConnectionTimeout"`

Expand Down
2 changes: 0 additions & 2 deletions node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -1009,8 +1009,6 @@ func (n *Node) initChainManager(avaxAssetID ids.ID) error {
CriticalChains: criticalChains,
TimeoutManager: n.timeoutManager,
Health: n.health,
RetryBootstrap: n.Config.RetryBootstrap,
RetryBootstrapWarnFrequency: n.Config.RetryBootstrapWarnFrequency,
ShutdownNodeFunc: n.Shutdown,
MeterVMEnabled: n.Config.MeterVMEnabled,
Metrics: n.MetricsGatherer,
Expand Down
2 changes: 1 addition & 1 deletion snow/engine/avalanche/bootstrap/bootstrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -626,7 +626,7 @@ func (b *bootstrapper) checkFinish(ctx context.Context) error {
}
if !linearized {
b.Ctx.Log.Debug("checking for stop vertex before finishing bootstrapping")
return b.Restart(ctx, true)
return b.Restart(ctx)
}

// Invariant: edge will only be the stop vertex after its acceptance.
Expand Down
48 changes: 11 additions & 37 deletions snow/engine/common/bootstrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ type Bootstrapper interface {
AcceptedHandler
Haltable
Startup(context.Context) error
Restart(ctx context.Context, reset bool) error
Restart(ctx context.Context) error
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this all common bootstrapper is really only used by snowman chains (not avalanche ones anymore).
Should we just embed it into the snowman bootstrapped and be done with the common thing?
Maybe outside the scope of this PR but we'd prolly simplify code more aggressively

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's the goal!

}

// It collects mechanisms common to both snowman and avalanche bootstrappers
Expand Down Expand Up @@ -70,9 +70,6 @@ type bootstrapper struct {
// marked them as accepted
acceptedVotes map[ids.ID]uint64
acceptedFrontier []ids.ID

// number of times the bootstrap has been attempted
bootstrapAttempts int
}

func NewCommonBootstrapper(config Config) Bootstrapper {
Expand Down Expand Up @@ -161,20 +158,12 @@ func (b *bootstrapper) markAcceptedFrontierReceived(ctx context.Context, nodeID

// fail the bootstrap if the weight is not enough to bootstrap
if float64(totalSampledWeight)-newAlpha < float64(failedBeaconWeight) {
if b.Config.RetryBootstrap {
b.Ctx.Log.Debug("restarting bootstrap",
zap.String("reason", "not enough frontiers received"),
zap.Int("numBeacons", b.Beacons.Count(b.Ctx.SubnetID)),
zap.Int("numFailedBootstrappers", b.failedAcceptedFrontier.Len()),
zap.Int("numBootstrapAttemps", b.bootstrapAttempts),
)
return b.Restart(ctx, false)
}

b.Ctx.Log.Debug("didn't receive enough frontiers",
zap.Int("numFailedValidators", b.failedAcceptedFrontier.Len()),
zap.Int("numBootstrapAttempts", b.bootstrapAttempts),
b.Ctx.Log.Debug("restarting bootstrap",
zap.String("reason", "not enough frontiers received"),
zap.Int("numBeacons", b.Beacons.Count(b.Ctx.SubnetID)),
zap.Int("numFailedBootstrappers", b.failedAcceptedFrontier.Len()),
)
return b.Startup(ctx)
}

b.Config.SharedCfg.RequestID++
Expand Down Expand Up @@ -253,14 +242,13 @@ func (b *bootstrapper) Accepted(ctx context.Context, nodeID ids.NodeID, requestI
return fmt.Errorf("failed to get total weight of failed beacons for subnet %s: %w", b.Ctx.SubnetID, err)
}
votingStakes := beaconTotalWeight - failedBeaconWeight
if b.Config.RetryBootstrap && votingStakes < b.Alpha {
if votingStakes < b.Alpha {
b.Ctx.Log.Debug("restarting bootstrap",
zap.String("reason", "not enough votes received"),
zap.Int("numBeacons", b.Beacons.Count(b.Ctx.SubnetID)),
zap.Int("numFailedBootstrappers", b.failedAccepted.Len()),
zap.Int("numBootstrapAttempts", b.bootstrapAttempts),
)
return b.Restart(ctx, false)
return b.Startup(ctx)
}
}

Expand Down Expand Up @@ -329,7 +317,6 @@ func (b *bootstrapper) Startup(ctx context.Context) error {
b.failedAccepted.Clear()
b.acceptedVotes = make(map[ids.ID]uint64)

b.bootstrapAttempts++
if b.pendingSendAcceptedFrontier.Len() == 0 {
b.Ctx.Log.Info("bootstrapping skipped",
zap.String("reason", "no provided bootstraps"),
Expand All @@ -342,22 +329,9 @@ func (b *bootstrapper) Startup(ctx context.Context) error {
return nil
}

func (b *bootstrapper) Restart(ctx context.Context, reset bool) error {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Restart(ctx, false) is replaced with Startup(ctx) and Restart(ctx, false) is replaced with Restart(ctx)

// resets the attempts when we're pulling blocks/vertices we don't want to
// fail the bootstrap at that stage
if reset {
b.Ctx.Log.Debug("Checking for new frontiers")

b.Config.SharedCfg.Restarted = true
b.bootstrapAttempts = 0
}

if b.bootstrapAttempts > 0 && b.bootstrapAttempts%b.RetryBootstrapWarnFrequency == 0 {
b.Ctx.Log.Debug("check internet connection",
zap.Int("numBootstrapAttempts", b.bootstrapAttempts),
)
}

func (b *bootstrapper) Restart(ctx context.Context) error {
b.Ctx.Log.Debug("Checking for new frontiers")
b.Config.SharedCfg.Restarted = true
return b.Startup(ctx)
}

Expand Down
6 changes: 0 additions & 6 deletions snow/engine/common/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,6 @@ type Config struct {
BootstrapTracker BootstrapTracker
Timer Timer

// Should Bootstrap be retried
RetryBootstrap bool

// Max number of times to retry bootstrap before warning the node operator
RetryBootstrapWarnFrequency int

// This node will only consider the first [AncestorsMaxContainersReceived]
// containers in an ancestors message it receives.
AncestorsMaxContainersReceived int
Expand Down
6 changes: 3 additions & 3 deletions snow/engine/snowman/bootstrap/bootstrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ func (b *bootstrapper) Timeout(ctx context.Context) error {
b.awaitingTimeout = false

if !b.Config.BootstrapTracker.IsBootstrapped() {
return b.Restart(ctx, true)
return b.Restart(ctx)
}
b.fetchETA.Set(0)
return b.OnFinished(ctx, b.Config.SharedCfg.RequestID)
Expand Down Expand Up @@ -587,8 +587,8 @@ func (b *bootstrapper) checkFinish(ctx context.Context) error {
// Note that executedBlocks < c*previouslyExecuted ( 0 <= c < 1 ) is enforced
// so that the bootstrapping process will terminate even as new blocks are
// being issued.
if b.Config.RetryBootstrap && executedBlocks > 0 && executedBlocks < previouslyExecuted/2 {
StephenButtolph marked this conversation as resolved.
Show resolved Hide resolved
return b.Restart(ctx, true)
if executedBlocks > 0 && executedBlocks < previouslyExecuted/2 {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is why test tests needed to be updated (because b.Config.RetryBootstrap was false in a number of tests previously)

return b.Restart(ctx)
}

// If there is an additional callback, notify them that this chain has been
Expand Down
29 changes: 5 additions & 24 deletions snow/engine/snowman/syncer/state_syncer.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,6 @@ type stateSyncer struct {
// we keep a list of deduplicated height ready for voting
summariesHeights set.Set[uint64]
uniqueSummariesHeights []uint64

// number of times the state sync has been attempted
attempts int
}

func New(
Expand Down Expand Up @@ -207,15 +204,11 @@ func (ss *stateSyncer) receivedStateSummaryFrontier(ctx context.Context) error {

frontierStake := frontiersTotalWeight - failedBeaconWeight
if float64(frontierStake) < frontierAlpha {
ss.Ctx.Log.Debug("didn't receive enough frontiers",
ss.Ctx.Log.Debug("restarting state sync",
zap.String("reason", "didn't receive enough frontiers"),
zap.Int("numFailedValidators", ss.failedSeeders.Len()),
zap.Int("numStateSyncAttempts", ss.attempts),
)

if ss.Config.RetryBootstrap {
ss.Ctx.Log.Debug("restarting state sync")
return ss.restart(ctx)
}
return ss.startup(ctx)
}

ss.requestID++
Expand Down Expand Up @@ -326,14 +319,13 @@ func (ss *stateSyncer) AcceptedStateSummary(ctx context.Context, nodeID ids.Node
return fmt.Errorf("failed to get total weight of state sync beacons for subnet %s: %w", ss.Ctx.SubnetID, err)
}
votingStakes := beaconsTotalWeight - failedVotersWeight
if ss.Config.RetryBootstrap && votingStakes < ss.Alpha {
if votingStakes < ss.Alpha {
ss.Ctx.Log.Debug("restarting state sync",
zap.String("reason", "not enough votes received"),
zap.Int("numBeacons", ss.StateSyncBeacons.Count(ss.Ctx.SubnetID)),
zap.Int("numFailedSyncers", ss.failedVoters.Len()),
zap.Int("numAttempts", ss.attempts),
)
return ss.restart(ctx)
return ss.startup(ctx)
}

ss.Ctx.Log.Info("skipping state sync",
Expand Down Expand Up @@ -509,7 +501,6 @@ func (ss *stateSyncer) startup(ctx context.Context) error {
}

// initiate messages exchange
ss.attempts++
if ss.targetSeeders.Len() == 0 {
ss.Ctx.Log.Info("State syncing skipped due to no provided syncers")
return ss.onDoneStateSyncing(ctx, ss.requestID)
Expand All @@ -520,16 +511,6 @@ func (ss *stateSyncer) startup(ctx context.Context) error {
return nil
}

func (ss *stateSyncer) restart(ctx context.Context) error {
if ss.attempts > 0 && ss.attempts%ss.RetryBootstrapWarnFrequency == 0 {
ss.Ctx.Log.Debug("check internet connection",
zap.Int("numSyncAttempts", ss.attempts),
)
}

return ss.startup(ctx)
}

// Ask up to [common.MaxOutstandingBroadcastRequests] state sync validators at a time
// to send their accepted state summary. It is called again until there are
// no more seeders to be reached in the pending set
Expand Down
Loading
Loading