diff --git a/.changelog/5005.bugfix.1.md b/.changelog/5005.bugfix.1.md new file mode 100644 index 00000000000..c3db957acbc --- /dev/null +++ b/.changelog/5005.bugfix.1.md @@ -0,0 +1,5 @@ +go/worker/common: Reorder state determination checks + +Otherwise the shown state would be misleading, e.g. showing that it is +waiting for runtime host being provisioned while it is actually blocked +in initialization like storage sync. diff --git a/.changelog/5005.bugfix.2.md b/.changelog/5005.bugfix.2.md new file mode 100644 index 00000000000..552a10a62b9 --- /dev/null +++ b/.changelog/5005.bugfix.2.md @@ -0,0 +1,7 @@ +go/worker/storage: Fix case when checkpoint sync disabled but forced + +If checkpoint sync is disabled but sync has been forced (e.g. because +the state at genesis is non-empty), we must request to sync the +checkpoint at genesis as otherwise we will jump to a later state which +may not be desired given that checkpoint sync has been explicitly +disabled via config. diff --git a/.changelog/5005.bugfix.3.md b/.changelog/5005.bugfix.3.md new file mode 100644 index 00000000000..f0c9924acf2 --- /dev/null +++ b/.changelog/5005.bugfix.3.md @@ -0,0 +1 @@ +go/storage/mkvs/checkpoint: Exclude initial version when pruning diff --git a/.changelog/5005.bugfix.4.md b/.changelog/5005.bugfix.4.md new file mode 100644 index 00000000000..4b95b23f27b --- /dev/null +++ b/.changelog/5005.bugfix.4.md @@ -0,0 +1 @@ +go/p2p/rpc: Fix multi call dispatch to different peers diff --git a/go/p2p/rpc/client.go b/go/p2p/rpc/client.go index 6e0f42c8382..0f2e79d316a 100644 --- a/go/p2p/rpc/client.go +++ b/go/p2p/rpc/client.go @@ -355,6 +355,7 @@ func (c *client) CallMulti( } var resultChs []channels.SimpleOutChannel for _, peer := range c.GetBestPeers() { + peer := peer // Make sure goroutine below operates on the right instance. if !c.isPeerAcceptable(peer) { continue } diff --git a/go/p2p/rpc/peermgmt.go b/go/p2p/rpc/peermgmt.go index de748982e21..190db110f83 100644 --- a/go/p2p/rpc/peermgmt.go +++ b/go/p2p/rpc/peermgmt.go @@ -119,6 +119,7 @@ func (mgr *peerManager) AddPeer(peerID core.PeerID) { mgr.logger.Debug("added new peer", "peer_id", peerID, + "num_peers", len(mgr.peers), ) } @@ -134,6 +135,7 @@ func (mgr *peerManager) RemovePeer(peerID core.PeerID) { mgr.logger.Debug("removed peer", "peer_id", peerID, + "num_peers", len(mgr.peers), ) } diff --git a/go/storage/mkvs/checkpoint/checkpointer.go b/go/storage/mkvs/checkpoint/checkpointer.go index 1ae9eae8f3e..98245e8a0cc 100644 --- a/go/storage/mkvs/checkpoint/checkpointer.go +++ b/go/storage/mkvs/checkpoint/checkpointer.go @@ -250,7 +250,10 @@ func (c *checkpointer) maybeCheckpoint(ctx context.Context, version uint64, para } } - // Garbage collect old checkpoints. + // Garbage collect old checkpoints, first making sure that genesis checkpoint is excluded. + if len(cpVersions) > 0 && cpVersions[0] == params.InitialVersion { + cpVersions = cpVersions[1:] + } if int(params.NumKept) < len(cpVersions) { c.logger.Info("performing checkpoint garbage collection", "num_checkpoints", len(cpVersions), diff --git a/go/worker/common/committee/node.go b/go/worker/common/committee/node.go index 84ac40c11af..f33fcf36b4f 100644 --- a/go/worker/common/committee/node.go +++ b/go/worker/common/committee/node.go @@ -206,15 +206,15 @@ func (n *Node) getStatusStateLocked() api.StatusState { if atomic.LoadUint32(&n.keymanagerAvailable) == 0 { return api.StatusStateWaitingKeymanager } - if atomic.LoadUint32(&n.hostedRuntimeProvisioned) == 0 { - return api.StatusStateWaitingHostedRuntime - } if atomic.LoadUint32(&n.historyReindexingDone) == 0 { return api.StatusStateWaitingHistoryReindex } if atomic.LoadUint32(&n.workersInitialized) == 0 { return api.StatusStateWaitingWorkersInit } + if atomic.LoadUint32(&n.hostedRuntimeProvisioned) == 0 { + return api.StatusStateWaitingHostedRuntime + } // If resumeCh exists the runtime is suspended (safe to check since the cross node lock should be held). if n.resumeCh != nil { return api.StatusStateRuntimeSuspended diff --git a/go/worker/storage/committee/checkpoint_sync.go b/go/worker/storage/committee/checkpoint_sync.go index cc7fafbde85..39a47e18801 100644 --- a/go/worker/storage/committee/checkpoint_sync.go +++ b/go/worker/storage/committee/checkpoint_sync.go @@ -331,7 +331,7 @@ func (n *Node) checkCheckpointUsable(cp *storageSync.Checkpoint, remainingMask o return false } -func (n *Node) syncCheckpoints(genesisRound uint64) (*blockSummary, error) { +func (n *Node) syncCheckpoints(genesisRound uint64, wantOnlyGenesis bool) (*blockSummary, error) { // Store roots and round info for checkpoints that finished syncing. // Round and namespace info will get overwritten as rounds are skipped // for errors, driven by remainingRoots. @@ -343,6 +343,17 @@ func (n *Node) syncCheckpoints(genesisRound uint64) (*blockSummary, error) { return nil, fmt.Errorf("can't get checkpoint list from peers: %w", err) } + // If we only want the genesis checkpoint, filter it out. + if wantOnlyGenesis && len(cps) > 0 { + var filteredCps []*storageSync.Checkpoint + for _, cp := range cps { + if cp.Root.Version == genesisRound { + filteredCps = append(filteredCps, cp) + } + } + cps = filteredCps + } + // Try all the checkpoints now, from most recent backwards. var ( prevVersion = ^uint64(0) diff --git a/go/worker/storage/committee/node.go b/go/worker/storage/committee/node.go index abd2e040f40..5be09e050b8 100644 --- a/go/worker/storage/committee/node.go +++ b/go/worker/storage/committee/node.go @@ -911,6 +911,11 @@ func (n *Node) worker() { // nolint: gocyclo // // - We haven't synced anything yet and checkpoint sync is not disabled. // + // If checkpoint sync is disabled but sync has been forced (e.g. because the state at genesis + // is non-empty), we must request to sync the checkpoint at genesis as otherwise we will jump + // to a later state which may not be desired given that checkpoint sync has been explicitly + // disabled via config. + // if (isInitialStartup && !n.checkpointSyncCfg.Disabled) || n.checkpointSyncForced { var ( summary *blockSummary @@ -918,7 +923,7 @@ func (n *Node) worker() { // nolint: gocyclo ) CheckpointSyncRetry: for { - summary, err = n.syncCheckpoints(genesisBlock.Header.Round) + summary, err = n.syncCheckpoints(genesisBlock.Header.Round, n.checkpointSyncCfg.Disabled) if err == nil { break }