From 2ef89c9ee405563a48ed5f8d78909448c1783eaa Mon Sep 17 00:00:00 2001 From: irfan sharif Date: Mon, 10 Apr 2023 14:40:25 -0400 Subject: [PATCH] server: (re-)fix data race in server initialization Fixes #91414. Fixes #101010. Fixes #100902. There was a race between registerEnginesForDiskStatsMap and SetPebbleMetricsProvider, the latter making use of a map constructed by the former but appeared in the opposite order in code. Release note: None --- pkg/server/server.go | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pkg/server/server.go b/pkg/server/server.go index c4892a820042..f8a44844f4db 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -1788,6 +1788,15 @@ func (s *Server) PreStart(ctx context.Context) error { // stores) s.node.waitForAdditionalStoreInit() + // Connect the engines to the disk stats map constructor. This needs to + // wait until after waitForAdditionalStoreInit returns since it realizes on + // wholly initialized stores (it reads the StoreIdentKeys). It also needs + // to come before the call into SetPebbleMetricsProvider, which internally + // uses the disk stats map we're initializing. + if err := s.node.registerEnginesForDiskStatsMap(s.cfg.Stores.Specs, s.engines); err != nil { + return errors.Wrapf(err, "failed to register engines for the disk stats map") + } + // Stores have been initialized, so Node can now provide Pebble metrics. // // Note that all existing stores will be operational before Pebble-level @@ -1798,13 +1807,6 @@ func (s *Server) PreStart(ctx context.Context) error { // to bypass admission control. s.storeGrantCoords.SetPebbleMetricsProvider(ctx, s.node, s.node) - // Connect the engines to the disk stats map constructor. - // This also needs to wait until after `waitForAdditionalStoreInit` returns, - // as the store IDs may not be known until then. - if err := s.node.registerEnginesForDiskStatsMap(s.cfg.Stores.Specs, s.engines); err != nil { - return errors.Wrapf(err, "failed to register engines for the disk stats map") - } - // Once all stores are initialized, check if offline storage recovery // was done prior to start and record any actions appropriately. logPendingLossOfQuorumRecoveryEvents(workersCtx, s.node.stores)