From cf55cab8a71a01d5ad328f69db1abe812db773ac Mon Sep 17 00:00:00 2001 From: 9547 Date: Fri, 2 Apr 2021 11:11:25 +0800 Subject: [PATCH] cluster: start pd,dm-master in sequentially (#1262) --- pkg/cluster/manager/builder.go | 2 +- pkg/cluster/operation/action.go | 20 ++++++++++++++++++++ pkg/cluster/operation/operation.go | 1 + 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/pkg/cluster/manager/builder.go b/pkg/cluster/manager/builder.go index 6c01ce3ffc..fcbe520b71 100644 --- a/pkg/cluster/manager/builder.go +++ b/pkg/cluster/manager/builder.go @@ -306,7 +306,7 @@ func buildScaleOutTask( return m.specManager.SaveMeta(name, metadata) }). Func("StartCluster", func(ctx context.Context) error { - return operator.Start(ctx, newPart, operator.Options{OptTimeout: gOpt.OptTimeout}, tlsCfg) + return operator.Start(ctx, newPart, operator.Options{OptTimeout: gOpt.OptTimeout, Operation: operator.ScaleOutOperation}, tlsCfg) }). Parallel(false, refreshConfigTasks...). Parallel(false, buildReloadPromTasks(metadata.GetTopology())...) diff --git a/pkg/cluster/operation/action.go b/pkg/cluster/operation/action.go index a2cc9f07f0..f30921594f 100644 --- a/pkg/cluster/operation/action.go +++ b/pkg/cluster/operation/action.go @@ -460,6 +460,14 @@ func StartComponent(ctx context.Context, instances []spec.Instance, options Opti name := instances[0].ComponentName() log.Infof("Starting component %s", name) + // start instances in serial for Raft related components + // eg: PD has more strict restrictions on the capacity expansion process, + // that is, there should be only one node in the peer-join stage at most + // ref https://github.com/tikv/pd/blob/d38b36714ccee70480c39e07126e3456b5fb292d/server/join/join.go#L179-L191 + if options.Operation == ScaleOutOperation && (name == spec.ComponentPD || name == spec.ComponentDMMaster) { + return serialStartInstances(ctx, instances, options, tlsCfg) + } + errg, _ := errgroup.WithContext(ctx) for _, ins := range instances { @@ -484,6 +492,18 @@ func StartComponent(ctx context.Context, instances []spec.Instance, options Opti return errg.Wait() } +func serialStartInstances(ctx context.Context, instances []spec.Instance, options Options, tlsCfg *tls.Config) error { + for _, ins := range instances { + if err := ins.PrepareStart(ctx, tlsCfg); err != nil { + return err + } + if err := startInstance(ctx, ins, options.OptTimeout); err != nil { + return err + } + } + return nil +} + // StopMonitored stop BlackboxExporter and NodeExporter func StopMonitored(ctx context.Context, instance spec.Instance, options *spec.MonitoredOptions, timeout uint64) error { ports := map[string]int{ diff --git a/pkg/cluster/operation/operation.go b/pkg/cluster/operation/operation.go index 7a79130cc8..a729dad539 100644 --- a/pkg/cluster/operation/operation.go +++ b/pkg/cluster/operation/operation.go @@ -43,6 +43,7 @@ type Options struct { // Show uptime or not ShowUptime bool + Operation Operation } // Operation represents the type of cluster operation