diff --git a/cmd/cmd.go b/cmd/cmd.go index 3e5a7dba3..f2328df01 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -357,6 +357,9 @@ func (c Cmd) Execute() (cmdErr error) { case *CloudCheckOpts: return NewCloudCheckCmd(c.deployment(), deps.UI).Run(*opts) + case *CreateRecoveryPlanOpts: + return NewCreateRecoveryPlanCmd(c.deployment(), deps.UI, deps.FS).Run(*opts) + case *CleanUpOpts: return NewCleanUpCmd(deps.UI, c.director()).Run(*opts) diff --git a/cmd/create_recovery_plan.go b/cmd/create_recovery_plan.go new file mode 100644 index 000000000..396e36c3e --- /dev/null +++ b/cmd/create_recovery_plan.go @@ -0,0 +1,213 @@ +package cmd + +import ( + "fmt" + "sort" + + "gopkg.in/yaml.v2" + + bosherr "github.com/cloudfoundry/bosh-utils/errors" + boshsys "github.com/cloudfoundry/bosh-utils/system" + + boshdir "github.com/cloudfoundry/bosh-cli/v7/director" + boshui "github.com/cloudfoundry/bosh-cli/v7/ui" + boshtbl "github.com/cloudfoundry/bosh-cli/v7/ui/table" + + . "github.com/cloudfoundry/bosh-cli/v7/cmd/opts" +) + +type InstanceGroupPlan struct { + Name string `yaml:"name"` + MaxInFlightOverride string `yaml:"max_in_flight_override,omitempty"` + PlannedResolutions map[string]string `yaml:"planned_resolutions"` +} + +type RecoveryPlan struct { + InstanceGroupsPlan []InstanceGroupPlan `yaml:"instance_groups_plan"` +} + +type CreateRecoveryPlanCmd struct { + deployment boshdir.Deployment + ui boshui.UI + fs boshsys.FileSystem +} + +func NewCreateRecoveryPlanCmd(deployment boshdir.Deployment, ui boshui.UI, fs boshsys.FileSystem) CreateRecoveryPlanCmd { + return CreateRecoveryPlanCmd{deployment: deployment, ui: ui, fs: fs} +} + +func (c CreateRecoveryPlanCmd) Run(opts CreateRecoveryPlanOpts) error { + problemsByInstanceGroup, err := c.getProblemsByInstanceGroup() + if err != nil { + return err + } + + if len(problemsByInstanceGroup) == 0 { + c.ui.PrintLinef("No problems found\n") + return nil + } + + maxInFlightByInstanceGroup, err := c.getMaxInFlightByInstanceGroup() + if err != nil { + return err + } + + var plan RecoveryPlan + for _, instanceGroup := range sortedMapKeys(problemsByInstanceGroup) { + c.ui.PrintLinef("Instance Group '%s'\n", instanceGroup) + + instanceGroupResolutions, err := c.processProblemsByType(problemsByInstanceGroup[instanceGroup]) + if err != nil { + return err + } + + instanceGroupCurrentMaxInFlight := maxInFlightByInstanceGroup[instanceGroup] + var instanceGroupMaxInFlightOverride string + if c.ui.AskForConfirmationWithLabel( + fmt.Sprintf("Override current max_in_flight value of '%s'?", instanceGroupCurrentMaxInFlight), + ) == nil { + instanceGroupMaxInFlightOverride, err = c.ui.AskForTextWithDefaultValue( + fmt.Sprintf("max_in_flight override for '%s'", instanceGroup), + instanceGroupCurrentMaxInFlight, + ) + if err != nil { + return err + } + } + + plan.InstanceGroupsPlan = append(plan.InstanceGroupsPlan, InstanceGroupPlan{ + Name: instanceGroup, + MaxInFlightOverride: instanceGroupMaxInFlightOverride, + PlannedResolutions: instanceGroupResolutions, + }) + } + + bytes, err := yaml.Marshal(plan) + if err != nil { + return err + } + + return c.fs.WriteFile(opts.Args.RecoveryPlan.ExpandedPath, bytes) +} + +type updateInstanceGroup struct { + Name string `yaml:"name"` + Update map[string]interface{} `yaml:"update"` +} + +type updateManifest struct { + InstanceGroups []updateInstanceGroup `yaml:"instance_groups"` + Update map[string]interface{} `yaml:"update"` +} + +func (c CreateRecoveryPlanCmd) getMaxInFlightByInstanceGroup() (map[string]string, error) { + rawManifest, err := c.deployment.Manifest() + if err != nil { + return nil, err + } + + var updateManifest updateManifest + err = yaml.Unmarshal([]byte(rawManifest), &updateManifest) + if err != nil { + return nil, err + } + + globalMaxInFlight := updateManifest.Update["max_in_flight"] + flightMap := make(map[string]string) + for _, instanceGroup := range updateManifest.InstanceGroups { + groupMaxInFlight := instanceGroup.Update["max_in_flight"] + if groupMaxInFlight == nil { + groupMaxInFlight = globalMaxInFlight + } + flightMap[instanceGroup.Name] = fmt.Sprintf("%v", groupMaxInFlight) + } + + return flightMap, nil +} + +func sortedMapKeys(problemMap map[string][]boshdir.Problem) []string { + var keys []string + for k := range problemMap { + keys = append(keys, k) + } + sort.Strings(keys) + + return keys +} + +func (c CreateRecoveryPlanCmd) processProblemsByType(problems []boshdir.Problem) (map[string]string, error) { + problemsByType := mapProblemsByTrait(problems, func(p boshdir.Problem) string { return p.Type }) + + resolutions := make(map[string]string) + for _, problemType := range sortedMapKeys(problemsByType) { + problemsForType := problemsByType[problemType] + c.printProblemTable(problemType, problemsForType) + + var opts []string + for _, res := range problemsForType[0].Resolutions { + opts = append(opts, res.Plan) + } + + chosenIndex, err := c.ui.AskForChoice(problemType, opts) + if err != nil { + return nil, err + } + + resolutions[problemType] = *problemsForType[0].Resolutions[chosenIndex].Name + } + + return resolutions, nil +} + +func (c CreateRecoveryPlanCmd) printProblemTable(problemType string, problemsForType []boshdir.Problem) { + table := boshtbl.Table{ + Title: fmt.Sprintf("Problem type: %s", problemType), + Content: fmt.Sprintf("%s problems", problemType), + Header: []boshtbl.Header{ + boshtbl.NewHeader("#"), + boshtbl.NewHeader("Description"), + }, + SortBy: []boshtbl.ColumnSort{{Column: 0, Asc: true}}, + } + + for _, p := range problemsForType { + table.Rows = append(table.Rows, []boshtbl.Value{ + boshtbl.NewValueInt(p.ID), + boshtbl.NewValueString(p.Description), + }) + } + + c.ui.PrintTable(table) +} + +func (c CreateRecoveryPlanCmd) getProblemsByInstanceGroup() (map[string][]boshdir.Problem, error) { + problems, err := c.deployment.ScanForProblems() + if err != nil { + return nil, err + } + + if anyProblemsHaveNoInstanceGroups(problems) { + return nil, bosherr.Error("Director does not support this command. Try 'bosh cloud-check' instead") + } + + return mapProblemsByTrait(problems, func(p boshdir.Problem) string { return p.InstanceGroup }), nil +} + +func anyProblemsHaveNoInstanceGroups(problems []boshdir.Problem) bool { + for _, p := range problems { + if p.InstanceGroup == "" { + return true + } + } + + return false +} + +func mapProblemsByTrait(problems []boshdir.Problem, traitFunc func(p boshdir.Problem) string) map[string][]boshdir.Problem { + probMap := make(map[string][]boshdir.Problem) + for _, p := range problems { + probMap[traitFunc(p)] = append(probMap[traitFunc(p)], p) + } + + return probMap +} diff --git a/cmd/create_recovery_plan_test.go b/cmd/create_recovery_plan_test.go new file mode 100644 index 000000000..d0e6b06dd --- /dev/null +++ b/cmd/create_recovery_plan_test.go @@ -0,0 +1,306 @@ +package cmd_test + +import ( + "errors" + + fakesys "github.com/cloudfoundry/bosh-utils/system/fakes" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "gopkg.in/yaml.v2" + + . "github.com/cloudfoundry/bosh-cli/v7/cmd" + . "github.com/cloudfoundry/bosh-cli/v7/cmd/opts" + boshdir "github.com/cloudfoundry/bosh-cli/v7/director" + fakedir "github.com/cloudfoundry/bosh-cli/v7/director/directorfakes" + fakeui "github.com/cloudfoundry/bosh-cli/v7/ui/fakes" + boshtbl "github.com/cloudfoundry/bosh-cli/v7/ui/table" +) + +func createResolution(name, plan string) boshdir.ProblemResolution { + return boshdir.ProblemResolution{ + Name: &name, + Plan: plan, + } +} + +var _ = Describe("CreateRecoveryPlanCmd", func() { + skipResolution := createResolution("ignore", "Skip for now") + recreateResolution := createResolution("recreate_vm", "Recreate VM") + rebootResolution := createResolution("reboot_vm", "Reboot VM") + deleteVmReferenceResolution := createResolution("delete_vm_reference", "Delete VM reference") + deleteDiskReferenceResolution := createResolution("delete_disk_reference", "Delete disk reference (DANGEROUS!)") + reattachDiskResolution := createResolution("reattach_disk", "Reattach disk to instance") + reattachDiskAndRebootResolution := createResolution("reattach_disk_and_reboot", "Reattach disk and reboot instance") + + var ( + deployment *fakedir.FakeDeployment + ui *fakeui.FakeUI + fakeFS *fakesys.FakeFileSystem + command CreateRecoveryPlanCmd + ) + + BeforeEach(func() { + deployment = &fakedir.FakeDeployment{} + ui = &fakeui.FakeUI{} + fakeFS = fakesys.NewFakeFileSystem() + command = NewCreateRecoveryPlanCmd(deployment, ui, fakeFS) + }) + + Describe("Run", func() { + var ( + opts CreateRecoveryPlanOpts + severalProbs []boshdir.Problem + ) + + BeforeEach(func() { + opts = CreateRecoveryPlanOpts{ + Args: CreateRecoveryPlanArgs{ + RecoveryPlan: FileArg{ + ExpandedPath: "/tmp/foo.yml", + FS: fakeFS, + }, + }, + } + + severalProbs = []boshdir.Problem{ + { + ID: 3, + + Type: "unresponsive_agent", + Description: "problem1-desc", + InstanceGroup: "diego_cell", + + Resolutions: []boshdir.ProblemResolution{ + skipResolution, + recreateResolution, + deleteVmReferenceResolution, + }, + }, + { + ID: 4, + + Type: "missing_vm", + Description: "problem2-desc", + InstanceGroup: "router", + + Resolutions: []boshdir.ProblemResolution{ + skipResolution, + recreateResolution, + rebootResolution, + deleteDiskReferenceResolution, + }, + }, + { + ID: 5, + + Type: "mount_info_mismatch", + Description: "problem3-desc", + InstanceGroup: "router", + + Resolutions: []boshdir.ProblemResolution{ + skipResolution, + reattachDiskResolution, + reattachDiskAndRebootResolution, + }, + }, + } + }) + + act := func() error { return command.Run(opts) } + + Context("scanning for problems failed", func() { + BeforeEach(func() { + deployment.ScanForProblemsReturns(nil, errors.New("fake-err")) + }) + + It("returns error", func() { + err := act() + Expect(err).To(MatchError(ContainSubstring("fake-err"))) + }) + }) + + Context("no problems are found", func() { + BeforeEach(func() { + deployment.ScanForProblemsReturns([]boshdir.Problem{}, nil) + }) + + It("tells the user", func() { + err := act() + Expect(err).ToNot(HaveOccurred()) + + Expect(ui.Said).To(ContainElement("No problems found\n")) + }) + + It("does not ask for confirmation or with choices", func() { + err := act() + Expect(err).ToNot(HaveOccurred()) + + Expect(ui.AskedChoiceCalled).To(BeFalse()) + Expect(ui.AskedConfirmationCalled).To(BeFalse()) + }) + + It("does not write a file", func() { + err := act() + Expect(err).ToNot(HaveOccurred()) + + Expect(fakeFS.WriteFileCallCount).To(BeZero()) + }) + }) + + Context("problems are found", func() { + BeforeEach(func() { + deployment.ScanForProblemsReturns(severalProbs, nil) + ui.AskedChoiceChosens = []int{0, 1, 2} + ui.AskedChoiceErrs = []error{nil, nil, nil} + ui.AskedConfirmationErr = nil + ui.AskedText = []fakeui.Answer{ + {Text: "10", Error: nil}, + {Text: "50%", Error: nil}, + } + }) + + It("shows problems by instance group and type", func() { + err := act() + Expect(err).ToNot(HaveOccurred()) + + Expect(ui.Said).To(ContainElements( + "Instance Group 'router'\n", + "Instance Group 'diego_cell'\n", + )) + Expect(ui.Tables).To( + ContainElements( + boshtbl.Table{ + Title: "Problem type: unresponsive_agent", + Content: "unresponsive_agent problems", + + Header: []boshtbl.Header{ + boshtbl.NewHeader("#"), + boshtbl.NewHeader("Description"), + }, + + SortBy: []boshtbl.ColumnSort{{Column: 0, Asc: true}}, + + Rows: [][]boshtbl.Value{ + { + boshtbl.NewValueInt(3), + boshtbl.NewValueString("problem1-desc"), + }, + }, + }, + boshtbl.Table{ + Title: "Problem type: missing_vm", + Content: "missing_vm problems", + + Header: []boshtbl.Header{ + boshtbl.NewHeader("#"), + boshtbl.NewHeader("Description"), + }, + + SortBy: []boshtbl.ColumnSort{{Column: 0, Asc: true}}, + + Rows: [][]boshtbl.Value{ + { + boshtbl.NewValueInt(4), + boshtbl.NewValueString("problem2-desc"), + }, + }, + }, + boshtbl.Table{ + Title: "Problem type: mount_info_mismatch", + Content: "mount_info_mismatch problems", + + Header: []boshtbl.Header{ + boshtbl.NewHeader("#"), + boshtbl.NewHeader("Description"), + }, + + SortBy: []boshtbl.ColumnSort{{Column: 0, Asc: true}}, + + Rows: [][]boshtbl.Value{ + { + boshtbl.NewValueInt(5), + boshtbl.NewValueString("problem3-desc"), + }, + }, + }, + ), + ) + }) + + It("writes a recovery plan based on answers", func() { + err := act() + Expect(err).ToNot(HaveOccurred()) + + Expect(ui.AskedChoiceCalled).To(BeTrue()) + + Expect(fakeFS.WriteFileCallCount).To(Equal(1)) + Expect(fakeFS.FileExists("/tmp/foo.yml")).To(BeTrue()) + bytes, err := fakeFS.ReadFile("/tmp/foo.yml") + Expect(err).ToNot(HaveOccurred()) + + var actualPlan RecoveryPlan + Expect(yaml.Unmarshal(bytes, &actualPlan)).ToNot(HaveOccurred()) + + Expect(actualPlan.InstanceGroupsPlan).To(HaveLen(2)) + + Expect(actualPlan.InstanceGroupsPlan[0].Name).To(Equal("diego_cell")) + Expect(actualPlan.InstanceGroupsPlan[0].MaxInFlightOverride).To(Equal("10")) + Expect(actualPlan.InstanceGroupsPlan[0].PlannedResolutions).To(HaveLen(1)) + Expect(actualPlan.InstanceGroupsPlan[0].PlannedResolutions).To(HaveKeyWithValue("unresponsive_agent", *skipResolution.Name)) + + Expect(actualPlan.InstanceGroupsPlan[1].Name).To(Equal("router")) + Expect(actualPlan.InstanceGroupsPlan[1].MaxInFlightOverride).To(Equal("50%")) + Expect(actualPlan.InstanceGroupsPlan[1].PlannedResolutions).To(HaveLen(2)) + Expect(actualPlan.InstanceGroupsPlan[1].PlannedResolutions).To(HaveKeyWithValue("missing_vm", *recreateResolution.Name)) + Expect(actualPlan.InstanceGroupsPlan[1].PlannedResolutions).To(HaveKeyWithValue("mount_info_mismatch", *reattachDiskAndRebootResolution.Name)) + }) + + It("returns an error if asking fails", func() { + ui.AskedChoiceErrs = []error{nil, errors.New("fake-err"), nil} + + err := act() + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("fake-err")) + }) + + It("does not override max_in_flight if not confirmed", func() { + ui.AskedConfirmationErr = errors.New("fake-err") + + err := act() + Expect(err).ToNot(HaveOccurred()) + + Expect(fakeFS.WriteFileCallCount).To(Equal(1)) + Expect(fakeFS.FileExists("/tmp/foo.yml")).To(BeTrue()) + bytes, err := fakeFS.ReadFile("/tmp/foo.yml") + Expect(err).ToNot(HaveOccurred()) + + var actualPlan RecoveryPlan + Expect(yaml.Unmarshal(bytes, &actualPlan)).ToNot(HaveOccurred()) + + Expect(actualPlan.InstanceGroupsPlan).To(HaveLen(2)) + + Expect(actualPlan.InstanceGroupsPlan[0].Name).To(Equal("diego_cell")) + Expect(actualPlan.InstanceGroupsPlan[0].MaxInFlightOverride).To(BeEmpty()) + + Expect(actualPlan.InstanceGroupsPlan[1].Name).To(Equal("router")) + Expect(actualPlan.InstanceGroupsPlan[1].MaxInFlightOverride).To(BeEmpty()) + }) + + Context("director does not return instance group", func() { + BeforeEach(func() { + grouplessProbs := severalProbs + for i := range grouplessProbs { + grouplessProbs[i].InstanceGroup = "" + } + + deployment.ScanForProblemsReturns(grouplessProbs, nil) + }) + + It("informs the user and exits", func() { + err := act() + Expect(err).To(MatchError(ContainSubstring("does not support"))) + }) + }) + }) + }) +}) diff --git a/cmd/opts/opts.go b/cmd/opts/opts.go index 4c7c9a406..c5b2aacf7 100644 --- a/cmd/opts/opts.go +++ b/cmd/opts/opts.go @@ -137,6 +137,7 @@ type BoshOpts struct { Ignore IgnoreOpts `command:"ignore" description:"Ignore an instance"` Unignore UnignoreOpts `command:"unignore" description:"Unignore an instance"` CloudCheck CloudCheckOpts `command:"cloud-check" alias:"cck" alias:"cloudcheck" description:"Cloud consistency check and interactive repair"` //nolint:staticcheck + CreateRecoveryPlan CreateRecoveryPlanOpts `command:"create-recovery-plan" description:"Interactively generate a recovery plan for disaster repair"` OrphanedVMs OrphanedVMsOpts `command:"orphaned-vms" description:"List all the orphaned VMs in all deployments"` // Instance management @@ -816,6 +817,15 @@ type CloudCheckOpts struct { cmd } +type CreateRecoveryPlanOpts struct { + Args CreateRecoveryPlanArgs `positional-args:"true" required:"true"` + cmd +} + +type CreateRecoveryPlanArgs struct { + RecoveryPlan FileArg `positional-arg-name:"PATH" description:"Create recovery plan file at path"` +} + type OrphanedVMsOpts struct { cmd } diff --git a/cmd/opts/opts_test.go b/cmd/opts/opts_test.go index 13f202902..5afeb5e9c 100644 --- a/cmd/opts/opts_test.go +++ b/cmd/opts/opts_test.go @@ -2455,6 +2455,38 @@ var _ = Describe("Opts", func() { }) }) + Describe("CreateRecoveryPlanOpts", func() { + var opts *CreateRecoveryPlanOpts + + BeforeEach(func() { + opts = &CreateRecoveryPlanOpts{} + }) + + Describe("Args", func() { + It("contains desired values", func() { + Expect(getStructTagForName("Args", opts)).To(Equal( + `positional-args:"true" required:"true"`, + )) + }) + }) + }) + + Describe("CreateRecoveryPlanArgs", func() { + var opts *CreateRecoveryPlanArgs + + BeforeEach(func() { + opts = &CreateRecoveryPlanArgs{} + }) + + Describe("RecoveryPlan", func() { + It("contains desired values", func() { + Expect(getStructTagForName("RecoveryPlan", opts)).To(Equal( + `positional-arg-name:"PATH" description:"Create recovery plan file at path"`, + )) + }) + }) + }) + Describe("UpdateResurrectionOpts", func() { var opts *UpdateResurrectionOpts diff --git a/director/problems.go b/director/problems.go index f52392079..9d2537634 100644 --- a/director/problems.go +++ b/director/problems.go @@ -12,8 +12,9 @@ import ( type Problem struct { ID int // e.g. 4 - Type string // e.g. "unresponsive_agent" - Description string // e.g. "api/1 (5efd2cb8-d73b-4e45-6df4-58f5dd5ec2ec) is not responding" + Type string // e.g. "unresponsive_agent" + Description string // e.g. "api/1 (5efd2cb8-d73b-4e45-6df4-58f5dd5ec2ec) is not responding" + InstanceGroup string `json:"instance_group"` // e.g. "database" Data interface{} Resolutions []ProblemResolution @@ -33,8 +34,8 @@ type ProblemResolution struct { } type ProblemAnswer struct { - ProblemID int - Resolution ProblemResolution + ProblemID int `yaml:"problem_id"` + Resolution ProblemResolution `yaml:"resolution"` } func (d DeploymentImpl) ScanForProblems() ([]Problem, error) { diff --git a/director/problems_test.go b/director/problems_test.go index 95633a568..6668df010 100644 --- a/director/problems_test.go +++ b/director/problems_test.go @@ -52,6 +52,7 @@ var _ = Describe("Director", func() { "id": 4, "type": "unresponsive_agent", "description": "desc1", + "instance_group": "diego_cell", "resolutions": [ {"name": "Skip for now", "plan": "ignore"}, {"name": "Reboot VM", "plan": "reboot_vm"} @@ -61,6 +62,7 @@ var _ = Describe("Director", func() { "id": 5, "type": "unresponsive_agent", "description": "desc2", + "instance_group": "router", "resolutions": [ {"name": "Skip for now", "plan": "ignore"} ] @@ -78,6 +80,7 @@ var _ = Describe("Director", func() { Expect(problem0.ID).To(Equal(4)) Expect(problem0.Type).To(Equal("unresponsive_agent")) Expect(problem0.Description).To(Equal("desc1")) + Expect(problem0.InstanceGroup).To(Equal("diego_cell")) problem0Resolutions := problem0.Resolutions Expect(len(problem0Resolutions)).To(Equal(2)) Expect(*problem0Resolutions[0].Name).To(Equal("Skip for now")) @@ -89,6 +92,7 @@ var _ = Describe("Director", func() { Expect(problem1.ID).To(Equal(5)) Expect(problem1.Type).To(Equal("unresponsive_agent")) Expect(problem1.Description).To(Equal("desc2")) + Expect(problem1.InstanceGroup).To(Equal("router")) problem1Resolutions := problem1.Resolutions Expect(len(problem1Resolutions)).To(Equal(1)) Expect(*problem1Resolutions[0].Name).To(Equal("Skip for now"))