From 4be9b0d04fd99e87f265469af22472f52c755b1c Mon Sep 17 00:00:00 2001 From: nouseforaname <34882943+nouseforaname@users.noreply.github.com> Date: Thu, 12 Sep 2024 10:07:16 +0200 Subject: [PATCH] write lockfiles for in flight services we need a mechanism for the bosh drain lifecycle to know if there are terraform processes in flight. the lockfiles do just that. they filesystem approach was favoured over a in memory map to allow an easier interface with bosh. assuming the csb process crashes while shutting down, recovering the SIs that were in flight is tricky because the in memory state got lost when the crash happened. additionally having the files use the GUIDs for their names allows us to log the ecact SIs that were not succesfully finished via the drain script. --- brokerapi/broker/brokerfakes/fake_storage.go | 148 ++++++++++++++++++ internal/storage/storage.go | 24 ++- internal/storage/terraform_deployment.go | 28 ++++ internal/storage/terraform_deployment_test.go | 46 +++++- .../fake_service_provider_storage.go | 148 ++++++++++++++++++ pkg/broker/service_provider.go | 2 + pkg/providers/tf/deployment_manager.go | 14 +- pkg/providers/tf/provider.go | 5 +- 8 files changed, 405 insertions(+), 10 deletions(-) diff --git a/brokerapi/broker/brokerfakes/fake_storage.go b/brokerapi/broker/brokerfakes/fake_storage.go index f0de811b3..cd6ce9688 100644 --- a/brokerapi/broker/brokerfakes/fake_storage.go +++ b/brokerapi/broker/brokerfakes/fake_storage.go @@ -197,6 +197,17 @@ type FakeStorage struct { result1 storage.TerraformDeployment result2 error } + RemoveLockFileStub func(string) error + removeLockFileMutex sync.RWMutex + removeLockFileArgsForCall []struct { + arg1 string + } + removeLockFileReturns struct { + result1 error + } + removeLockFileReturnsOnCall map[int]struct { + result1 error + } StoreBindRequestDetailsStub func(storage.BindRequestDetails) error storeBindRequestDetailsMutex sync.RWMutex storeBindRequestDetailsArgsForCall []struct { @@ -242,6 +253,17 @@ type FakeStorage struct { storeTerraformDeploymentReturnsOnCall map[int]struct { result1 error } + WriteLockFileStub func(string) error + writeLockFileMutex sync.RWMutex + writeLockFileArgsForCall []struct { + arg1 string + } + writeLockFileReturns struct { + result1 error + } + writeLockFileReturnsOnCall map[int]struct { + result1 error + } invocations map[string][][]interface{} invocationsMutex sync.RWMutex } @@ -1193,6 +1215,67 @@ func (fake *FakeStorage) GetTerraformDeploymentReturnsOnCall(i int, result1 stor }{result1, result2} } +func (fake *FakeStorage) RemoveLockFile(arg1 string) error { + fake.removeLockFileMutex.Lock() + ret, specificReturn := fake.removeLockFileReturnsOnCall[len(fake.removeLockFileArgsForCall)] + fake.removeLockFileArgsForCall = append(fake.removeLockFileArgsForCall, struct { + arg1 string + }{arg1}) + stub := fake.RemoveLockFileStub + fakeReturns := fake.removeLockFileReturns + fake.recordInvocation("RemoveLockFile", []interface{}{arg1}) + fake.removeLockFileMutex.Unlock() + if stub != nil { + return stub(arg1) + } + if specificReturn { + return ret.result1 + } + return fakeReturns.result1 +} + +func (fake *FakeStorage) RemoveLockFileCallCount() int { + fake.removeLockFileMutex.RLock() + defer fake.removeLockFileMutex.RUnlock() + return len(fake.removeLockFileArgsForCall) +} + +func (fake *FakeStorage) RemoveLockFileCalls(stub func(string) error) { + fake.removeLockFileMutex.Lock() + defer fake.removeLockFileMutex.Unlock() + fake.RemoveLockFileStub = stub +} + +func (fake *FakeStorage) RemoveLockFileArgsForCall(i int) string { + fake.removeLockFileMutex.RLock() + defer fake.removeLockFileMutex.RUnlock() + argsForCall := fake.removeLockFileArgsForCall[i] + return argsForCall.arg1 +} + +func (fake *FakeStorage) RemoveLockFileReturns(result1 error) { + fake.removeLockFileMutex.Lock() + defer fake.removeLockFileMutex.Unlock() + fake.RemoveLockFileStub = nil + fake.removeLockFileReturns = struct { + result1 error + }{result1} +} + +func (fake *FakeStorage) RemoveLockFileReturnsOnCall(i int, result1 error) { + fake.removeLockFileMutex.Lock() + defer fake.removeLockFileMutex.Unlock() + fake.RemoveLockFileStub = nil + if fake.removeLockFileReturnsOnCall == nil { + fake.removeLockFileReturnsOnCall = make(map[int]struct { + result1 error + }) + } + fake.removeLockFileReturnsOnCall[i] = struct { + result1 error + }{result1} +} + func (fake *FakeStorage) StoreBindRequestDetails(arg1 storage.BindRequestDetails) error { fake.storeBindRequestDetailsMutex.Lock() ret, specificReturn := fake.storeBindRequestDetailsReturnsOnCall[len(fake.storeBindRequestDetailsArgsForCall)] @@ -1438,6 +1521,67 @@ func (fake *FakeStorage) StoreTerraformDeploymentReturnsOnCall(i int, result1 er }{result1} } +func (fake *FakeStorage) WriteLockFile(arg1 string) error { + fake.writeLockFileMutex.Lock() + ret, specificReturn := fake.writeLockFileReturnsOnCall[len(fake.writeLockFileArgsForCall)] + fake.writeLockFileArgsForCall = append(fake.writeLockFileArgsForCall, struct { + arg1 string + }{arg1}) + stub := fake.WriteLockFileStub + fakeReturns := fake.writeLockFileReturns + fake.recordInvocation("WriteLockFile", []interface{}{arg1}) + fake.writeLockFileMutex.Unlock() + if stub != nil { + return stub(arg1) + } + if specificReturn { + return ret.result1 + } + return fakeReturns.result1 +} + +func (fake *FakeStorage) WriteLockFileCallCount() int { + fake.writeLockFileMutex.RLock() + defer fake.writeLockFileMutex.RUnlock() + return len(fake.writeLockFileArgsForCall) +} + +func (fake *FakeStorage) WriteLockFileCalls(stub func(string) error) { + fake.writeLockFileMutex.Lock() + defer fake.writeLockFileMutex.Unlock() + fake.WriteLockFileStub = stub +} + +func (fake *FakeStorage) WriteLockFileArgsForCall(i int) string { + fake.writeLockFileMutex.RLock() + defer fake.writeLockFileMutex.RUnlock() + argsForCall := fake.writeLockFileArgsForCall[i] + return argsForCall.arg1 +} + +func (fake *FakeStorage) WriteLockFileReturns(result1 error) { + fake.writeLockFileMutex.Lock() + defer fake.writeLockFileMutex.Unlock() + fake.WriteLockFileStub = nil + fake.writeLockFileReturns = struct { + result1 error + }{result1} +} + +func (fake *FakeStorage) WriteLockFileReturnsOnCall(i int, result1 error) { + fake.writeLockFileMutex.Lock() + defer fake.writeLockFileMutex.Unlock() + fake.WriteLockFileStub = nil + if fake.writeLockFileReturnsOnCall == nil { + fake.writeLockFileReturnsOnCall = make(map[int]struct { + result1 error + }) + } + fake.writeLockFileReturnsOnCall[i] = struct { + result1 error + }{result1} +} + func (fake *FakeStorage) Invocations() map[string][][]interface{} { fake.invocationsMutex.RLock() defer fake.invocationsMutex.RUnlock() @@ -1471,6 +1615,8 @@ func (fake *FakeStorage) Invocations() map[string][][]interface{} { defer fake.getServiceInstanceDetailsMutex.RUnlock() fake.getTerraformDeploymentMutex.RLock() defer fake.getTerraformDeploymentMutex.RUnlock() + fake.removeLockFileMutex.RLock() + defer fake.removeLockFileMutex.RUnlock() fake.storeBindRequestDetailsMutex.RLock() defer fake.storeBindRequestDetailsMutex.RUnlock() fake.storeProvisionRequestDetailsMutex.RLock() @@ -1479,6 +1625,8 @@ func (fake *FakeStorage) Invocations() map[string][][]interface{} { defer fake.storeServiceInstanceDetailsMutex.RUnlock() fake.storeTerraformDeploymentMutex.RLock() defer fake.storeTerraformDeploymentMutex.RUnlock() + fake.writeLockFileMutex.RLock() + defer fake.writeLockFileMutex.RUnlock() copiedInvocations := map[string][][]interface{}{} for key, value := range fake.invocations { copiedInvocations[key] = value diff --git a/internal/storage/storage.go b/internal/storage/storage.go index 96a992208..335c90a18 100644 --- a/internal/storage/storage.go +++ b/internal/storage/storage.go @@ -1,17 +1,31 @@ // Package storage implements a Database Access Object (DAO) package storage -import "gorm.io/gorm" +import ( + "os" + + "gorm.io/gorm" +) type Storage struct { - db *gorm.DB - encryptor Encryptor + db *gorm.DB + encryptor Encryptor + lockFileDir string } func New(db *gorm.DB, encryptor Encryptor) *Storage { + // the VM based HA deployment requires a drain mechanism. LockFiles are a simple solution. + // but not every environment will opt for using VM based deployments. So detect if the lockfile + // director is present. + + dirDefault := os.Getenv("CSB_LOCKFILE_DIR") + if _, err := os.Stat(dirDefault); err != nil { + dirDefault, _ = os.MkdirTemp("/tmp/", "lockfiles") + } return &Storage{ - db: db, - encryptor: encryptor, + db: db, + encryptor: encryptor, + lockFileDir: dirDefault, } } diff --git a/internal/storage/terraform_deployment.go b/internal/storage/terraform_deployment.go index 9d6e022aa..223793293 100644 --- a/internal/storage/terraform_deployment.go +++ b/internal/storage/terraform_deployment.go @@ -2,6 +2,8 @@ package storage import ( "fmt" + "os" + "strings" "time" "github.com/hashicorp/go-version" @@ -156,3 +158,29 @@ func (s *Storage) loadTerraformDeploymentIfExists(id string, receiver any) error return s.db.Where("id = ?", id).First(receiver).Error } + +func (s *Storage) LockFilesExist() bool { + entries, _ := os.ReadDir(s.lockFileDir) + return len(entries) != 0 +} + +func (s *Storage) WriteLockFile(deploymentID string) error { + return os.WriteFile(fmt.Sprintf("%s/%s", s.lockFileDir, sanitizeFileName(deploymentID)), []byte{}, 0o644) +} + +func (s *Storage) RemoveLockFile(deploymentID string) error { + return os.Remove(fmt.Sprintf("%s/%s", s.lockFileDir, sanitizeFileName(deploymentID))) +} + +func (s *Storage) GetLockedDeploymentIds() ([]string, error) { + entries, err := os.ReadDir(s.lockFileDir) + var names []string + for _, entry := range entries { + names = append(names, strings.ReplaceAll(entry.Name(), "_", ":")) + } + return names, err +} + +func sanitizeFileName(name string) string { + return strings.ReplaceAll(name, ":", "_") +} diff --git a/internal/storage/terraform_deployment_test.go b/internal/storage/terraform_deployment_test.go index 1dfe0ada9..86bd3a8fc 100644 --- a/internal/storage/terraform_deployment_test.go +++ b/internal/storage/terraform_deployment_test.go @@ -206,10 +206,54 @@ var _ = Describe("TerraformDeployments", func() { Expect(store.DeleteTerraformDeployment("not-there")).NotTo(HaveOccurred()) }) }) + + Describe("LockFileExists", func() { + It("reports correct status", func() { + Expect(store.WriteLockFile("1234")).To(Succeed()) + Expect(store.WriteLockFile("5678")).To(Succeed()) + + Expect(store.LockFilesExist()).To(BeTrue()) + + Expect(store.RemoveLockFile("1234")).To(Succeed()) + + Expect(store.LockFilesExist()).To(BeTrue()) + + Expect(store.RemoveLockFile("5678")).To(Succeed()) + + Expect(store.LockFilesExist()).To(BeFalse()) + }) + }) + + Describe("GetLockedDeploymentIds", func() { + It("returns correct names", func() { + names, err := store.GetLockedDeploymentIds() + Expect(err).NotTo(HaveOccurred()) + Expect(names).To(BeEmpty()) + + Expect(store.WriteLockFile("tf:1234:")).To(Succeed()) + Expect(store.WriteLockFile("tf:5678:9123")).To(Succeed()) + + names, err = store.GetLockedDeploymentIds() + Expect(err).NotTo(HaveOccurred()) + Expect(names).To(ContainElements("tf:1234:", "tf:5678:9123")) + + Expect(store.RemoveLockFile("tf:1234:")).To(Succeed()) + + names, err = store.GetLockedDeploymentIds() + Expect(err).NotTo(HaveOccurred()) + Expect(names).To(ContainElements("tf:5678:9123")) + Expect(names).ToNot(ContainElements("tf:1234:")) + + Expect(store.RemoveLockFile("tf:5678:9123")).To(Succeed()) + + names, err = store.GetLockedDeploymentIds() + Expect(err).NotTo(HaveOccurred()) + Expect(names).To(BeEmpty()) + }) + }) }) func addFakeTerraformDeployments() { - Expect(db.Create(&models.TerraformDeployment{ ID: "fake-id-1", Workspace: fakeWorkspace("fake-1", "1.2.3"), diff --git a/pkg/broker/brokerfakes/fake_service_provider_storage.go b/pkg/broker/brokerfakes/fake_service_provider_storage.go index 726eb81e4..c194ea7dd 100644 --- a/pkg/broker/brokerfakes/fake_service_provider_storage.go +++ b/pkg/broker/brokerfakes/fake_service_provider_storage.go @@ -59,6 +59,17 @@ type FakeServiceProviderStorage struct { result1 storage.TerraformDeployment result2 error } + RemoveLockFileStub func(string) error + removeLockFileMutex sync.RWMutex + removeLockFileArgsForCall []struct { + arg1 string + } + removeLockFileReturns struct { + result1 error + } + removeLockFileReturnsOnCall map[int]struct { + result1 error + } StoreTerraformDeploymentStub func(storage.TerraformDeployment) error storeTerraformDeploymentMutex sync.RWMutex storeTerraformDeploymentArgsForCall []struct { @@ -70,6 +81,17 @@ type FakeServiceProviderStorage struct { storeTerraformDeploymentReturnsOnCall map[int]struct { result1 error } + WriteLockFileStub func(string) error + writeLockFileMutex sync.RWMutex + writeLockFileArgsForCall []struct { + arg1 string + } + writeLockFileReturns struct { + result1 error + } + writeLockFileReturnsOnCall map[int]struct { + result1 error + } invocations map[string][][]interface{} invocationsMutex sync.RWMutex } @@ -327,6 +349,67 @@ func (fake *FakeServiceProviderStorage) GetTerraformDeploymentReturnsOnCall(i in }{result1, result2} } +func (fake *FakeServiceProviderStorage) RemoveLockFile(arg1 string) error { + fake.removeLockFileMutex.Lock() + ret, specificReturn := fake.removeLockFileReturnsOnCall[len(fake.removeLockFileArgsForCall)] + fake.removeLockFileArgsForCall = append(fake.removeLockFileArgsForCall, struct { + arg1 string + }{arg1}) + stub := fake.RemoveLockFileStub + fakeReturns := fake.removeLockFileReturns + fake.recordInvocation("RemoveLockFile", []interface{}{arg1}) + fake.removeLockFileMutex.Unlock() + if stub != nil { + return stub(arg1) + } + if specificReturn { + return ret.result1 + } + return fakeReturns.result1 +} + +func (fake *FakeServiceProviderStorage) RemoveLockFileCallCount() int { + fake.removeLockFileMutex.RLock() + defer fake.removeLockFileMutex.RUnlock() + return len(fake.removeLockFileArgsForCall) +} + +func (fake *FakeServiceProviderStorage) RemoveLockFileCalls(stub func(string) error) { + fake.removeLockFileMutex.Lock() + defer fake.removeLockFileMutex.Unlock() + fake.RemoveLockFileStub = stub +} + +func (fake *FakeServiceProviderStorage) RemoveLockFileArgsForCall(i int) string { + fake.removeLockFileMutex.RLock() + defer fake.removeLockFileMutex.RUnlock() + argsForCall := fake.removeLockFileArgsForCall[i] + return argsForCall.arg1 +} + +func (fake *FakeServiceProviderStorage) RemoveLockFileReturns(result1 error) { + fake.removeLockFileMutex.Lock() + defer fake.removeLockFileMutex.Unlock() + fake.RemoveLockFileStub = nil + fake.removeLockFileReturns = struct { + result1 error + }{result1} +} + +func (fake *FakeServiceProviderStorage) RemoveLockFileReturnsOnCall(i int, result1 error) { + fake.removeLockFileMutex.Lock() + defer fake.removeLockFileMutex.Unlock() + fake.RemoveLockFileStub = nil + if fake.removeLockFileReturnsOnCall == nil { + fake.removeLockFileReturnsOnCall = make(map[int]struct { + result1 error + }) + } + fake.removeLockFileReturnsOnCall[i] = struct { + result1 error + }{result1} +} + func (fake *FakeServiceProviderStorage) StoreTerraformDeployment(arg1 storage.TerraformDeployment) error { fake.storeTerraformDeploymentMutex.Lock() ret, specificReturn := fake.storeTerraformDeploymentReturnsOnCall[len(fake.storeTerraformDeploymentArgsForCall)] @@ -388,6 +471,67 @@ func (fake *FakeServiceProviderStorage) StoreTerraformDeploymentReturnsOnCall(i }{result1} } +func (fake *FakeServiceProviderStorage) WriteLockFile(arg1 string) error { + fake.writeLockFileMutex.Lock() + ret, specificReturn := fake.writeLockFileReturnsOnCall[len(fake.writeLockFileArgsForCall)] + fake.writeLockFileArgsForCall = append(fake.writeLockFileArgsForCall, struct { + arg1 string + }{arg1}) + stub := fake.WriteLockFileStub + fakeReturns := fake.writeLockFileReturns + fake.recordInvocation("WriteLockFile", []interface{}{arg1}) + fake.writeLockFileMutex.Unlock() + if stub != nil { + return stub(arg1) + } + if specificReturn { + return ret.result1 + } + return fakeReturns.result1 +} + +func (fake *FakeServiceProviderStorage) WriteLockFileCallCount() int { + fake.writeLockFileMutex.RLock() + defer fake.writeLockFileMutex.RUnlock() + return len(fake.writeLockFileArgsForCall) +} + +func (fake *FakeServiceProviderStorage) WriteLockFileCalls(stub func(string) error) { + fake.writeLockFileMutex.Lock() + defer fake.writeLockFileMutex.Unlock() + fake.WriteLockFileStub = stub +} + +func (fake *FakeServiceProviderStorage) WriteLockFileArgsForCall(i int) string { + fake.writeLockFileMutex.RLock() + defer fake.writeLockFileMutex.RUnlock() + argsForCall := fake.writeLockFileArgsForCall[i] + return argsForCall.arg1 +} + +func (fake *FakeServiceProviderStorage) WriteLockFileReturns(result1 error) { + fake.writeLockFileMutex.Lock() + defer fake.writeLockFileMutex.Unlock() + fake.WriteLockFileStub = nil + fake.writeLockFileReturns = struct { + result1 error + }{result1} +} + +func (fake *FakeServiceProviderStorage) WriteLockFileReturnsOnCall(i int, result1 error) { + fake.writeLockFileMutex.Lock() + defer fake.writeLockFileMutex.Unlock() + fake.WriteLockFileStub = nil + if fake.writeLockFileReturnsOnCall == nil { + fake.writeLockFileReturnsOnCall = make(map[int]struct { + result1 error + }) + } + fake.writeLockFileReturnsOnCall[i] = struct { + result1 error + }{result1} +} + func (fake *FakeServiceProviderStorage) Invocations() map[string][][]interface{} { fake.invocationsMutex.RLock() defer fake.invocationsMutex.RUnlock() @@ -399,8 +543,12 @@ func (fake *FakeServiceProviderStorage) Invocations() map[string][][]interface{} defer fake.getServiceBindingIDsForServiceInstanceMutex.RUnlock() fake.getTerraformDeploymentMutex.RLock() defer fake.getTerraformDeploymentMutex.RUnlock() + fake.removeLockFileMutex.RLock() + defer fake.removeLockFileMutex.RUnlock() fake.storeTerraformDeploymentMutex.RLock() defer fake.storeTerraformDeploymentMutex.RUnlock() + fake.writeLockFileMutex.RLock() + defer fake.writeLockFileMutex.RUnlock() copiedInvocations := map[string][][]interface{}{} for key, value := range fake.invocations { copiedInvocations[key] = value diff --git a/pkg/broker/service_provider.go b/pkg/broker/service_provider.go index 1cd6ad535..f28cc0758 100644 --- a/pkg/broker/service_provider.go +++ b/pkg/broker/service_provider.go @@ -77,4 +77,6 @@ type ServiceProviderStorage interface { DeleteTerraformDeployment(id string) error ExistsTerraformDeployment(id string) (bool, error) GetServiceBindingIDsForServiceInstance(serviceInstanceID string) ([]string, error) + WriteLockFile(guid string) error + RemoveLockFile(guid string) error } diff --git a/pkg/providers/tf/deployment_manager.go b/pkg/providers/tf/deployment_manager.go index 188581ec5..3a3f6c19b 100644 --- a/pkg/providers/tf/deployment_manager.go +++ b/pkg/providers/tf/deployment_manager.go @@ -51,7 +51,7 @@ func (d *DeploymentManager) MarkOperationStarted(deployment *storage.TerraformDe return err } - return nil + return d.store.WriteLockFile(deployment.ID) } func (d *DeploymentManager) MarkOperationFinished(deployment *storage.TerraformDeployment, err error) error { @@ -74,8 +74,16 @@ func (d *DeploymentManager) MarkOperationFinished(deployment *storage.TerraformD }) } - - return d.store.StoreTerraformDeployment(*deployment) + err = d.store.StoreTerraformDeployment(*deployment) + if err != nil { + d.logger.Error("store-state", err, lager.Data{ + "deploymentID": deployment.ID, + "message": deployment.LastOperationMessage, + }) + } else { + d.logger.Info(fmt.Sprintf("successfully stored state for %s", deployment.ID)) + } + return d.store.RemoveLockFile(deployment.ID) } func (d *DeploymentManager) OperationStatus(deploymentID string) (bool, string, error) { diff --git a/pkg/providers/tf/provider.go b/pkg/providers/tf/provider.go index 6c9828995..6e2eaeae7 100644 --- a/pkg/providers/tf/provider.go +++ b/pkg/providers/tf/provider.go @@ -96,7 +96,10 @@ func (provider *TerraformProvider) create(ctx context.Context, vars *varcontext. } else { err = provider.DefaultInvoker().Apply(ctx, newWorkspace) } - _ = provider.MarkOperationFinished(&deployment, err) + err = provider.MarkOperationFinished(&deployment, err) + if err != nil { + provider.logger.Error("MarkOperationFinished", err) + } }() return tfID, nil