-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[core] avoid stuck updateTaskStatus due to mulitple mesos updates
The issue described in OCTRL-953 occurs when we schedule kill of a task and while waiting for an acknowledgment, we receive two mesos updates: ``` [2024-11-11T12:14:02+01:00] TRACE scheduler: task status update received detector=TRG message= srcHost=alio2-cr1-flp163 state=TASK_FAILED task=2qwA9EYEgnY [2024-11-11T12:14:02+01:00] TRACE scheduler: task status update received detector=TRG message=Reconciliation: Task is unknown to the agent srcHost=alio2-cr1-flp163 state=TASK_LOST task=2qwA9EYEgnY ``` Which then trigger the discussed ack. Since it's inclear to me whether we can surely ignore TASK_LOST and trust that we will always receive either TASK_FAILED or TASK_FINISHED, I went for the approach of improving safeAcks to produce an error when subsequent acks are sent instead of blocking some goroutines.
- Loading branch information
Showing
5 changed files
with
264 additions
and
97 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
/* | ||
* === This file is part of ALICE O² === | ||
* | ||
* Copyright 2020 CERN and copyright holders of ALICE O². | ||
* Author: Miltiadis Alexis <[email protected]> | ||
* | ||
* This program is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation, either version 3 of the License, or | ||
* (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
* | ||
* In applying this license CERN does not waive the privileges and | ||
* immunities granted to it by virtue of its status as an | ||
* Intergovernmental Organization or submit itself to any jurisdiction. | ||
*/ | ||
|
||
package safeacks | ||
|
||
import ( | ||
"fmt" | ||
"sync" | ||
) | ||
|
||
// SafeAcks is a thread safe structure which allows to handle acknowledgment exchanges | ||
// with N senders and one receiver. The first sender succeeds, then an error is returned for the subsequent ones. | ||
// This way, subsequent senders are not stuck sending an acknowledgment when nothing expects it anymore. | ||
// The signaling design is inspired by point 2 in https://go101.org/article/channel-closing.html | ||
// SafeAcks can be used to acknowledge that an action happened to the task such as task KILLED. | ||
// At the moment we utilize SafeAcks to acknowledge that all the requested tasks were killed by mesos (task/manager.go). | ||
type SafeAcks struct { | ||
mu sync.RWMutex | ||
acks map[string]ackChannels | ||
} | ||
|
||
type ackChannels struct { | ||
// the channel to send the ack to | ||
ack chan struct{} | ||
// the channel to close when acks are no longer expected | ||
stop chan struct{} | ||
} | ||
|
||
func (a *SafeAcks) deleteKey(key string) { | ||
a.mu.Lock() | ||
defer a.mu.Unlock() | ||
|
||
delete(a.acks, key) | ||
} | ||
|
||
func (a *SafeAcks) ExpectsAck(key string) bool { | ||
a.mu.RLock() | ||
defer a.mu.RUnlock() | ||
|
||
_, ok := a.acks[key] | ||
|
||
return ok | ||
} | ||
|
||
func (a *SafeAcks) RegisterAck(key string) error { | ||
a.mu.Lock() | ||
defer a.mu.Unlock() | ||
|
||
if _, hasKey := a.acks[key]; hasKey { | ||
return fmt.Errorf("an acknowledgment was already registered for key '%s'", key) | ||
} | ||
|
||
a.acks[key] = ackChannels{make(chan struct{}), make(chan struct{})} | ||
return nil | ||
} | ||
|
||
func (a *SafeAcks) getValue(key string) (ackChannels ackChannels, ok bool) { | ||
a.mu.Lock() | ||
defer a.mu.Unlock() | ||
|
||
ackChannels, ok = a.acks[key] | ||
return | ||
} | ||
|
||
// TrySendAck checks if an acknowledgment is expected and if it is, it blocks until it is received. | ||
// If an acknowledgment is not expected at the moment of the call (or already was received), nil is returned. | ||
// If more than one goroutine attempts to send an acknowledgment before it is received, all but one goroutines will | ||
// receive an error. | ||
func (a *SafeAcks) TrySendAck(key string) error { | ||
channels, ok := a.getValue(key) | ||
if !ok { | ||
// fixme: perhaps we should return an error also here, but returning nil preserves the original behaviour | ||
// of safeAcks before the refactoring. Perhaps the rest of the code assumes it's ok to blindly try sending | ||
// an ack "just in case", so I would not change it lightly. | ||
return nil | ||
} | ||
|
||
select { | ||
case <-channels.stop: | ||
return fmt.Errorf("an acknowledgment has been already received for key '%s'", key) | ||
case channels.ack <- struct{}{}: | ||
return nil | ||
} | ||
} | ||
|
||
// TryReceiveAck blocks until an acknowledgment is received and then returns true. | ||
// It will return false if an acknowledgment for a given key is not expected. | ||
func (a *SafeAcks) TryReceiveAck(key string) bool { | ||
channels, ok := a.getValue(key) | ||
if !ok { | ||
return false | ||
} | ||
<-channels.ack | ||
close(channels.stop) | ||
a.deleteKey(key) | ||
return true | ||
} | ||
|
||
func NewAcks() *SafeAcks { | ||
return &SafeAcks{ | ||
acks: make(map[string]ackChannels), | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
package safeacks | ||
|
||
import ( | ||
. "github.com/onsi/ginkgo/v2" | ||
. "github.com/onsi/gomega" | ||
"sync" | ||
"testing" | ||
"time" | ||
) | ||
|
||
var _ = Describe("SafeAcks", func() { | ||
var sa *SafeAcks | ||
|
||
BeforeEach(func() { | ||
sa = NewAcks() | ||
}) | ||
|
||
Describe("RegisterAck", func() { | ||
It("should register a new ack", func(ctx SpecContext) { | ||
err := sa.RegisterAck("test") | ||
Expect(err).NotTo(HaveOccurred()) | ||
Expect(sa.ExpectsAck("test")).To(BeTrue()) | ||
}, SpecTimeout(5*time.Second)) | ||
|
||
It("should return error when an ack is already registered", func(ctx SpecContext) { | ||
err := sa.RegisterAck("test") | ||
Expect(err).NotTo(HaveOccurred()) | ||
Expect(sa.ExpectsAck("test")).To(BeTrue()) | ||
|
||
err = sa.RegisterAck("test") | ||
Expect(err).To(HaveOccurred()) | ||
|
||
Expect(sa.ExpectsAck("test")).To(BeTrue()) | ||
}, SpecTimeout(5*time.Second)) | ||
}) | ||
// TODO add timeout for this test | ||
Describe("TrySendAck and TryReceiveAck", func() { | ||
It("should return nil for non-existent key", func(ctx SpecContext) { | ||
err := sa.TrySendAck("nonexistent") | ||
Expect(err).To(BeNil()) | ||
}, SpecTimeout(5*time.Second)) | ||
|
||
It("should send ack successfully", func(ctx SpecContext) { | ||
err := sa.RegisterAck("test") | ||
Expect(err).NotTo(HaveOccurred()) | ||
|
||
var wg sync.WaitGroup | ||
wg.Add(1) | ||
|
||
go func() { | ||
defer wg.Done() | ||
err := sa.TrySendAck("test") | ||
Expect(err).To(BeNil()) | ||
}() | ||
Expect(sa.TryReceiveAck("test")).To(BeTrue()) | ||
|
||
wg.Wait() | ||
}, SpecTimeout(5*time.Second)) | ||
|
||
It("should return error when ack was already sent once", func(ctx SpecContext) { | ||
err := sa.RegisterAck("test") | ||
Expect(err).NotTo(HaveOccurred()) | ||
|
||
result1 := make(chan error) | ||
result2 := make(chan error) | ||
go func() { | ||
result1 <- sa.TrySendAck("test") | ||
}() | ||
|
||
go func() { | ||
result2 <- sa.TrySendAck("test") | ||
}() | ||
|
||
// I really don't like relying on a sleep call to test this, but I see no other way... | ||
// The goal is to have both `TrySendAck` blocked at channel send before invoking TryReceiveAck. | ||
// Hopefully 1 second is enough to avoid having a shaky test. | ||
time.Sleep(1000 * time.Millisecond) | ||
|
||
ok := sa.TryReceiveAck("test") | ||
Expect(ok).To(BeTrue()) | ||
|
||
oneErrorHaveOccured := (<-result1 != nil) != (<-result2 != nil) | ||
Expect(oneErrorHaveOccured).To(BeTrue()) | ||
}, SpecTimeout(5*time.Second)) | ||
}) | ||
|
||
Describe("ExpectsAck", func() { | ||
It("should return false for non-existent key", func(ctx SpecContext) { | ||
Expect(sa.ExpectsAck("nonexistent")).To(BeFalse()) | ||
}, SpecTimeout(5*time.Second)) | ||
|
||
It("should return true for registered key", func(ctx SpecContext) { | ||
err := sa.RegisterAck("test") | ||
Expect(err).NotTo(HaveOccurred()) | ||
Expect(sa.ExpectsAck("test")).To(BeTrue()) | ||
}, SpecTimeout(5*time.Second)) | ||
|
||
It("should not be permanently blocked by another call", func(ctx SpecContext) { | ||
err := sa.RegisterAck("test") | ||
Expect(err).NotTo(HaveOccurred()) | ||
go func() { | ||
sa.TryReceiveAck("test") | ||
}() | ||
|
||
// I really don't like relying on a sleep call to test this, but I see no other way... | ||
// The goal is to have `TryReceiveAck` blocked at channel receive before invoking ExpectsAck. | ||
// Hopefully 1 second is enough to avoid having a shaky test. | ||
time.Sleep(1000 * time.Millisecond) | ||
|
||
Expect(sa.ExpectsAck("test")).To(BeTrue()) | ||
}, SpecTimeout(5*time.Second)) | ||
}) | ||
}) | ||
|
||
func TestSafeAcks(t *testing.T) { | ||
RegisterFailHandler(Fail) | ||
RunSpecs(t, "Component SafeAcks Test Suite") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.