Skip to content

Commit

Permalink
Fix concurrency issue between AbandonPendingBacklog() and CheckBacklo…
Browse files Browse the repository at this point in the history
…gForTimeouts(), and remove backlog locking.
  • Loading branch information
kornelpal committed Apr 5, 2023
1 parent 571d832 commit 9035679
Showing 1 changed file with 32 additions and 28 deletions.
60 changes: 32 additions & 28 deletions src/StackExchange.Redis/PhysicalBridge.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ internal sealed class PhysicalBridge : IDisposable
private int _backlogProcessorIsRunning = 0;
private int _backlogCurrentEnqueued = 0;
private long _backlogTotalEnqueued = 0;
private Exception? _abandonPendingBacklogException;

private int activeWriters = 0;
private int beating;
Expand Down Expand Up @@ -483,11 +484,18 @@ internal void OnDisconnected(ConnectionFailureType failureType, PhysicalConnecti

private void AbandonPendingBacklog(Exception ex)
{
// Peeking at the backlog, checking message and then dequeuing is not thread-safe.
// CheckBacklogForTimeouts() depends on this being set to properly complete dequeued messages.
Volatile.Write(ref _abandonPendingBacklogException, ex);

while (BacklogTryDequeue(out Message? next))
{
Multiplexer.OnMessageFaulted(next, ex);
next.SetExceptionAndComplete(ex, this);
}

// Best effort cleanup to avoid false positive thread safey check failures in CheckBacklogForTimeouts().
if (_backlogStatus != BacklogStatus.CheckingForTimeout) Interlocked.CompareExchange(ref _abandonPendingBacklogException, null, ex);
}

internal void OnFullyEstablished(PhysicalConnection connection, string source)
Expand Down Expand Up @@ -888,24 +896,29 @@ private void CheckBacklogForTimeouts()
var now = Environment.TickCount;
var timeout = TimeoutMilliseconds;

// Because peeking at the backlog, checking message and then dequeuing, is not thread-safe, we do have to use
// a lock here, for mutual exclusion of backlog DEQUEUERS. Unfortunately.
// But we reduce contention by only locking if we see something that looks timed out.
// Peeking at the backlog, checking message and then dequeuing is not thread-safe.
// Because AbandonPendingBacklog() is the only dequeuer that can run concurrently,
// locking can be avoided by throwing the AbandonPendingBacklog() exception here.
while (_backlog.TryPeek(out Message? message))
{
// See if the message has pass our async timeout threshold
// Note: All timed out messages must be dequeued, even when no completion is needed, to be able to dequeue and complete other timed out messages.
if (!message.HasTimedOut(now, timeout, out var _)) break; // not a timeout - we can stop looking
lock (_backlog)
if (!BacklogTryDequeue(out var message2)) message2 = null; // consume it for real
if (message != message2)
{
// Peek again since we didn't have lock before...
// and rerun the exact same checks as above, note that it may be a different message now
if (!_backlog.TryPeek(out message)) break;
if (!message.HasTimedOut(now, timeout, out var _)) break;
var ex = Volatile.Read(ref _abandonPendingBacklogException);
var isAbandonPendingBacklog = ex != null;
ex ??= new RedisException("Thread safety bug detected! A queue message disappeared when AbandonPendingBacklog() was not running.");
message2?.SetExceptionAndComplete(ex, this);

if (!BacklogTryDequeue(out var message2) || (message != message2)) // consume it for real
if (isAbandonPendingBacklog)
{
throw new RedisException("Thread safety bug detected! A queue message disappeared while we had the backlog lock");
break;
}
else
{
throw ex;
}
}

Expand Down Expand Up @@ -976,20 +989,15 @@ private async Task ProcessBacklogAsync()
if (isDisposed && BacklogHasItems)
{
_backlogStatus = BacklogStatus.NotifyingDisposed;
// Because peeking at the backlog, checking message and then dequeuing, is not thread-safe, we do have to use
// a lock here, for mutual exclusion of backlog DEQUEUERS. Unfortunately.
// But we reduce contention by only locking if we see something that looks timed out.
// Peeking at the backlog, checking message and then dequeuing is not thread-safe.
// CheckBacklogForTimeouts() depends on not running concurrently with this.
while (BacklogHasItems)
{
Message? message = null;
lock (_backlog)
if (!BacklogTryDequeue(out Message? message))
{
if (!BacklogTryDequeue(out message))
{
break;
}
break;
}

var ex = ExceptionFactory.Timeout(Multiplexer, "The message was in the backlog when connection was disposed", message, ServerEndPoint, WriteResult.TimeoutBeforeWrite, this);
message.SetExceptionAndComplete(ex, this);
}
Expand Down Expand Up @@ -1073,17 +1081,13 @@ private async Task ProcessBridgeBacklogAsync()
// If we can't write them, abort and wait for the next heartbeat or activation to try this again.
while (IsConnected && physical?.HasOutputPipe == true)
{
Message? message;
_backlogStatus = BacklogStatus.CheckingForWork;

lock (_backlog)
// Note that we're actively taking it off the queue here, not peeking
// If there's nothing left in queue, we're done.
if (!BacklogTryDequeue(out Message? message))
{
// Note that we're actively taking it off the queue here, not peeking
// If there's nothing left in queue, we're done.
if (!BacklogTryDequeue(out message))
{
break;
}
break;
}

try
Expand Down

0 comments on commit 9035679

Please sign in to comment.