Skip to content

Commit c21b996

Browse files
authored
Merge pull request #24 from cloudtoid/Deadlock
solve a potential deadlock
2 parents 547c530 + 6dfa9b3 commit c21b996

2 files changed

Lines changed: 64 additions & 12 deletions

File tree

src/Interprocess.Tests/QueueTests.cs

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
using FluentAssertions;
22
using Microsoft.Extensions.DependencyInjection;
33
using Microsoft.Extensions.Logging;
4+
using Microsoft.Extensions.Logging.Abstractions;
45
using Xunit;
56
using Xunit.Abstractions;
67

@@ -238,11 +239,47 @@ public void CanRejectLargeMessages()
238239
p.TryEnqueue(ByteArray50).Should().BeFalse(); // failed here
239240
}
240241

242+
[Fact]
243+
[TestBeforeAfter]
244+
public void CanRecoverIfPublisherCrashes()
245+
{
246+
// This is very complicated test that is trying to replicate a crash scenario when the publisher
247+
// crashes after indicating that it is writing the message but before completing the operation.
248+
249+
using var dp = new DeadlockCausingPublisher(new("qn", fixture.Path, 1024), NullLoggerFactory.Instance);
250+
dp.TryEnqueue(ByteArray3).Should().BeTrue();
251+
252+
using var p = CreatePublisher(1024);
253+
p.TryEnqueue(ByteArray1).Should().BeTrue();
254+
using var s = CreateSubscriber(1024);
255+
256+
// This line should take 10 seconds to return (that is how long the timeout is set in the code)
257+
// After the 10 seconds expires, we should have lost all other messages that were in the queue when we started the dequeue process.
258+
s.TryDequeue(default, out _).Should().BeFalse();
259+
260+
// But then, after this 10 seconds delay, system should fully recover and continue with new messages
261+
p.TryEnqueue(ByteArray1).Should().BeTrue();
262+
s.TryDequeue(default, out var message).Should().BeTrue();
263+
message.ToArray().Should().BeEquivalentTo(ByteArray1);
264+
}
265+
241266
private IPublisher CreatePublisher(long capacity) =>
242-
queueFactory.CreatePublisher(
243-
new QueueOptions("qn", fixture.Path, capacity));
267+
queueFactory.CreatePublisher(new("qn", fixture.Path, capacity));
244268

245269
private ISubscriber CreateSubscriber(long capacity) =>
246-
queueFactory.CreateSubscriber(
247-
new QueueOptions("qn", fixture.Path, capacity));
270+
queueFactory.CreateSubscriber(new("qn", fixture.Path, capacity));
271+
272+
private sealed class DeadlockCausingPublisher(QueueOptions options, ILoggerFactory loggerFactory) :
273+
Queue(options, loggerFactory),
274+
IPublisher
275+
{
276+
public unsafe bool TryEnqueue(ReadOnlySpan<byte> message)
277+
{
278+
var bodyLength = message.Length;
279+
var messageLength = GetPaddedMessageLength(bodyLength);
280+
var header = *Header;
281+
Header->WriteOffset = SafeIncrementMessageOffset(header.WriteOffset, messageLength);
282+
return true;
283+
}
284+
}
248285
}

src/Interprocess/Queue/Subscriber.cs

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -107,14 +107,14 @@ private unsafe bool TryDequeueImpl(
107107
return false;
108108

109109
var readLockTimestamp = header.ReadLockTimestamp;
110-
var now = DateTime.UtcNow.Ticks;
110+
var start = DateTime.UtcNow.Ticks;
111111

112112
// is there already a read-lock or has the previous lock timed out meaning that a subscriber crashed?
113-
if (now - readLockTimestamp < TicksForTenSeconds)
113+
if (start - readLockTimestamp < TicksForTenSeconds)
114114
return false;
115115

116116
// take a read-lock so no other thread can read a message
117-
if (Interlocked.CompareExchange(ref Header->ReadLockTimestamp, now, readLockTimestamp) != readLockTimestamp)
117+
if (Interlocked.CompareExchange(ref Header->ReadLockTimestamp, start, readLockTimestamp) != readLockTimestamp)
118118
return false;
119119

120120
try
@@ -125,14 +125,29 @@ private unsafe bool TryDequeueImpl(
125125

126126
// now finally have a read-lock and the queue is not empty
127127
var readOffset = Header->ReadOffset;
128+
var writeOffset = Header->WriteOffset;
128129
var messageHeader = (MessageHeader*)Buffer.GetPointer(readOffset);
129130

130-
// was this message fully written by the publisher? if not, wait for the publisher to finish writing it
131-
while (Interlocked.CompareExchange(
132-
ref messageHeader->State,
133-
MessageHeader.LockedToBeConsumedState,
134-
MessageHeader.ReadyToBeConsumedState) != MessageHeader.ReadyToBeConsumedState)
131+
while (true)
135132
{
133+
// was this message fully written by the publisher? if not, wait for the publisher to finish writing it
134+
var state = Interlocked.CompareExchange(
135+
ref messageHeader->State,
136+
MessageHeader.LockedToBeConsumedState,
137+
MessageHeader.ReadyToBeConsumedState);
138+
139+
if (state == MessageHeader.ReadyToBeConsumedState)
140+
break;
141+
142+
// but if the publisher crashed, we will never get the message, so we need to handle that case by timing out
143+
if (DateTime.UtcNow.Ticks - start > TicksForTenSeconds)
144+
{
145+
// the publisher crashed and we will never get the message
146+
// so we need to release the read-lock and advance the queue for everyone.
147+
// some messages might be lost in this case but this is the best we can do.
148+
Interlocked.Exchange(ref Header->ReadOffset, writeOffset);
149+
return false;
150+
}
136151
Thread.Yield();
137152
}
138153

0 commit comments

Comments
 (0)