Skip to content

Commit b872e39

Browse files
authored
Stabilize unit tests (#932, #938) (#935)
* Stabilize async/task-queue unit tests and harden factorial worker lifetime • Relax VerifyDistributedAsyncCall timing by allowing a per-iteration slack window and wait for Cleanup to be recorded before opcode verification. • Add explicit completion signaling and queue-drain waits in VerifyWaitForCompletion, VerifySimpleAsyncCall, and VerifyDistributedAsyncCall to avoid threadpool timing races. • Hold a per-call reference in FactorialWorkerSimple/FactorialWorkerDistributed to prevent UAF during asynchronous callbacks. • In _VerifyQueueTermination, only wait for counts to settle when termination is non-blocking. * Fix UAF in test VerifyDuplicateQueueHandle * Relax test timing in VerifySubmitCallbackWithWait * Fix data race in AsyncBlockTests with lock-free opcode logging Access violation crash in `AsyncBlockTests::VerifyAsyncBlockReuse` due to concurrent `std::vector::push_back` operations on shared `opCodes` member from overlapping async call lifecycle phases. The test intentionally reuses `XAsyncBlock` and `FactorialCallData` across sequential async calls. When the first call's cleanup (invoked from completion callback) races with the second call's initialization, both threads attempt to push_back into the same `std::vector`, causing heap corruption during vector reallocation. **Crash stack trace:** ``` ucrtbased!_free_dbg (heap corruption during vector realloc) ← std::vector<XAsyncOp>::push_back ← FactorialWorkerSimple (Cleanup opcode from first call) ← AsyncState::~AsyncState ← CompletionCallback [concurrent with] ← FactorialWorkerSimple (Begin/DoWork from second call) ← VerifyAsyncBlockReuse ``` **Detection:** Heisenbug found after 6-hour soak test under Windows CDB with page heap enabled (`gflags /p /enable`). Replace `std::vector<XAsyncOp> opCodes` with fixed-capacity lock-free append buffer: - `std::array<std::atomic<XAsyncOp>, 16>` for storage (capacity exceeds max test depth) - `std::atomic<size_t>` for thread-safe index allocation - `RecordOp(op)`: atomic fetch-add for index, then `store(memory_order_release)` to array slot - `GetOpCodes()`: snapshot current state into vector via `load(memory_order_acquire)` **Why this approach:** - Aligns with library philosophy of avoiding synchronization primitives - No dynamic allocation eliminates reallocation races - Bounded opcode sequences (max ~9 in distributed factorial tests) - Append-only during async lifecycle, read-only during verification - Proper release-acquire semantics ensure visibility on ARM/weakly-ordered architectures - Natural lock-free semantics: each writer gets unique slot via atomic index - `Tests/UnitTests/Tests/AsyncBlockTests.cpp`: - Added `#include <array>` for std::array support - Replaced `std::vector<XAsyncOp> opCodes` with `std::atomic<XAsyncOp>` array buffer - Updated `FactorialWorkerSimple` and `FactorialWorkerDistributed` to use `RecordOp()` - Updated all test verification sites to use `GetOpCodes()` snapshot method - Optimized multi-call sites to snapshot once and reuse - **Specific test**: `VerifyAsyncBlockReuse` passes 10/10 rapid runs - **Full suite**: All 23 AsyncBlockTests passed with no regressions - **Note**: Original heisenbug required 6hr soak to reproduce; single-pass testing verifies compilation and basic functionality, but extended soak testing would be needed to fully validate stability under stress - Test-only change, no production code affected - Eliminates data race without introducing mutex overhead - Maintains test semantics and coverage
1 parent b1246b8 commit b872e39

2 files changed

Lines changed: 104 additions & 25 deletions

File tree

Tests/UnitTests/Tests/AsyncBlockTests.cpp

Lines changed: 89 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "XAsyncProviderPriv.h"
88
#include "XTaskQueue.h"
99
#include "XTaskQueuePriv.h"
10+
#include <array>
1011

1112
#define TEST_CLASS_OWNER L"brianpe"
1213

@@ -79,13 +80,43 @@ DEFINE_TEST_CLASS(AsyncBlockTests)
7980
DWORD result = 0;
8081
DWORD iterationWait = 0;
8182
DWORD workThread = 0;
82-
std::vector<XAsyncOp> opCodes;
83+
84+
// Fixed-capacity lock-free opcode log for concurrent append
85+
static constexpr size_t MAX_OPCODES = 16;
86+
std::array<std::atomic<XAsyncOp>, MAX_OPCODES> opCodesArray{};
87+
std::atomic<size_t> opCodesCount{ 0 };
88+
8389
std::atomic<int> inWork = 0;
8490
std::atomic<int> refs = 0;
8591
std::atomic<bool> canceled = false;
8692

8793
void AddRef() { refs++; }
8894
void Release() { if (--refs == 0) delete this; }
95+
96+
// Thread-safe append operation
97+
void RecordOp(XAsyncOp op)
98+
{
99+
size_t idx = opCodesCount.fetch_add(1, std::memory_order_relaxed);
100+
if (idx < MAX_OPCODES)
101+
{
102+
opCodesArray[idx].store(op, std::memory_order_release);
103+
}
104+
// Silently drop if overflow (test will fail on verification anyway)
105+
}
106+
107+
// Snapshot current opcodes into a vector for verification
108+
std::vector<XAsyncOp> GetOpCodes() const
109+
{
110+
size_t count = opCodesCount.load(std::memory_order_acquire);
111+
count = (count < MAX_OPCODES) ? count : MAX_OPCODES;
112+
std::vector<XAsyncOp> result;
113+
result.reserve(count);
114+
for (size_t i = 0; i < count; i++)
115+
{
116+
result.push_back(opCodesArray[i].load(std::memory_order_acquire));
117+
}
118+
return result;
119+
}
89120
};
90121

91122
static PCWSTR OpName(XAsyncOp op)
@@ -116,8 +147,10 @@ DEFINE_TEST_CLASS(AsyncBlockTests)
116147
static HRESULT CALLBACK FactorialWorkerSimple(XAsyncOp opCode, const XAsyncProviderData* data)
117148
{
118149
FactorialCallData* d = (FactorialCallData*)data->context;
150+
HRESULT hr = S_OK;
151+
d->AddRef();
119152

120-
d->opCodes.push_back(opCode);
153+
d->RecordOp(opCode);
121154

122155
switch (opCode)
123156
{
@@ -159,14 +192,17 @@ DEFINE_TEST_CLASS(AsyncBlockTests)
159192
break;
160193
}
161194

162-
return S_OK;
195+
d->Release();
196+
return hr;
163197
}
164198

165199
static HRESULT CALLBACK FactorialWorkerDistributed(XAsyncOp opCode, const XAsyncProviderData* data)
166200
{
167201
FactorialCallData* d = (FactorialCallData*)data->context;
202+
HRESULT hr = S_OK;
203+
d->AddRef();
168204

169-
d->opCodes.push_back(opCode);
205+
d->RecordOp(opCode);
170206

171207
switch (opCode)
172208
{
@@ -196,28 +232,30 @@ DEFINE_TEST_CLASS(AsyncBlockTests)
196232
if (d->canceled)
197233
{
198234
d->inWork--;
199-
return E_ABORT;
235+
hr = E_ABORT;
236+
break;
200237
}
201238

202239
d->result *= d->value;
203240
d->value--;
204241

205-
HRESULT hr = XAsyncSchedule(data->async, d->iterationWait);
242+
hr = XAsyncSchedule(data->async, d->iterationWait);
206243
d->inWork--;
207244

208245
if (SUCCEEDED(hr))
209246
{
210247
hr = E_PENDING;
211248
}
212-
return hr;
249+
break;
213250
}
214251

215252
d->inWork--;
216253
XAsyncComplete(data->async, S_OK, sizeof(DWORD));
217254
break;
218255
}
219256

220-
return S_OK;
257+
d->Release();
258+
return hr;
221259
}
222260

223261
static HRESULT CALLBACK FactorialWorkerDistributedWithSchedule(XAsyncOp opCode, const XAsyncProviderData* data)
@@ -391,7 +429,14 @@ DEFINE_TEST_CLASS(AsyncBlockTests)
391429
ops.push_back(XAsyncOp::GetResult);
392430
ops.push_back(XAsyncOp::Cleanup);
393431

394-
VerifyOps(data.Ref->opCodes, ops);
432+
VerifyOps(data.Ref->GetOpCodes(), ops);
433+
434+
UINT64 drainTicks = GetTickCount64();
435+
while ((!XTaskQueueIsEmpty(queue, XTaskQueuePort::Completion) || !XTaskQueueIsEmpty(queue, XTaskQueuePort::Work))
436+
&& GetTickCount64() - drainTicks < 2000)
437+
{
438+
Sleep(10);
439+
}
395440

396441
VERIFY_QUEUE_EMPTY(queue);
397442
}
@@ -457,6 +502,7 @@ DEFINE_TEST_CLASS(AsyncBlockTests)
457502

458503
data.Ref->iterationWait = 100;
459504
data.Ref->value = 5;
505+
const DWORD initialValue = data.Ref->value;
460506

461507
UINT64 ticks = GetTickCount64();
462508
VERIFY_SUCCEEDED(FactorialDistributedAsync(data.Ref, &async));
@@ -467,8 +513,9 @@ DEFINE_TEST_CLASS(AsyncBlockTests)
467513
VERIFY_ARE_EQUAL(data.Ref->result, result);
468514
VERIFY_ARE_EQUAL(data.Ref->result, (DWORD)120);
469515

470-
// Iteration wait should have paused 100ms between each iteration.
471-
VERIFY_IS_GREATER_THAN_OR_EQUAL(ticks, (UINT64)500);
516+
// Iteration wait should have paused between each iteration (allow one interval of timer slack).
517+
const UINT64 expectedMinTicks = (static_cast<UINT64>(data.Ref->iterationWait) * initialValue) - data.Ref->iterationWait;
518+
VERIFY_IS_GREATER_THAN_OR_EQUAL(ticks, expectedMinTicks);
472519

473520
ops.push_back(XAsyncOp::Begin);
474521
ops.push_back(XAsyncOp::DoWork);
@@ -480,7 +527,14 @@ DEFINE_TEST_CLASS(AsyncBlockTests)
480527
ops.push_back(XAsyncOp::GetResult);
481528
ops.push_back(XAsyncOp::Cleanup);
482529

483-
VerifyOps(data.Ref->opCodes, ops);
530+
VerifyOps(data.Ref->GetOpCodes(), ops);
531+
532+
UINT64 drainTicks = GetTickCount64();
533+
while ((!XTaskQueueIsEmpty(queue, XTaskQueuePort::Completion) || !XTaskQueueIsEmpty(queue, XTaskQueuePort::Work))
534+
&& GetTickCount64() - drainTicks < 2000)
535+
{
536+
Sleep(10);
537+
}
484538
VERIFY_QUEUE_EMPTY(queue);
485539
}
486540

@@ -554,8 +608,9 @@ DEFINE_TEST_CLASS(AsyncBlockTests)
554608
Sleep(500);
555609
VERIFY_ARE_EQUAL(E_ABORT, hrCallback);
556610

557-
VerifyHasOp(data.Ref->opCodes, XAsyncOp::Cancel);
558-
VerifyHasOp(data.Ref->opCodes, XAsyncOp::Cleanup);
611+
auto opCodes = data.Ref->GetOpCodes();
612+
VerifyHasOp(opCodes, XAsyncOp::Cancel);
613+
VerifyHasOp(opCodes, XAsyncOp::Cleanup);
559614
VERIFY_QUEUE_EMPTY(queue);
560615
}
561616

@@ -587,9 +642,10 @@ DEFINE_TEST_CLASS(AsyncBlockTests)
587642
VERIFY_ARE_EQUAL(XAsyncGetStatus(&async, true), E_ABORT);
588643
XTaskQueueDispatch(queue, XTaskQueuePort::Completion, 700);
589644

590-
VerifyHasOp(data.Ref->opCodes, XAsyncOp::Cancel);
591-
VerifyHasOp(data.Ref->opCodes, XAsyncOp::Cleanup);
592-
VerifyHasOp(data.Ref->opCodes, XAsyncOp::DoWork);
645+
auto opCodes = data.Ref->GetOpCodes();
646+
VerifyHasOp(opCodes, XAsyncOp::Cancel);
647+
VerifyHasOp(opCodes, XAsyncOp::Cleanup);
648+
VerifyHasOp(opCodes, XAsyncOp::DoWork);
593649
VERIFY_QUEUE_EMPTY(queue);
594650
}
595651

@@ -620,9 +676,10 @@ DEFINE_TEST_CLASS(AsyncBlockTests)
620676
VERIFY_ARE_EQUAL(XAsyncGetStatus(&async, true), E_ABORT);
621677
Sleep(500);
622678

623-
VerifyHasOp(data.Ref->opCodes, XAsyncOp::Cancel);
624-
VerifyHasOp(data.Ref->opCodes, XAsyncOp::Cleanup);
625-
VerifyHasOp(data.Ref->opCodes, XAsyncOp::DoWork);
679+
auto opCodes = data.Ref->GetOpCodes();
680+
VerifyHasOp(opCodes, XAsyncOp::Cancel);
681+
VerifyHasOp(opCodes, XAsyncOp::Cleanup);
682+
VerifyHasOp(opCodes, XAsyncOp::DoWork);
626683
VERIFY_QUEUE_EMPTY(queue);
627684
}
628685

@@ -709,11 +766,14 @@ DEFINE_TEST_CLASS(AsyncBlockTests)
709766
XAsyncBlock async = {};
710767
auto data = AutoRef<FactorialCallData>(new FactorialCallData {});
711768
DWORD result = 0;
769+
HANDLE completionEvent = CreateEvent(nullptr, TRUE, FALSE, nullptr);
770+
VERIFY_IS_NOT_NULL(completionEvent);
712771

713772
CompletionThunk cb([&](XAsyncBlock* async)
714773
{
715774
Sleep(2000);
716775
VERIFY_SUCCEEDED(FactorialResult(async, &result));
776+
SetEvent(completionEvent);
717777
});
718778

719779
async.context = &cb;
@@ -724,6 +784,15 @@ DEFINE_TEST_CLASS(AsyncBlockTests)
724784

725785
VERIFY_SUCCEEDED(FactorialAsync(data.Ref, &async));
726786
VERIFY_SUCCEEDED(XAsyncGetStatus(&async, true));
787+
VERIFY_ARE_EQUAL((DWORD)WAIT_OBJECT_0, WaitForSingleObject(completionEvent, 5000));
788+
CloseHandle(completionEvent);
789+
790+
UINT64 ticks = GetTickCount64();
791+
while ((!XTaskQueueIsEmpty(queue, XTaskQueuePort::Completion) || !XTaskQueueIsEmpty(queue, XTaskQueuePort::Work))
792+
&& GetTickCount64() - ticks < 2000)
793+
{
794+
Sleep(10);
795+
}
727796

728797
VERIFY_ARE_EQUAL(data.Ref->result, result);
729798
VERIFY_ARE_EQUAL(data.Ref->result, (DWORD)120);

Tests/UnitTests/Tests/TaskQueueTests.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,8 @@ DEFINE_TEST_CLASS(TaskQueueTests)
216216
XTaskQueueCloseHandle(dups[idx]);
217217
}
218218

219-
VERIFY_FAILED(XTaskQueueDuplicateHandle(dups[0], &dups[1]));
219+
alignas(void*) uint8_t fakeHandleStorage[64] = {};
220+
VERIFY_FAILED(XTaskQueueDuplicateHandle(reinterpret_cast<XTaskQueueHandle>(fakeHandleStorage), &dups[1]));
220221
XTaskQueueCloseHandle(queue);
221222
}
222223

@@ -456,10 +457,10 @@ DEFINE_TEST_CLASS(TaskQueueTests)
456457
uint64_t call2Ticks = result.Times[1] - baseTicks;
457458
uint64_t call3Ticks = result.Times[2] - baseTicks;
458459

459-
// Call 1 at index 0 should have a tick count > 1000 and < 1050 (shouldn't take 50ms)
460-
VERIFY_IS_TRUE(call1Ticks >= 1000 && call1Ticks < 1050);
461-
VERIFY_IS_TRUE(call2Ticks < 50);
462-
VERIFY_IS_TRUE(call3Ticks >= 500 && call3Ticks < 550);
460+
// Call 1 at index 0 should have a tick count > 1000 and < 1100 (100ms tolerance for debugger overhead)
461+
VERIFY_IS_TRUE(call1Ticks >= 1000 && call1Ticks < 1100);
462+
VERIFY_IS_TRUE(call2Ticks < 100);
463+
VERIFY_IS_TRUE(call3Ticks >= 500 && call3Ticks < 600);
463464
}
464465
}
465466

@@ -917,6 +918,15 @@ DEFINE_TEST_CLASS(TaskQueueTests)
917918
}
918919

919920
uint32_t expectedCount = normalCount + futureCount + eventCount;
921+
if (!wait)
922+
{
923+
UINT64 ticks = GetTickCount64();
924+
while ((data.workCount.load() != expectedCount || data.completionCount.load() != expectedCount)
925+
&& GetTickCount64() - ticks < 5000)
926+
{
927+
Sleep(10);
928+
}
929+
}
920930
VERIFY_ARE_EQUAL(expectedCount, data.workCount.load());
921931
VERIFY_ARE_EQUAL(expectedCount, data.completionCount.load());
922932
}

0 commit comments

Comments
 (0)