Skip to content

Commit 9c6b888

Browse files
committed
[trace-guest] Avoid deadlock when calling outb in exceptions context to report trace data
When guest tracing is enabled and an exception occurs while the guest trace state lock is acquired (e.g. no more memory left to allocate a trace event on the heap), the guest tries to report to the host what the error was by calling the `out32` function. This function reports the guest trace data by serializing it on the spot, which needs to acquire the lock again, this leads to a deadlock. Signed-off-by: Doru Blânzeanu <dblnz@pm.me>
1 parent 4041bd6 commit 9c6b888

File tree

6 files changed

+205
-34
lines changed

6 files changed

+205
-34
lines changed

src/hyperlight_guest/src/exit.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ use hyperlight_common::outb::OutBAction;
2121
use tracing::instrument;
2222

2323
/// Halt the execution of the guest and returns control to the host.
24+
/// Halt is generally called for a successful completion of the guest's work,
25+
/// this means we can instrument it as a trace point because the trace state
26+
/// shall not be locked at this point (we are not in an exception context).
2427
#[inline(never)]
2528
#[instrument(skip_all, level = "Trace")]
2629
pub fn halt() {
@@ -122,7 +125,12 @@ pub(crate) fn outb(port: u16, data: &[u8]) {
122125
}
123126

124127
/// OUT function for sending a 32-bit value to the host.
125-
#[instrument(skip_all, level = "Trace")]
128+
/// `out32` can be called from an exception context, so we must be careful
129+
/// with the tracing state that might be locked at that time.
130+
/// The tracing state calls `try_lock` internally to avoid deadlocks.
131+
/// Furthermore, the instrument macro is not used here to avoid creating spans
132+
/// in exception contexts. Because if the trace state is already locked, trying to create a span
133+
/// would cause a panic, which is undesirable in exception handling.
126134
pub(crate) unsafe fn out32(port: u16, val: u32) {
127135
#[cfg(feature = "trace_guest")]
128136
{

src/hyperlight_guest_tracing/src/lib.rs

Lines changed: 75 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,11 @@ mod trace {
5555
use crate::subscriber::GuestSubscriber;
5656

5757
/// Weak reference to the guest state so we can manually trigger flush to host
58+
/// The `GuestState` is ONLY accessed from two places:
59+
/// - The tracing subscriber, when spans/events are created in the guest
60+
/// - The guest tracing API, when we want manual control to flush the events to the host
61+
///
62+
/// The mutex ensures safe access to the state from both places.
5863
static GUEST_STATE: spin::Once<Weak<Mutex<GuestState>>> = spin::Once::new();
5964

6065
/// Initialize the guest tracing subscriber as global default.
@@ -72,12 +77,27 @@ mod trace {
7277
let _ = tracing_core::dispatcher::set_global_default(tracing_core::Dispatch::new(sub));
7378
}
7479

75-
/// Sets the guset starting timestamp reported to the host on a VMExit
80+
/// Sets the guest starting timestamp reported to the host on a VMExit
81+
/// NOTE: Panics if unable to lock the guest state.
7682
pub fn set_start_tsc(guest_start_tsc: u64) {
7783
if let Some(w) = GUEST_STATE.get()
78-
&& let Some(state) = w.upgrade()
84+
&& let Some(state_mutex) = w.upgrade()
7985
{
80-
state.lock().set_start_tsc(guest_start_tsc);
86+
// We want to protect against re-entrancy issues produced by tracing code that locks
87+
// the state and then causes an exception that tries to lock the state again.
88+
//
89+
// For example:
90+
// - 1. A span is created, locking the state
91+
// - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.)
92+
// - 3. The exception handler uses the tracing API to send the trace data to the host
93+
// or just create spans/events for logging purposes.
94+
// - 4. The tracing API tries to lock the state again, causing a deadlock.
95+
// To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal
96+
// the issue.
97+
let mut state = state_mutex
98+
.try_lock()
99+
.expect("guest_tracing: Unable to lock guest tracing state in `set_start_tsc`");
100+
state.set_start_tsc(guest_start_tsc);
81101
}
82102
}
83103

@@ -87,32 +107,78 @@ mod trace {
87107
/// This expects an outb call to send the spans to the host.
88108
/// After calling this function, the internal state is marked
89109
/// for cleaning on the next access.
110+
///
111+
/// NOTE: Panics if unable to lock the guest state.
90112
pub fn end_trace() {
91113
if let Some(w) = GUEST_STATE.get()
92-
&& let Some(state) = w.upgrade()
114+
&& let Some(state_mutex) = w.upgrade()
93115
{
94-
state.lock().end_trace();
116+
// We want to protect against re-entrancy issues produced by tracing code that locks
117+
// the state and then causes an exception that tries to lock the state again.
118+
//
119+
// For example:
120+
// - 1. A span is created, locking the state
121+
// - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.)
122+
// - 3. The exception handler uses the tracing API to send the trace data to the host
123+
// or just create spans/events for logging purposes.
124+
// - 4. The tracing API tries to lock the state again, causing a deadlock.
125+
// To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal
126+
// the issue.
127+
let mut state = state_mutex
128+
.try_lock()
129+
.expect("guest_tracing: Unable to lock guest tracing state in `end_trace`");
130+
state.end_trace();
95131
}
96132
}
97133

98134
/// Cleans the internal trace state by removing closed spans and events.
99135
/// This ensures that after a VM exit, we keep the spans that
100136
/// are still active (in the stack) and remove all other spans and events.
137+
/// NOTE: Panics if unable to lock the guest state.
101138
pub fn clean_trace_state() {
102139
if let Some(w) = GUEST_STATE.get()
103-
&& let Some(state) = w.upgrade()
140+
&& let Some(state_mutex) = w.upgrade()
104141
{
105-
state.lock().clean();
142+
// We want to protect against re-entrancy issues produced by tracing code that locks
143+
// the state and then causes an exception that tries to lock the state again.
144+
//
145+
// For example:
146+
// - 1. A span is created, locking the state
147+
// - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.)
148+
// - 3. The exception handler uses the tracing API to send the trace data to the host
149+
// or just create spans/events for logging purposes.
150+
// - 4. The tracing API tries to lock the state again, causing a deadlock.
151+
// To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal
152+
// the issue.
153+
let mut state = state_mutex
154+
.try_lock()
155+
.expect("guest_tracing: Unable to lock guest tracing state in `clean_trace_state`");
156+
state.clean();
106157
}
107158
}
108159

109160
/// Returns information about the current trace state needed by the host to read the spans.
161+
/// NOTE: Panics if unable to lock the guest state.
110162
pub fn guest_trace_info() -> Option<TraceBatchInfo> {
111163
let mut res = None;
112164
if let Some(w) = GUEST_STATE.get()
113-
&& let Some(state) = w.upgrade()
165+
&& let Some(state_mutex) = w.upgrade()
114166
{
115-
res = Some(state.lock().guest_trace_info());
167+
// We want to protect against re-entrancy issues produced by tracing code that locks
168+
// the state and then causes an exception that tries to lock the state again.
169+
//
170+
// For example:
171+
// - 1. A span is created, locking the state
172+
// - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.)
173+
// - 3. The exception handler uses the tracing API to send the trace data to the host
174+
// or just create spans/events for logging purposes.
175+
// - 4. The tracing API tries to lock the state again, causing a deadlock.
176+
// To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal
177+
// the issue.
178+
let mut state = state_mutex
179+
.try_lock()
180+
.expect("guest_tracing: Unable to lock guest tracing state in `guest_trace_info`");
181+
res = Some(state.guest_trace_info());
116182
}
117183
res
118184
}

src/hyperlight_guest_tracing/src/subscriber.rs

Lines changed: 103 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ pub(crate) struct GuestSubscriber {
2929
/// Internal state that holds the spans and events
3030
/// Protected by a Mutex for inner mutability
3131
/// A reference to this state is stored in a static variable
32+
/// so it can be accessed from the guest tracing API
3233
state: Arc<Mutex<GuestState>>,
3334
}
3435

@@ -49,27 +50,123 @@ impl Subscriber for GuestSubscriber {
4950
}
5051

5152
fn new_span(&self, attrs: &Attributes<'_>) -> Id {
52-
self.state.lock().new_span(attrs)
53+
// We want to protect against re-entrancy issues produced by tracing code that locks
54+
// the state and then causes an exception that tries to lock the state again.
55+
//
56+
// For example:
57+
// - 1. A span is created, locking the state
58+
// - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.)
59+
// - 3. The exception handler uses the tracing API to send the trace data to the host
60+
// or just create spans/events for logging purposes.
61+
// - 4. The tracing API tries to lock the state again, causing a deadlock.
62+
// To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal
63+
// the issue.
64+
let mut state = self
65+
.state
66+
.try_lock()
67+
.expect("guest_tracing: Unable to lock guest tracing state in `new_span`");
68+
69+
state.new_span(attrs)
5370
}
5471

5572
fn record(&self, id: &Id, values: &Record<'_>) {
56-
self.state.lock().record(id, values)
73+
// We want to protect against re-entrancy issues produced by tracing code that locks
74+
// the state and then causes an exception that tries to lock the state again.
75+
//
76+
// For example:
77+
// - 1. A span is created, locking the state
78+
// - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.)
79+
// - 3. The exception handler uses the tracing API to send the trace data to the host
80+
// or just create spans/events for logging purposes.
81+
// - 4. The tracing API tries to lock the state again, causing a deadlock.
82+
// To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal
83+
// the issue.
84+
let mut state = self
85+
.state
86+
.try_lock()
87+
.expect("guest_tracing: Unable to lock guest tracing state in `record`");
88+
89+
state.record(id, values)
5790
}
5891

5992
fn event(&self, event: &Event<'_>) {
60-
self.state.lock().event(event)
93+
// We want to protect against re-entrancy issues produced by tracing code that locks
94+
// the state and then causes an exception that tries to lock the state again.
95+
//
96+
// For example:
97+
// - 1. A span is created, locking the state
98+
// - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.)
99+
// - 3. The exception handler uses the tracing API to send the trace data to the host
100+
// or just create spans/events for logging purposes.
101+
// - 4. The tracing API tries to lock the state again, causing a deadlock.
102+
// To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal
103+
// the issue.
104+
let mut state = self
105+
.state
106+
.try_lock()
107+
.expect("guest_tracing: Unable to lock guest tracing state in `event`");
108+
109+
state.event(event)
61110
}
62111

63112
fn enter(&self, id: &Id) {
64-
self.state.lock().enter(id)
113+
// We want to protect against re-entrancy issues produced by tracing code that locks
114+
// the state and then causes an exception that tries to lock the state again.
115+
//
116+
// For example:
117+
// - 1. A span is created, locking the state
118+
// - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.)
119+
// - 3. The exception handler uses the tracing API to send the trace data to the host
120+
// or just create spans/events for logging purposes.
121+
// - 4. The tracing API tries to lock the state again, causing a deadlock.
122+
// To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal
123+
// the issue.
124+
let mut state = self
125+
.state
126+
.try_lock()
127+
.expect("guest_tracing: Unable to lock guest tracing state in `enter`");
128+
129+
state.enter(id)
65130
}
66131

67132
fn exit(&self, id: &Id) {
68-
self.state.lock().exit(id)
133+
// We want to protect against re-entrancy issues produced by tracing code that locks
134+
// the state and then causes an exception that tries to lock the state again.
135+
//
136+
// For example:
137+
// - 1. A span is created, locking the state
138+
// - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.)
139+
// - 3. The exception handler uses the tracing API to send the trace data to the host
140+
// or just create spans/events for logging purposes.
141+
// - 4. The tracing API tries to lock the state again, causing a deadlock.
142+
// To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal
143+
// the issue.
144+
let mut state = self
145+
.state
146+
.try_lock()
147+
.expect("guest_tracing: Unable to lock guest tracing state in `exit`");
148+
149+
state.exit(id)
69150
}
70151

71152
fn try_close(&self, id: Id) -> bool {
72-
self.state.lock().try_close(id)
153+
// We want to protect against re-entrancy issues produced by tracing code that locks
154+
// the state and then causes an exception that tries to lock the state again.
155+
//
156+
// For example:
157+
// - 1. A span is created, locking the state
158+
// - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.)
159+
// - 3. The exception handler uses the tracing API to send the trace data to the host
160+
// or just create spans/events for logging purposes.
161+
// - 4. The tracing API tries to lock the state again, causing a deadlock.
162+
// To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal
163+
// the issue.
164+
let mut state = self
165+
.state
166+
.try_lock()
167+
.expect("guest_tracing: Unable to lock guest tracing state in `try_close`");
168+
169+
state.try_close(id)
73170
}
74171

75172
fn record_follows_from(&self, _span: &Id, _follows: &Id) {

src/tests/rust_guests/dummyguest/Cargo.lock

Lines changed: 6 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/tests/rust_guests/simpleguest/Cargo.lock

Lines changed: 6 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/tests/rust_guests/witguest/Cargo.lock

Lines changed: 6 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)