Skip to content

Commit 35afc11

Browse files
committed
fix(pathfinder): keep canary probes script-safe
Run the CTK canary probe via a dedicated module subprocess so it still gets a fresh interpreter and independent loader state without re-entering the caller's script as __main__. Made-with: Cursor
1 parent 3ed5217 commit 35afc11

File tree

5 files changed

+147
-26
lines changed

5 files changed

+147
-26
lines changed

cuda_pathfinder/cuda/pathfinder/_dynamic_libs/canary_probe_subprocess.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# SPDX-License-Identifier: Apache-2.0
44

5+
import sys
6+
from collections.abc import Sequence
57
import json
68

79
from cuda.pathfinder._dynamic_libs.lib_descriptor import LIB_DESCRIPTORS
@@ -27,3 +29,15 @@ def _probe_canary_abs_path(libname: str) -> str | None:
2729

2830
def probe_canary_abs_path_and_print_json(libname: str) -> None:
2931
print(json.dumps(_probe_canary_abs_path(libname)))
32+
33+
34+
def main(argv: Sequence[str] | None = None) -> int:
35+
args = list(sys.argv[1:] if argv is None else argv)
36+
if len(args) != 1:
37+
raise SystemExit("Usage: python -m cuda.pathfinder._dynamic_libs.canary_probe_subprocess <libname>")
38+
probe_canary_abs_path_and_print_json(args[0])
39+
return 0
40+
41+
42+
if __name__ == "__main__":
43+
raise SystemExit(main())

cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
import functools
77
import json
88
import struct
9+
import subprocess
910
import sys
1011
from typing import TYPE_CHECKING
1112

12-
from cuda.pathfinder._dynamic_libs.canary_probe_subprocess import probe_canary_abs_path_and_print_json
1313
from cuda.pathfinder._dynamic_libs.lib_descriptor import LIB_DESCRIPTORS
1414
from cuda.pathfinder._dynamic_libs.load_dl_common import (
1515
DynamicLibNotAvailableError,
@@ -28,7 +28,6 @@
2828
run_find_steps,
2929
)
3030
from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
31-
from cuda.pathfinder._utils.spawned_process_runner import run_in_spawned_child_process
3231

3332
if TYPE_CHECKING:
3433
from cuda.pathfinder._dynamic_libs.lib_descriptor import LibDescriptor
@@ -40,6 +39,8 @@
4039
name for name, desc in LIB_DESCRIPTORS.items() if (desc.windows_dlls if IS_WINDOWS else desc.linux_sonames)
4140
)
4241
_PLATFORM_NAME = "Windows" if IS_WINDOWS else "Linux"
42+
_CANARY_PROBE_MODULE = "cuda.pathfinder._dynamic_libs.canary_probe_subprocess"
43+
_CANARY_PROBE_TIMEOUT_SECONDS = 10.0
4344

4445
# Driver libraries: shipped with the NVIDIA display driver, always on the
4546
# system linker path. These skip all CTK search steps (site-packages,
@@ -67,15 +68,46 @@ def _load_driver_lib_no_cache(desc: LibDescriptor) -> LoadedDL:
6768
)
6869

6970

71+
def _coerce_subprocess_output(output: str | bytes | None) -> str:
72+
if isinstance(output, bytes):
73+
return output.decode(errors="replace")
74+
return "" if output is None else output
75+
76+
77+
def _raise_canary_probe_child_process_error(
78+
*,
79+
returncode: int | None = None,
80+
timeout: float | None = None,
81+
stderr: str | bytes | None = None,
82+
) -> None:
83+
if timeout is None:
84+
error_line = f"Canary probe child process exited with code {returncode}."
85+
else:
86+
error_line = f"Canary probe child process timed out after {timeout} seconds."
87+
raise ChildProcessError(
88+
f"{error_line}\n"
89+
"--- stderr-from-child-process ---\n"
90+
f"{_coerce_subprocess_output(stderr)}"
91+
"<end-of-stderr-from-child-process>\n"
92+
)
93+
94+
7095
@functools.cache
7196
def _resolve_system_loaded_abs_path_in_subprocess(libname: str) -> str | None:
72-
"""Resolve a canary library's absolute path in a spawned child process."""
73-
result = run_in_spawned_child_process(
74-
probe_canary_abs_path_and_print_json,
75-
args=(libname,),
76-
timeout=10.0,
77-
rethrow=True,
78-
)
97+
"""Resolve a canary library's absolute path in a fresh Python subprocess."""
98+
try:
99+
result = subprocess.run(
100+
[sys.executable, "-m", _CANARY_PROBE_MODULE, libname],
101+
capture_output=True,
102+
text=True,
103+
timeout=_CANARY_PROBE_TIMEOUT_SECONDS,
104+
check=False,
105+
)
106+
except subprocess.TimeoutExpired as exc:
107+
_raise_canary_probe_child_process_error(timeout=exc.timeout, stderr=exc.stderr)
108+
109+
if result.returncode != 0:
110+
_raise_canary_probe_child_process_error(returncode=result.returncode, stderr=result.stderr)
79111

80112
# Use the final non-empty line in case earlier output lines are emitted.
81113
lines = [line for line in result.stdout.splitlines() if line.strip()]

cuda_pathfinder/cuda/pathfinder/_headers/find_nvidia_headers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def find_via_ctk_root_canary(desc: HeaderDescriptor) -> LocatedHeaderDir | None:
115115
"""Try CTK header lookup via CTK-root canary probing.
116116
117117
Skips immediately if the descriptor does not opt in (``use_ctk_root_canary``).
118-
Otherwise, system-loads ``cudart`` in a spawned child process, derives
118+
Otherwise, system-loads ``cudart`` in a dedicated Python subprocess, derives
119119
CTK root from the resolved library path, and searches the expected include
120120
layout under that root.
121121
"""

cuda_pathfinder/docs/source/release/1.4.0-notes.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ Highlights
1212
----------
1313

1414
* Add CTK root canary probing for non-standard-path libraries in
15-
``load_nvidia_dynamic_lib()`` (notably ``nvvm``), including spawned child
16-
process isolation for the canary probe.
15+
``load_nvidia_dynamic_lib()`` (notably ``nvvm``), including dedicated
16+
subprocess isolation for the canary probe.
1717
(`PR #1595 <https://github.com/NVIDIA/cuda-python/pull/1595>`_)
1818

1919
* Restore backward-compatible exception behavior for

cuda_pathfinder/tests/test_ctk_root_discovery.py

Lines changed: 89 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44

5+
import os
6+
import subprocess
7+
import sys
8+
import textwrap
9+
from pathlib import Path
10+
511
import pytest
612

713
from cuda.pathfinder._dynamic_libs import load_nvidia_dynamic_lib as load_mod
@@ -24,6 +30,7 @@
2430

2531
_MODULE = "cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib"
2632
_STEPS_MODULE = "cuda.pathfinder._dynamic_libs.search_steps"
33+
_PACKAGE_ROOT = Path(__file__).resolve().parents[1]
2734

2835

2936
def _ctx(libname: str = "nvvm") -> SearchContext:
@@ -184,53 +191,121 @@ def test_try_via_ctk_root_regular_lib(tmp_path):
184191

185192

186193
def test_subprocess_probe_returns_abs_path_on_string_payload(mocker):
187-
result = mocker.Mock(stdout='"/usr/local/cuda/lib64/libcudart.so.13"\n')
188-
run_mock = mocker.patch(f"{_MODULE}.run_in_spawned_child_process", return_value=result)
194+
result = subprocess.CompletedProcess(
195+
args=[],
196+
returncode=0,
197+
stdout='"/usr/local/cuda/lib64/libcudart.so.13"\n',
198+
stderr="",
199+
)
200+
run_mock = mocker.patch(f"{_MODULE}.subprocess.run", return_value=result)
189201

190202
assert _resolve_system_loaded_abs_path_in_subprocess("cudart") == "/usr/local/cuda/lib64/libcudart.so.13"
191-
assert run_mock.call_args.kwargs.get("rethrow") is True
203+
run_mock.assert_called_once_with(
204+
[sys.executable, "-m", "cuda.pathfinder._dynamic_libs.canary_probe_subprocess", "cudart"],
205+
capture_output=True,
206+
text=True,
207+
timeout=10.0,
208+
check=False,
209+
)
192210

193211

194212
def test_subprocess_probe_returns_none_on_null_payload(mocker):
195-
result = mocker.Mock(stdout="null\n")
196-
mocker.patch(f"{_MODULE}.run_in_spawned_child_process", return_value=result)
213+
result = subprocess.CompletedProcess(args=[], returncode=0, stdout="null\n", stderr="")
214+
mocker.patch(f"{_MODULE}.subprocess.run", return_value=result)
197215

198216
assert _resolve_system_loaded_abs_path_in_subprocess("cudart") is None
199217

200218

201219
def test_subprocess_probe_raises_on_child_failure(mocker):
220+
result = subprocess.CompletedProcess(args=[], returncode=1, stdout="", stderr="child failed\n")
221+
mocker.patch(f"{_MODULE}.subprocess.run", return_value=result)
222+
223+
with pytest.raises(ChildProcessError, match="child failed"):
224+
_resolve_system_loaded_abs_path_in_subprocess("cudart")
225+
226+
227+
def test_subprocess_probe_raises_on_timeout(mocker):
202228
mocker.patch(
203-
f"{_MODULE}.run_in_spawned_child_process",
204-
side_effect=ChildProcessError("child failed"),
229+
f"{_MODULE}.subprocess.run",
230+
side_effect=subprocess.TimeoutExpired(cmd=["python"], timeout=10.0, stderr="probe hung\n"),
205231
)
206-
with pytest.raises(ChildProcessError, match="child failed"):
232+
with pytest.raises(ChildProcessError, match="timed out after 10.0 seconds"):
207233
_resolve_system_loaded_abs_path_in_subprocess("cudart")
208234

209235

210236
def test_subprocess_probe_raises_on_empty_stdout(mocker):
211-
result = mocker.Mock(stdout=" \n \n")
212-
mocker.patch(f"{_MODULE}.run_in_spawned_child_process", return_value=result)
237+
result = subprocess.CompletedProcess(args=[], returncode=0, stdout=" \n \n", stderr="")
238+
mocker.patch(f"{_MODULE}.subprocess.run", return_value=result)
213239

214240
with pytest.raises(RuntimeError, match="produced no stdout payload"):
215241
_resolve_system_loaded_abs_path_in_subprocess("cudart")
216242

217243

218244
def test_subprocess_probe_raises_on_invalid_json_payload(mocker):
219-
result = mocker.Mock(stdout="not-json\n")
220-
mocker.patch(f"{_MODULE}.run_in_spawned_child_process", return_value=result)
245+
result = subprocess.CompletedProcess(args=[], returncode=0, stdout="not-json\n", stderr="")
246+
mocker.patch(f"{_MODULE}.subprocess.run", return_value=result)
221247

222248
with pytest.raises(RuntimeError, match="invalid JSON payload"):
223249
_resolve_system_loaded_abs_path_in_subprocess("cudart")
224250

225251

226252
def test_subprocess_probe_raises_on_unexpected_json_payload(mocker):
227-
result = mocker.Mock(stdout='{"path": "/usr/local/cuda/lib64/libcudart.so.13"}\n')
228-
mocker.patch(f"{_MODULE}.run_in_spawned_child_process", return_value=result)
253+
result = subprocess.CompletedProcess(
254+
args=[],
255+
returncode=0,
256+
stdout='{"path": "/usr/local/cuda/lib64/libcudart.so.13"}\n',
257+
stderr="",
258+
)
259+
mocker.patch(f"{_MODULE}.subprocess.run", return_value=result)
229260

230261
with pytest.raises(RuntimeError, match="unexpected payload"):
231262
_resolve_system_loaded_abs_path_in_subprocess("cudart")
232263

233264

265+
def test_subprocess_probe_does_not_reenter_calling_script(tmp_path):
266+
script_path = tmp_path / "call_probe.py"
267+
run_count_path = tmp_path / "run_count.txt"
268+
script_path.write_text(
269+
textwrap.dedent(
270+
f"""
271+
from pathlib import Path
272+
273+
from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import (
274+
_resolve_system_loaded_abs_path_in_subprocess,
275+
)
276+
277+
marker_path = Path({str(run_count_path)!r})
278+
run_count = int(marker_path.read_text()) if marker_path.exists() else 0
279+
marker_path.write_text(str(run_count + 1))
280+
281+
try:
282+
_resolve_system_loaded_abs_path_in_subprocess("not_a_real_lib")
283+
except Exception:
284+
pass
285+
"""
286+
),
287+
encoding="utf-8",
288+
)
289+
env = os.environ.copy()
290+
existing_pythonpath = env.get("PYTHONPATH")
291+
env["PYTHONPATH"] = (
292+
str(_PACKAGE_ROOT)
293+
if not existing_pythonpath
294+
else os.pathsep.join((str(_PACKAGE_ROOT), existing_pythonpath))
295+
)
296+
297+
result = subprocess.run(
298+
[sys.executable, str(script_path)],
299+
capture_output=True,
300+
text=True,
301+
check=False,
302+
env=env,
303+
)
304+
305+
assert result.returncode == 0, result.stderr
306+
assert run_count_path.read_text(encoding="utf-8") == "1"
307+
308+
234309
# ---------------------------------------------------------------------------
235310
# _try_ctk_root_canary
236311
# ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)