diff --git a/CLAUDE.md b/CLAUDE.md index ef8d92f..c17c1b0 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -39,6 +39,19 @@ coverage report coverage html && open htmlcov/index.html ``` +### Documentation + +```bash +# Build HTML documentation +cd docs && make html + +# View documentation +open docs/build/html/index.html + +# Clean build artifacts +cd docs && make clean +``` + ### Linting and Pre-commit ```bash diff --git a/dependencies.yaml b/dependencies.yaml index 63e2900..fdda2a2 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -75,6 +75,13 @@ dependencies: - output_types: [pyproject, requirements] packages: - importlib-metadata >= 4.13.0; python_version < '3.12' + docs: + common: + - output_types: [conda, requirements, pyproject] + packages: + - pydata-sphinx-theme + - sphinx + - sphinx-copybutton test_python: common: - output_types: [conda, requirements, pyproject] diff --git a/dependency-injection-refactoring.md b/dependency-injection-refactoring.md new file mode 100644 index 0000000..697642c --- /dev/null +++ b/dependency-injection-refactoring.md @@ -0,0 +1,152 @@ +# Dependency Injection Refactoring + +## Context + +The check modules (`gpu.py`, `cuda_driver.py`, `memory.py`, `nvlink.py`) +and `debug.py` previously called `pynvml`, `psutil`, and `cuda.pathfinder` +directly. This forced tests to use 50+ `mock.patch` calls with deeply +nested context managers and `MagicMock` objects to simulate hardware +configurations. A thin abstraction layer was introduced so tests can +construct plain dataclasses instead of mocking low-level library internals. + +## Approach: Default Parameter Injection with Provider Dataclasses + +A single new file `rapids_cli/hardware.py` was created containing: + +- **`DeviceInfo`** dataclass -- holds per-GPU data + (index, compute capability, memory, nvlink states) +- **`GpuInfoProvider`** protocol -- read-only interface for GPU info + (`device_count`, `devices`, `cuda_driver_version`, `driver_version`) +- **`SystemInfoProvider`** protocol -- read-only interface for system info + (`total_memory_bytes`, `cuda_runtime_path`) +- **`NvmlGpuInfo`** -- real implementation backed by pynvml + (lazy-loads on first property access, caches results) +- **`DefaultSystemInfo`** -- real implementation backed by + psutil + cuda.pathfinder (lazy-loads per property) +- **`FakeGpuInfo`** / **`FakeSystemInfo`** -- test fakes + (plain dataclasses, no hardware dependency) +- **`FailingGpuInfo`** / **`FailingSystemInfo`** -- test fakes that + raise `ValueError` on access (simulates missing hardware) + +Check functions gained an optional keyword parameter with `None` default: + +```python +def gpu_check(verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs): + if gpu_info is None: # pragma: no cover + gpu_info = NvmlGpuInfo() +``` + +The orchestrator (`doctor.py`) creates a shared `NvmlGpuInfo()` instance +and passes it to all checks via `check_fn(verbose=verbose, gpu_info=gpu_info)`. +Third-party plugins safely ignore the extra keyword argument via their +own `**kwargs`. + +## Files Changed + +### New file: `rapids_cli/hardware.py` + +Contains all provider abstractions: + +- `DeviceInfo` dataclass with fields: `index`, `compute_capability`, + `memory_total_bytes`, `nvlink_states` +- `GpuInfoProvider` and `SystemInfoProvider` protocols + (runtime-checkable) +- `NvmlGpuInfo` -- calls `nvmlInit()` once on first property access, + queries all device info (count, compute capability, memory, + NVLink states), and caches everything +- `DefaultSystemInfo` -- lazily loads system memory via psutil and + CUDA path via cuda.pathfinder (each cached independently) +- `FakeGpuInfo`, `FakeSystemInfo` -- `@dataclass` test fakes with + pre-set data +- `FailingGpuInfo`, `FailingSystemInfo` -- test fakes that raise + `ValueError` on any property access + +### Modified: `rapids_cli/doctor/checks/gpu.py` + +- Removed `import pynvml` +- Added `gpu_info: GpuInfoProvider | None = None` parameter and + `**kwargs` to both `gpu_check()` and `check_gpu_compute_capability()` +- Replaced direct `pynvml` calls with `gpu_info.device_count` and + iteration over `gpu_info.devices` + +### Modified: `rapids_cli/doctor/checks/cuda_driver.py` + +- Removed `import pynvml` +- Added `gpu_info` parameter and `**kwargs` to `cuda_check()` +- Replaced nested try/except with `gpu_info.cuda_driver_version` + +### Modified: `rapids_cli/doctor/checks/memory.py` + +- Removed `import pynvml` and `import psutil` +- Added `system_info` parameter to `get_system_memory()` +- Added `gpu_info` parameter to `get_gpu_memory()` +- Added both `gpu_info` and `system_info` parameters to + `check_memory_to_gpu_ratio()` +- `get_system_memory()` reads `system_info.total_memory_bytes` +- `get_gpu_memory()` sums `dev.memory_total_bytes` from + `gpu_info.devices` +- `check_memory_to_gpu_ratio()` passes injected providers down + to helpers + +### Modified: `rapids_cli/doctor/checks/nvlink.py` + +- Removed `import pynvml` +- Added `gpu_info` parameter and `**kwargs` to `check_nvlink_status()` +- Iterates `dev.nvlink_states` instead of calling + `nvmlDeviceGetNvLinkState` +- **Side-fix**: the original code always passed `0` instead of + `nvlink_id` to `nvmlDeviceGetNvLinkState`; the refactored + `NvmlGpuInfo` queries each link by its actual index + +### Modified: `rapids_cli/debug/debug.py` + +- Removed `import pynvml` and `import cuda.pathfinder` +- Added `gpu_info` parameter to `gather_cuda_version()` +- Added `gpu_info` and `system_info` parameters to `run_debug()` +- Replaced direct pynvml/cuda.pathfinder calls with provider + property accesses + +### Modified: `rapids_cli/doctor/doctor.py` + +- Imports `NvmlGpuInfo` from `rapids_cli.hardware` +- Creates a shared `NvmlGpuInfo()` instance before the check loop +- Passes it via `check_fn(verbose=verbose, gpu_info=gpu_info)` + +### Rewritten tests + +`test_gpu.py`, `test_cuda.py`, `test_memory.py`, `test_nvlink.py`, +`test_debug.py`: + +- Replaced all `patch("pynvml.*")` / `patch("psutil.*")` / + `patch("cuda.pathfinder.*")` with `FakeGpuInfo` / `FakeSystemInfo` / + `FailingGpuInfo` construction +- Tests for `debug.py` still use patches for non-hardware concerns + (subprocess, pathlib, gather_tools) + +### New file: `rapids_cli/tests/test_hardware.py` + +- Unit tests for `NvmlGpuInfo` + (init failure, loads once, device data, NVLink states, no NVLink) +- Unit tests for `DefaultSystemInfo` + (total memory, CUDA runtime path, caching) +- Tests for `FakeGpuInfo` / `FakeSystemInfo` + (defaults, custom values, protocol satisfaction) +- Tests for `FailingGpuInfo` / `FailingSystemInfo` + (all properties raise) + +## Impact + +| Metric | Before | After | +| --------------------------------------------- | ------- | --------------------------------- | +| Hardware library patches in check/debug tests | ~51 | 0 (moved to test_hardware.py) | +| import pynvml in check/debug modules | 5 files | 1 file (hardware.py) | +| MagicMock objects for hardware | ~11 | 0 | +| pynvml.nvmlInit() calls in production | 7 | 1 (in NvmlGpuInfo._ensure_loaded) | +| Total tests | 53 | 72 (+19 hardware tests) | +| Coverage | 95%+ | 97.72% | + +## Verification + +1. `pytest` -- all 72 tests pass +2. `pytest --cov-fail-under=95` -- coverage at 97.72%, above threshold +3. `pre-commit run --all-files` -- all checks pass diff --git a/docs/source/api/checks.rst b/docs/source/api/checks.rst new file mode 100644 index 0000000..363e26f --- /dev/null +++ b/docs/source/api/checks.rst @@ -0,0 +1,42 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +Health Checks +============= + +Built-in health check modules registered via the ``rapids_doctor_check`` +entry point group in ``pyproject.toml``. + +All check functions follow the contract described in :doc:`../plugin_development`. + +GPU Checks +---------- + +.. automodule:: rapids_cli.doctor.checks.gpu + :members: + :undoc-members: + :show-inheritance: + +CUDA Driver Checks +------------------ + +.. automodule:: rapids_cli.doctor.checks.cuda_driver + :members: + :undoc-members: + :show-inheritance: + +Memory Checks +------------- + +.. automodule:: rapids_cli.doctor.checks.memory + :members: + :undoc-members: + :show-inheritance: + +NVLink Checks +------------- + +.. automodule:: rapids_cli.doctor.checks.nvlink + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/cli.rst b/docs/source/api/cli.rst new file mode 100644 index 0000000..1580d51 --- /dev/null +++ b/docs/source/api/cli.rst @@ -0,0 +1,16 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +CLI Module +========== + +The ``rapids_cli.cli`` module defines the main CLI entry point and subcommands +using `rich-click `_. + +The CLI is registered as a console script called ``rapids`` via the +``[project.scripts]`` entry in ``pyproject.toml``. + +.. automodule:: rapids_cli.cli + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/debug.rst b/docs/source/api/debug.rst new file mode 100644 index 0000000..9c50567 --- /dev/null +++ b/docs/source/api/debug.rst @@ -0,0 +1,29 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +Debug Module +============ + +The ``rapids_cli.debug.debug`` module gathers system and environment information +for troubleshooting RAPIDS installations. + +:func:`run_debug` is the main entry point. It collects: + +- Platform and OS details (from ``platform`` and ``/etc/os-release``) +- NVIDIA driver and CUDA versions (via ``pynvml``) +- CUDA runtime path (via ``cuda-pathfinder``) +- System CUDA toolkit locations (globbing ``/usr/local/cuda*``) +- Python version and hash info +- All installed package versions +- pip freeze and conda list output +- Tool versions: pip, conda, uv, pixi, g++, cmake, nvcc + +Output is either a Rich-formatted console table or JSON (``--json``). + +API +--- + +.. automodule:: rapids_cli.debug.debug + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/doctor.rst b/docs/source/api/doctor.rst new file mode 100644 index 0000000..f4bd73d --- /dev/null +++ b/docs/source/api/doctor.rst @@ -0,0 +1,38 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +Doctor Module +============= + +The ``rapids_cli.doctor.doctor`` module orchestrates health check discovery +and execution. + +Checks are discovered via Python entry points in the ``rapids_doctor_check`` +group. Each check function is called with ``verbose`` as a keyword argument. +Results are collected into :class:`CheckResult` objects that track pass/fail +status, return values, errors, and warnings. + +Check Execution Flow +-------------------- + +1. **Discovery**: Scan ``rapids_doctor_check`` entry points and load check + functions. ``ImportError`` and ``AttributeError`` during loading are + silently suppressed via ``contextlib.suppress``. + +2. **Filtering**: If filter arguments are provided, only checks whose + ``ep.value`` contains a filter substring are kept. + +3. **Execution**: Each check runs inside ``warnings.catch_warnings(record=True)`` + so warnings are captured. Exceptions are caught and stored rather than + propagated. + +4. **Reporting**: Warnings are printed, verbose output is shown for passing + checks, and failed checks are listed with their error messages. + +API +--- + +.. automodule:: rapids_cli.doctor.doctor + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/conf.py b/docs/source/conf.py index 3b59adc..b4df851 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,44 +1,81 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # Configuration file for the Sphinx documentation builder. # # For the full list of built-in configuration values, see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html -# -- Project information ----------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information - - +import datetime import os import sys -sys.path.insert(0, os.path.abspath("../rapids_cli")) +sys.path.insert(0, os.path.abspath("../../")) +# -- Project information ----------------------------------------------------- project = "RAPIDS CLI" -copyright = "2024, NVIDIA RAPIDS" -author = "NVIDIA RAPIDS" -release = "2024" +html_title = "RAPIDS CLI" +copyright = f"{datetime.date.today().year}, NVIDIA" +author = "NVIDIA" # -- General configuration --------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -extensions = [] +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.viewcode", + "sphinx.ext.napoleon", + "sphinx.ext.intersphinx", + "sphinx_copybutton", +] templates_path = ["_templates"] exclude_patterns = [] +copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: " +copybutton_prompt_is_regexp = True + +# Napoleon settings for Google-style docstrings +napoleon_google_docstring = True +napoleon_numpy_docstring = False + +# Autodoc settings +autodoc_default_options = { + "members": True, + "member-order": "bysource", + "undoc-members": True, +} + +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), +} # -- Options for HTML output ------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = "alabaster" +html_theme = "pydata_sphinx_theme" + +html_theme_options = { + "header_links_before_dropdown": 7, + "icon_links": [], + "logo": { + "link": "https://docs.rapids.ai/", + }, + "github_url": "https://github.com/rapidsai/rapids-cli", + "show_toc_level": 1, + "navbar_align": "right", +} + +html_sidebars = { + "**": ["sidebar-nav-bs", "sidebar-ethical-ads"], +} + html_static_path = ["_static"] -extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", - "sphinx.ext.viewcode", - "sphinx.ext.napoleon", # For Google and NumPy style docstrings -] + +def setup(app): + app.add_css_file("https://docs.rapids.ai/assets/css/custom.css") + app.add_css_file( + "https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.9.0/css/all.min.css" + ) + app.add_js_file( + "https://docs.rapids.ai/assets/js/custom.js", loading_method="defer" + ) diff --git a/docs/source/cuda_driver.rst b/docs/source/cuda_driver.rst deleted file mode 100644 index a4d69d6..0000000 --- a/docs/source/cuda_driver.rst +++ /dev/null @@ -1,35 +0,0 @@ -.. _cuda_driver: - -CUDA Driver Checks -================== - -This module provides functions to check the availability of CUDA, retrieve CUDA version, -and verify compatibility between the CUDA toolkit and NVIDIA drivers. - -Functions ---------- -.. autofunction:: rapids_cli.doctor.checks.cuda_driver.cuda_check - - Checks if CUDA is available on the system by initializing the NVML and retrieving the CUDA driver version. - - :return: True if CUDA is available, False otherwise. - -.. autofunction:: rapids_cli.doctor.checks.cuda_driver.get_cuda_version - - Retrieves the version of the installed CUDA toolkit. - - :return: A string representing the CUDA version or None if CUDA is not found. - -.. autofunction:: rapids_cli.doctor.checks.cuda_driver.get_driver_version - - Retrieves the installed NVIDIA driver version. - - :return: A string representing the NVIDIA driver version or None if the driver is not found. - -.. autofunction:: rapids_cli.doctor.checks.cuda_driver.check_driver_compatibility - - Checks the compatibility between the installed CUDA version and the NVIDIA driver version. - - This function prints whether the installed versions are compatible with RAPIDS. - - :return: None diff --git a/docs/source/doctor.rst b/docs/source/doctor.rst deleted file mode 100644 index 33be8d8..0000000 --- a/docs/source/doctor.rst +++ /dev/null @@ -1,19 +0,0 @@ -.. _doctor: - -Doctor -========= - -Overview of Doctor. - -.. automodule:: doctor - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -.. automodule:: doctor.checks - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/index.rst b/docs/source/index.rst index 94df6bc..474bf39 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,26 +1,51 @@ -.. RAPIDS CLI documentation master file, created by - sphinx-quickstart on Fri Oct 25 10:50:48 2024. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 -RAPIDS CLI documentation +RAPIDS CLI Documentation ======================== -Add your content using ``reStructuredText`` syntax. See the -`reStructuredText `_ -documentation for details. +The RAPIDS CLI is a command-line tool for performing common RAPIDS operations, +primarily focused on health checks (``rapids doctor``) and debugging (``rapids debug``). +It uses a plugin system that allows RAPIDS libraries to register their own health checks +via Python entry points. +Quick Start +----------- + +.. code-block:: bash + + pip install rapids-cli + + # Run health checks + rapids doctor + + # Gather system info for debugging + rapids debug --json .. toctree:: :maxdepth: 2 - :caption: Contents: + :caption: User Guide + + user_guide + troubleshooting - doctor - cuda_driver +.. toctree:: + :maxdepth: 2 + :caption: Developer Guide + + plugin_development + +.. toctree:: + :maxdepth: 2 + :caption: API Reference + api/cli + api/doctor + api/debug + api/checks Indices and tables -=================== +================== * :ref:`genindex` * :ref:`modindex` diff --git a/docs/source/plugin_development.rst b/docs/source/plugin_development.rst new file mode 100644 index 0000000..f93b069 --- /dev/null +++ b/docs/source/plugin_development.rst @@ -0,0 +1,176 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +Plugin Development +================== + +Any package can add checks to ``rapids doctor`` by exposing a function via a +Python entry point in the ``rapids_doctor_check`` group. + +Quick Start +----------- + +1. Create a check function: + + .. code-block:: python + + # my_package/health_checks.py + + + def my_check(verbose=False, **kwargs): + """Check that my_package is working correctly.""" + try: + import my_package + except ImportError as e: + raise ImportError( + "my_package not found. Install with: pip install my_package" + ) from e + + if verbose: + return f"my_package {my_package.__version__} is available" + +2. Register it in ``pyproject.toml``: + + .. code-block:: toml + + [project.entry-points.rapids_doctor_check] + my_check = "my_package.health_checks:my_check" + +3. Install and verify: + + .. code-block:: bash + + pip install -e . + rapids doctor --verbose --dry-run + +Check Function Contract +----------------------- + +Signature +^^^^^^^^^ + +.. code-block:: python + + def my_check(verbose=False, **kwargs): + """First line of docstring is shown in output.""" + ... + +- Accept ``verbose`` (bool) and ``**kwargs`` for forward compatibility. +- The first line of the docstring is used as the check description in output. +- New keyword arguments may be added in the future but will never be removed, + so ``**kwargs`` ensures your check won't break. + +Return Values +^^^^^^^^^^^^^ + +- **Pass**: Return any value. Returning a string provides extra info shown in + ``--verbose`` mode. +- **Fail**: Raise an exception. The message should tell the user how to fix it. +- **Warn**: Call ``warnings.warn("message", stacklevel=2)`` for non-fatal issues. + Warnings are captured and displayed but do not cause the check to fail. + +Examples +-------- + +GPU memory requirement check: + +.. code-block:: python + + import pynvml + + + def gpu_memory_check(verbose=False, **kwargs): + """Check that GPU has at least 8GB memory.""" + pynvml.nvmlInit() + handle = pynvml.nvmlDeviceGetHandleByIndex(0) + mem = pynvml.nvmlDeviceGetMemoryInfo(handle) + available_gb = mem.total / (1024**3) + + if available_gb < 8: + raise ValueError( + f"Insufficient GPU memory: {available_gb:.1f}GB available, 8GB required" + ) + + if verbose: + return f"GPU memory: {available_gb:.1f}GB" + +Non-fatal warning: + +.. code-block:: python + + import warnings + + + def config_check(verbose=False, **kwargs): + """Check optional configuration.""" + if not optimal_condition(): + warnings.warn( + "Suboptimal configuration detected. Performance may be degraded.", + stacklevel=2, + ) + +Multiple checks from one package: + +.. code-block:: toml + + [project.entry-points.rapids_doctor_check] + my_pkg_import = "my_package.checks:import_check" + my_pkg_gpu = "my_package.checks:gpu_check" + my_pkg_functional = "my_package.checks:functional_check" + +Testing Your Plugin +------------------- + +Verify discovery: + +.. code-block:: bash + + rapids doctor --verbose --dry-run | grep my_check + +Run only your checks: + +.. code-block:: bash + + rapids doctor --verbose my_package + +Unit test with mocks (following the pattern in ``rapids_cli/tests/``): + +.. code-block:: python + + from unittest.mock import patch + + import pytest + + from my_package.health_checks import my_check + + + def test_my_check_success(): + result = my_check(verbose=True) + assert result is not None + + + def test_my_check_failure(): + with pytest.raises(ValueError, match="expected error"): + my_check(verbose=False) + +Troubleshooting +--------------- + +**Check not discovered**: Verify the entry point name is in the output of: + +.. code-block:: bash + + python -c "from importlib.metadata import entry_points; \ + print([ep.name for ep in entry_points(group='rapids_doctor_check')])" + +If missing, reinstall with ``pip install -e . --force-reinstall --no-deps``. + +**Import errors are silent**: The doctor module uses ``contextlib.suppress`` +to skip checks that fail to import. Test your import directly: + +.. code-block:: bash + + python -c "from my_package.health_checks import my_check" + +See the built-in checks in ``rapids_cli/doctor/checks/`` for reference +implementations. diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst new file mode 100644 index 0000000..5da7f2c --- /dev/null +++ b/docs/source/troubleshooting.rst @@ -0,0 +1,112 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +Troubleshooting +=============== + +No GPUs Detected +---------------- + +``rapids doctor`` reports "No available GPUs detected". + +1. Verify NVIDIA drivers are installed: + + .. code-block:: bash + + nvidia-smi + +2. Check that GPU is visible from Python: + + .. code-block:: bash + + python -c "import pynvml; pynvml.nvmlInit(); print(pynvml.nvmlDeviceGetCount())" + +3. If running in a container, ensure GPU passthrough is enabled: + + .. code-block:: bash + + docker run --gpus all ... + +Insufficient Compute Capability +-------------------------------- + +"GPU requires compute capability 7 or higher". + +RAPIDS requires Volta-generation GPUs or newer (compute capability 7.0+). +Supported GPUs include V100, A100, H100, and RTX 20xx/30xx/40xx series. +See https://developer.nvidia.com/cuda-gpus for a full list. + +CUDA Version Issues +------------------- + +"Unable to look up CUDA version". + +1. Check your CUDA driver version: + + .. code-block:: bash + + nvidia-smi | grep "CUDA Version" + +2. Ensure RAPIDS packages match your CUDA version: + + .. code-block:: bash + + # For CUDA 12.x + pip install cudf-cu12 + + # For CUDA 11.x + pip install cudf-cu11 + +Low Memory Warning +------------------ + +"System Memory to total GPU Memory ratio not at least 2:1 ratio." + +This is a warning, not a failure. RAPIDS recommends system RAM be at least +twice total GPU memory for optimal performance, particularly with Dask. +RAPIDS will still function with a lower ratio. + +Custom Checks Not Discovered +----------------------------- + +If ``rapids doctor --verbose`` doesn't show your custom check: + +1. Verify the entry point is registered: + + .. code-block:: bash + + python -c "from importlib.metadata import entry_points; \ + print([ep.name for ep in entry_points(group='rapids_doctor_check')])" + +2. Reinstall the package that provides the check: + + .. code-block:: bash + + pip install -e . --force-reinstall --no-deps + +3. Check for import errors by importing the check function directly: + + .. code-block:: bash + + python -c "from my_package.checks import my_check" + + Import errors during discovery are silently suppressed + (see ``contextlib.suppress`` in ``doctor.py``). + +General Debugging Steps +----------------------- + +1. Run with verbose output: + + .. code-block:: bash + + rapids doctor --verbose + +2. Gather full environment information: + + .. code-block:: bash + + rapids debug --json > debug_info.json + +3. Report issues at https://github.com/rapidsai/rapids-cli/issues with the + debug output attached. diff --git a/docs/source/user_guide.rst b/docs/source/user_guide.rst new file mode 100644 index 0000000..8656999 --- /dev/null +++ b/docs/source/user_guide.rst @@ -0,0 +1,121 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +User Guide +========== + +The RAPIDS CLI provides two commands: ``rapids doctor`` for health checks and +``rapids debug`` for gathering system information. + +rapids doctor +------------- + +The ``doctor`` command performs health checks to ensure your RAPIDS environment +is properly configured. + +.. code-block:: bash + + rapids doctor + +Built-in checks verify: + +- GPU availability and compute capability (7.0+) +- CUDA driver version +- System memory to GPU memory ratio (recommends 2:1 for Dask) +- NVLink status (multi-GPU systems) + +Any installed RAPIDS library can register additional checks via the plugin system +(see :doc:`plugin_development`). + +Verbose Output +^^^^^^^^^^^^^^ + +The ``--verbose`` flag shows check discovery details and per-check output: + +.. code-block:: bash + + $ rapids doctor --verbose + Discovering checks + Found check 'gpu' provided by 'rapids_cli.doctor.checks.gpu:gpu_check' + ... + Discovered 5 checks + Running checks + gpu_check: GPU(s) detected: 2 + All checks passed! + +Dry Run +^^^^^^^ + +The ``--dry-run`` flag discovers checks without executing them, useful for +verifying plugin registration: + +.. code-block:: bash + + rapids doctor --dry-run + +Filtering +^^^^^^^^^ + +Pass filter arguments to run only matching checks. Filters match against +the check's module path: + +.. code-block:: bash + + # Run only cuDF-related checks + rapids doctor cudf + + # Run checks from multiple packages + rapids doctor cudf cuml + +Exit Codes +^^^^^^^^^^ + +- ``0``: All checks passed +- ``1``: One or more checks failed + +This makes ``rapids doctor`` suitable for scripting: + +.. code-block:: bash + + rapids doctor || exit 1 + +rapids debug +------------ + +The ``debug`` command gathers comprehensive system information for troubleshooting. + +.. code-block:: bash + + rapids debug + +Output includes: platform, NVIDIA driver version, CUDA version, CUDA runtime +path, system CTK locations, Python version, all installed package versions, +pip/conda package lists, available tools (pip, conda, uv, pixi, g++, cmake, +nvcc), and OS information. + +JSON Output +^^^^^^^^^^^ + +The ``--json`` flag produces machine-readable output: + +.. code-block:: bash + + rapids debug --json > debug_info.json + +This is useful for attaching to bug reports or comparing environments. + +CI/CD Integration +----------------- + +Example GitHub Actions usage: + +.. code-block:: yaml + + - name: Verify RAPIDS Environment + run: | + pip install rapids-cli + rapids doctor --verbose || exit 1 + + - name: Save Debug Info on Failure + if: failure() + run: rapids debug --json > debug.json diff --git a/rapids_cli/debug/debug.py b/rapids_cli/debug/debug.py index fca4d1d..b4afde5 100644 --- a/rapids_cli/debug/debug.py +++ b/rapids_cli/debug/debug.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 """This module contains the debug subcommand for the Rapids CLI.""" +from __future__ import annotations + import json import platform import subprocess @@ -9,22 +11,29 @@ from datetime import datetime from importlib.metadata import distributions, version from pathlib import Path +from typing import TYPE_CHECKING -import cuda.pathfinder -import pynvml from rich.console import Console from rich.table import Table +if TYPE_CHECKING: + from rapids_cli.hardware import GpuInfoProvider, SystemInfoProvider + console = Console() -def gather_cuda_version(): +def gather_cuda_version(*, gpu_info: GpuInfoProvider | None = None): """Return CUDA driver version as a string, similar to nvidia-smi output.""" - version = pynvml.nvmlSystemGetCudaDriverVersion() + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + + ver = gpu_info.cuda_driver_version # pynvml returns an int like 12040 for 12.4, so format as string - major = version // 1000 - minor = (version % 1000) // 10 - patch = version % 10 + major = ver // 1000 + minor = (ver % 1000) // 10 + patch = ver % 10 if patch == 0: return f"{major}.{minor}" else: @@ -67,18 +76,31 @@ def gather_tools(): } -def run_debug(output_format="console"): +def run_debug( + output_format="console", + *, + gpu_info: GpuInfoProvider | None = None, + system_info: SystemInfoProvider | None = None, +): """Run debug.""" - pynvml.nvmlInit() + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + if system_info is None: # pragma: no cover + from rapids_cli.hardware import DefaultSystemInfo + + system_info = DefaultSystemInfo() + debug_info = { "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "platform": platform.platform(), "nvidia_smi_output": gather_command_output( ["nvidia-smi"], "Nvidia-smi not installed" ), - "driver_version": pynvml.nvmlSystemGetDriverVersion(), - "cuda_version": gather_cuda_version(), - "cuda_runtime_path": cuda.pathfinder.find_nvidia_header_directory("cudart"), + "driver_version": gpu_info.driver_version, + "cuda_version": gather_cuda_version(gpu_info=gpu_info), + "cuda_runtime_path": system_info.cuda_runtime_path, "system_ctk": sorted( [str(p) for p in Path("/usr/local").glob("cuda*") if p.is_dir()] ), diff --git a/rapids_cli/doctor/checks/cuda_driver.py b/rapids_cli/doctor/checks/cuda_driver.py index 252dd47..6275c1a 100644 --- a/rapids_cli/doctor/checks/cuda_driver.py +++ b/rapids_cli/doctor/checks/cuda_driver.py @@ -2,17 +2,22 @@ # SPDX-License-Identifier: Apache-2.0 """Check for CUDA and driver compatibility.""" -import pynvml +from __future__ import annotations +from typing import TYPE_CHECKING -def cuda_check(verbose=False): +if TYPE_CHECKING: + from rapids_cli.hardware import GpuInfoProvider + + +def cuda_check(verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs): """Check CUDA availability.""" + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + try: - pynvml.nvmlInit() - try: - cuda_version = pynvml.nvmlSystemGetCudaDriverVersion() - return cuda_version - except pynvml.NVMLError as e: - raise ValueError("Unable to look up CUDA version") from e - except pynvml.NVMLError as e: + return gpu_info.cuda_driver_version + except ValueError as e: raise ValueError("Unable to look up CUDA version") from e diff --git a/rapids_cli/doctor/checks/gpu.py b/rapids_cli/doctor/checks/gpu.py index 77e6ca6..d8e1a45 100644 --- a/rapids_cli/doctor/checks/gpu.py +++ b/rapids_cli/doctor/checks/gpu.py @@ -2,38 +2,52 @@ # SPDX-License-Identifier: Apache-2.0 """GPU checks for the doctor command.""" -import pynvml +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from rapids_cli.hardware import GpuInfoProvider REQUIRED_COMPUTE_CAPABILITY = 7 -def gpu_check(verbose=False): +def gpu_check(verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs): """Check GPU availability.""" + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + try: - pynvml.nvmlInit() - num_gpus = pynvml.nvmlDeviceGetCount() - except pynvml.NVMLError as e: + num_gpus = gpu_info.device_count + except ValueError as e: raise ValueError("No available GPUs detected") from e assert num_gpus > 0, "No GPUs detected" return f"GPU(s) detected: {num_gpus}" -def check_gpu_compute_capability(verbose): +def check_gpu_compute_capability( + verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs +): """Check the system for GPU Compute Capability.""" + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + try: - pynvml.nvmlInit() - except pynvml.NVMLError as e: + devices = gpu_info.devices + except ValueError as e: raise ValueError("No GPU - cannot determine GPU Compute Capability") from e - for i in range(pynvml.nvmlDeviceGetCount()): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle) - if major >= REQUIRED_COMPUTE_CAPABILITY: + for dev in devices: + if dev.compute_capability[0] >= REQUIRED_COMPUTE_CAPABILITY: continue else: raise ValueError( - f"GPU {i} requires compute capability {REQUIRED_COMPUTE_CAPABILITY} " - f"or higher but only has {major}.{minor}." + f"GPU {dev.index} requires compute capability {REQUIRED_COMPUTE_CAPABILITY} " + f"or higher but only has {dev.compute_capability[0]}.{dev.compute_capability[1]}." "See https://developer.nvidia.com/cuda-gpus for more information." ) return True diff --git a/rapids_cli/doctor/checks/memory.py b/rapids_cli/doctor/checks/memory.py index cb1fcb5..f1d8231 100644 --- a/rapids_cli/doctor/checks/memory.py +++ b/rapids_cli/doctor/checks/memory.py @@ -2,46 +2,71 @@ # SPDX-License-Identifier: Apache-2.0 """Memory checks.""" +from __future__ import annotations + import warnings +from typing import TYPE_CHECKING -import psutil -import pynvml +if TYPE_CHECKING: + from rapids_cli.hardware import GpuInfoProvider, SystemInfoProvider -def get_system_memory(verbose=False): +def get_system_memory( + verbose=False, *, system_info: SystemInfoProvider | None = None, **kwargs +): """Get the total system memory.""" - virtual_memory = psutil.virtual_memory() - total_memory = virtual_memory.total / (1024**3) # converts bytes to gigabytes + if system_info is None: # pragma: no cover + from rapids_cli.hardware import DefaultSystemInfo + + system_info = DefaultSystemInfo() + + total_memory = system_info.total_memory_bytes / ( + 1024**3 + ) # converts bytes to gigabytes return total_memory -def get_gpu_memory(verbose=False): +def get_gpu_memory(verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs): """Get the total GPU memory.""" - pynvml.nvmlInit() - gpus = pynvml.nvmlDeviceGetCount() - gpu_memory_total = 0 - for i in range(gpus): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) - gpu_memory_total += memory_info.total / (1024**3) # converts to gigabytes - - pynvml.nvmlShutdown() + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + + gpu_memory_total = sum(dev.memory_total_bytes for dev in gpu_info.devices) / ( + 1024**3 + ) # converts to gigabytes return gpu_memory_total -def check_memory_to_gpu_ratio(verbose=True): +def check_memory_to_gpu_ratio( + verbose=True, + *, + gpu_info: GpuInfoProvider | None = None, + system_info: SystemInfoProvider | None = None, + **kwargs, +): """Check the system for a 2:1 ratio of system Memory to total GPU Memory. This is especially useful for Dask. """ + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + if system_info is None: # pragma: no cover + from rapids_cli.hardware import DefaultSystemInfo + + system_info = DefaultSystemInfo() + try: - pynvml.nvmlInit() - except pynvml.NVMLError as e: + _ = gpu_info.device_count + except ValueError as e: raise ValueError("GPU not found. Please ensure GPUs are installed.") from e - system_memory = get_system_memory(verbose) - gpu_memory = get_gpu_memory(verbose) + system_memory = get_system_memory(verbose, system_info=system_info) + gpu_memory = get_gpu_memory(verbose, gpu_info=gpu_info) ratio = system_memory / gpu_memory if ratio < 1.8: warnings.warn( diff --git a/rapids_cli/doctor/checks/nvlink.py b/rapids_cli/doctor/checks/nvlink.py index 22bbdd1..715a8fd 100644 --- a/rapids_cli/doctor/checks/nvlink.py +++ b/rapids_cli/doctor/checks/nvlink.py @@ -2,25 +2,33 @@ # SPDX-License-Identifier: Apache-2.0 """Check for NVLink status.""" -import pynvml +from __future__ import annotations +from typing import TYPE_CHECKING -def check_nvlink_status(verbose=True): +if TYPE_CHECKING: + from rapids_cli.hardware import GpuInfoProvider + + +def check_nvlink_status( + verbose=True, *, gpu_info: GpuInfoProvider | None = None, **kwargs +): """Check the system for NVLink with 2 or more GPUs.""" + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + try: - pynvml.nvmlInit() - except pynvml.NVMLError as e: + device_count = gpu_info.device_count + except ValueError as e: raise ValueError("GPU not found. Please ensure GPUs are installed.") from e - device_count = pynvml.nvmlDeviceGetCount() if device_count < 2: return False - for i in range(device_count): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - for nvlink_id in range(pynvml.NVML_NVLINK_MAX_LINKS): - try: - pynvml.nvmlDeviceGetNvLinkState(handle, 0) - return True - except pynvml.NVMLError as e: - raise ValueError(f"NVLink {nvlink_id} Status Check Failed") from e + for dev in gpu_info.devices: + if any(dev.nvlink_states): + return True + + return False diff --git a/rapids_cli/doctor/doctor.py b/rapids_cli/doctor/doctor.py index 0fdff86..c497300 100644 --- a/rapids_cli/doctor/doctor.py +++ b/rapids_cli/doctor/doctor.py @@ -10,6 +10,7 @@ from rapids_cli._compatibility import entry_points from rapids_cli.constants import DOCTOR_SYMBOL +from rapids_cli.hardware import NvmlGpuInfo console = Console() @@ -34,26 +35,22 @@ def doctor_check( If specific subcommands are given, it validates them against valid subcommands and executes corresponding checks. - Parameters: - ---------- - filters : list (optional) - A list of filters to run specific checks. + Args: + verbose: Whether to print verbose output. + dry_run: Whether to skip running checks. + filters: A list of filters to run specific checks. Raises: - ------- - ValueError: - If an invalid subcommand is provided. + ValueError: If an invalid subcommand is provided. - Notes: - ----- - The function discovers and loads check functions defined in entry points - under the 'rapids_doctor_check' group. It also checks specific - configurations related to a corresponding subcommand if given. + Note: + The function discovers and loads check functions defined in entry points + under the ``rapids_doctor_check`` group. It also checks specific + configurations related to a corresponding subcommand if given. Example: - -------- - > doctor_check([]) # Run all health checks - > doctor_check(['cudf']) # Run 'cudf' specific checks + >>> doctor_check(verbose=False, dry_run=False) + >>> doctor_check(verbose=False, dry_run=False, filters=['cudf']) """ filters = [] if not filters else filters console.print( @@ -78,6 +75,8 @@ def doctor_check( console.print("Dry run, skipping checks") return True + gpu_info = NvmlGpuInfo() + results: list[CheckResult] = [] with console.status("[bold green]Running checks...") as ui_status: for i, check_fn in enumerate(checks): @@ -89,7 +88,7 @@ def doctor_check( with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") status = True - value = check_fn(verbose=verbose) + value = check_fn(verbose=verbose, gpu_info=gpu_info) caught_warnings = w except Exception as e: diff --git a/rapids_cli/hardware.py b/rapids_cli/hardware.py new file mode 100644 index 0000000..94aab52 --- /dev/null +++ b/rapids_cli/hardware.py @@ -0,0 +1,229 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Hardware abstraction layer for GPU and system information.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Protocol, runtime_checkable + + +@dataclass +class DeviceInfo: + """Per-GPU device information.""" + + index: int + compute_capability: tuple[int, int] + memory_total_bytes: int + nvlink_states: list[bool] = field(default_factory=list) + + +@runtime_checkable +class GpuInfoProvider(Protocol): + """Read-only interface for GPU information.""" + + @property + def device_count(self) -> int: + """Return number of GPU devices.""" + ... + + @property + def devices(self) -> list[DeviceInfo]: + """Return list of device information.""" + ... + + @property + def cuda_driver_version(self) -> int: + """Return CUDA driver version as integer.""" + ... + + @property + def driver_version(self) -> str: + """Return driver version string.""" + ... + + +@runtime_checkable +class SystemInfoProvider(Protocol): + """Read-only interface for system information.""" + + @property + def total_memory_bytes(self) -> int: + """Return total system memory in bytes.""" + ... + + @property + def cuda_runtime_path(self) -> str | None: + """Return path to CUDA runtime headers.""" + ... + + +class NvmlGpuInfo: + """Real GPU info provider backed by pynvml. + + Lazily loads all device information on first property access and caches results. + """ + + def __init__(self) -> None: + """Initialize with empty cached state.""" + self._loaded = False + self._device_count = 0 + self._devices: list[DeviceInfo] = [] + self._cuda_driver_version = 0 + self._driver_version = "" + + def _ensure_loaded(self) -> None: + if self._loaded: + return + + import pynvml + + try: + pynvml.nvmlInit() + except pynvml.NVMLError as e: + raise ValueError("Unable to initialize GPU driver (NVML)") from e + + self._device_count = pynvml.nvmlDeviceGetCount() + self._cuda_driver_version = pynvml.nvmlSystemGetCudaDriverVersion() + self._driver_version = pynvml.nvmlSystemGetDriverVersion() + + self._devices = [] + for i in range(self._device_count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle) + memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + + nvlink_states: list[bool] = [] + for link_id in range(pynvml.NVML_NVLINK_MAX_LINKS): + try: + state = pynvml.nvmlDeviceGetNvLinkState(handle, link_id) + nvlink_states.append(bool(state)) + except pynvml.NVMLError: + break + + self._devices.append( + DeviceInfo( + index=i, + compute_capability=(major, minor), + memory_total_bytes=memory_info.total, + nvlink_states=nvlink_states, + ) + ) + + self._loaded = True + + @property + def device_count(self) -> int: + """Return number of GPU devices.""" + self._ensure_loaded() + return self._device_count + + @property + def devices(self) -> list[DeviceInfo]: + """Return list of device information.""" + self._ensure_loaded() + return self._devices + + @property + def cuda_driver_version(self) -> int: + """Return CUDA driver version as integer (e.g. 12040).""" + self._ensure_loaded() + return self._cuda_driver_version + + @property + def driver_version(self) -> str: + """Return driver version string.""" + self._ensure_loaded() + return self._driver_version + + +class DefaultSystemInfo: + """Real system info provider backed by psutil and cuda.pathfinder. + + Lazily loads each piece of information on first access. + """ + + def __init__(self) -> None: + """Initialize with empty cached state.""" + self._memory_loaded = False + self._total_memory_bytes = 0 + self._cuda_path_loaded = False + self._cuda_runtime_path: str | None = None + + @property + def total_memory_bytes(self) -> int: + """Return total system memory in bytes.""" + if not self._memory_loaded: + import psutil + + self._total_memory_bytes = psutil.virtual_memory().total + self._memory_loaded = True + return self._total_memory_bytes + + @property + def cuda_runtime_path(self) -> str | None: + """Return path to CUDA runtime headers.""" + if not self._cuda_path_loaded: + import cuda.pathfinder + + self._cuda_runtime_path = cuda.pathfinder.find_nvidia_header_directory( + "cudart" + ) + self._cuda_path_loaded = True + return self._cuda_runtime_path + + +@dataclass +class FakeGpuInfo: + """Test fake for GPU information with pre-set data.""" + + device_count: int = 0 + devices: list[DeviceInfo] = field(default_factory=list) + cuda_driver_version: int = 0 + driver_version: str = "" + + +@dataclass +class FakeSystemInfo: + """Test fake for system information with pre-set data.""" + + total_memory_bytes: int = 0 + cuda_runtime_path: str | None = None + + +class FailingGpuInfo: + """Test fake that raises ValueError on any property access.""" + + @property + def device_count(self) -> int: + """Raise ValueError.""" + raise ValueError("No GPU available") + + @property + def devices(self) -> list[DeviceInfo]: + """Raise ValueError.""" + raise ValueError("No GPU available") + + @property + def cuda_driver_version(self) -> int: + """Raise ValueError.""" + raise ValueError("No GPU available") + + @property + def driver_version(self) -> str: + """Raise ValueError.""" + raise ValueError("No GPU available") + + +class FailingSystemInfo: + """Test fake that raises ValueError on any property access.""" + + @property + def total_memory_bytes(self) -> int: + """Raise ValueError.""" + raise ValueError("System info unavailable") + + @property + def cuda_runtime_path(self) -> str | None: + """Raise ValueError.""" + raise ValueError("System info unavailable") diff --git a/rapids_cli/tests/test_cuda.py b/rapids_cli/tests/test_cuda.py index 70097b2..de4fd99 100644 --- a/rapids_cli/tests/test_cuda.py +++ b/rapids_cli/tests/test_cuda.py @@ -1,26 +1,17 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import patch +import pytest from rapids_cli.doctor.checks.cuda_driver import cuda_check +from rapids_cli.hardware import FailingGpuInfo, FakeGpuInfo -def mock_cuda_version(): - return 12050 +def test_cuda_check_success(): + gpu_info = FakeGpuInfo(cuda_driver_version=12050) + assert cuda_check(verbose=True, gpu_info=gpu_info) == 12050 -def test_get_cuda_version_success(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), - ): - version = mock_cuda_version() - assert version - - -def test_cuda_check_success(capfd): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), - ): - assert cuda_check(verbose=True) +def test_cuda_check_no_gpu(): + gpu_info = FailingGpuInfo() + with pytest.raises(ValueError, match="Unable to look up CUDA version"): + cuda_check(verbose=False, gpu_info=gpu_info) diff --git a/rapids_cli/tests/test_debug.py b/rapids_cli/tests/test_debug.py index 91c330c..79b9db5 100644 --- a/rapids_cli/tests/test_debug.py +++ b/rapids_cli/tests/test_debug.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 import json -from unittest.mock import MagicMock, patch +from unittest.mock import patch from rapids_cli.debug.debug import ( gather_command_output, @@ -10,24 +10,22 @@ gather_tools, run_debug, ) +from rapids_cli.hardware import FakeGpuInfo, FakeSystemInfo def test_gather_cuda_version(): - """Test CUDA version gathering.""" - with patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040): - result = gather_cuda_version() - assert result == "12.4" + gpu_info = FakeGpuInfo(cuda_driver_version=12040) + result = gather_cuda_version(gpu_info=gpu_info) + assert result == "12.4" def test_gather_cuda_version_with_patch(): - """Test CUDA version with patch number.""" - with patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12345): - result = gather_cuda_version() - assert result == "12.34.5" + gpu_info = FakeGpuInfo(cuda_driver_version=12345) + result = gather_cuda_version(gpu_info=gpu_info) + assert result == "12.34.5" def test_gather_package_versions(): - """Test package version gathering.""" result = gather_package_versions() assert isinstance(result, dict) assert len(result) > 0 @@ -36,25 +34,21 @@ def test_gather_package_versions(): def test_gather_command_output_success(): - """Test successful command output gathering.""" result = gather_command_output(["echo", "test"]) assert result == "test" def test_gather_command_output_with_fallback(): - """Test command output with fallback.""" result = gather_command_output(["nonexistent_command"], fallback_output="fallback") assert result == "fallback" def test_gather_command_output_no_fallback(): - """Test command output without fallback.""" result = gather_command_output(["nonexistent_command"]) assert result is None def test_gather_tools(): - """Test tools gathering.""" with ( patch( "rapids_cli.debug.debug.gather_command_output", @@ -69,40 +63,41 @@ def test_gather_tools(): def test_run_debug_console(capsys): - """Test run_debug with console output.""" - mock_vm = MagicMock() - mock_vm.total = 32 * 1024**3 + gpu_info = FakeGpuInfo( + device_count=1, + cuda_driver_version=12040, + driver_version="550.54.15", + ) + system_info = FakeSystemInfo( + total_memory_bytes=32 * 1024**3, + cuda_runtime_path="/usr/local/cuda/include", + ) with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54.15"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040), - patch( - "cuda.pathfinder.find_nvidia_header_directory", - return_value="/usr/local/cuda/include", - ), patch("pathlib.Path.glob", return_value=[]), patch("rapids_cli.debug.debug.gather_package_versions", return_value={}), patch("rapids_cli.debug.debug.gather_command_output", return_value=None), patch("rapids_cli.debug.debug.gather_tools", return_value={}), patch("pathlib.Path.read_text", return_value='NAME="Ubuntu"\nVERSION="22.04"'), ): - run_debug(output_format="console") + run_debug(output_format="console", gpu_info=gpu_info, system_info=system_info) captured = capsys.readouterr() assert "RAPIDS Debug Information" in captured.out def test_run_debug_json(capsys): - """Test run_debug with JSON output.""" + gpu_info = FakeGpuInfo( + device_count=1, + cuda_driver_version=12040, + driver_version="550.54.15", + ) + system_info = FakeSystemInfo( + total_memory_bytes=32 * 1024**3, + cuda_runtime_path="/usr/local/cuda/include", + ) + with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54.15"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040), - patch( - "cuda.pathfinder.find_nvidia_header_directory", - return_value="/usr/local/cuda/include", - ), patch("pathlib.Path.glob", return_value=[]), patch( "rapids_cli.debug.debug.gather_package_versions", @@ -114,7 +109,7 @@ def test_run_debug_json(capsys): patch("rapids_cli.debug.debug.gather_tools", return_value={"pip": "pip 23.0"}), patch("pathlib.Path.read_text", return_value='NAME="Ubuntu"\nVERSION="22.04"'), ): - run_debug(output_format="json") + run_debug(output_format="json", gpu_info=gpu_info, system_info=system_info) captured = capsys.readouterr() output = json.loads(captured.out) diff --git a/rapids_cli/tests/test_gpu.py b/rapids_cli/tests/test_gpu.py index a895bc2..f9fdf28 100644 --- a/rapids_cli/tests/test_gpu.py +++ b/rapids_cli/tests/test_gpu.py @@ -1,7 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import patch - import pytest from rapids_cli.doctor.checks.gpu import ( @@ -9,67 +7,60 @@ check_gpu_compute_capability, gpu_check, ) +from rapids_cli.hardware import DeviceInfo, FailingGpuInfo, FakeGpuInfo def test_gpu_check_success(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - ): - result = gpu_check(verbose=True) - assert result == "GPU(s) detected: 2" + gpu_info = FakeGpuInfo(device_count=2) + result = gpu_check(verbose=True, gpu_info=gpu_info) + assert result == "GPU(s) detected: 2" def test_gpu_check_no_gpus(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=0), - ): - with pytest.raises(AssertionError, match="No GPUs detected"): - gpu_check(verbose=False) + gpu_info = FakeGpuInfo(device_count=0) + with pytest.raises(AssertionError, match="No GPUs detected"): + gpu_check(verbose=False, gpu_info=gpu_info) def test_gpu_check_nvml_error(): - import pynvml - - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises(ValueError, match="No available GPUs detected"): - gpu_check(verbose=False) + gpu_info = FailingGpuInfo() + with pytest.raises(ValueError, match="No available GPUs detected"): + gpu_check(verbose=False, gpu_info=gpu_info) def test_check_gpu_compute_capability_success(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex"), - patch( - "pynvml.nvmlDeviceGetCudaComputeCapability", - return_value=(REQUIRED_COMPUTE_CAPABILITY, 5), + devices = [ + DeviceInfo( + index=0, + compute_capability=(REQUIRED_COMPUTE_CAPABILITY, 5), + memory_total_bytes=0, ), - ): - result = check_gpu_compute_capability(verbose=True) - assert result is True + DeviceInfo( + index=1, + compute_capability=(REQUIRED_COMPUTE_CAPABILITY, 5), + memory_total_bytes=0, + ), + ] + gpu_info = FakeGpuInfo(device_count=2, devices=devices) + result = check_gpu_compute_capability(verbose=True, gpu_info=gpu_info) + assert result is True def test_check_gpu_compute_capability_insufficient(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), - patch("pynvml.nvmlDeviceGetHandleByIndex"), - patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(6, 0)), + devices = [ + DeviceInfo(index=0, compute_capability=(6, 0), memory_total_bytes=0), + ] + gpu_info = FakeGpuInfo(device_count=1, devices=devices) + with pytest.raises( + ValueError, + match=f"GPU 0 requires compute capability {REQUIRED_COMPUTE_CAPABILITY}", ): - with pytest.raises( - ValueError, - match=f"GPU 0 requires compute capability {REQUIRED_COMPUTE_CAPABILITY}", - ): - check_gpu_compute_capability(verbose=False) + check_gpu_compute_capability(verbose=False, gpu_info=gpu_info) def test_check_gpu_compute_capability_no_gpu(): - import pynvml - - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises( - ValueError, match="No GPU - cannot determine GPU Compute Capability" - ): - check_gpu_compute_capability(verbose=False) + gpu_info = FailingGpuInfo() + with pytest.raises( + ValueError, match="No GPU - cannot determine GPU Compute Capability" + ): + check_gpu_compute_capability(verbose=False, gpu_info=gpu_info) diff --git a/rapids_cli/tests/test_hardware.py b/rapids_cli/tests/test_hardware.py new file mode 100644 index 0000000..1236e0f --- /dev/null +++ b/rapids_cli/tests/test_hardware.py @@ -0,0 +1,235 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +from unittest.mock import MagicMock, patch + +import pynvml +import pytest + +from rapids_cli.hardware import ( + DefaultSystemInfo, + DeviceInfo, + FailingGpuInfo, + FailingSystemInfo, + FakeGpuInfo, + FakeSystemInfo, + GpuInfoProvider, + NvmlGpuInfo, + SystemInfoProvider, +) + +# --- NvmlGpuInfo tests --- + + +def test_nvml_gpu_info_init_failure(): + with patch( + "pynvml.nvmlInit", + side_effect=pynvml.NVMLError(pynvml.NVML_ERROR_DRIVER_NOT_LOADED), + ): + gpu_info = NvmlGpuInfo() + with pytest.raises(ValueError, match="Unable to initialize GPU driver"): + _ = gpu_info.device_count + + +def test_nvml_gpu_info_loads_once(): + mock_handle = MagicMock() + mock_memory = MagicMock() + mock_memory.total = 16 * 1024**3 + + with ( + patch("pynvml.nvmlInit") as mock_init, + patch("pynvml.nvmlDeviceGetCount", return_value=1), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), + patch( + "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported + ), + ): + gpu_info = NvmlGpuInfo() + # Access multiple properties to verify caching + _ = gpu_info.device_count + _ = gpu_info.devices + _ = gpu_info.cuda_driver_version + _ = gpu_info.driver_version + # nvmlInit should be called exactly once + mock_init.assert_called_once() + + +def test_nvml_gpu_info_device_data(): + mock_handle = MagicMock() + mock_memory = MagicMock() + mock_memory.total = 24 * 1024**3 + + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=2), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12060), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="560.10"), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(9, 0)), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), + patch("pynvml.nvmlDeviceGetNvLinkState", return_value=1), + ): + gpu_info = NvmlGpuInfo() + assert gpu_info.device_count == 2 + assert len(gpu_info.devices) == 2 + assert gpu_info.devices[0].compute_capability == (9, 0) + assert gpu_info.devices[0].memory_total_bytes == 24 * 1024**3 + assert gpu_info.cuda_driver_version == 12060 + assert gpu_info.driver_version == "560.10" + + +def test_nvml_gpu_info_nvlink_states(): + mock_handle = MagicMock() + mock_memory = MagicMock() + mock_memory.total = 16 * 1024**3 + + def nvlink_side_effect(handle, link_id): + if link_id < 2: + return 1 + raise pynvml.NVMLError_NotSupported() + + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=1), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), + patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=nvlink_side_effect), + ): + gpu_info = NvmlGpuInfo() + assert gpu_info.devices[0].nvlink_states == [True, True] + + +def test_nvml_gpu_info_no_nvlink(): + mock_handle = MagicMock() + mock_memory = MagicMock() + mock_memory.total = 16 * 1024**3 + + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=1), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), + patch( + "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported + ), + ): + gpu_info = NvmlGpuInfo() + assert gpu_info.devices[0].nvlink_states == [] + + +# --- DefaultSystemInfo tests --- + + +def test_default_system_info_total_memory(): + mock_vm = MagicMock() + mock_vm.total = 64 * 1024**3 + with patch("psutil.virtual_memory", return_value=mock_vm): + sys_info = DefaultSystemInfo() + assert sys_info.total_memory_bytes == 64 * 1024**3 + + +def test_default_system_info_cuda_runtime_path(): + with patch( + "cuda.pathfinder.find_nvidia_header_directory", + return_value="/usr/local/cuda/include", + ): + sys_info = DefaultSystemInfo() + assert sys_info.cuda_runtime_path == "/usr/local/cuda/include" + + +def test_default_system_info_caches(): + mock_vm = MagicMock() + mock_vm.total = 64 * 1024**3 + with patch("psutil.virtual_memory", return_value=mock_vm) as mock_psutil: + sys_info = DefaultSystemInfo() + _ = sys_info.total_memory_bytes + _ = sys_info.total_memory_bytes + mock_psutil.assert_called_once() + + +# --- FakeGpuInfo tests --- + + +def test_fake_gpu_info_defaults(): + fake = FakeGpuInfo() + assert fake.device_count == 0 + assert fake.devices == [] + assert fake.cuda_driver_version == 0 + assert fake.driver_version == "" + + +def test_fake_gpu_info_custom(): + devices = [ + DeviceInfo(index=0, compute_capability=(8, 0), memory_total_bytes=32 * 1024**3) + ] + fake = FakeGpuInfo( + device_count=1, + devices=devices, + cuda_driver_version=12040, + driver_version="550.0", + ) + assert fake.device_count == 1 + assert len(fake.devices) == 1 + assert fake.cuda_driver_version == 12040 + + +def test_fake_gpu_info_satisfies_protocol(): + assert isinstance(FakeGpuInfo(), GpuInfoProvider) + + +# --- FakeSystemInfo tests --- + + +def test_fake_system_info_defaults(): + fake = FakeSystemInfo() + assert fake.total_memory_bytes == 0 + assert fake.cuda_runtime_path is None + + +def test_fake_system_info_satisfies_protocol(): + assert isinstance(FakeSystemInfo(), SystemInfoProvider) + + +# --- FailingGpuInfo tests --- + + +def test_failing_gpu_info_device_count(): + with pytest.raises(ValueError, match="No GPU available"): + _ = FailingGpuInfo().device_count + + +def test_failing_gpu_info_devices(): + with pytest.raises(ValueError, match="No GPU available"): + _ = FailingGpuInfo().devices + + +def test_failing_gpu_info_cuda_driver_version(): + with pytest.raises(ValueError, match="No GPU available"): + _ = FailingGpuInfo().cuda_driver_version + + +def test_failing_gpu_info_driver_version(): + with pytest.raises(ValueError, match="No GPU available"): + _ = FailingGpuInfo().driver_version + + +# --- FailingSystemInfo tests --- + + +def test_failing_system_info_total_memory(): + with pytest.raises(ValueError, match="System info unavailable"): + _ = FailingSystemInfo().total_memory_bytes + + +def test_failing_system_info_cuda_runtime_path(): + with pytest.raises(ValueError, match="System info unavailable"): + _ = FailingSystemInfo().cuda_runtime_path diff --git a/rapids_cli/tests/test_memory.py b/rapids_cli/tests/test_memory.py index 572df33..183d7ff 100644 --- a/rapids_cli/tests/test_memory.py +++ b/rapids_cli/tests/test_memory.py @@ -1,7 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import MagicMock, patch - import pytest from rapids_cli.doctor.checks.memory import ( @@ -9,74 +7,62 @@ get_gpu_memory, get_system_memory, ) +from rapids_cli.hardware import DeviceInfo, FailingGpuInfo, FakeGpuInfo, FakeSystemInfo def test_get_system_memory(): - mock_vm = MagicMock() - mock_vm.total = 32 * 1024**3 # 32 GB in bytes - with patch("psutil.virtual_memory", return_value=mock_vm): - result = get_system_memory(verbose=False) - assert result == 32.0 + system_info = FakeSystemInfo(total_memory_bytes=32 * 1024**3) + result = get_system_memory(verbose=False, system_info=system_info) + assert result == 32.0 def test_get_gpu_memory_single_gpu(): - mock_handle = MagicMock() - mock_memory_info = MagicMock() - mock_memory_info.total = 16 * 1024**3 # 16 GB in bytes - - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory_info), - patch("pynvml.nvmlShutdown"), - ): - result = get_gpu_memory(verbose=False) - assert result == 16.0 + devices = [ + DeviceInfo(index=0, compute_capability=(7, 0), memory_total_bytes=16 * 1024**3) + ] + gpu_info = FakeGpuInfo(device_count=1, devices=devices) + result = get_gpu_memory(verbose=False, gpu_info=gpu_info) + assert result == 16.0 def test_get_gpu_memory_multiple_gpus(): - mock_handle = MagicMock() - mock_memory_info = MagicMock() - mock_memory_info.total = 16 * 1024**3 # 16 GB per GPU - - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=4), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory_info), - patch("pynvml.nvmlShutdown"), - ): - result = get_gpu_memory(verbose=False) - assert result == 64.0 # 16 GB * 4 GPUs + devices = [ + DeviceInfo(index=i, compute_capability=(7, 0), memory_total_bytes=16 * 1024**3) + for i in range(4) + ] + gpu_info = FakeGpuInfo(device_count=4, devices=devices) + result = get_gpu_memory(verbose=False, gpu_info=gpu_info) + assert result == 64.0 # 16 GB * 4 GPUs def test_check_memory_to_gpu_ratio_good_ratio(): - with ( - patch("pynvml.nvmlInit"), - patch("rapids_cli.doctor.checks.memory.get_system_memory", return_value=64.0), - patch("rapids_cli.doctor.checks.memory.get_gpu_memory", return_value=32.0), - ): - result = check_memory_to_gpu_ratio(verbose=True) - assert result is True + devices = [ + DeviceInfo(index=0, compute_capability=(7, 0), memory_total_bytes=32 * 1024**3) + ] + gpu_info = FakeGpuInfo(device_count=1, devices=devices) + system_info = FakeSystemInfo(total_memory_bytes=64 * 1024**3) + result = check_memory_to_gpu_ratio( + verbose=True, gpu_info=gpu_info, system_info=system_info + ) + assert result is True def test_check_memory_to_gpu_ratio_warning(): - with ( - patch("pynvml.nvmlInit"), - patch("rapids_cli.doctor.checks.memory.get_system_memory", return_value=32.0), - patch("rapids_cli.doctor.checks.memory.get_gpu_memory", return_value=32.0), - ): - with pytest.warns(UserWarning, match="System Memory to total GPU Memory ratio"): - result = check_memory_to_gpu_ratio(verbose=True) - assert result is True + devices = [ + DeviceInfo(index=0, compute_capability=(7, 0), memory_total_bytes=32 * 1024**3) + ] + gpu_info = FakeGpuInfo(device_count=1, devices=devices) + system_info = FakeSystemInfo(total_memory_bytes=32 * 1024**3) + with pytest.warns(UserWarning, match="System Memory to total GPU Memory ratio"): + result = check_memory_to_gpu_ratio( + verbose=True, gpu_info=gpu_info, system_info=system_info + ) + assert result is True def test_check_memory_to_gpu_ratio_no_gpu(): - import pynvml - - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises( - ValueError, match="GPU not found. Please ensure GPUs are installed." - ): - check_memory_to_gpu_ratio(verbose=False) + gpu_info = FailingGpuInfo() + with pytest.raises( + ValueError, match="GPU not found. Please ensure GPUs are installed." + ): + check_memory_to_gpu_ratio(verbose=False, gpu_info=gpu_info) diff --git a/rapids_cli/tests/test_nvlink.py b/rapids_cli/tests/test_nvlink.py index e2d82c7..a849ed0 100644 --- a/rapids_cli/tests/test_nvlink.py +++ b/rapids_cli/tests/test_nvlink.py @@ -1,54 +1,54 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import MagicMock, patch - import pytest from rapids_cli.doctor.checks.nvlink import check_nvlink_status +from rapids_cli.hardware import DeviceInfo, FailingGpuInfo, FakeGpuInfo def test_check_nvlink_status_success(): - mock_handle = MagicMock() - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetNvLinkState", return_value=1), - ): - result = check_nvlink_status(verbose=True) - assert result is True + devices = [ + DeviceInfo( + index=0, + compute_capability=(7, 0), + memory_total_bytes=0, + nvlink_states=[True], + ), + DeviceInfo( + index=1, + compute_capability=(7, 0), + memory_total_bytes=0, + nvlink_states=[True], + ), + ] + gpu_info = FakeGpuInfo(device_count=2, devices=devices) + result = check_nvlink_status(verbose=True, gpu_info=gpu_info) + assert result is True def test_check_nvlink_status_single_gpu(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), - ): - result = check_nvlink_status(verbose=False) - assert result is False + gpu_info = FakeGpuInfo(device_count=1) + result = check_nvlink_status(verbose=False, gpu_info=gpu_info) + assert result is False def test_check_nvlink_status_no_gpu(): - import pynvml - - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises( - ValueError, match="GPU not found. Please ensure GPUs are installed." - ): - check_nvlink_status(verbose=False) - + gpu_info = FailingGpuInfo() + with pytest.raises( + ValueError, match="GPU not found. Please ensure GPUs are installed." + ): + check_nvlink_status(verbose=False, gpu_info=gpu_info) -def test_check_nvlink_status_nvml_error(): - import pynvml - mock_handle = MagicMock() - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch( - "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported +def test_check_nvlink_status_no_nvlink(): + devices = [ + DeviceInfo( + index=0, compute_capability=(7, 0), memory_total_bytes=0, nvlink_states=[] ), - ): - with pytest.raises(ValueError, match="NVLink 0 Status Check Failed"): - check_nvlink_status(verbose=False) + DeviceInfo( + index=1, compute_capability=(7, 0), memory_total_bytes=0, nvlink_states=[] + ), + ] + gpu_info = FakeGpuInfo(device_count=2, devices=devices) + result = check_nvlink_status(verbose=True, gpu_info=gpu_info) + assert result is False