From a97c196c3ec8a9b33fc221c5e6d6ffd2a4874809 Mon Sep 17 00:00:00 2001 From: Mike McCarty Date: Wed, 11 Feb 2026 16:05:24 -0500 Subject: [PATCH 1/3] Updated docs --- CLAUDE.md | 13 + dependencies.yaml | 6 + docs/source/api/checks.rst | 265 ++++++++++++++++ docs/source/api/cli.rst | 62 ++++ docs/source/api/debug.rst | 233 ++++++++++++++ docs/source/api/doctor.rst | 141 +++++++++ docs/source/conf.py | 66 +++- docs/source/contributing.rst | 363 ++++++++++++++++++++++ docs/source/cuda_driver.rst | 35 --- docs/source/doctor.rst | 19 -- docs/source/index.rst | 70 ++++- docs/source/installation.rst | 106 +++++++ docs/source/plugin_development.rst | 477 +++++++++++++++++++++++++++++ docs/source/troubleshooting.rst | 395 ++++++++++++++++++++++++ docs/source/user_guide.rst | 293 ++++++++++++++++++ 15 files changed, 2465 insertions(+), 79 deletions(-) create mode 100644 docs/source/api/checks.rst create mode 100644 docs/source/api/cli.rst create mode 100644 docs/source/api/debug.rst create mode 100644 docs/source/api/doctor.rst create mode 100644 docs/source/contributing.rst delete mode 100644 docs/source/cuda_driver.rst delete mode 100644 docs/source/doctor.rst create mode 100644 docs/source/installation.rst create mode 100644 docs/source/plugin_development.rst create mode 100644 docs/source/troubleshooting.rst create mode 100644 docs/source/user_guide.rst diff --git a/CLAUDE.md b/CLAUDE.md index ef8d92f..c17c1b0 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -39,6 +39,19 @@ coverage report coverage html && open htmlcov/index.html ``` +### Documentation + +```bash +# Build HTML documentation +cd docs && make html + +# View documentation +open docs/build/html/index.html + +# Clean build artifacts +cd docs && make clean +``` + ### Linting and Pre-commit ```bash diff --git a/dependencies.yaml b/dependencies.yaml index 63e2900..12cb729 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -75,6 +75,12 @@ dependencies: - output_types: [pyproject, requirements] packages: - importlib-metadata >= 4.13.0; python_version < '3.12' + docs: + common: + - output_types: [conda, requirements, pyproject] + packages: + - sphinx + - sphinx-rtd-theme test_python: common: - output_types: [conda, requirements, pyproject] diff --git a/docs/source/api/checks.rst b/docs/source/api/checks.rst new file mode 100644 index 0000000..acf8775 --- /dev/null +++ b/docs/source/api/checks.rst @@ -0,0 +1,265 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +Health Checks +============= + +Built-in health check modules for verifying RAPIDS installation requirements. + +GPU Checks +---------- + +.. automodule:: rapids_cli.doctor.checks.gpu + :members: + :undoc-members: + :show-inheritance: + +gpu_check +^^^^^^^^^ + +.. autofunction:: rapids_cli.doctor.checks.gpu.gpu_check + +Verifies that NVIDIA GPUs are available and accessible. + +**Parameters:** + +- ``verbose`` (bool): Enable detailed output + +**Returns:** + +- str: Message indicating number of GPUs detected + +**Raises:** + +- ValueError: If no GPUs are detected +- AssertionError: If GPU count is zero + +**Example:** + +.. code-block:: python + + >>> gpu_check(verbose=True) + 'GPU(s) detected: 2' + +check_gpu_compute_capability +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autofunction:: rapids_cli.doctor.checks.gpu.check_gpu_compute_capability + +Verifies that all GPUs meet minimum compute capability requirements. + +**Parameters:** + +- ``verbose`` (bool): Enable detailed output + +**Returns:** + +- bool: True if all GPUs meet requirements + +**Raises:** + +- ValueError: If any GPU has insufficient compute capability + +**Required Compute Capability:** + +- Minimum: 7.0 (Volta architecture or newer) +- Supported GPUs: V100, A100, H100, RTX 20xx/30xx/40xx series + +CUDA Driver Checks +------------------ + +.. automodule:: rapids_cli.doctor.checks.cuda_driver + :members: + :undoc-members: + :show-inheritance: + +cuda_check +^^^^^^^^^^ + +.. autofunction:: rapids_cli.doctor.checks.cuda_driver.cuda_check + +Verifies CUDA driver availability and retrieves version. + +**Parameters:** + +- ``verbose`` (bool): Enable detailed output + +**Returns:** + +- int: CUDA driver version code (e.g., 12040 for CUDA 12.4) + +**Raises:** + +- ValueError: If CUDA driver version cannot be determined + +**Example:** + +.. code-block:: python + + >>> cuda_check(verbose=True) + 12040 + +Memory Checks +------------- + +.. automodule:: rapids_cli.doctor.checks.memory + :members: + :undoc-members: + :show-inheritance: + +get_system_memory +^^^^^^^^^^^^^^^^^ + +.. autofunction:: rapids_cli.doctor.checks.memory.get_system_memory + +Retrieves total system memory in gigabytes. + +**Parameters:** + +- ``verbose`` (bool): Unused, kept for consistency + +**Returns:** + +- float: Total system memory in GB + +get_gpu_memory +^^^^^^^^^^^^^^ + +.. autofunction:: rapids_cli.doctor.checks.memory.get_gpu_memory + +Calculates total GPU memory across all GPUs in gigabytes. + +**Parameters:** + +- ``verbose`` (bool): Unused, kept for consistency + +**Returns:** + +- float: Total GPU memory in GB + +check_memory_to_gpu_ratio +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autofunction:: rapids_cli.doctor.checks.memory.check_memory_to_gpu_ratio + +Verifies system-to-GPU memory ratio meets recommendations. + +**Parameters:** + +- ``verbose`` (bool): Enable detailed output + +**Returns:** + +- bool: Always returns True (issues warnings instead of failing) + +**Warnings:** + +Issues warning if ratio is less than 1.8:1 (below recommended 2:1) + +**Recommendation:** + +For optimal performance, especially with Dask: + +- System memory should be at least 2x total GPU memory +- Example: 64GB RAM for 32GB total GPU memory (2x 16GB GPUs) + +NVLink Checks +------------- + +.. automodule:: rapids_cli.doctor.checks.nvlink + :members: + :undoc-members: + :show-inheritance: + +check_nvlink_status +^^^^^^^^^^^^^^^^^^^ + +.. autofunction:: rapids_cli.doctor.checks.nvlink.check_nvlink_status + +Checks for NVLink availability on multi-GPU systems. + +**Parameters:** + +- ``verbose`` (bool): Enable detailed output + +**Returns:** + +- bool: False if fewer than 2 GPUs, True if NVLink detected + +**Raises:** + +- ValueError: If NVLink status check fails on multi-GPU system + +**NVLink Benefits:** + +- High-bandwidth GPU-to-GPU communication +- Essential for multi-GPU training and processing +- Significantly faster than PCIe transfers + +**Note:** + +Only relevant for multi-GPU systems with NVLink-capable GPUs. + +Check Function Contract +----------------------- + +All built-in checks follow these conventions: + +Function Signature +^^^^^^^^^^^^^^^^^^ + +.. code-block:: python + + def check_function(verbose=False, **kwargs): + """Brief description of what this check verifies.""" + pass + +**Parameters:** + +- ``verbose`` (bool): Whether to provide detailed output +- ``**kwargs``: Reserved for future compatibility + +Return Values +^^^^^^^^^^^^^ + +**Success:** + +- Return any value (often True or a status string) +- Returning a string provides information for verbose output + +**Failure:** + +- Raise an exception with descriptive error message +- Use ValueError for failed checks +- Provide actionable guidance in error message + +**Warnings:** + +- Use ``warnings.warn()`` for non-fatal issues +- Always set ``stacklevel=2`` for correct source location + +Usage in Custom Checks +----------------------- + +Reference these built-in checks when creating custom checks: + +.. code-block:: python + + # Example: Custom memory check based on built-in pattern + from rapids_cli.doctor.checks.memory import get_gpu_memory + + + def my_memory_check(verbose=False, **kwargs): + """Check if GPU has enough memory for my workload.""" + gpu_memory = get_gpu_memory() + + required_gb = 16 + if gpu_memory < required_gb: + raise ValueError( + f"Insufficient GPU memory: {gpu_memory:.1f}GB available, " + f"{required_gb}GB required" + ) + + if verbose: + return f"GPU memory check passed: {gpu_memory:.1f}GB available" + return True diff --git a/docs/source/api/cli.rst b/docs/source/api/cli.rst new file mode 100644 index 0000000..bd916ff --- /dev/null +++ b/docs/source/api/cli.rst @@ -0,0 +1,62 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +CLI Module +========== + +The CLI module provides the main command-line interface for RAPIDS CLI using Click. + +.. automodule:: rapids_cli.cli + :members: + :undoc-members: + :show-inheritance: + +Main Commands +------------- + +rapids +^^^^^^ + +.. autofunction:: rapids_cli.cli.rapids + +The main CLI entry point. Provides access to all subcommands. + +doctor +^^^^^^ + +.. autofunction:: rapids_cli.cli.doctor + +Run health checks to verify RAPIDS installation. + +**Options:** + +- ``--verbose``: Enable detailed output +- ``--dry-run``: Show which checks would run without executing them +- ``filters``: Optional filters to run specific checks + +**Exit Codes:** + +- 0: All checks passed +- 1: One or more checks failed + +debug +^^^^^ + +.. autofunction:: rapids_cli.cli.debug + +Gather comprehensive debugging information. + +**Options:** + +- ``--json``: Output in JSON format for machine parsing + +**Output:** + +Returns detailed system information including: + +- Platform and OS details +- GPU and driver information +- CUDA version +- Python configuration +- Installed packages +- Available tools diff --git a/docs/source/api/debug.rst b/docs/source/api/debug.rst new file mode 100644 index 0000000..b4ce4b4 --- /dev/null +++ b/docs/source/api/debug.rst @@ -0,0 +1,233 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +Debug Module +============ + +The debug module gathers comprehensive system information for troubleshooting. + +.. automodule:: rapids_cli.debug.debug + :members: + :undoc-members: + :show-inheritance: + +Core Functions +-------------- + +run_debug +^^^^^^^^^ + +.. autofunction:: rapids_cli.debug.debug.run_debug + +Main function for gathering and displaying debug information. + +**Parameters:** + +- ``output_format`` (str): Output format, either "console" or "json" + +**Collected Information:** + +- Date and time +- Platform information +- nvidia-smi output +- NVIDIA driver version +- CUDA version +- CUDA runtime path +- System CUDA toolkit locations +- Python version (full and short) +- Python hash info +- All installed package versions +- pip freeze output +- conda list output (if available) +- conda info output (if available) +- Available development tools +- OS information from /etc/os-release + +gather_cuda_version +^^^^^^^^^^^^^^^^^^^ + +.. autofunction:: rapids_cli.debug.debug.gather_cuda_version + +Retrieves and formats CUDA driver version from pynvml. + +**Returns:** + +- str: CUDA version in format "Major.Minor" or "Major.Minor.Patch" + +**Example:** + +.. code-block:: python + + >>> gather_cuda_version() + '12.4' + +gather_package_versions +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autofunction:: rapids_cli.debug.debug.gather_package_versions + +Collects versions of all installed Python packages. + +**Returns:** + +- dict: Mapping of package names to version strings + +**Example:** + +.. code-block:: python + + >>> versions = gather_package_versions() + >>> versions['rapids-cli'] + '0.1.0' + +gather_command_output +^^^^^^^^^^^^^^^^^^^^^^ + +.. autofunction:: rapids_cli.debug.debug.gather_command_output + +Executes a command and returns its output, with optional fallback. + +**Parameters:** + +- ``command`` (list[str]): Command and arguments to execute +- ``fallback_output`` (str | None): Value to return if command fails + +**Returns:** + +- str | None: Command output or fallback value + +**Example:** + +.. code-block:: python + + >>> gather_command_output(['pip', '--version']) + 'pip 24.0 from /usr/local/lib/python3.10/site-packages/pip (python 3.10)' + + >>> gather_command_output(['nonexistent'], fallback_output='Not installed') + 'Not installed' + +gather_tools +^^^^^^^^^^^^ + +.. autofunction:: rapids_cli.debug.debug.gather_tools + +Gathers version information for common development tools. + +**Returns:** + +- dict: Tool names mapped to version strings or None + +**Checked Tools:** + +- pip +- conda +- uv +- pixi +- g++ +- cmake +- nvcc + +Output Formats +-------------- + +Console Format +^^^^^^^^^^^^^^ + +Human-readable output with Rich formatting: + +.. code-block:: text + + RAPIDS Debug Information + + Date + 2025-02-11 15:30:00 + + Platform + Linux-6.8.0-94-generic-x86_64 + + Driver Version + 550.54.15 + + Cuda Version + 12.4 + + Package Versions + ┌─────────────────┬──────────┐ + │ rapids-cli │ 0.1.0 │ + │ cudf │ 25.02.0 │ + └─────────────────┴──────────┘ + +JSON Format +^^^^^^^^^^^ + +Machine-readable output for automation: + +.. code-block:: json + + { + "date": "2025-02-11 15:30:00", + "platform": "Linux-6.8.0-94-generic-x86_64", + "nvidia_smi_output": "...", + "driver_version": "550.54.15", + "cuda_version": "12.4", + "cuda_runtime_path": "/usr/local/cuda/include", + "system_ctk": ["/usr/local/cuda-12.4"], + "python_version_full": "3.13.12 (main, ...)", + "python_version": "3.13.12", + "python_hash_info": "sys.hash_info(...)", + "package_versions": { + "rapids-cli": "0.1.0" + }, + "pip_packages": "...", + "conda_packages": "...", + "conda_info": "...", + "tools": { + "pip": "pip 24.0", + "conda": "conda 24.1.0" + }, + "os_info": { + "NAME": "Ubuntu", + "VERSION": "22.04" + } + } + +Usage Examples +-------------- + +Console Output +^^^^^^^^^^^^^^ + +.. code-block:: python + + from rapids_cli.debug.debug import run_debug + + # Display debug info in console + run_debug(output_format="console") + +JSON Output +^^^^^^^^^^^ + +.. code-block:: python + + import json + from rapids_cli.debug.debug import run_debug + + # Get JSON output + run_debug(output_format="json") + + # Can be captured with redirection + # rapids debug --json > debug.json + +Programmatic Access +^^^^^^^^^^^^^^^^^^^ + +.. code-block:: python + + from rapids_cli.debug.debug import gather_package_versions, gather_cuda_version + + # Get specific information + cuda_ver = gather_cuda_version() + packages = gather_package_versions() + + print(f"CUDA: {cuda_ver}") + print(f"Installed packages: {len(packages)}") diff --git a/docs/source/api/doctor.rst b/docs/source/api/doctor.rst new file mode 100644 index 0000000..16ab6ad --- /dev/null +++ b/docs/source/api/doctor.rst @@ -0,0 +1,141 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +Doctor Module +============= + +The doctor module orchestrates health check execution and plugin discovery. + +.. automodule:: rapids_cli.doctor.doctor + :members: + :undoc-members: + :show-inheritance: + +Core Functions +-------------- + +doctor_check +^^^^^^^^^^^^ + +.. autofunction:: rapids_cli.doctor.doctor.doctor_check + +The main orchestration function for running health checks. + +**Parameters:** + +- ``verbose`` (bool): Enable detailed output +- ``dry_run`` (bool): Discover checks without executing them +- ``filters`` (list[str] | None): Optional filters to match check paths + +**Returns:** + +- bool: True if all checks passed, False if any failed + +**Process:** + +1. Discovers all registered checks via entry points +2. Filters checks based on provided filters +3. Executes each check and captures results +4. Collects warnings from checks +5. Displays results and returns success status + +CheckResult +^^^^^^^^^^^ + +.. autoclass:: rapids_cli.doctor.doctor.CheckResult + :members: + +Data class representing the result of a single check execution. + +**Attributes:** + +- ``name`` (str): Name of the check function +- ``description`` (str): First line of check's docstring +- ``status`` (bool): True if check passed, False if failed +- ``value`` (str | None): Optional return value for verbose output +- ``error`` (Exception | None): Exception if check failed +- ``warnings`` (list[WarningMessage] | None): Any warnings issued during check + +Plugin Discovery +---------------- + +The doctor module discovers plugins using Python entry points: + +.. code-block:: python + + from importlib.metadata import entry_points + + for ep in entry_points(group="rapids_doctor_check"): + check_fn = ep.load() + # Execute check + +Entry Point Group +^^^^^^^^^^^^^^^^^ + +Plugins register in the ``rapids_doctor_check`` group: + +.. code-block:: toml + + [project.entry-points.rapids_doctor_check] + my_check = "my_package.checks:my_check_function" + +Check Execution Flow +-------------------- + +1. **Discovery Phase** + + - Scan entry points for ``rapids_doctor_check`` group + - Load check functions + - Apply filters if specified + +2. **Execution Phase** + + - Run each check with ``verbose`` parameter + - Capture warnings using ``warnings.catch_warnings()`` + - Catch exceptions for failed checks + - Store results in CheckResult objects + +3. **Reporting Phase** + + - Display warnings + - Show verbose output if requested + - List failed checks with error messages + - Return overall success status + +Error Handling +-------------- + +The doctor module handles several error scenarios: + +**Import Errors** + +Failed imports during discovery are suppressed with ``contextlib.suppress``: + +.. code-block:: python + + with contextlib.suppress(AttributeError, ImportError): + check_fn = ep.load() + +**Check Exceptions** + +Exceptions raised by checks are caught and stored: + +.. code-block:: python + + try: + value = check_fn(verbose=verbose) + status = True + except Exception as e: + error = e + status = False + +**Warnings** + +Python warnings are captured and displayed: + +.. code-block:: python + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + value = check_fn(verbose=verbose) + caught_warnings = w diff --git a/docs/source/conf.py b/docs/source/conf.py index 3b59adc..2303b4a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,5 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # Configuration file for the Sphinx documentation builder. # @@ -13,32 +12,73 @@ import os import sys -sys.path.insert(0, os.path.abspath("../rapids_cli")) +sys.path.insert(0, os.path.abspath("../../")) project = "RAPIDS CLI" -copyright = "2024, NVIDIA RAPIDS" +copyright = "2025-2026, NVIDIA CORPORATION & AFFILIATES" author = "NVIDIA RAPIDS" -release = "2024" + +# The short X.Y version +version = "0.1" +# The full version, including alpha/beta/rc tags +release = "0.1.0" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -extensions = [] +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.viewcode", + "sphinx.ext.napoleon", # For Google and NumPy style docstrings + "sphinx.ext.intersphinx", # Link to other project docs + "sphinx.ext.todo", # Support for todo items +] templates_path = ["_templates"] exclude_patterns = [] +# Napoleon settings for Google-style docstrings +napoleon_google_docstring = True +napoleon_numpy_docstring = False +napoleon_include_init_with_doc = True +napoleon_include_private_with_doc = False +napoleon_include_special_with_doc = True +napoleon_use_admonition_for_examples = True +napoleon_use_admonition_for_notes = True +napoleon_use_admonition_for_references = True +napoleon_use_ivar = False +napoleon_use_param = True +napoleon_use_rtype = True + +# Autodoc settings +autodoc_default_options = { + "members": True, + "member-order": "bysource", + "special-members": "__init__", + "undoc-members": True, + "exclude-members": "__weakref__", +} + +# Intersphinx mapping +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), +} # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = "alabaster" +html_theme = "sphinx_rtd_theme" html_static_path = ["_static"] -extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", - "sphinx.ext.viewcode", - "sphinx.ext.napoleon", # For Google and NumPy style docstrings -] +html_theme_options = { + "navigation_depth": 4, + "collapse_navigation": False, + "sticky_navigation": True, + "includehidden": True, +} + +# Add any paths that contain custom static files (such as style sheets) +html_logo = None +html_favicon = None diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst new file mode 100644 index 0000000..9d721f3 --- /dev/null +++ b/docs/source/contributing.rst @@ -0,0 +1,363 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +Contributing Guide +================== + +Thank you for your interest in contributing to RAPIDS CLI! This guide will help you get started. + +Getting Started +--------------- + +Prerequisites +^^^^^^^^^^^^^ + +- Python 3.10 or later +- Git +- NVIDIA GPU (for testing) +- NVIDIA drivers and CUDA toolkit + +Development Setup +^^^^^^^^^^^^^^^^^ + +1. Fork and clone the repository: + + .. code-block:: bash + + git clone https://github.com/YOUR_USERNAME/rapids-cli.git + cd rapids-cli + +2. Create a development environment: + + .. code-block:: bash + + # Using conda (recommended) + conda create -n rapids-cli-dev python=3.10 + conda activate rapids-cli-dev + + # Or using venv + python -m venv venv + source venv/bin/activate + +3. Install in editable mode with test dependencies: + + .. code-block:: bash + + pip install -e .[test] + +4. Install pre-commit hooks: + + .. code-block:: bash + + pre-commit install + +Development Workflow +-------------------- + +Making Changes +^^^^^^^^^^^^^^ + +1. Create a feature branch: + + .. code-block:: bash + + git checkout -b feature/your-feature-name + +2. Make your changes following the code style guidelines + +3. Add tests for your changes + +4. Run tests locally: + + .. code-block:: bash + + pytest + +5. Run linting checks: + + .. code-block:: bash + + pre-commit run --all-files + +Code Style +---------- + +The project uses several linting tools to maintain code quality: + +Formatting +^^^^^^^^^^ + +- **Black**: Code formatting (120 char line length) +- **isort**: Import sorting + +Linting +^^^^^^^ + +- **Ruff**: Fast Python linter (replaces flake8, pylint, etc.) +- **mypy**: Static type checking + +Run formatters and linters: + +.. code-block:: bash + + # Format code + black . + + # Check with ruff + ruff check --fix . + + # Type check + mypy rapids_cli/ + +Docstrings +^^^^^^^^^^ + +Use Google-style docstrings: + +.. code-block:: python + + def my_function(param1: str, param2: int) -> bool: + """Brief description of the function. + + Longer description if needed. + + Args: + param1: Description of param1. + param2: Description of param2. + + Returns: + Description of return value. + + Raises: + ValueError: Description of when this is raised. + + Example: + >>> my_function("test", 42) + True + """ + pass + +Testing +------- + +Writing Tests +^^^^^^^^^^^^^ + +- Place tests in ``rapids_cli/tests/`` +- Use pytest for testing +- Mock external dependencies (pynvml, subprocess calls, etc.) +- Aim for high coverage (95%+ required) + +Test Structure: + +.. code-block:: python + + # rapids_cli/tests/test_my_feature.py + from unittest.mock import patch, MagicMock + import pytest + + from rapids_cli.my_module import my_function + + + def test_my_function_success(): + """Test that my_function works in normal case.""" + result = my_function("input") + assert result == "expected" + + + def test_my_function_failure(): + """Test that my_function handles errors correctly.""" + with pytest.raises(ValueError, match="error message"): + my_function("invalid") + + + def test_my_function_with_mock(): + """Test my_function with mocked dependencies.""" + with patch("pynvml.nvmlInit") as mock_init: + result = my_function() + mock_init.assert_called_once() + +Running Tests +^^^^^^^^^^^^^ + +.. code-block:: bash + + # Run all tests + pytest + + # Run specific test file + pytest rapids_cli/tests/test_doctor.py + + # Run with coverage + pytest --cov=rapids_cli + + # Run specific test + pytest rapids_cli/tests/test_doctor.py::test_doctor_check_all_pass + +Pull Request Process +-------------------- + +1. Ensure all tests pass and coverage is maintained + +2. Update documentation if needed + +3. Sign your commits: + + .. code-block:: bash + + git commit -s -m "Your commit message" + +4. Push to your fork: + + .. code-block:: bash + + git push origin feature/your-feature-name + +5. Create a pull request on GitHub + +6. Address review feedback + +Commit Messages +^^^^^^^^^^^^^^^ + +Follow conventional commit format: + +.. code-block:: text + + : + + + + Signed-off-by: Your Name + +Types: + +- ``feat``: New feature +- ``fix``: Bug fix +- ``docs``: Documentation changes +- ``test``: Adding or updating tests +- ``refactor``: Code refactoring +- ``ci``: CI/CD changes +- ``chore``: Maintenance tasks + +Example: + +.. code-block:: text + + feat: add support for filtering checks by package name + + This allows users to run only specific checks by providing + filter arguments to the doctor command. + + Signed-off-by: Jane Doe + +Documentation +------------- + +Building Documentation +^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + cd docs + make html + + # View in browser + open build/html/index.html + +Documentation lives in ``docs/source/`` and uses Sphinx with reStructuredText. + +Adding New Documentation +^^^^^^^^^^^^^^^^^^^^^^^^ + +1. Create ``.rst`` file in ``docs/source/`` + +2. Add to table of contents in ``index.rst`` + +3. Build and verify: + + .. code-block:: bash + + cd docs + make html + +Reporting Issues +---------------- + +When reporting bugs: + +1. Check if issue already exists + +2. Provide minimal reproduction example + +3. Include debug output: + + .. code-block:: bash + + rapids debug --json > debug_info.json + +4. Include: + + - RAPIDS CLI version + - Python version + - OS and driver versions + - Expected vs actual behavior + +Feature Requests +^^^^^^^^^^^^^^^^ + +For feature requests: + +1. Describe the use case + +2. Explain why existing features don't work + +3. Provide example usage + +4. Consider contributing the feature! + +Code Review Guidelines +---------------------- + +For Reviewers +^^^^^^^^^^^^^ + +- Check that tests cover new functionality +- Verify documentation is updated +- Ensure code style is consistent +- Look for potential edge cases +- Validate error messages are helpful + +For Contributors +^^^^^^^^^^^^^^^^ + +- Respond to feedback promptly +- Ask questions if feedback is unclear +- Keep PRs focused on single concern +- Update based on reviews + +Release Process +--------------- + +Releases are managed by maintainers: + +1. Version is managed via git tags +2. CI automatically builds packages +3. Packages published to PyPI and conda-forge + +Community +--------- + +- GitHub Discussions: Q&A and ideas +- Slack: Real-time chat at rapids.ai/community +- Issues: Bug reports and features + +License +------- + +By contributing, you agree that your contributions will be licensed under the Apache 2.0 License. + +Thank You! +---------- + +Every contribution helps make RAPIDS CLI better. Thank you for your time and effort! diff --git a/docs/source/cuda_driver.rst b/docs/source/cuda_driver.rst deleted file mode 100644 index a4d69d6..0000000 --- a/docs/source/cuda_driver.rst +++ /dev/null @@ -1,35 +0,0 @@ -.. _cuda_driver: - -CUDA Driver Checks -================== - -This module provides functions to check the availability of CUDA, retrieve CUDA version, -and verify compatibility between the CUDA toolkit and NVIDIA drivers. - -Functions ---------- -.. autofunction:: rapids_cli.doctor.checks.cuda_driver.cuda_check - - Checks if CUDA is available on the system by initializing the NVML and retrieving the CUDA driver version. - - :return: True if CUDA is available, False otherwise. - -.. autofunction:: rapids_cli.doctor.checks.cuda_driver.get_cuda_version - - Retrieves the version of the installed CUDA toolkit. - - :return: A string representing the CUDA version or None if CUDA is not found. - -.. autofunction:: rapids_cli.doctor.checks.cuda_driver.get_driver_version - - Retrieves the installed NVIDIA driver version. - - :return: A string representing the NVIDIA driver version or None if the driver is not found. - -.. autofunction:: rapids_cli.doctor.checks.cuda_driver.check_driver_compatibility - - Checks the compatibility between the installed CUDA version and the NVIDIA driver version. - - This function prints whether the installed versions are compatible with RAPIDS. - - :return: None diff --git a/docs/source/doctor.rst b/docs/source/doctor.rst deleted file mode 100644 index 33be8d8..0000000 --- a/docs/source/doctor.rst +++ /dev/null @@ -1,19 +0,0 @@ -.. _doctor: - -Doctor -========= - -Overview of Doctor. - -.. automodule:: doctor - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -.. automodule:: doctor.checks - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/index.rst b/docs/source/index.rst index 94df6bc..9c27bfa 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,26 +1,72 @@ -.. RAPIDS CLI documentation master file, created by - sphinx-quickstart on Fri Oct 25 10:50:48 2024. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 -RAPIDS CLI documentation +RAPIDS CLI Documentation ======================== -Add your content using ``reStructuredText`` syntax. See the -`reStructuredText `_ -documentation for details. +The RAPIDS CLI is a command-line tool for performing common RAPIDS operations, +primarily focused on health checks (``rapids doctor``) and debugging (``rapids debug``). +It uses a plugin system that allows RAPIDS libraries to register their own health checks +via Python entry points. +.. image:: https://img.shields.io/badge/python-3.10+-blue.svg + :target: https://www.python.org/downloads/ + :alt: Python Version + +.. image:: https://img.shields.io/badge/license-Apache%202.0-blue.svg + :target: https://github.com/rapidsai/rapids-cli/blob/main/LICENSE + :alt: License + +Quick Start +----------- + +Install the RAPIDS CLI: + +.. code-block:: bash + + pip install rapids-cli + +Run health checks: + +.. code-block:: bash + + rapids doctor + +Gather debugging information: + +.. code-block:: bash + + rapids debug --json + +Documentation Contents +---------------------- .. toctree:: :maxdepth: 2 - :caption: Contents: + :caption: User Guide + + installation + user_guide + troubleshooting - doctor - cuda_driver +.. toctree:: + :maxdepth: 2 + :caption: Developer Guide + + plugin_development + contributing + +.. toctree:: + :maxdepth: 2 + :caption: API Reference + api/cli + api/doctor + api/debug + api/checks Indices and tables -=================== +================== * :ref:`genindex` * :ref:`modindex` diff --git a/docs/source/installation.rst b/docs/source/installation.rst new file mode 100644 index 0000000..213d474 --- /dev/null +++ b/docs/source/installation.rst @@ -0,0 +1,106 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +Installation +============ + +Requirements +------------ + +- Python 3.10 or later +- NVIDIA GPU (for running health checks) +- NVIDIA drivers installed +- CUDA toolkit (optional, for full functionality) + +Installation Methods +-------------------- + +From PyPI +^^^^^^^^^ + +The simplest way to install RAPIDS CLI is via pip: + +.. code-block:: bash + + pip install rapids-cli + +From Conda +^^^^^^^^^^ + +You can also install via conda: + +.. code-block:: bash + + conda install -c rapidsai -c conda-forge rapids-cli + +From Source +^^^^^^^^^^^ + +For development or to get the latest features: + +.. code-block:: bash + + git clone https://github.com/rapidsai/rapids-cli.git + cd rapids-cli + pip install -e . + +With Test Dependencies +^^^^^^^^^^^^^^^^^^^^^^ + +To run tests locally: + +.. code-block:: bash + + pip install -e .[test] + +Verification +------------ + +Verify the installation by running: + +.. code-block:: bash + + rapids --help + +You should see the RAPIDS CLI help message with available commands. + +Quick Test +^^^^^^^^^^ + +Run a quick health check to verify everything is working: + +.. code-block:: bash + + rapids doctor --verbose + +This will check your GPU availability, CUDA installation, and system configuration. + +Upgrading +--------- + +To upgrade to the latest version: + +.. code-block:: bash + + pip install --upgrade rapids-cli + +Or with conda: + +.. code-block:: bash + + conda update rapids-cli + +Uninstalling +------------ + +To uninstall RAPIDS CLI: + +.. code-block:: bash + + pip uninstall rapids-cli + +Or with conda: + +.. code-block:: bash + + conda remove rapids-cli diff --git a/docs/source/plugin_development.rst b/docs/source/plugin_development.rst new file mode 100644 index 0000000..f4aa2ae --- /dev/null +++ b/docs/source/plugin_development.rst @@ -0,0 +1,477 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +Plugin Development Guide +======================== + +The RAPIDS CLI uses a plugin system based on Python entry points to allow external packages +to register their own health checks. This guide shows you how to create plugins for your +RAPIDS library. + +Overview +-------- + +Plugins are discovered automatically through Python entry points in the ``rapids_doctor_check`` +group. When ``rapids doctor`` runs, it discovers all registered checks and executes them. + +Quick Start +----------- + +Here's a minimal example of adding a check to your RAPIDS package: + +1. Create a check function in your package: + + .. code-block:: python + + # my_rapids_package/health_checks.py + + + def my_package_check(verbose=False, **kwargs): + """Check that my_rapids_package is working correctly.""" + import my_rapids_package + + # Perform your check + result = my_rapids_package.test_function() + + if not result: + raise ValueError("my_rapids_package self-test failed") + + return "my_rapids_package is working correctly" + +2. Register the check in your ``pyproject.toml``: + + .. code-block:: toml + + [project.entry-points.rapids_doctor_check] + my_package_check = "my_rapids_package.health_checks:my_package_check" + +3. Install your package and test: + + .. code-block:: bash + + pip install -e . + rapids doctor --verbose + +Check Function Contract +----------------------- + +Your check function must follow these conventions: + +Function Signature +^^^^^^^^^^^^^^^^^^ + +.. code-block:: python + + def my_check(verbose=False, **kwargs): + """Check description goes here.""" + pass + +- Accept ``verbose`` parameter (boolean, default False) +- Accept ``**kwargs`` for forward compatibility +- Provide a clear docstring (first line is used in output) + +Return Values +^^^^^^^^^^^^^ + +**Success**: Return successfully (any return value) + +.. code-block:: python + + def check_success(verbose=False, **kwargs): + """This check always passes.""" + # Option 1: Return None (implicit) + return + + + def check_with_info(verbose=False, **kwargs): + """This check passes with info.""" + # Option 2: Return a string for verbose output + return "GPU 0: Tesla V100, 32GB memory" + +**Failure**: Raise an exception with a helpful message + +.. code-block:: python + + def check_failure(verbose=False, **kwargs): + """This check fails with helpful message.""" + if not some_condition(): + raise ValueError( + "Check failed: XYZ is not configured correctly. " + "To fix this, run: sudo apt-get install xyz" + ) + +**Warnings**: Use ``warnings.warn()`` for non-fatal issues + +.. code-block:: python + + import warnings + + + def check_with_warning(verbose=False, **kwargs): + """This check passes but issues a warning.""" + if not optimal_condition(): + warnings.warn( + "Suboptimal configuration detected. " "Performance may be degraded.", + stacklevel=2, + ) + return True + +Examples +-------- + +Example 1: Basic Import Check +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Check that your package can be imported: + +.. code-block:: python + + def import_check(verbose=False, **kwargs): + """Check that my_package can be imported.""" + try: + import my_package + except ImportError as e: + raise ImportError( + "my_package not found. Install with: pip install my_package" + ) from e + + if verbose: + return f"my_package version {my_package.__version__}" + return True + +Example 2: GPU Memory Check +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Check GPU memory requirements: + +.. code-block:: python + + import pynvml + + + def gpu_memory_check(verbose=False, **kwargs): + """Check that GPU has sufficient memory for my_package.""" + pynvml.nvmlInit() + + required_memory_gb = 8 + handle = pynvml.nvmlDeviceGetHandleByIndex(0) + memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + available_gb = memory_info.total / (1024**3) + + if available_gb < required_memory_gb: + raise ValueError( + f"Insufficient GPU memory: {available_gb:.1f}GB available, " + f"{required_memory_gb}GB required" + ) + + if verbose: + return f"GPU memory: {available_gb:.1f}GB available" + return True + +Example 3: Dependency Version Check +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Check that dependencies meet version requirements: + +.. code-block:: python + + import warnings + from packaging import version + + + def dependency_version_check(verbose=False, **kwargs): + """Check that dependencies meet minimum version requirements.""" + import numpy + import pandas + + min_numpy = "1.20.0" + min_pandas = "1.3.0" + + if version.parse(numpy.__version__) < version.parse(min_numpy): + raise ValueError( + f"NumPy {min_numpy}+ required, found {numpy.__version__}. " + f"Upgrade with: pip install 'numpy>={min_numpy}'" + ) + + if version.parse(pandas.__version__) < version.parse(min_pandas): + warnings.warn( + f"Pandas {min_pandas}+ recommended for best performance. " + f"Found {pandas.__version__}.", + stacklevel=2, + ) + + if verbose: + return f"NumPy {numpy.__version__}, Pandas {pandas.__version__}" + return True + +Example 4: Functional Test +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Run a simple functional test: + +.. code-block:: python + + def functional_check(verbose=False, **kwargs): + """Run a simple functional test.""" + import my_package + import numpy as np + + try: + # Create test data + data = np.random.rand(100, 10) + + # Run simple operation + result = my_package.process(data) + + # Verify result + assert result.shape == (100, 10), "Unexpected output shape" + assert not np.isnan(result).any(), "NaN values in output" + + except Exception as e: + raise RuntimeError( + f"Functional test failed: {e}. " "This may indicate a GPU or driver issue." + ) from e + + if verbose: + return "Functional test passed: basic operations working" + return True + +Best Practices +-------------- + +Clear Error Messages +^^^^^^^^^^^^^^^^^^^^ + +Always provide actionable error messages: + +.. code-block:: python + + # Bad: Unclear what to do + raise ValueError("Check failed") + + # Good: Clear action to fix + raise ValueError( + "CUDA 11.2+ required but CUDA 10.2 found. " + "Upgrade CUDA: https://developer.nvidia.com/cuda-downloads" + ) + +Performance +^^^^^^^^^^^ + +Keep checks fast (< 1 second each): + +.. code-block:: python + + # Bad: Slow check + def slow_check(verbose=False, **kwargs): + """This check is too slow.""" + result = expensive_computation() # Takes 30 seconds + return result + + + # Good: Fast check + def fast_check(verbose=False, **kwargs): + """This check is appropriately fast.""" + # Just verify configuration, don't run full workload + config = load_config() + validate_config(config) + return True + +Verbose Output +^^^^^^^^^^^^^^ + +Provide useful information in verbose mode: + +.. code-block:: python + + def informative_check(verbose=False, **kwargs): + """Check with informative output.""" + gpu_count = get_gpu_count() + gpu_memory = get_total_gpu_memory() + + if gpu_count == 0: + raise ValueError("No GPUs found") + + if verbose: + return f"Found {gpu_count} GPU(s) " f"with {gpu_memory:.1f}GB total memory" + return True + +Graceful Degradation +^^^^^^^^^^^^^^^^^^^^ + +Handle optional dependencies gracefully: + +.. code-block:: python + + def optional_dependency_check(verbose=False, **kwargs): + """Check that works with optional dependencies.""" + try: + import optional_package + + has_optional = True + except ImportError: + has_optional = False + + if not has_optional: + import warnings + + warnings.warn( + "optional_package not found. " "Some features will be disabled.", + stacklevel=2, + ) + + # Continue with check anyway + return True + +Testing Your Plugin +------------------- + +Test Plugin Discovery +^^^^^^^^^^^^^^^^^^^^^ + +Verify your check is discovered: + +.. code-block:: bash + + rapids doctor --verbose --dry-run | grep my_check + +Test Plugin Execution +^^^^^^^^^^^^^^^^^^^^^ + +Run your check: + +.. code-block:: bash + + rapids doctor --verbose my_package + +Unit Testing +^^^^^^^^^^^^ + +Test your check function directly: + +.. code-block:: python + + # test_health_checks.py + import pytest + from my_package.health_checks import my_check + + + def test_my_check_success(): + """Test that check passes in normal conditions.""" + result = my_check(verbose=True) + assert result is not None + + + def test_my_check_failure(): + """Test that check fails appropriately.""" + with pytest.raises(ValueError, match="expected error"): + my_check_with_bad_config(verbose=False) + +Advanced Topics +--------------- + +Multiple Checks per Package +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Register multiple checks: + +.. code-block:: toml + + [project.entry-points.rapids_doctor_check] + my_pkg_import = "my_package.checks:import_check" + my_pkg_gpu = "my_package.checks:gpu_check" + my_pkg_functional = "my_package.checks:functional_check" + +Check Dependencies +^^^^^^^^^^^^^^^^^^ + +If checks have dependencies, handle them gracefully: + +.. code-block:: python + + def dependent_check(verbose=False, **kwargs): + """This check depends on GPU check passing.""" + # Don't fail if dependencies aren't met + try: + import pynvml + + pynvml.nvmlInit() + except Exception: + warnings.warn("GPU not available, skipping dependent check", stacklevel=2) + return True + + # Rest of check + return True + +Environment-Specific Checks +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Adapt checks to different environments: + +.. code-block:: python + + import os + + + def environment_aware_check(verbose=False, **kwargs): + """Check that adapts to environment.""" + is_ci = os.environ.get("CI") == "true" + + if is_ci: + # Skip expensive checks in CI + return "Skipped in CI environment" + + # Run full check + run_expensive_validation() + return True + +Troubleshooting +--------------- + +Check Not Discovered +^^^^^^^^^^^^^^^^^^^^ + +If your check isn't showing up: + +1. Verify entry point is correct: + + .. code-block:: bash + + python -c "from importlib.metadata import entry_points; print([ep for ep in entry_points(group='rapids_doctor_check')])" + +2. Reinstall your package: + + .. code-block:: bash + + pip install -e . --force-reinstall --no-deps + +3. Check for import errors: + + .. code-block:: python + + python -c "from my_package.checks import my_check" + +Check Always Fails +^^^^^^^^^^^^^^^^^^ + +Debug the check directly: + +.. code-block:: python + + from my_package.checks import my_check + + try: + result = my_check(verbose=True) + print(f"Success: {result}") + except Exception as e: + print(f"Failed: {e}") + import traceback + + traceback.print_exc() + +Resources +--------- + +- Entry points documentation: https://packaging.python.org/specifications/entry-points/ +- RAPIDS CLI repository: https://github.com/rapidsai/rapids-cli +- Example plugins: See built-in checks in ``rapids_cli/doctor/checks/`` diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst new file mode 100644 index 0000000..014f39f --- /dev/null +++ b/docs/source/troubleshooting.rst @@ -0,0 +1,395 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +Troubleshooting +=============== + +This guide helps you resolve common issues with the RAPIDS CLI. + +Common Issues +------------- + +No GPUs Detected +^^^^^^^^^^^^^^^^ + +**Symptom**: ``rapids doctor`` reports "No available GPUs detected" + +**Solutions**: + +1. Verify NVIDIA drivers are installed: + + .. code-block:: bash + + nvidia-smi + + If this fails, install NVIDIA drivers: + + .. code-block:: bash + + # Ubuntu/Debian + sudo apt-get install nvidia-driver-550 + +2. Check that GPU is visible to Python: + + .. code-block:: bash + + python -c "import pynvml; pynvml.nvmlInit(); print(pynvml.nvmlDeviceGetCount())" + +3. Verify you're not in a container without GPU access: + + .. code-block:: bash + + # Docker needs --gpus all flag + docker run --gpus all ... + +CUDA Version Mismatch +^^^^^^^^^^^^^^^^^^^^^ + +**Symptom**: ``rapids doctor`` reports CUDA version incompatibility + +**Solutions**: + +1. Check your CUDA driver version: + + .. code-block:: bash + + nvidia-smi | grep "CUDA Version" + +2. Install compatible RAPIDS packages: + + .. code-block:: bash + + # For CUDA 11.x + pip install cudf-cu11 cuml-cu11 + + # For CUDA 12.x + pip install cudf-cu12 cuml-cu12 + +3. Update NVIDIA drivers if needed: + + .. code-block:: bash + + # Check https://docs.rapids.ai/install for requirements + sudo apt-get update && sudo apt-get upgrade nvidia-driver + +Low Memory Warning +^^^^^^^^^^^^^^^^^^ + +**Symptom**: Warning about system memory to GPU memory ratio + +**Context**: RAPIDS recommends 2:1 ratio of system RAM to GPU memory for optimal performance + +**Solutions**: + +1. This is a warning, not an error. RAPIDS will still work. + +2. For better performance, consider: + + - Adding more system RAM + - Using data chunking strategies + - Processing smaller batches + +3. For Dask workloads, adjust worker memory limits: + + .. code-block:: python + + from dask_cuda import LocalCUDACluster + + cluster = LocalCUDACluster( + device_memory_limit="8GB", # Limit per worker + memory_limit="16GB", # System memory per worker + ) + +NVLink Not Found +^^^^^^^^^^^^^^^^ + +**Symptom**: ``rapids doctor`` reports NVLink is not available + +**Context**: NVLink is only available on multi-GPU systems with NVLink-capable GPUs + +**Solutions**: + +1. If you have only one GPU, this is expected. NVLink is not needed. + +2. For multi-GPU systems without NVLink: + + - RAPIDS will work but inter-GPU transfers will be slower + - Consider PCIe topology optimization + +3. Verify NVLink status: + + .. code-block:: bash + + nvidia-smi nvlink --status + +Insufficient Compute Capability +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**Symptom**: "GPU requires compute capability 7.0 or higher" + +**Context**: RAPIDS requires GPU compute capability 7.0+ (Volta architecture or newer) + +**Solutions**: + +1. Check your GPU compute capability: + + .. code-block:: bash + + rapids debug | grep "GPU" + +2. Supported GPUs include: + + - Tesla V100, A100, H100 + - RTX 20xx, 30xx, 40xx series + - GTX 1660 and above + +3. If your GPU is too old, you'll need to upgrade hardware. + +Check Discovery Issues +^^^^^^^^^^^^^^^^^^^^^^ + +**Symptom**: Custom checks not discovered by ``rapids doctor`` + +**Solutions**: + +1. Verify entry point registration: + + .. code-block:: bash + + python -c "from importlib.metadata import entry_points; \ + print([ep.name for ep in entry_points(group='rapids_doctor_check')])" + +2. Reinstall package with entry points: + + .. code-block:: bash + + pip install -e . --force-reinstall + +3. Check for import errors: + + .. code-block:: bash + + rapids doctor --verbose + + Look for "Failed to import" messages. + +Import Errors +^^^^^^^^^^^^^ + +**Symptom**: "ModuleNotFoundError" when running checks + +**Solutions**: + +1. Verify package is installed: + + .. code-block:: bash + + pip list | grep rapids + +2. Check Python environment: + + .. code-block:: bash + + which python + python --version + +3. Ensure you're in the correct virtual environment: + + .. code-block:: bash + + # Conda + conda activate rapids-env + + # venv + source venv/bin/activate + +Permission Errors +^^^^^^^^^^^^^^^^^ + +**Symptom**: "Permission denied" when accessing GPU + +**Solutions**: + +1. Add user to video/render groups: + + .. code-block:: bash + + sudo usermod -a -G video $USER + sudo usermod -a -G render $USER + + # Log out and back in for changes to take effect + +2. Check device permissions: + + .. code-block:: bash + + ls -l /dev/nvidia* + +3. For containers, ensure proper device mounting: + + .. code-block:: bash + + docker run --gpus all --device=/dev/nvidia0 ... + +Debugging Tips +-------------- + +Enable Verbose Mode +^^^^^^^^^^^^^^^^^^^ + +Always start with verbose output: + +.. code-block:: bash + + rapids doctor --verbose + +This shows: + +- Which checks are discovered +- Detailed error messages +- Stack traces for failures + +Gather Debug Information +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Collect comprehensive system information: + +.. code-block:: bash + + rapids debug --json > debug_info.json + +Share this file when reporting issues. + +Test Individual Components +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Test NVIDIA stack components: + +.. code-block:: bash + + # Test nvidia-smi + nvidia-smi + + # Test pynvml (Python binding) + python -c "import pynvml; pynvml.nvmlInit(); print('OK')" + + # Test CUDA + python -c "import cuda; print(cuda.cudaroot)" + +Check Environment Variables +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Verify CUDA-related environment variables: + +.. code-block:: bash + + echo $CUDA_HOME + echo $LD_LIBRARY_PATH + echo $PATH + +Run in Isolation +^^^^^^^^^^^^^^^^ + +Test in a clean environment: + +.. code-block:: bash + + # Create fresh environment + conda create -n test-rapids python=3.10 + conda activate test-rapids + + # Install only RAPIDS CLI + pip install rapids-cli + + # Test + rapids doctor + +Enable Python Warnings +^^^^^^^^^^^^^^^^^^^^^^ + +See all warnings: + +.. code-block:: bash + + python -W all -m rapids_cli.cli doctor + +Performance Issues +------------------ + +Slow Check Execution +^^^^^^^^^^^^^^^^^^^^ + +If checks are slow: + +1. Use ``--dry-run`` to verify discovery without execution: + + .. code-block:: bash + + rapids doctor --dry-run + +2. Profile individual checks: + + .. code-block:: python + + import time + from my_package.checks import my_check + + start = time.time() + my_check(verbose=True) + print(f"Check took {time.time() - start:.2f}s") + +3. Optimize slow checks (keep under 1 second each) + +High Memory Usage +^^^^^^^^^^^^^^^^^ + +If ``rapids doctor`` uses too much memory: + +1. This is unexpected - report as a bug + +2. Workaround: Run checks individually: + + .. code-block:: bash + + rapids doctor package1 + rapids doctor package2 + +Reporting Issues +---------------- + +When reporting issues, include: + +1. Output of ``rapids debug --json`` + +2. Complete error messages from ``rapids doctor --verbose`` + +3. Steps to reproduce + +4. Expected vs actual behavior + +5. Environment details: + + .. code-block:: bash + + rapids debug > environment.txt + python --version + pip list > packages.txt + +Submit issues at: https://github.com/rapidsai/rapids-cli/issues + +Getting Help +------------ + +- GitHub Issues: https://github.com/rapidsai/rapids-cli/issues +- RAPIDS Slack: https://rapids.ai/community +- Documentation: https://docs.rapids.ai +- Stack Overflow: Tag questions with ``rapids`` and ``rapids-cli`` + +Known Limitations +----------------- + +- Windows support is experimental +- WSL2 requires special GPU setup +- Some checks require sudo access +- Docker containers need ``--gpus all`` flag +- Remote GPU monitoring not supported diff --git a/docs/source/user_guide.rst b/docs/source/user_guide.rst new file mode 100644 index 0000000..11da9ad --- /dev/null +++ b/docs/source/user_guide.rst @@ -0,0 +1,293 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +User Guide +========== + +This guide provides detailed information on using the RAPIDS CLI. + +Overview +-------- + +The RAPIDS CLI provides two main commands: + +- ``rapids doctor`` - Health checks for your RAPIDS installation +- ``rapids debug`` - Gather debugging information about your system + +rapids doctor +------------- + +The ``doctor`` command performs health checks to ensure your RAPIDS environment is properly configured. + +Basic Usage +^^^^^^^^^^^ + +Run all health checks: + +.. code-block:: bash + + rapids doctor + +This will check: + +- GPU availability and compatibility +- CUDA driver version +- System memory to GPU memory ratio +- NVLink status (for multi-GPU systems) +- Any checks registered by installed RAPIDS packages + +Verbose Output +^^^^^^^^^^^^^^ + +Get detailed information about each check: + +.. code-block:: bash + + rapids doctor --verbose + +This shows: + +- Which checks are discovered +- Detailed output from each check +- Additional diagnostic information + +Dry Run +^^^^^^^ + +See which checks would run without actually executing them: + +.. code-block:: bash + + rapids doctor --dry-run + +This is useful for: + +- Verifying plugin discovery +- Debugging check registration issues +- Understanding what will be checked + +Filtering Checks +^^^^^^^^^^^^^^^^ + +Run only specific checks by filtering: + +.. code-block:: bash + + # Run only cuDF-related checks + rapids doctor cudf + + # Run multiple filtered checks + rapids doctor cudf cuml + +The filter matches any part of the check's module path. + +Exit Codes +^^^^^^^^^^ + +The ``doctor`` command returns: + +- ``0`` - All checks passed +- ``1`` - One or more checks failed + +This makes it suitable for use in scripts and CI/CD pipelines: + +.. code-block:: bash + + if rapids doctor; then + echo "Environment is ready!" + else + echo "Environment has issues!" + exit 1 + fi + +rapids debug +------------ + +The ``debug`` command gathers comprehensive information about your system for troubleshooting. + +Basic Usage +^^^^^^^^^^^ + +Generate a debug report: + +.. code-block:: bash + + rapids debug + +This displays: + +- Platform information +- NVIDIA driver version +- CUDA version +- Python version and configuration +- Installed package versions +- System tools (pip, conda, cmake, etc.) +- OS information + +JSON Output +^^^^^^^^^^^ + +Get machine-readable output: + +.. code-block:: bash + + rapids debug --json + +This is useful for: + +- Automated debugging scripts +- Parsing in other tools +- Sharing debug information programmatically + +The JSON output includes all information in a structured format: + +.. code-block:: json + + { + "date": "2025-02-11 15:30:00", + "platform": "Linux-6.8.0-94-generic-x86_64", + "driver_version": "550.54.15", + "cuda_version": "12.4", + "python_version": "3.13.12", + "package_versions": { + "rapids-cli": "0.1.0", + ... + }, + ... + } + +Saving Debug Output +^^^^^^^^^^^^^^^^^^^ + +Save debug information to a file: + +.. code-block:: bash + + rapids debug --json > debug_info.json + +This file can be: + +- Shared with support teams +- Attached to bug reports +- Used for comparison across environments + +Common Workflows +---------------- + +Pre-Installation Check +^^^^^^^^^^^^^^^^^^^^^^ + +Before installing RAPIDS, verify your system meets requirements: + +.. code-block:: bash + + # Install just the CLI first + pip install rapids-cli + + # Check system compatibility + rapids doctor --verbose + +The checks will tell you if your GPU, drivers, and CUDA are suitable for RAPIDS. + +Post-Installation Verification +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +After installing RAPIDS packages, verify everything works: + +.. code-block:: bash + + # Install RAPIDS + pip install cudf-cu12 cuml-cu12 + + # Verify installation + rapids doctor + + # If issues occur, gather debug info + rapids debug --json > debug_info.json + +CI/CD Integration +^^^^^^^^^^^^^^^^^ + +Use RAPIDS CLI in your CI/CD pipelines: + +.. code-block:: yaml + + # GitHub Actions example + - name: Verify RAPIDS Environment + run: | + pip install rapids-cli + rapids doctor --verbose || exit 1 + + - name: Save Debug Info on Failure + if: failure() + run: rapids debug --json > ${{ github.workspace }}/debug.json + +Troubleshooting Workflow +^^^^^^^^^^^^^^^^^^^^^^^^^ + +When encountering issues: + +1. Run verbose health check: + + .. code-block:: bash + + rapids doctor --verbose + +2. Review warning messages and failures + +3. Gather full debug information: + + .. code-block:: bash + + rapids debug > debug_output.txt + +4. Check troubleshooting guide (see :doc:`troubleshooting`) + +5. Report issues with debug output + +Best Practices +-------------- + +Regular Health Checks +^^^^^^^^^^^^^^^^^^^^^ + +Run ``rapids doctor`` regularly to catch configuration drift: + +.. code-block:: bash + + # Add to your shell profile + alias rapids-check='rapids doctor && echo "✓ RAPIDS environment healthy"' + +Environment Documentation +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Document your environment with debug output: + +.. code-block:: bash + + # Save baseline configuration + rapids debug --json > baseline_env.json + + # Later, compare environments + rapids debug --json > current_env.json + diff baseline_env.json current_env.json + +Automated Monitoring +^^^^^^^^^^^^^^^^^^^^ + +Monitor RAPIDS environments automatically: + +.. code-block:: bash + + #!/bin/bash + # daily_rapids_check.sh + + if ! rapids doctor; then + rapids debug --json | mail -s "RAPIDS Health Check Failed" admin@example.com + fi + +Add to cron: + +.. code-block:: bash + + 0 9 * * * /path/to/daily_rapids_check.sh From 005391c1e14b7d7cd0f3f44ef09986c1f477dc20 Mon Sep 17 00:00:00 2001 From: Mike McCarty Date: Wed, 11 Feb 2026 16:20:12 -0500 Subject: [PATCH 2/3] updated docs to match the deployment docs --- dependencies.yaml | 3 +- docs/source/api/checks.rst | 231 +-------------- docs/source/api/cli.rst | 56 +--- docs/source/api/debug.rst | 238 ++-------------- docs/source/api/doctor.rst | 147 ++-------- docs/source/conf.py | 73 +++-- docs/source/contributing.rst | 363 ------------------------ docs/source/index.rst | 25 +- docs/source/installation.rst | 106 ------- docs/source/plugin_development.rst | 439 +++++------------------------ docs/source/troubleshooting.rst | 369 +++--------------------- docs/source/user_guide.rst | 254 +++-------------- rapids_cli/doctor/doctor.py | 26 +- 13 files changed, 251 insertions(+), 2079 deletions(-) delete mode 100644 docs/source/contributing.rst delete mode 100644 docs/source/installation.rst diff --git a/dependencies.yaml b/dependencies.yaml index 12cb729..fdda2a2 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -79,8 +79,9 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: + - pydata-sphinx-theme - sphinx - - sphinx-rtd-theme + - sphinx-copybutton test_python: common: - output_types: [conda, requirements, pyproject] diff --git a/docs/source/api/checks.rst b/docs/source/api/checks.rst index acf8775..363e26f 100644 --- a/docs/source/api/checks.rst +++ b/docs/source/api/checks.rst @@ -4,7 +4,10 @@ Health Checks ============= -Built-in health check modules for verifying RAPIDS installation requirements. +Built-in health check modules registered via the ``rapids_doctor_check`` +entry point group in ``pyproject.toml``. + +All check functions follow the contract described in :doc:`../plugin_development`. GPU Checks ---------- @@ -14,57 +17,6 @@ GPU Checks :undoc-members: :show-inheritance: -gpu_check -^^^^^^^^^ - -.. autofunction:: rapids_cli.doctor.checks.gpu.gpu_check - -Verifies that NVIDIA GPUs are available and accessible. - -**Parameters:** - -- ``verbose`` (bool): Enable detailed output - -**Returns:** - -- str: Message indicating number of GPUs detected - -**Raises:** - -- ValueError: If no GPUs are detected -- AssertionError: If GPU count is zero - -**Example:** - -.. code-block:: python - - >>> gpu_check(verbose=True) - 'GPU(s) detected: 2' - -check_gpu_compute_capability -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. autofunction:: rapids_cli.doctor.checks.gpu.check_gpu_compute_capability - -Verifies that all GPUs meet minimum compute capability requirements. - -**Parameters:** - -- ``verbose`` (bool): Enable detailed output - -**Returns:** - -- bool: True if all GPUs meet requirements - -**Raises:** - -- ValueError: If any GPU has insufficient compute capability - -**Required Compute Capability:** - -- Minimum: 7.0 (Volta architecture or newer) -- Supported GPUs: V100, A100, H100, RTX 20xx/30xx/40xx series - CUDA Driver Checks ------------------ @@ -73,32 +25,6 @@ CUDA Driver Checks :undoc-members: :show-inheritance: -cuda_check -^^^^^^^^^^ - -.. autofunction:: rapids_cli.doctor.checks.cuda_driver.cuda_check - -Verifies CUDA driver availability and retrieves version. - -**Parameters:** - -- ``verbose`` (bool): Enable detailed output - -**Returns:** - -- int: CUDA driver version code (e.g., 12040 for CUDA 12.4) - -**Raises:** - -- ValueError: If CUDA driver version cannot be determined - -**Example:** - -.. code-block:: python - - >>> cuda_check(verbose=True) - 12040 - Memory Checks ------------- @@ -107,62 +33,6 @@ Memory Checks :undoc-members: :show-inheritance: -get_system_memory -^^^^^^^^^^^^^^^^^ - -.. autofunction:: rapids_cli.doctor.checks.memory.get_system_memory - -Retrieves total system memory in gigabytes. - -**Parameters:** - -- ``verbose`` (bool): Unused, kept for consistency - -**Returns:** - -- float: Total system memory in GB - -get_gpu_memory -^^^^^^^^^^^^^^ - -.. autofunction:: rapids_cli.doctor.checks.memory.get_gpu_memory - -Calculates total GPU memory across all GPUs in gigabytes. - -**Parameters:** - -- ``verbose`` (bool): Unused, kept for consistency - -**Returns:** - -- float: Total GPU memory in GB - -check_memory_to_gpu_ratio -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. autofunction:: rapids_cli.doctor.checks.memory.check_memory_to_gpu_ratio - -Verifies system-to-GPU memory ratio meets recommendations. - -**Parameters:** - -- ``verbose`` (bool): Enable detailed output - -**Returns:** - -- bool: Always returns True (issues warnings instead of failing) - -**Warnings:** - -Issues warning if ratio is less than 1.8:1 (below recommended 2:1) - -**Recommendation:** - -For optimal performance, especially with Dask: - -- System memory should be at least 2x total GPU memory -- Example: 64GB RAM for 32GB total GPU memory (2x 16GB GPUs) - NVLink Checks ------------- @@ -170,96 +40,3 @@ NVLink Checks :members: :undoc-members: :show-inheritance: - -check_nvlink_status -^^^^^^^^^^^^^^^^^^^ - -.. autofunction:: rapids_cli.doctor.checks.nvlink.check_nvlink_status - -Checks for NVLink availability on multi-GPU systems. - -**Parameters:** - -- ``verbose`` (bool): Enable detailed output - -**Returns:** - -- bool: False if fewer than 2 GPUs, True if NVLink detected - -**Raises:** - -- ValueError: If NVLink status check fails on multi-GPU system - -**NVLink Benefits:** - -- High-bandwidth GPU-to-GPU communication -- Essential for multi-GPU training and processing -- Significantly faster than PCIe transfers - -**Note:** - -Only relevant for multi-GPU systems with NVLink-capable GPUs. - -Check Function Contract ------------------------ - -All built-in checks follow these conventions: - -Function Signature -^^^^^^^^^^^^^^^^^^ - -.. code-block:: python - - def check_function(verbose=False, **kwargs): - """Brief description of what this check verifies.""" - pass - -**Parameters:** - -- ``verbose`` (bool): Whether to provide detailed output -- ``**kwargs``: Reserved for future compatibility - -Return Values -^^^^^^^^^^^^^ - -**Success:** - -- Return any value (often True or a status string) -- Returning a string provides information for verbose output - -**Failure:** - -- Raise an exception with descriptive error message -- Use ValueError for failed checks -- Provide actionable guidance in error message - -**Warnings:** - -- Use ``warnings.warn()`` for non-fatal issues -- Always set ``stacklevel=2`` for correct source location - -Usage in Custom Checks ------------------------ - -Reference these built-in checks when creating custom checks: - -.. code-block:: python - - # Example: Custom memory check based on built-in pattern - from rapids_cli.doctor.checks.memory import get_gpu_memory - - - def my_memory_check(verbose=False, **kwargs): - """Check if GPU has enough memory for my workload.""" - gpu_memory = get_gpu_memory() - - required_gb = 16 - if gpu_memory < required_gb: - raise ValueError( - f"Insufficient GPU memory: {gpu_memory:.1f}GB available, " - f"{required_gb}GB required" - ) - - if verbose: - return f"GPU memory check passed: {gpu_memory:.1f}GB available" - return True diff --git a/docs/source/api/cli.rst b/docs/source/api/cli.rst index bd916ff..1580d51 100644 --- a/docs/source/api/cli.rst +++ b/docs/source/api/cli.rst @@ -4,59 +4,13 @@ CLI Module ========== -The CLI module provides the main command-line interface for RAPIDS CLI using Click. +The ``rapids_cli.cli`` module defines the main CLI entry point and subcommands +using `rich-click `_. + +The CLI is registered as a console script called ``rapids`` via the +``[project.scripts]`` entry in ``pyproject.toml``. .. automodule:: rapids_cli.cli :members: :undoc-members: :show-inheritance: - -Main Commands -------------- - -rapids -^^^^^^ - -.. autofunction:: rapids_cli.cli.rapids - -The main CLI entry point. Provides access to all subcommands. - -doctor -^^^^^^ - -.. autofunction:: rapids_cli.cli.doctor - -Run health checks to verify RAPIDS installation. - -**Options:** - -- ``--verbose``: Enable detailed output -- ``--dry-run``: Show which checks would run without executing them -- ``filters``: Optional filters to run specific checks - -**Exit Codes:** - -- 0: All checks passed -- 1: One or more checks failed - -debug -^^^^^ - -.. autofunction:: rapids_cli.cli.debug - -Gather comprehensive debugging information. - -**Options:** - -- ``--json``: Output in JSON format for machine parsing - -**Output:** - -Returns detailed system information including: - -- Platform and OS details -- GPU and driver information -- CUDA version -- Python configuration -- Installed packages -- Available tools diff --git a/docs/source/api/debug.rst b/docs/source/api/debug.rst index b4ce4b4..9c50567 100644 --- a/docs/source/api/debug.rst +++ b/docs/source/api/debug.rst @@ -4,230 +4,26 @@ Debug Module ============ -The debug module gathers comprehensive system information for troubleshooting. +The ``rapids_cli.debug.debug`` module gathers system and environment information +for troubleshooting RAPIDS installations. -.. automodule:: rapids_cli.debug.debug - :members: - :undoc-members: - :show-inheritance: - -Core Functions --------------- - -run_debug -^^^^^^^^^ - -.. autofunction:: rapids_cli.debug.debug.run_debug - -Main function for gathering and displaying debug information. - -**Parameters:** - -- ``output_format`` (str): Output format, either "console" or "json" +:func:`run_debug` is the main entry point. It collects: -**Collected Information:** - -- Date and time -- Platform information -- nvidia-smi output -- NVIDIA driver version -- CUDA version -- CUDA runtime path -- System CUDA toolkit locations -- Python version (full and short) -- Python hash info +- Platform and OS details (from ``platform`` and ``/etc/os-release``) +- NVIDIA driver and CUDA versions (via ``pynvml``) +- CUDA runtime path (via ``cuda-pathfinder``) +- System CUDA toolkit locations (globbing ``/usr/local/cuda*``) +- Python version and hash info - All installed package versions -- pip freeze output -- conda list output (if available) -- conda info output (if available) -- Available development tools -- OS information from /etc/os-release - -gather_cuda_version -^^^^^^^^^^^^^^^^^^^ - -.. autofunction:: rapids_cli.debug.debug.gather_cuda_version - -Retrieves and formats CUDA driver version from pynvml. - -**Returns:** - -- str: CUDA version in format "Major.Minor" or "Major.Minor.Patch" - -**Example:** - -.. code-block:: python - - >>> gather_cuda_version() - '12.4' - -gather_package_versions -^^^^^^^^^^^^^^^^^^^^^^^^ - -.. autofunction:: rapids_cli.debug.debug.gather_package_versions - -Collects versions of all installed Python packages. - -**Returns:** - -- dict: Mapping of package names to version strings - -**Example:** - -.. code-block:: python - - >>> versions = gather_package_versions() - >>> versions['rapids-cli'] - '0.1.0' - -gather_command_output -^^^^^^^^^^^^^^^^^^^^^^ - -.. autofunction:: rapids_cli.debug.debug.gather_command_output - -Executes a command and returns its output, with optional fallback. - -**Parameters:** - -- ``command`` (list[str]): Command and arguments to execute -- ``fallback_output`` (str | None): Value to return if command fails - -**Returns:** - -- str | None: Command output or fallback value - -**Example:** - -.. code-block:: python - - >>> gather_command_output(['pip', '--version']) - 'pip 24.0 from /usr/local/lib/python3.10/site-packages/pip (python 3.10)' - - >>> gather_command_output(['nonexistent'], fallback_output='Not installed') - 'Not installed' +- pip freeze and conda list output +- Tool versions: pip, conda, uv, pixi, g++, cmake, nvcc -gather_tools -^^^^^^^^^^^^ +Output is either a Rich-formatted console table or JSON (``--json``). -.. autofunction:: rapids_cli.debug.debug.gather_tools +API +--- -Gathers version information for common development tools. - -**Returns:** - -- dict: Tool names mapped to version strings or None - -**Checked Tools:** - -- pip -- conda -- uv -- pixi -- g++ -- cmake -- nvcc - -Output Formats --------------- - -Console Format -^^^^^^^^^^^^^^ - -Human-readable output with Rich formatting: - -.. code-block:: text - - RAPIDS Debug Information - - Date - 2025-02-11 15:30:00 - - Platform - Linux-6.8.0-94-generic-x86_64 - - Driver Version - 550.54.15 - - Cuda Version - 12.4 - - Package Versions - ┌─────────────────┬──────────┐ - │ rapids-cli │ 0.1.0 │ - │ cudf │ 25.02.0 │ - └─────────────────┴──────────┘ - -JSON Format -^^^^^^^^^^^ - -Machine-readable output for automation: - -.. code-block:: json - - { - "date": "2025-02-11 15:30:00", - "platform": "Linux-6.8.0-94-generic-x86_64", - "nvidia_smi_output": "...", - "driver_version": "550.54.15", - "cuda_version": "12.4", - "cuda_runtime_path": "/usr/local/cuda/include", - "system_ctk": ["/usr/local/cuda-12.4"], - "python_version_full": "3.13.12 (main, ...)", - "python_version": "3.13.12", - "python_hash_info": "sys.hash_info(...)", - "package_versions": { - "rapids-cli": "0.1.0" - }, - "pip_packages": "...", - "conda_packages": "...", - "conda_info": "...", - "tools": { - "pip": "pip 24.0", - "conda": "conda 24.1.0" - }, - "os_info": { - "NAME": "Ubuntu", - "VERSION": "22.04" - } - } - -Usage Examples --------------- - -Console Output -^^^^^^^^^^^^^^ - -.. code-block:: python - - from rapids_cli.debug.debug import run_debug - - # Display debug info in console - run_debug(output_format="console") - -JSON Output -^^^^^^^^^^^ - -.. code-block:: python - - import json - from rapids_cli.debug.debug import run_debug - - # Get JSON output - run_debug(output_format="json") - - # Can be captured with redirection - # rapids debug --json > debug.json - -Programmatic Access -^^^^^^^^^^^^^^^^^^^ - -.. code-block:: python - - from rapids_cli.debug.debug import gather_package_versions, gather_cuda_version - - # Get specific information - cuda_ver = gather_cuda_version() - packages = gather_package_versions() - - print(f"CUDA: {cuda_ver}") - print(f"Installed packages: {len(packages)}") +.. automodule:: rapids_cli.debug.debug + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/doctor.rst b/docs/source/api/doctor.rst index 16ab6ad..f4bd73d 100644 --- a/docs/source/api/doctor.rst +++ b/docs/source/api/doctor.rst @@ -4,138 +4,35 @@ Doctor Module ============= -The doctor module orchestrates health check execution and plugin discovery. +The ``rapids_cli.doctor.doctor`` module orchestrates health check discovery +and execution. -.. automodule:: rapids_cli.doctor.doctor - :members: - :undoc-members: - :show-inheritance: - -Core Functions --------------- - -doctor_check -^^^^^^^^^^^^ - -.. autofunction:: rapids_cli.doctor.doctor.doctor_check - -The main orchestration function for running health checks. - -**Parameters:** - -- ``verbose`` (bool): Enable detailed output -- ``dry_run`` (bool): Discover checks without executing them -- ``filters`` (list[str] | None): Optional filters to match check paths - -**Returns:** - -- bool: True if all checks passed, False if any failed - -**Process:** - -1. Discovers all registered checks via entry points -2. Filters checks based on provided filters -3. Executes each check and captures results -4. Collects warnings from checks -5. Displays results and returns success status - -CheckResult -^^^^^^^^^^^ - -.. autoclass:: rapids_cli.doctor.doctor.CheckResult - :members: - -Data class representing the result of a single check execution. - -**Attributes:** - -- ``name`` (str): Name of the check function -- ``description`` (str): First line of check's docstring -- ``status`` (bool): True if check passed, False if failed -- ``value`` (str | None): Optional return value for verbose output -- ``error`` (Exception | None): Exception if check failed -- ``warnings`` (list[WarningMessage] | None): Any warnings issued during check - -Plugin Discovery ----------------- - -The doctor module discovers plugins using Python entry points: - -.. code-block:: python - - from importlib.metadata import entry_points - - for ep in entry_points(group="rapids_doctor_check"): - check_fn = ep.load() - # Execute check - -Entry Point Group -^^^^^^^^^^^^^^^^^ - -Plugins register in the ``rapids_doctor_check`` group: - -.. code-block:: toml - - [project.entry-points.rapids_doctor_check] - my_check = "my_package.checks:my_check_function" +Checks are discovered via Python entry points in the ``rapids_doctor_check`` +group. Each check function is called with ``verbose`` as a keyword argument. +Results are collected into :class:`CheckResult` objects that track pass/fail +status, return values, errors, and warnings. Check Execution Flow -------------------- -1. **Discovery Phase** - - - Scan entry points for ``rapids_doctor_check`` group - - Load check functions - - Apply filters if specified - -2. **Execution Phase** - - - Run each check with ``verbose`` parameter - - Capture warnings using ``warnings.catch_warnings()`` - - Catch exceptions for failed checks - - Store results in CheckResult objects - -3. **Reporting Phase** - - - Display warnings - - Show verbose output if requested - - List failed checks with error messages - - Return overall success status - -Error Handling --------------- - -The doctor module handles several error scenarios: +1. **Discovery**: Scan ``rapids_doctor_check`` entry points and load check + functions. ``ImportError`` and ``AttributeError`` during loading are + silently suppressed via ``contextlib.suppress``. -**Import Errors** +2. **Filtering**: If filter arguments are provided, only checks whose + ``ep.value`` contains a filter substring are kept. -Failed imports during discovery are suppressed with ``contextlib.suppress``: +3. **Execution**: Each check runs inside ``warnings.catch_warnings(record=True)`` + so warnings are captured. Exceptions are caught and stored rather than + propagated. -.. code-block:: python +4. **Reporting**: Warnings are printed, verbose output is shown for passing + checks, and failed checks are listed with their error messages. - with contextlib.suppress(AttributeError, ImportError): - check_fn = ep.load() +API +--- -**Check Exceptions** - -Exceptions raised by checks are caught and stored: - -.. code-block:: python - - try: - value = check_fn(verbose=verbose) - status = True - except Exception as e: - error = e - status = False - -**Warnings** - -Python warnings are captured and displayed: - -.. code-block:: python - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - value = check_fn(verbose=verbose) - caught_warnings = w +.. automodule:: rapids_cli.doctor.doctor + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/conf.py b/docs/source/conf.py index 2303b4a..b4df851 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -5,80 +5,77 @@ # For the full list of built-in configuration values, see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html -# -- Project information ----------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information - - +import datetime import os import sys sys.path.insert(0, os.path.abspath("../../")) +# -- Project information ----------------------------------------------------- project = "RAPIDS CLI" -copyright = "2025-2026, NVIDIA CORPORATION & AFFILIATES" -author = "NVIDIA RAPIDS" - -# The short X.Y version -version = "0.1" -# The full version, including alpha/beta/rc tags -release = "0.1.0" +html_title = "RAPIDS CLI" +copyright = f"{datetime.date.today().year}, NVIDIA" +author = "NVIDIA" # -- General configuration --------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration extensions = [ "sphinx.ext.autodoc", - "sphinx.ext.autosummary", "sphinx.ext.viewcode", - "sphinx.ext.napoleon", # For Google and NumPy style docstrings - "sphinx.ext.intersphinx", # Link to other project docs - "sphinx.ext.todo", # Support for todo items + "sphinx.ext.napoleon", + "sphinx.ext.intersphinx", + "sphinx_copybutton", ] templates_path = ["_templates"] exclude_patterns = [] +copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: " +copybutton_prompt_is_regexp = True + # Napoleon settings for Google-style docstrings napoleon_google_docstring = True napoleon_numpy_docstring = False -napoleon_include_init_with_doc = True -napoleon_include_private_with_doc = False -napoleon_include_special_with_doc = True -napoleon_use_admonition_for_examples = True -napoleon_use_admonition_for_notes = True -napoleon_use_admonition_for_references = True -napoleon_use_ivar = False -napoleon_use_param = True -napoleon_use_rtype = True # Autodoc settings autodoc_default_options = { "members": True, "member-order": "bysource", - "special-members": "__init__", "undoc-members": True, - "exclude-members": "__weakref__", } -# Intersphinx mapping intersphinx_mapping = { "python": ("https://docs.python.org/3", None), } # -- Options for HTML output ------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = "sphinx_rtd_theme" -html_static_path = ["_static"] +html_theme = "pydata_sphinx_theme" html_theme_options = { - "navigation_depth": 4, - "collapse_navigation": False, - "sticky_navigation": True, - "includehidden": True, + "header_links_before_dropdown": 7, + "icon_links": [], + "logo": { + "link": "https://docs.rapids.ai/", + }, + "github_url": "https://github.com/rapidsai/rapids-cli", + "show_toc_level": 1, + "navbar_align": "right", +} + +html_sidebars = { + "**": ["sidebar-nav-bs", "sidebar-ethical-ads"], } -# Add any paths that contain custom static files (such as style sheets) -html_logo = None -html_favicon = None +html_static_path = ["_static"] + + +def setup(app): + app.add_css_file("https://docs.rapids.ai/assets/css/custom.css") + app.add_css_file( + "https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.9.0/css/all.min.css" + ) + app.add_js_file( + "https://docs.rapids.ai/assets/js/custom.js", loading_method="defer" + ) diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst deleted file mode 100644 index 9d721f3..0000000 --- a/docs/source/contributing.rst +++ /dev/null @@ -1,363 +0,0 @@ -.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -.. SPDX-License-Identifier: Apache-2.0 - -Contributing Guide -================== - -Thank you for your interest in contributing to RAPIDS CLI! This guide will help you get started. - -Getting Started ---------------- - -Prerequisites -^^^^^^^^^^^^^ - -- Python 3.10 or later -- Git -- NVIDIA GPU (for testing) -- NVIDIA drivers and CUDA toolkit - -Development Setup -^^^^^^^^^^^^^^^^^ - -1. Fork and clone the repository: - - .. code-block:: bash - - git clone https://github.com/YOUR_USERNAME/rapids-cli.git - cd rapids-cli - -2. Create a development environment: - - .. code-block:: bash - - # Using conda (recommended) - conda create -n rapids-cli-dev python=3.10 - conda activate rapids-cli-dev - - # Or using venv - python -m venv venv - source venv/bin/activate - -3. Install in editable mode with test dependencies: - - .. code-block:: bash - - pip install -e .[test] - -4. Install pre-commit hooks: - - .. code-block:: bash - - pre-commit install - -Development Workflow --------------------- - -Making Changes -^^^^^^^^^^^^^^ - -1. Create a feature branch: - - .. code-block:: bash - - git checkout -b feature/your-feature-name - -2. Make your changes following the code style guidelines - -3. Add tests for your changes - -4. Run tests locally: - - .. code-block:: bash - - pytest - -5. Run linting checks: - - .. code-block:: bash - - pre-commit run --all-files - -Code Style ----------- - -The project uses several linting tools to maintain code quality: - -Formatting -^^^^^^^^^^ - -- **Black**: Code formatting (120 char line length) -- **isort**: Import sorting - -Linting -^^^^^^^ - -- **Ruff**: Fast Python linter (replaces flake8, pylint, etc.) -- **mypy**: Static type checking - -Run formatters and linters: - -.. code-block:: bash - - # Format code - black . - - # Check with ruff - ruff check --fix . - - # Type check - mypy rapids_cli/ - -Docstrings -^^^^^^^^^^ - -Use Google-style docstrings: - -.. code-block:: python - - def my_function(param1: str, param2: int) -> bool: - """Brief description of the function. - - Longer description if needed. - - Args: - param1: Description of param1. - param2: Description of param2. - - Returns: - Description of return value. - - Raises: - ValueError: Description of when this is raised. - - Example: - >>> my_function("test", 42) - True - """ - pass - -Testing -------- - -Writing Tests -^^^^^^^^^^^^^ - -- Place tests in ``rapids_cli/tests/`` -- Use pytest for testing -- Mock external dependencies (pynvml, subprocess calls, etc.) -- Aim for high coverage (95%+ required) - -Test Structure: - -.. code-block:: python - - # rapids_cli/tests/test_my_feature.py - from unittest.mock import patch, MagicMock - import pytest - - from rapids_cli.my_module import my_function - - - def test_my_function_success(): - """Test that my_function works in normal case.""" - result = my_function("input") - assert result == "expected" - - - def test_my_function_failure(): - """Test that my_function handles errors correctly.""" - with pytest.raises(ValueError, match="error message"): - my_function("invalid") - - - def test_my_function_with_mock(): - """Test my_function with mocked dependencies.""" - with patch("pynvml.nvmlInit") as mock_init: - result = my_function() - mock_init.assert_called_once() - -Running Tests -^^^^^^^^^^^^^ - -.. code-block:: bash - - # Run all tests - pytest - - # Run specific test file - pytest rapids_cli/tests/test_doctor.py - - # Run with coverage - pytest --cov=rapids_cli - - # Run specific test - pytest rapids_cli/tests/test_doctor.py::test_doctor_check_all_pass - -Pull Request Process --------------------- - -1. Ensure all tests pass and coverage is maintained - -2. Update documentation if needed - -3. Sign your commits: - - .. code-block:: bash - - git commit -s -m "Your commit message" - -4. Push to your fork: - - .. code-block:: bash - - git push origin feature/your-feature-name - -5. Create a pull request on GitHub - -6. Address review feedback - -Commit Messages -^^^^^^^^^^^^^^^ - -Follow conventional commit format: - -.. code-block:: text - - : - - - - Signed-off-by: Your Name - -Types: - -- ``feat``: New feature -- ``fix``: Bug fix -- ``docs``: Documentation changes -- ``test``: Adding or updating tests -- ``refactor``: Code refactoring -- ``ci``: CI/CD changes -- ``chore``: Maintenance tasks - -Example: - -.. code-block:: text - - feat: add support for filtering checks by package name - - This allows users to run only specific checks by providing - filter arguments to the doctor command. - - Signed-off-by: Jane Doe - -Documentation -------------- - -Building Documentation -^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: bash - - cd docs - make html - - # View in browser - open build/html/index.html - -Documentation lives in ``docs/source/`` and uses Sphinx with reStructuredText. - -Adding New Documentation -^^^^^^^^^^^^^^^^^^^^^^^^ - -1. Create ``.rst`` file in ``docs/source/`` - -2. Add to table of contents in ``index.rst`` - -3. Build and verify: - - .. code-block:: bash - - cd docs - make html - -Reporting Issues ----------------- - -When reporting bugs: - -1. Check if issue already exists - -2. Provide minimal reproduction example - -3. Include debug output: - - .. code-block:: bash - - rapids debug --json > debug_info.json - -4. Include: - - - RAPIDS CLI version - - Python version - - OS and driver versions - - Expected vs actual behavior - -Feature Requests -^^^^^^^^^^^^^^^^ - -For feature requests: - -1. Describe the use case - -2. Explain why existing features don't work - -3. Provide example usage - -4. Consider contributing the feature! - -Code Review Guidelines ----------------------- - -For Reviewers -^^^^^^^^^^^^^ - -- Check that tests cover new functionality -- Verify documentation is updated -- Ensure code style is consistent -- Look for potential edge cases -- Validate error messages are helpful - -For Contributors -^^^^^^^^^^^^^^^^ - -- Respond to feedback promptly -- Ask questions if feedback is unclear -- Keep PRs focused on single concern -- Update based on reviews - -Release Process ---------------- - -Releases are managed by maintainers: - -1. Version is managed via git tags -2. CI automatically builds packages -3. Packages published to PyPI and conda-forge - -Community ---------- - -- GitHub Discussions: Q&A and ideas -- Slack: Real-time chat at rapids.ai/community -- Issues: Bug reports and features - -License -------- - -By contributing, you agree that your contributions will be licensed under the Apache 2.0 License. - -Thank You! ----------- - -Every contribution helps make RAPIDS CLI better. Thank you for your time and effort! diff --git a/docs/source/index.rst b/docs/source/index.rst index 9c27bfa..474bf39 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -9,43 +9,23 @@ primarily focused on health checks (``rapids doctor``) and debugging (``rapids d It uses a plugin system that allows RAPIDS libraries to register their own health checks via Python entry points. -.. image:: https://img.shields.io/badge/python-3.10+-blue.svg - :target: https://www.python.org/downloads/ - :alt: Python Version - -.. image:: https://img.shields.io/badge/license-Apache%202.0-blue.svg - :target: https://github.com/rapidsai/rapids-cli/blob/main/LICENSE - :alt: License - Quick Start ----------- -Install the RAPIDS CLI: - .. code-block:: bash pip install rapids-cli -Run health checks: - -.. code-block:: bash - + # Run health checks rapids doctor -Gather debugging information: - -.. code-block:: bash - + # Gather system info for debugging rapids debug --json -Documentation Contents ----------------------- - .. toctree:: :maxdepth: 2 :caption: User Guide - installation user_guide troubleshooting @@ -54,7 +34,6 @@ Documentation Contents :caption: Developer Guide plugin_development - contributing .. toctree:: :maxdepth: 2 diff --git a/docs/source/installation.rst b/docs/source/installation.rst deleted file mode 100644 index 213d474..0000000 --- a/docs/source/installation.rst +++ /dev/null @@ -1,106 +0,0 @@ -.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -.. SPDX-License-Identifier: Apache-2.0 - -Installation -============ - -Requirements ------------- - -- Python 3.10 or later -- NVIDIA GPU (for running health checks) -- NVIDIA drivers installed -- CUDA toolkit (optional, for full functionality) - -Installation Methods --------------------- - -From PyPI -^^^^^^^^^ - -The simplest way to install RAPIDS CLI is via pip: - -.. code-block:: bash - - pip install rapids-cli - -From Conda -^^^^^^^^^^ - -You can also install via conda: - -.. code-block:: bash - - conda install -c rapidsai -c conda-forge rapids-cli - -From Source -^^^^^^^^^^^ - -For development or to get the latest features: - -.. code-block:: bash - - git clone https://github.com/rapidsai/rapids-cli.git - cd rapids-cli - pip install -e . - -With Test Dependencies -^^^^^^^^^^^^^^^^^^^^^^ - -To run tests locally: - -.. code-block:: bash - - pip install -e .[test] - -Verification ------------- - -Verify the installation by running: - -.. code-block:: bash - - rapids --help - -You should see the RAPIDS CLI help message with available commands. - -Quick Test -^^^^^^^^^^ - -Run a quick health check to verify everything is working: - -.. code-block:: bash - - rapids doctor --verbose - -This will check your GPU availability, CUDA installation, and system configuration. - -Upgrading ---------- - -To upgrade to the latest version: - -.. code-block:: bash - - pip install --upgrade rapids-cli - -Or with conda: - -.. code-block:: bash - - conda update rapids-cli - -Uninstalling ------------- - -To uninstall RAPIDS CLI: - -.. code-block:: bash - - pip uninstall rapids-cli - -Or with conda: - -.. code-block:: bash - - conda remove rapids-cli diff --git a/docs/source/plugin_development.rst b/docs/source/plugin_development.rst index f4aa2ae..f93b069 100644 --- a/docs/source/plugin_development.rst +++ b/docs/source/plugin_development.rst @@ -1,148 +1,78 @@ .. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. .. SPDX-License-Identifier: Apache-2.0 -Plugin Development Guide -======================== +Plugin Development +================== -The RAPIDS CLI uses a plugin system based on Python entry points to allow external packages -to register their own health checks. This guide shows you how to create plugins for your -RAPIDS library. - -Overview --------- - -Plugins are discovered automatically through Python entry points in the ``rapids_doctor_check`` -group. When ``rapids doctor`` runs, it discovers all registered checks and executes them. +Any package can add checks to ``rapids doctor`` by exposing a function via a +Python entry point in the ``rapids_doctor_check`` group. Quick Start ----------- -Here's a minimal example of adding a check to your RAPIDS package: - -1. Create a check function in your package: +1. Create a check function: .. code-block:: python - # my_rapids_package/health_checks.py - + # my_package/health_checks.py - def my_package_check(verbose=False, **kwargs): - """Check that my_rapids_package is working correctly.""" - import my_rapids_package - # Perform your check - result = my_rapids_package.test_function() + def my_check(verbose=False, **kwargs): + """Check that my_package is working correctly.""" + try: + import my_package + except ImportError as e: + raise ImportError( + "my_package not found. Install with: pip install my_package" + ) from e - if not result: - raise ValueError("my_rapids_package self-test failed") + if verbose: + return f"my_package {my_package.__version__} is available" - return "my_rapids_package is working correctly" - -2. Register the check in your ``pyproject.toml``: +2. Register it in ``pyproject.toml``: .. code-block:: toml [project.entry-points.rapids_doctor_check] - my_package_check = "my_rapids_package.health_checks:my_package_check" + my_check = "my_package.health_checks:my_check" -3. Install your package and test: +3. Install and verify: .. code-block:: bash pip install -e . - rapids doctor --verbose + rapids doctor --verbose --dry-run Check Function Contract ----------------------- -Your check function must follow these conventions: - -Function Signature -^^^^^^^^^^^^^^^^^^ +Signature +^^^^^^^^^ .. code-block:: python def my_check(verbose=False, **kwargs): - """Check description goes here.""" - pass + """First line of docstring is shown in output.""" + ... -- Accept ``verbose`` parameter (boolean, default False) -- Accept ``**kwargs`` for forward compatibility -- Provide a clear docstring (first line is used in output) +- Accept ``verbose`` (bool) and ``**kwargs`` for forward compatibility. +- The first line of the docstring is used as the check description in output. +- New keyword arguments may be added in the future but will never be removed, + so ``**kwargs`` ensures your check won't break. Return Values ^^^^^^^^^^^^^ -**Success**: Return successfully (any return value) - -.. code-block:: python - - def check_success(verbose=False, **kwargs): - """This check always passes.""" - # Option 1: Return None (implicit) - return - - - def check_with_info(verbose=False, **kwargs): - """This check passes with info.""" - # Option 2: Return a string for verbose output - return "GPU 0: Tesla V100, 32GB memory" - -**Failure**: Raise an exception with a helpful message - -.. code-block:: python - - def check_failure(verbose=False, **kwargs): - """This check fails with helpful message.""" - if not some_condition(): - raise ValueError( - "Check failed: XYZ is not configured correctly. " - "To fix this, run: sudo apt-get install xyz" - ) - -**Warnings**: Use ``warnings.warn()`` for non-fatal issues - -.. code-block:: python - - import warnings - - - def check_with_warning(verbose=False, **kwargs): - """This check passes but issues a warning.""" - if not optimal_condition(): - warnings.warn( - "Suboptimal configuration detected. " "Performance may be degraded.", - stacklevel=2, - ) - return True +- **Pass**: Return any value. Returning a string provides extra info shown in + ``--verbose`` mode. +- **Fail**: Raise an exception. The message should tell the user how to fix it. +- **Warn**: Call ``warnings.warn("message", stacklevel=2)`` for non-fatal issues. + Warnings are captured and displayed but do not cause the check to fail. Examples -------- -Example 1: Basic Import Check -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Check that your package can be imported: - -.. code-block:: python - - def import_check(verbose=False, **kwargs): - """Check that my_package can be imported.""" - try: - import my_package - except ImportError as e: - raise ImportError( - "my_package not found. Install with: pip install my_package" - ) from e - - if verbose: - return f"my_package version {my_package.__version__}" - return True - -Example 2: GPU Memory Check -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Check GPU memory requirements: +GPU memory requirement check: .. code-block:: python @@ -150,328 +80,97 @@ Check GPU memory requirements: def gpu_memory_check(verbose=False, **kwargs): - """Check that GPU has sufficient memory for my_package.""" + """Check that GPU has at least 8GB memory.""" pynvml.nvmlInit() - - required_memory_gb = 8 handle = pynvml.nvmlDeviceGetHandleByIndex(0) - memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) - available_gb = memory_info.total / (1024**3) + mem = pynvml.nvmlDeviceGetMemoryInfo(handle) + available_gb = mem.total / (1024**3) - if available_gb < required_memory_gb: + if available_gb < 8: raise ValueError( - f"Insufficient GPU memory: {available_gb:.1f}GB available, " - f"{required_memory_gb}GB required" + f"Insufficient GPU memory: {available_gb:.1f}GB available, 8GB required" ) if verbose: - return f"GPU memory: {available_gb:.1f}GB available" - return True - -Example 3: Dependency Version Check -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + return f"GPU memory: {available_gb:.1f}GB" -Check that dependencies meet version requirements: +Non-fatal warning: .. code-block:: python import warnings - from packaging import version - - - def dependency_version_check(verbose=False, **kwargs): - """Check that dependencies meet minimum version requirements.""" - import numpy - import pandas - min_numpy = "1.20.0" - min_pandas = "1.3.0" - if version.parse(numpy.__version__) < version.parse(min_numpy): - raise ValueError( - f"NumPy {min_numpy}+ required, found {numpy.__version__}. " - f"Upgrade with: pip install 'numpy>={min_numpy}'" - ) - - if version.parse(pandas.__version__) < version.parse(min_pandas): + def config_check(verbose=False, **kwargs): + """Check optional configuration.""" + if not optimal_condition(): warnings.warn( - f"Pandas {min_pandas}+ recommended for best performance. " - f"Found {pandas.__version__}.", + "Suboptimal configuration detected. Performance may be degraded.", stacklevel=2, ) - if verbose: - return f"NumPy {numpy.__version__}, Pandas {pandas.__version__}" - return True - -Example 4: Functional Test -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Run a simple functional test: - -.. code-block:: python - - def functional_check(verbose=False, **kwargs): - """Run a simple functional test.""" - import my_package - import numpy as np - - try: - # Create test data - data = np.random.rand(100, 10) - - # Run simple operation - result = my_package.process(data) - - # Verify result - assert result.shape == (100, 10), "Unexpected output shape" - assert not np.isnan(result).any(), "NaN values in output" - - except Exception as e: - raise RuntimeError( - f"Functional test failed: {e}. " "This may indicate a GPU or driver issue." - ) from e - - if verbose: - return "Functional test passed: basic operations working" - return True - -Best Practices --------------- - -Clear Error Messages -^^^^^^^^^^^^^^^^^^^^ - -Always provide actionable error messages: - -.. code-block:: python - - # Bad: Unclear what to do - raise ValueError("Check failed") - - # Good: Clear action to fix - raise ValueError( - "CUDA 11.2+ required but CUDA 10.2 found. " - "Upgrade CUDA: https://developer.nvidia.com/cuda-downloads" - ) - -Performance -^^^^^^^^^^^ - -Keep checks fast (< 1 second each): - -.. code-block:: python - - # Bad: Slow check - def slow_check(verbose=False, **kwargs): - """This check is too slow.""" - result = expensive_computation() # Takes 30 seconds - return result - - - # Good: Fast check - def fast_check(verbose=False, **kwargs): - """This check is appropriately fast.""" - # Just verify configuration, don't run full workload - config = load_config() - validate_config(config) - return True - -Verbose Output -^^^^^^^^^^^^^^ +Multiple checks from one package: -Provide useful information in verbose mode: - -.. code-block:: python - - def informative_check(verbose=False, **kwargs): - """Check with informative output.""" - gpu_count = get_gpu_count() - gpu_memory = get_total_gpu_memory() - - if gpu_count == 0: - raise ValueError("No GPUs found") - - if verbose: - return f"Found {gpu_count} GPU(s) " f"with {gpu_memory:.1f}GB total memory" - return True - -Graceful Degradation -^^^^^^^^^^^^^^^^^^^^ - -Handle optional dependencies gracefully: - -.. code-block:: python - - def optional_dependency_check(verbose=False, **kwargs): - """Check that works with optional dependencies.""" - try: - import optional_package - - has_optional = True - except ImportError: - has_optional = False - - if not has_optional: - import warnings - - warnings.warn( - "optional_package not found. " "Some features will be disabled.", - stacklevel=2, - ) +.. code-block:: toml - # Continue with check anyway - return True + [project.entry-points.rapids_doctor_check] + my_pkg_import = "my_package.checks:import_check" + my_pkg_gpu = "my_package.checks:gpu_check" + my_pkg_functional = "my_package.checks:functional_check" Testing Your Plugin ------------------- -Test Plugin Discovery -^^^^^^^^^^^^^^^^^^^^^ - -Verify your check is discovered: +Verify discovery: .. code-block:: bash rapids doctor --verbose --dry-run | grep my_check -Test Plugin Execution -^^^^^^^^^^^^^^^^^^^^^ - -Run your check: +Run only your checks: .. code-block:: bash rapids doctor --verbose my_package -Unit Testing -^^^^^^^^^^^^ - -Test your check function directly: +Unit test with mocks (following the pattern in ``rapids_cli/tests/``): .. code-block:: python - # test_health_checks.py + from unittest.mock import patch + import pytest + from my_package.health_checks import my_check def test_my_check_success(): - """Test that check passes in normal conditions.""" result = my_check(verbose=True) assert result is not None def test_my_check_failure(): - """Test that check fails appropriately.""" with pytest.raises(ValueError, match="expected error"): - my_check_with_bad_config(verbose=False) - -Advanced Topics ---------------- - -Multiple Checks per Package -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Register multiple checks: - -.. code-block:: toml - - [project.entry-points.rapids_doctor_check] - my_pkg_import = "my_package.checks:import_check" - my_pkg_gpu = "my_package.checks:gpu_check" - my_pkg_functional = "my_package.checks:functional_check" - -Check Dependencies -^^^^^^^^^^^^^^^^^^ - -If checks have dependencies, handle them gracefully: - -.. code-block:: python - - def dependent_check(verbose=False, **kwargs): - """This check depends on GPU check passing.""" - # Don't fail if dependencies aren't met - try: - import pynvml - - pynvml.nvmlInit() - except Exception: - warnings.warn("GPU not available, skipping dependent check", stacklevel=2) - return True - - # Rest of check - return True - -Environment-Specific Checks -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Adapt checks to different environments: - -.. code-block:: python - - import os - - - def environment_aware_check(verbose=False, **kwargs): - """Check that adapts to environment.""" - is_ci = os.environ.get("CI") == "true" - - if is_ci: - # Skip expensive checks in CI - return "Skipped in CI environment" - - # Run full check - run_expensive_validation() - return True + my_check(verbose=False) Troubleshooting --------------- -Check Not Discovered -^^^^^^^^^^^^^^^^^^^^ +**Check not discovered**: Verify the entry point name is in the output of: -If your check isn't showing up: - -1. Verify entry point is correct: - - .. code-block:: bash - - python -c "from importlib.metadata import entry_points; print([ep for ep in entry_points(group='rapids_doctor_check')])" - -2. Reinstall your package: - - .. code-block:: bash - - pip install -e . --force-reinstall --no-deps - -3. Check for import errors: - - .. code-block:: python - - python -c "from my_package.checks import my_check" - -Check Always Fails -^^^^^^^^^^^^^^^^^^ - -Debug the check directly: +.. code-block:: bash -.. code-block:: python + python -c "from importlib.metadata import entry_points; \ + print([ep.name for ep in entry_points(group='rapids_doctor_check')])" - from my_package.checks import my_check +If missing, reinstall with ``pip install -e . --force-reinstall --no-deps``. - try: - result = my_check(verbose=True) - print(f"Success: {result}") - except Exception as e: - print(f"Failed: {e}") - import traceback +**Import errors are silent**: The doctor module uses ``contextlib.suppress`` +to skip checks that fail to import. Test your import directly: - traceback.print_exc() +.. code-block:: bash -Resources ---------- + python -c "from my_package.health_checks import my_check" -- Entry points documentation: https://packaging.python.org/specifications/entry-points/ -- RAPIDS CLI repository: https://github.com/rapidsai/rapids-cli -- Example plugins: See built-in checks in ``rapids_cli/doctor/checks/`` +See the built-in checks in ``rapids_cli/doctor/checks/`` for reference +implementations. diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst index 014f39f..5da7f2c 100644 --- a/docs/source/troubleshooting.rst +++ b/docs/source/troubleshooting.rst @@ -4,17 +4,10 @@ Troubleshooting =============== -This guide helps you resolve common issues with the RAPIDS CLI. - -Common Issues -------------- - No GPUs Detected -^^^^^^^^^^^^^^^^ - -**Symptom**: ``rapids doctor`` reports "No available GPUs detected" +---------------- -**Solutions**: +``rapids doctor`` reports "No available GPUs detected". 1. Verify NVIDIA drivers are installed: @@ -22,32 +15,31 @@ No GPUs Detected nvidia-smi - If this fails, install NVIDIA drivers: - - .. code-block:: bash - - # Ubuntu/Debian - sudo apt-get install nvidia-driver-550 - -2. Check that GPU is visible to Python: +2. Check that GPU is visible from Python: .. code-block:: bash python -c "import pynvml; pynvml.nvmlInit(); print(pynvml.nvmlDeviceGetCount())" -3. Verify you're not in a container without GPU access: +3. If running in a container, ensure GPU passthrough is enabled: .. code-block:: bash - # Docker needs --gpus all flag docker run --gpus all ... -CUDA Version Mismatch -^^^^^^^^^^^^^^^^^^^^^ +Insufficient Compute Capability +-------------------------------- + +"GPU requires compute capability 7 or higher". + +RAPIDS requires Volta-generation GPUs or newer (compute capability 7.0+). +Supported GPUs include V100, A100, H100, and RTX 20xx/30xx/40xx series. +See https://developer.nvidia.com/cuda-gpus for a full list. -**Symptom**: ``rapids doctor`` reports CUDA version incompatibility +CUDA Version Issues +------------------- -**Solutions**: +"Unable to look up CUDA version". 1. Check your CUDA driver version: @@ -55,341 +47,66 @@ CUDA Version Mismatch nvidia-smi | grep "CUDA Version" -2. Install compatible RAPIDS packages: +2. Ensure RAPIDS packages match your CUDA version: .. code-block:: bash - # For CUDA 11.x - pip install cudf-cu11 cuml-cu11 - # For CUDA 12.x - pip install cudf-cu12 cuml-cu12 - -3. Update NVIDIA drivers if needed: + pip install cudf-cu12 - .. code-block:: bash - - # Check https://docs.rapids.ai/install for requirements - sudo apt-get update && sudo apt-get upgrade nvidia-driver + # For CUDA 11.x + pip install cudf-cu11 Low Memory Warning -^^^^^^^^^^^^^^^^^^ - -**Symptom**: Warning about system memory to GPU memory ratio - -**Context**: RAPIDS recommends 2:1 ratio of system RAM to GPU memory for optimal performance - -**Solutions**: - -1. This is a warning, not an error. RAPIDS will still work. - -2. For better performance, consider: - - - Adding more system RAM - - Using data chunking strategies - - Processing smaller batches - -3. For Dask workloads, adjust worker memory limits: - - .. code-block:: python - - from dask_cuda import LocalCUDACluster - - cluster = LocalCUDACluster( - device_memory_limit="8GB", # Limit per worker - memory_limit="16GB", # System memory per worker - ) - -NVLink Not Found -^^^^^^^^^^^^^^^^ - -**Symptom**: ``rapids doctor`` reports NVLink is not available - -**Context**: NVLink is only available on multi-GPU systems with NVLink-capable GPUs - -**Solutions**: - -1. If you have only one GPU, this is expected. NVLink is not needed. - -2. For multi-GPU systems without NVLink: - - - RAPIDS will work but inter-GPU transfers will be slower - - Consider PCIe topology optimization - -3. Verify NVLink status: - - .. code-block:: bash - - nvidia-smi nvlink --status - -Insufficient Compute Capability -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -**Symptom**: "GPU requires compute capability 7.0 or higher" - -**Context**: RAPIDS requires GPU compute capability 7.0+ (Volta architecture or newer) - -**Solutions**: - -1. Check your GPU compute capability: - - .. code-block:: bash - - rapids debug | grep "GPU" - -2. Supported GPUs include: - - - Tesla V100, A100, H100 - - RTX 20xx, 30xx, 40xx series - - GTX 1660 and above +------------------ -3. If your GPU is too old, you'll need to upgrade hardware. +"System Memory to total GPU Memory ratio not at least 2:1 ratio." -Check Discovery Issues -^^^^^^^^^^^^^^^^^^^^^^ +This is a warning, not a failure. RAPIDS recommends system RAM be at least +twice total GPU memory for optimal performance, particularly with Dask. +RAPIDS will still function with a lower ratio. -**Symptom**: Custom checks not discovered by ``rapids doctor`` +Custom Checks Not Discovered +----------------------------- -**Solutions**: +If ``rapids doctor --verbose`` doesn't show your custom check: -1. Verify entry point registration: +1. Verify the entry point is registered: .. code-block:: bash python -c "from importlib.metadata import entry_points; \ - print([ep.name for ep in entry_points(group='rapids_doctor_check')])" - -2. Reinstall package with entry points: - - .. code-block:: bash - - pip install -e . --force-reinstall - -3. Check for import errors: - - .. code-block:: bash - - rapids doctor --verbose - - Look for "Failed to import" messages. - -Import Errors -^^^^^^^^^^^^^ - -**Symptom**: "ModuleNotFoundError" when running checks - -**Solutions**: - -1. Verify package is installed: - - .. code-block:: bash - - pip list | grep rapids - -2. Check Python environment: - - .. code-block:: bash - - which python - python --version + print([ep.name for ep in entry_points(group='rapids_doctor_check')])" -3. Ensure you're in the correct virtual environment: +2. Reinstall the package that provides the check: .. code-block:: bash - # Conda - conda activate rapids-env + pip install -e . --force-reinstall --no-deps - # venv - source venv/bin/activate - -Permission Errors -^^^^^^^^^^^^^^^^^ - -**Symptom**: "Permission denied" when accessing GPU - -**Solutions**: - -1. Add user to video/render groups: +3. Check for import errors by importing the check function directly: .. code-block:: bash - sudo usermod -a -G video $USER - sudo usermod -a -G render $USER + python -c "from my_package.checks import my_check" - # Log out and back in for changes to take effect + Import errors during discovery are silently suppressed + (see ``contextlib.suppress`` in ``doctor.py``). -2. Check device permissions: +General Debugging Steps +----------------------- - .. code-block:: bash - - ls -l /dev/nvidia* - -3. For containers, ensure proper device mounting: +1. Run with verbose output: .. code-block:: bash - docker run --gpus all --device=/dev/nvidia0 ... - -Debugging Tips --------------- - -Enable Verbose Mode -^^^^^^^^^^^^^^^^^^^ - -Always start with verbose output: - -.. code-block:: bash - - rapids doctor --verbose - -This shows: - -- Which checks are discovered -- Detailed error messages -- Stack traces for failures - -Gather Debug Information -^^^^^^^^^^^^^^^^^^^^^^^^^ - -Collect comprehensive system information: - -.. code-block:: bash - - rapids debug --json > debug_info.json - -Share this file when reporting issues. - -Test Individual Components -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Test NVIDIA stack components: - -.. code-block:: bash - - # Test nvidia-smi - nvidia-smi - - # Test pynvml (Python binding) - python -c "import pynvml; pynvml.nvmlInit(); print('OK')" - - # Test CUDA - python -c "import cuda; print(cuda.cudaroot)" - -Check Environment Variables -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Verify CUDA-related environment variables: - -.. code-block:: bash - - echo $CUDA_HOME - echo $LD_LIBRARY_PATH - echo $PATH - -Run in Isolation -^^^^^^^^^^^^^^^^ - -Test in a clean environment: - -.. code-block:: bash - - # Create fresh environment - conda create -n test-rapids python=3.10 - conda activate test-rapids - - # Install only RAPIDS CLI - pip install rapids-cli - - # Test - rapids doctor - -Enable Python Warnings -^^^^^^^^^^^^^^^^^^^^^^ - -See all warnings: - -.. code-block:: bash - - python -W all -m rapids_cli.cli doctor - -Performance Issues ------------------- - -Slow Check Execution -^^^^^^^^^^^^^^^^^^^^ - -If checks are slow: - -1. Use ``--dry-run`` to verify discovery without execution: - - .. code-block:: bash - - rapids doctor --dry-run - -2. Profile individual checks: - - .. code-block:: python - - import time - from my_package.checks import my_check - - start = time.time() - my_check(verbose=True) - print(f"Check took {time.time() - start:.2f}s") - -3. Optimize slow checks (keep under 1 second each) - -High Memory Usage -^^^^^^^^^^^^^^^^^ - -If ``rapids doctor`` uses too much memory: - -1. This is unexpected - report as a bug - -2. Workaround: Run checks individually: - - .. code-block:: bash - - rapids doctor package1 - rapids doctor package2 - -Reporting Issues ----------------- - -When reporting issues, include: - -1. Output of ``rapids debug --json`` - -2. Complete error messages from ``rapids doctor --verbose`` - -3. Steps to reproduce - -4. Expected vs actual behavior + rapids doctor --verbose -5. Environment details: +2. Gather full environment information: .. code-block:: bash - rapids debug > environment.txt - python --version - pip list > packages.txt - -Submit issues at: https://github.com/rapidsai/rapids-cli/issues - -Getting Help ------------- - -- GitHub Issues: https://github.com/rapidsai/rapids-cli/issues -- RAPIDS Slack: https://rapids.ai/community -- Documentation: https://docs.rapids.ai -- Stack Overflow: Tag questions with ``rapids`` and ``rapids-cli`` - -Known Limitations ------------------ + rapids debug --json > debug_info.json -- Windows support is experimental -- WSL2 requires special GPU setup -- Some checks require sudo access -- Docker containers need ``--gpus all`` flag -- Remote GPU monitoring not supported +3. Report issues at https://github.com/rapidsai/rapids-cli/issues with the + debug output attached. diff --git a/docs/source/user_guide.rst b/docs/source/user_guide.rst index 11da9ad..8656999 100644 --- a/docs/source/user_guide.rst +++ b/docs/source/user_guide.rst @@ -4,215 +4,113 @@ User Guide ========== -This guide provides detailed information on using the RAPIDS CLI. - -Overview --------- - -The RAPIDS CLI provides two main commands: - -- ``rapids doctor`` - Health checks for your RAPIDS installation -- ``rapids debug`` - Gather debugging information about your system +The RAPIDS CLI provides two commands: ``rapids doctor`` for health checks and +``rapids debug`` for gathering system information. rapids doctor ------------- -The ``doctor`` command performs health checks to ensure your RAPIDS environment is properly configured. - -Basic Usage -^^^^^^^^^^^ - -Run all health checks: +The ``doctor`` command performs health checks to ensure your RAPIDS environment +is properly configured. .. code-block:: bash rapids doctor -This will check: +Built-in checks verify: -- GPU availability and compatibility +- GPU availability and compute capability (7.0+) - CUDA driver version -- System memory to GPU memory ratio -- NVLink status (for multi-GPU systems) -- Any checks registered by installed RAPIDS packages +- System memory to GPU memory ratio (recommends 2:1 for Dask) +- NVLink status (multi-GPU systems) + +Any installed RAPIDS library can register additional checks via the plugin system +(see :doc:`plugin_development`). Verbose Output ^^^^^^^^^^^^^^ -Get detailed information about each check: +The ``--verbose`` flag shows check discovery details and per-check output: .. code-block:: bash - rapids doctor --verbose - -This shows: - -- Which checks are discovered -- Detailed output from each check -- Additional diagnostic information + $ rapids doctor --verbose + Discovering checks + Found check 'gpu' provided by 'rapids_cli.doctor.checks.gpu:gpu_check' + ... + Discovered 5 checks + Running checks + gpu_check: GPU(s) detected: 2 + All checks passed! Dry Run ^^^^^^^ -See which checks would run without actually executing them: +The ``--dry-run`` flag discovers checks without executing them, useful for +verifying plugin registration: .. code-block:: bash rapids doctor --dry-run -This is useful for: - -- Verifying plugin discovery -- Debugging check registration issues -- Understanding what will be checked +Filtering +^^^^^^^^^ -Filtering Checks -^^^^^^^^^^^^^^^^ - -Run only specific checks by filtering: +Pass filter arguments to run only matching checks. Filters match against +the check's module path: .. code-block:: bash # Run only cuDF-related checks rapids doctor cudf - # Run multiple filtered checks + # Run checks from multiple packages rapids doctor cudf cuml -The filter matches any part of the check's module path. - Exit Codes ^^^^^^^^^^ -The ``doctor`` command returns: - -- ``0`` - All checks passed -- ``1`` - One or more checks failed +- ``0``: All checks passed +- ``1``: One or more checks failed -This makes it suitable for use in scripts and CI/CD pipelines: +This makes ``rapids doctor`` suitable for scripting: .. code-block:: bash - if rapids doctor; then - echo "Environment is ready!" - else - echo "Environment has issues!" - exit 1 - fi + rapids doctor || exit 1 rapids debug ------------ -The ``debug`` command gathers comprehensive information about your system for troubleshooting. - -Basic Usage -^^^^^^^^^^^ - -Generate a debug report: +The ``debug`` command gathers comprehensive system information for troubleshooting. .. code-block:: bash rapids debug -This displays: - -- Platform information -- NVIDIA driver version -- CUDA version -- Python version and configuration -- Installed package versions -- System tools (pip, conda, cmake, etc.) -- OS information +Output includes: platform, NVIDIA driver version, CUDA version, CUDA runtime +path, system CTK locations, Python version, all installed package versions, +pip/conda package lists, available tools (pip, conda, uv, pixi, g++, cmake, +nvcc), and OS information. JSON Output ^^^^^^^^^^^ -Get machine-readable output: - -.. code-block:: bash - - rapids debug --json - -This is useful for: - -- Automated debugging scripts -- Parsing in other tools -- Sharing debug information programmatically - -The JSON output includes all information in a structured format: - -.. code-block:: json - - { - "date": "2025-02-11 15:30:00", - "platform": "Linux-6.8.0-94-generic-x86_64", - "driver_version": "550.54.15", - "cuda_version": "12.4", - "python_version": "3.13.12", - "package_versions": { - "rapids-cli": "0.1.0", - ... - }, - ... - } - -Saving Debug Output -^^^^^^^^^^^^^^^^^^^ - -Save debug information to a file: +The ``--json`` flag produces machine-readable output: .. code-block:: bash rapids debug --json > debug_info.json -This file can be: - -- Shared with support teams -- Attached to bug reports -- Used for comparison across environments - -Common Workflows ----------------- - -Pre-Installation Check -^^^^^^^^^^^^^^^^^^^^^^ - -Before installing RAPIDS, verify your system meets requirements: - -.. code-block:: bash - - # Install just the CLI first - pip install rapids-cli - - # Check system compatibility - rapids doctor --verbose - -The checks will tell you if your GPU, drivers, and CUDA are suitable for RAPIDS. - -Post-Installation Verification -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -After installing RAPIDS packages, verify everything works: - -.. code-block:: bash - - # Install RAPIDS - pip install cudf-cu12 cuml-cu12 - - # Verify installation - rapids doctor - - # If issues occur, gather debug info - rapids debug --json > debug_info.json +This is useful for attaching to bug reports or comparing environments. CI/CD Integration -^^^^^^^^^^^^^^^^^ +----------------- -Use RAPIDS CLI in your CI/CD pipelines: +Example GitHub Actions usage: .. code-block:: yaml - # GitHub Actions example - name: Verify RAPIDS Environment run: | pip install rapids-cli @@ -220,74 +118,4 @@ Use RAPIDS CLI in your CI/CD pipelines: - name: Save Debug Info on Failure if: failure() - run: rapids debug --json > ${{ github.workspace }}/debug.json - -Troubleshooting Workflow -^^^^^^^^^^^^^^^^^^^^^^^^^ - -When encountering issues: - -1. Run verbose health check: - - .. code-block:: bash - - rapids doctor --verbose - -2. Review warning messages and failures - -3. Gather full debug information: - - .. code-block:: bash - - rapids debug > debug_output.txt - -4. Check troubleshooting guide (see :doc:`troubleshooting`) - -5. Report issues with debug output - -Best Practices --------------- - -Regular Health Checks -^^^^^^^^^^^^^^^^^^^^^ - -Run ``rapids doctor`` regularly to catch configuration drift: - -.. code-block:: bash - - # Add to your shell profile - alias rapids-check='rapids doctor && echo "✓ RAPIDS environment healthy"' - -Environment Documentation -^^^^^^^^^^^^^^^^^^^^^^^^^ - -Document your environment with debug output: - -.. code-block:: bash - - # Save baseline configuration - rapids debug --json > baseline_env.json - - # Later, compare environments - rapids debug --json > current_env.json - diff baseline_env.json current_env.json - -Automated Monitoring -^^^^^^^^^^^^^^^^^^^^ - -Monitor RAPIDS environments automatically: - -.. code-block:: bash - - #!/bin/bash - # daily_rapids_check.sh - - if ! rapids doctor; then - rapids debug --json | mail -s "RAPIDS Health Check Failed" admin@example.com - fi - -Add to cron: - -.. code-block:: bash - - 0 9 * * * /path/to/daily_rapids_check.sh + run: rapids debug --json > debug.json diff --git a/rapids_cli/doctor/doctor.py b/rapids_cli/doctor/doctor.py index 0fdff86..0e8ce5d 100644 --- a/rapids_cli/doctor/doctor.py +++ b/rapids_cli/doctor/doctor.py @@ -34,26 +34,22 @@ def doctor_check( If specific subcommands are given, it validates them against valid subcommands and executes corresponding checks. - Parameters: - ---------- - filters : list (optional) - A list of filters to run specific checks. + Args: + verbose: Whether to print verbose output. + dry_run: Whether to skip running checks. + filters: A list of filters to run specific checks. Raises: - ------- - ValueError: - If an invalid subcommand is provided. + ValueError: If an invalid subcommand is provided. - Notes: - ----- - The function discovers and loads check functions defined in entry points - under the 'rapids_doctor_check' group. It also checks specific - configurations related to a corresponding subcommand if given. + Note: + The function discovers and loads check functions defined in entry points + under the ``rapids_doctor_check`` group. It also checks specific + configurations related to a corresponding subcommand if given. Example: - -------- - > doctor_check([]) # Run all health checks - > doctor_check(['cudf']) # Run 'cudf' specific checks + >>> doctor_check(verbose=False, dry_run=False) + >>> doctor_check(verbose=False, dry_run=False, filters=['cudf']) """ filters = [] if not filters else filters console.print( From 126bca09a1ae803a3feeabd904c69f5d8af5b4de Mon Sep 17 00:00:00 2001 From: Mike McCarty Date: Wed, 11 Feb 2026 17:06:36 -0500 Subject: [PATCH 3/3] refactored to use dependency injection for interfacing with hardware --- dependency-injection-refactoring.md | 152 +++++++++++++++ rapids_cli/debug/debug.py | 46 +++-- rapids_cli/doctor/checks/cuda_driver.py | 23 ++- rapids_cli/doctor/checks/gpu.py | 42 +++-- rapids_cli/doctor/checks/memory.py | 65 +++++-- rapids_cli/doctor/checks/nvlink.py | 34 ++-- rapids_cli/doctor/doctor.py | 5 +- rapids_cli/hardware.py | 229 +++++++++++++++++++++++ rapids_cli/tests/test_cuda.py | 27 +-- rapids_cli/tests/test_debug.py | 63 +++---- rapids_cli/tests/test_gpu.py | 83 ++++----- rapids_cli/tests/test_hardware.py | 235 ++++++++++++++++++++++++ rapids_cli/tests/test_memory.py | 96 +++++----- rapids_cli/tests/test_nvlink.py | 74 ++++---- 14 files changed, 915 insertions(+), 259 deletions(-) create mode 100644 dependency-injection-refactoring.md create mode 100644 rapids_cli/hardware.py create mode 100644 rapids_cli/tests/test_hardware.py diff --git a/dependency-injection-refactoring.md b/dependency-injection-refactoring.md new file mode 100644 index 0000000..697642c --- /dev/null +++ b/dependency-injection-refactoring.md @@ -0,0 +1,152 @@ +# Dependency Injection Refactoring + +## Context + +The check modules (`gpu.py`, `cuda_driver.py`, `memory.py`, `nvlink.py`) +and `debug.py` previously called `pynvml`, `psutil`, and `cuda.pathfinder` +directly. This forced tests to use 50+ `mock.patch` calls with deeply +nested context managers and `MagicMock` objects to simulate hardware +configurations. A thin abstraction layer was introduced so tests can +construct plain dataclasses instead of mocking low-level library internals. + +## Approach: Default Parameter Injection with Provider Dataclasses + +A single new file `rapids_cli/hardware.py` was created containing: + +- **`DeviceInfo`** dataclass -- holds per-GPU data + (index, compute capability, memory, nvlink states) +- **`GpuInfoProvider`** protocol -- read-only interface for GPU info + (`device_count`, `devices`, `cuda_driver_version`, `driver_version`) +- **`SystemInfoProvider`** protocol -- read-only interface for system info + (`total_memory_bytes`, `cuda_runtime_path`) +- **`NvmlGpuInfo`** -- real implementation backed by pynvml + (lazy-loads on first property access, caches results) +- **`DefaultSystemInfo`** -- real implementation backed by + psutil + cuda.pathfinder (lazy-loads per property) +- **`FakeGpuInfo`** / **`FakeSystemInfo`** -- test fakes + (plain dataclasses, no hardware dependency) +- **`FailingGpuInfo`** / **`FailingSystemInfo`** -- test fakes that + raise `ValueError` on access (simulates missing hardware) + +Check functions gained an optional keyword parameter with `None` default: + +```python +def gpu_check(verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs): + if gpu_info is None: # pragma: no cover + gpu_info = NvmlGpuInfo() +``` + +The orchestrator (`doctor.py`) creates a shared `NvmlGpuInfo()` instance +and passes it to all checks via `check_fn(verbose=verbose, gpu_info=gpu_info)`. +Third-party plugins safely ignore the extra keyword argument via their +own `**kwargs`. + +## Files Changed + +### New file: `rapids_cli/hardware.py` + +Contains all provider abstractions: + +- `DeviceInfo` dataclass with fields: `index`, `compute_capability`, + `memory_total_bytes`, `nvlink_states` +- `GpuInfoProvider` and `SystemInfoProvider` protocols + (runtime-checkable) +- `NvmlGpuInfo` -- calls `nvmlInit()` once on first property access, + queries all device info (count, compute capability, memory, + NVLink states), and caches everything +- `DefaultSystemInfo` -- lazily loads system memory via psutil and + CUDA path via cuda.pathfinder (each cached independently) +- `FakeGpuInfo`, `FakeSystemInfo` -- `@dataclass` test fakes with + pre-set data +- `FailingGpuInfo`, `FailingSystemInfo` -- test fakes that raise + `ValueError` on any property access + +### Modified: `rapids_cli/doctor/checks/gpu.py` + +- Removed `import pynvml` +- Added `gpu_info: GpuInfoProvider | None = None` parameter and + `**kwargs` to both `gpu_check()` and `check_gpu_compute_capability()` +- Replaced direct `pynvml` calls with `gpu_info.device_count` and + iteration over `gpu_info.devices` + +### Modified: `rapids_cli/doctor/checks/cuda_driver.py` + +- Removed `import pynvml` +- Added `gpu_info` parameter and `**kwargs` to `cuda_check()` +- Replaced nested try/except with `gpu_info.cuda_driver_version` + +### Modified: `rapids_cli/doctor/checks/memory.py` + +- Removed `import pynvml` and `import psutil` +- Added `system_info` parameter to `get_system_memory()` +- Added `gpu_info` parameter to `get_gpu_memory()` +- Added both `gpu_info` and `system_info` parameters to + `check_memory_to_gpu_ratio()` +- `get_system_memory()` reads `system_info.total_memory_bytes` +- `get_gpu_memory()` sums `dev.memory_total_bytes` from + `gpu_info.devices` +- `check_memory_to_gpu_ratio()` passes injected providers down + to helpers + +### Modified: `rapids_cli/doctor/checks/nvlink.py` + +- Removed `import pynvml` +- Added `gpu_info` parameter and `**kwargs` to `check_nvlink_status()` +- Iterates `dev.nvlink_states` instead of calling + `nvmlDeviceGetNvLinkState` +- **Side-fix**: the original code always passed `0` instead of + `nvlink_id` to `nvmlDeviceGetNvLinkState`; the refactored + `NvmlGpuInfo` queries each link by its actual index + +### Modified: `rapids_cli/debug/debug.py` + +- Removed `import pynvml` and `import cuda.pathfinder` +- Added `gpu_info` parameter to `gather_cuda_version()` +- Added `gpu_info` and `system_info` parameters to `run_debug()` +- Replaced direct pynvml/cuda.pathfinder calls with provider + property accesses + +### Modified: `rapids_cli/doctor/doctor.py` + +- Imports `NvmlGpuInfo` from `rapids_cli.hardware` +- Creates a shared `NvmlGpuInfo()` instance before the check loop +- Passes it via `check_fn(verbose=verbose, gpu_info=gpu_info)` + +### Rewritten tests + +`test_gpu.py`, `test_cuda.py`, `test_memory.py`, `test_nvlink.py`, +`test_debug.py`: + +- Replaced all `patch("pynvml.*")` / `patch("psutil.*")` / + `patch("cuda.pathfinder.*")` with `FakeGpuInfo` / `FakeSystemInfo` / + `FailingGpuInfo` construction +- Tests for `debug.py` still use patches for non-hardware concerns + (subprocess, pathlib, gather_tools) + +### New file: `rapids_cli/tests/test_hardware.py` + +- Unit tests for `NvmlGpuInfo` + (init failure, loads once, device data, NVLink states, no NVLink) +- Unit tests for `DefaultSystemInfo` + (total memory, CUDA runtime path, caching) +- Tests for `FakeGpuInfo` / `FakeSystemInfo` + (defaults, custom values, protocol satisfaction) +- Tests for `FailingGpuInfo` / `FailingSystemInfo` + (all properties raise) + +## Impact + +| Metric | Before | After | +| --------------------------------------------- | ------- | --------------------------------- | +| Hardware library patches in check/debug tests | ~51 | 0 (moved to test_hardware.py) | +| import pynvml in check/debug modules | 5 files | 1 file (hardware.py) | +| MagicMock objects for hardware | ~11 | 0 | +| pynvml.nvmlInit() calls in production | 7 | 1 (in NvmlGpuInfo._ensure_loaded) | +| Total tests | 53 | 72 (+19 hardware tests) | +| Coverage | 95%+ | 97.72% | + +## Verification + +1. `pytest` -- all 72 tests pass +2. `pytest --cov-fail-under=95` -- coverage at 97.72%, above threshold +3. `pre-commit run --all-files` -- all checks pass diff --git a/rapids_cli/debug/debug.py b/rapids_cli/debug/debug.py index fca4d1d..b4afde5 100644 --- a/rapids_cli/debug/debug.py +++ b/rapids_cli/debug/debug.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 """This module contains the debug subcommand for the Rapids CLI.""" +from __future__ import annotations + import json import platform import subprocess @@ -9,22 +11,29 @@ from datetime import datetime from importlib.metadata import distributions, version from pathlib import Path +from typing import TYPE_CHECKING -import cuda.pathfinder -import pynvml from rich.console import Console from rich.table import Table +if TYPE_CHECKING: + from rapids_cli.hardware import GpuInfoProvider, SystemInfoProvider + console = Console() -def gather_cuda_version(): +def gather_cuda_version(*, gpu_info: GpuInfoProvider | None = None): """Return CUDA driver version as a string, similar to nvidia-smi output.""" - version = pynvml.nvmlSystemGetCudaDriverVersion() + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + + ver = gpu_info.cuda_driver_version # pynvml returns an int like 12040 for 12.4, so format as string - major = version // 1000 - minor = (version % 1000) // 10 - patch = version % 10 + major = ver // 1000 + minor = (ver % 1000) // 10 + patch = ver % 10 if patch == 0: return f"{major}.{minor}" else: @@ -67,18 +76,31 @@ def gather_tools(): } -def run_debug(output_format="console"): +def run_debug( + output_format="console", + *, + gpu_info: GpuInfoProvider | None = None, + system_info: SystemInfoProvider | None = None, +): """Run debug.""" - pynvml.nvmlInit() + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + if system_info is None: # pragma: no cover + from rapids_cli.hardware import DefaultSystemInfo + + system_info = DefaultSystemInfo() + debug_info = { "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "platform": platform.platform(), "nvidia_smi_output": gather_command_output( ["nvidia-smi"], "Nvidia-smi not installed" ), - "driver_version": pynvml.nvmlSystemGetDriverVersion(), - "cuda_version": gather_cuda_version(), - "cuda_runtime_path": cuda.pathfinder.find_nvidia_header_directory("cudart"), + "driver_version": gpu_info.driver_version, + "cuda_version": gather_cuda_version(gpu_info=gpu_info), + "cuda_runtime_path": system_info.cuda_runtime_path, "system_ctk": sorted( [str(p) for p in Path("/usr/local").glob("cuda*") if p.is_dir()] ), diff --git a/rapids_cli/doctor/checks/cuda_driver.py b/rapids_cli/doctor/checks/cuda_driver.py index 252dd47..6275c1a 100644 --- a/rapids_cli/doctor/checks/cuda_driver.py +++ b/rapids_cli/doctor/checks/cuda_driver.py @@ -2,17 +2,22 @@ # SPDX-License-Identifier: Apache-2.0 """Check for CUDA and driver compatibility.""" -import pynvml +from __future__ import annotations +from typing import TYPE_CHECKING -def cuda_check(verbose=False): +if TYPE_CHECKING: + from rapids_cli.hardware import GpuInfoProvider + + +def cuda_check(verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs): """Check CUDA availability.""" + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + try: - pynvml.nvmlInit() - try: - cuda_version = pynvml.nvmlSystemGetCudaDriverVersion() - return cuda_version - except pynvml.NVMLError as e: - raise ValueError("Unable to look up CUDA version") from e - except pynvml.NVMLError as e: + return gpu_info.cuda_driver_version + except ValueError as e: raise ValueError("Unable to look up CUDA version") from e diff --git a/rapids_cli/doctor/checks/gpu.py b/rapids_cli/doctor/checks/gpu.py index 77e6ca6..d8e1a45 100644 --- a/rapids_cli/doctor/checks/gpu.py +++ b/rapids_cli/doctor/checks/gpu.py @@ -2,38 +2,52 @@ # SPDX-License-Identifier: Apache-2.0 """GPU checks for the doctor command.""" -import pynvml +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from rapids_cli.hardware import GpuInfoProvider REQUIRED_COMPUTE_CAPABILITY = 7 -def gpu_check(verbose=False): +def gpu_check(verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs): """Check GPU availability.""" + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + try: - pynvml.nvmlInit() - num_gpus = pynvml.nvmlDeviceGetCount() - except pynvml.NVMLError as e: + num_gpus = gpu_info.device_count + except ValueError as e: raise ValueError("No available GPUs detected") from e assert num_gpus > 0, "No GPUs detected" return f"GPU(s) detected: {num_gpus}" -def check_gpu_compute_capability(verbose): +def check_gpu_compute_capability( + verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs +): """Check the system for GPU Compute Capability.""" + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + try: - pynvml.nvmlInit() - except pynvml.NVMLError as e: + devices = gpu_info.devices + except ValueError as e: raise ValueError("No GPU - cannot determine GPU Compute Capability") from e - for i in range(pynvml.nvmlDeviceGetCount()): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle) - if major >= REQUIRED_COMPUTE_CAPABILITY: + for dev in devices: + if dev.compute_capability[0] >= REQUIRED_COMPUTE_CAPABILITY: continue else: raise ValueError( - f"GPU {i} requires compute capability {REQUIRED_COMPUTE_CAPABILITY} " - f"or higher but only has {major}.{minor}." + f"GPU {dev.index} requires compute capability {REQUIRED_COMPUTE_CAPABILITY} " + f"or higher but only has {dev.compute_capability[0]}.{dev.compute_capability[1]}." "See https://developer.nvidia.com/cuda-gpus for more information." ) return True diff --git a/rapids_cli/doctor/checks/memory.py b/rapids_cli/doctor/checks/memory.py index cb1fcb5..f1d8231 100644 --- a/rapids_cli/doctor/checks/memory.py +++ b/rapids_cli/doctor/checks/memory.py @@ -2,46 +2,71 @@ # SPDX-License-Identifier: Apache-2.0 """Memory checks.""" +from __future__ import annotations + import warnings +from typing import TYPE_CHECKING -import psutil -import pynvml +if TYPE_CHECKING: + from rapids_cli.hardware import GpuInfoProvider, SystemInfoProvider -def get_system_memory(verbose=False): +def get_system_memory( + verbose=False, *, system_info: SystemInfoProvider | None = None, **kwargs +): """Get the total system memory.""" - virtual_memory = psutil.virtual_memory() - total_memory = virtual_memory.total / (1024**3) # converts bytes to gigabytes + if system_info is None: # pragma: no cover + from rapids_cli.hardware import DefaultSystemInfo + + system_info = DefaultSystemInfo() + + total_memory = system_info.total_memory_bytes / ( + 1024**3 + ) # converts bytes to gigabytes return total_memory -def get_gpu_memory(verbose=False): +def get_gpu_memory(verbose=False, *, gpu_info: GpuInfoProvider | None = None, **kwargs): """Get the total GPU memory.""" - pynvml.nvmlInit() - gpus = pynvml.nvmlDeviceGetCount() - gpu_memory_total = 0 - for i in range(gpus): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) - gpu_memory_total += memory_info.total / (1024**3) # converts to gigabytes - - pynvml.nvmlShutdown() + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + + gpu_memory_total = sum(dev.memory_total_bytes for dev in gpu_info.devices) / ( + 1024**3 + ) # converts to gigabytes return gpu_memory_total -def check_memory_to_gpu_ratio(verbose=True): +def check_memory_to_gpu_ratio( + verbose=True, + *, + gpu_info: GpuInfoProvider | None = None, + system_info: SystemInfoProvider | None = None, + **kwargs, +): """Check the system for a 2:1 ratio of system Memory to total GPU Memory. This is especially useful for Dask. """ + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + if system_info is None: # pragma: no cover + from rapids_cli.hardware import DefaultSystemInfo + + system_info = DefaultSystemInfo() + try: - pynvml.nvmlInit() - except pynvml.NVMLError as e: + _ = gpu_info.device_count + except ValueError as e: raise ValueError("GPU not found. Please ensure GPUs are installed.") from e - system_memory = get_system_memory(verbose) - gpu_memory = get_gpu_memory(verbose) + system_memory = get_system_memory(verbose, system_info=system_info) + gpu_memory = get_gpu_memory(verbose, gpu_info=gpu_info) ratio = system_memory / gpu_memory if ratio < 1.8: warnings.warn( diff --git a/rapids_cli/doctor/checks/nvlink.py b/rapids_cli/doctor/checks/nvlink.py index 22bbdd1..715a8fd 100644 --- a/rapids_cli/doctor/checks/nvlink.py +++ b/rapids_cli/doctor/checks/nvlink.py @@ -2,25 +2,33 @@ # SPDX-License-Identifier: Apache-2.0 """Check for NVLink status.""" -import pynvml +from __future__ import annotations +from typing import TYPE_CHECKING -def check_nvlink_status(verbose=True): +if TYPE_CHECKING: + from rapids_cli.hardware import GpuInfoProvider + + +def check_nvlink_status( + verbose=True, *, gpu_info: GpuInfoProvider | None = None, **kwargs +): """Check the system for NVLink with 2 or more GPUs.""" + if gpu_info is None: # pragma: no cover + from rapids_cli.hardware import NvmlGpuInfo + + gpu_info = NvmlGpuInfo() + try: - pynvml.nvmlInit() - except pynvml.NVMLError as e: + device_count = gpu_info.device_count + except ValueError as e: raise ValueError("GPU not found. Please ensure GPUs are installed.") from e - device_count = pynvml.nvmlDeviceGetCount() if device_count < 2: return False - for i in range(device_count): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - for nvlink_id in range(pynvml.NVML_NVLINK_MAX_LINKS): - try: - pynvml.nvmlDeviceGetNvLinkState(handle, 0) - return True - except pynvml.NVMLError as e: - raise ValueError(f"NVLink {nvlink_id} Status Check Failed") from e + for dev in gpu_info.devices: + if any(dev.nvlink_states): + return True + + return False diff --git a/rapids_cli/doctor/doctor.py b/rapids_cli/doctor/doctor.py index 0e8ce5d..c497300 100644 --- a/rapids_cli/doctor/doctor.py +++ b/rapids_cli/doctor/doctor.py @@ -10,6 +10,7 @@ from rapids_cli._compatibility import entry_points from rapids_cli.constants import DOCTOR_SYMBOL +from rapids_cli.hardware import NvmlGpuInfo console = Console() @@ -74,6 +75,8 @@ def doctor_check( console.print("Dry run, skipping checks") return True + gpu_info = NvmlGpuInfo() + results: list[CheckResult] = [] with console.status("[bold green]Running checks...") as ui_status: for i, check_fn in enumerate(checks): @@ -85,7 +88,7 @@ def doctor_check( with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") status = True - value = check_fn(verbose=verbose) + value = check_fn(verbose=verbose, gpu_info=gpu_info) caught_warnings = w except Exception as e: diff --git a/rapids_cli/hardware.py b/rapids_cli/hardware.py new file mode 100644 index 0000000..94aab52 --- /dev/null +++ b/rapids_cli/hardware.py @@ -0,0 +1,229 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Hardware abstraction layer for GPU and system information.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Protocol, runtime_checkable + + +@dataclass +class DeviceInfo: + """Per-GPU device information.""" + + index: int + compute_capability: tuple[int, int] + memory_total_bytes: int + nvlink_states: list[bool] = field(default_factory=list) + + +@runtime_checkable +class GpuInfoProvider(Protocol): + """Read-only interface for GPU information.""" + + @property + def device_count(self) -> int: + """Return number of GPU devices.""" + ... + + @property + def devices(self) -> list[DeviceInfo]: + """Return list of device information.""" + ... + + @property + def cuda_driver_version(self) -> int: + """Return CUDA driver version as integer.""" + ... + + @property + def driver_version(self) -> str: + """Return driver version string.""" + ... + + +@runtime_checkable +class SystemInfoProvider(Protocol): + """Read-only interface for system information.""" + + @property + def total_memory_bytes(self) -> int: + """Return total system memory in bytes.""" + ... + + @property + def cuda_runtime_path(self) -> str | None: + """Return path to CUDA runtime headers.""" + ... + + +class NvmlGpuInfo: + """Real GPU info provider backed by pynvml. + + Lazily loads all device information on first property access and caches results. + """ + + def __init__(self) -> None: + """Initialize with empty cached state.""" + self._loaded = False + self._device_count = 0 + self._devices: list[DeviceInfo] = [] + self._cuda_driver_version = 0 + self._driver_version = "" + + def _ensure_loaded(self) -> None: + if self._loaded: + return + + import pynvml + + try: + pynvml.nvmlInit() + except pynvml.NVMLError as e: + raise ValueError("Unable to initialize GPU driver (NVML)") from e + + self._device_count = pynvml.nvmlDeviceGetCount() + self._cuda_driver_version = pynvml.nvmlSystemGetCudaDriverVersion() + self._driver_version = pynvml.nvmlSystemGetDriverVersion() + + self._devices = [] + for i in range(self._device_count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle) + memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + + nvlink_states: list[bool] = [] + for link_id in range(pynvml.NVML_NVLINK_MAX_LINKS): + try: + state = pynvml.nvmlDeviceGetNvLinkState(handle, link_id) + nvlink_states.append(bool(state)) + except pynvml.NVMLError: + break + + self._devices.append( + DeviceInfo( + index=i, + compute_capability=(major, minor), + memory_total_bytes=memory_info.total, + nvlink_states=nvlink_states, + ) + ) + + self._loaded = True + + @property + def device_count(self) -> int: + """Return number of GPU devices.""" + self._ensure_loaded() + return self._device_count + + @property + def devices(self) -> list[DeviceInfo]: + """Return list of device information.""" + self._ensure_loaded() + return self._devices + + @property + def cuda_driver_version(self) -> int: + """Return CUDA driver version as integer (e.g. 12040).""" + self._ensure_loaded() + return self._cuda_driver_version + + @property + def driver_version(self) -> str: + """Return driver version string.""" + self._ensure_loaded() + return self._driver_version + + +class DefaultSystemInfo: + """Real system info provider backed by psutil and cuda.pathfinder. + + Lazily loads each piece of information on first access. + """ + + def __init__(self) -> None: + """Initialize with empty cached state.""" + self._memory_loaded = False + self._total_memory_bytes = 0 + self._cuda_path_loaded = False + self._cuda_runtime_path: str | None = None + + @property + def total_memory_bytes(self) -> int: + """Return total system memory in bytes.""" + if not self._memory_loaded: + import psutil + + self._total_memory_bytes = psutil.virtual_memory().total + self._memory_loaded = True + return self._total_memory_bytes + + @property + def cuda_runtime_path(self) -> str | None: + """Return path to CUDA runtime headers.""" + if not self._cuda_path_loaded: + import cuda.pathfinder + + self._cuda_runtime_path = cuda.pathfinder.find_nvidia_header_directory( + "cudart" + ) + self._cuda_path_loaded = True + return self._cuda_runtime_path + + +@dataclass +class FakeGpuInfo: + """Test fake for GPU information with pre-set data.""" + + device_count: int = 0 + devices: list[DeviceInfo] = field(default_factory=list) + cuda_driver_version: int = 0 + driver_version: str = "" + + +@dataclass +class FakeSystemInfo: + """Test fake for system information with pre-set data.""" + + total_memory_bytes: int = 0 + cuda_runtime_path: str | None = None + + +class FailingGpuInfo: + """Test fake that raises ValueError on any property access.""" + + @property + def device_count(self) -> int: + """Raise ValueError.""" + raise ValueError("No GPU available") + + @property + def devices(self) -> list[DeviceInfo]: + """Raise ValueError.""" + raise ValueError("No GPU available") + + @property + def cuda_driver_version(self) -> int: + """Raise ValueError.""" + raise ValueError("No GPU available") + + @property + def driver_version(self) -> str: + """Raise ValueError.""" + raise ValueError("No GPU available") + + +class FailingSystemInfo: + """Test fake that raises ValueError on any property access.""" + + @property + def total_memory_bytes(self) -> int: + """Raise ValueError.""" + raise ValueError("System info unavailable") + + @property + def cuda_runtime_path(self) -> str | None: + """Raise ValueError.""" + raise ValueError("System info unavailable") diff --git a/rapids_cli/tests/test_cuda.py b/rapids_cli/tests/test_cuda.py index 70097b2..de4fd99 100644 --- a/rapids_cli/tests/test_cuda.py +++ b/rapids_cli/tests/test_cuda.py @@ -1,26 +1,17 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import patch +import pytest from rapids_cli.doctor.checks.cuda_driver import cuda_check +from rapids_cli.hardware import FailingGpuInfo, FakeGpuInfo -def mock_cuda_version(): - return 12050 +def test_cuda_check_success(): + gpu_info = FakeGpuInfo(cuda_driver_version=12050) + assert cuda_check(verbose=True, gpu_info=gpu_info) == 12050 -def test_get_cuda_version_success(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), - ): - version = mock_cuda_version() - assert version - - -def test_cuda_check_success(capfd): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), - ): - assert cuda_check(verbose=True) +def test_cuda_check_no_gpu(): + gpu_info = FailingGpuInfo() + with pytest.raises(ValueError, match="Unable to look up CUDA version"): + cuda_check(verbose=False, gpu_info=gpu_info) diff --git a/rapids_cli/tests/test_debug.py b/rapids_cli/tests/test_debug.py index 91c330c..79b9db5 100644 --- a/rapids_cli/tests/test_debug.py +++ b/rapids_cli/tests/test_debug.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 import json -from unittest.mock import MagicMock, patch +from unittest.mock import patch from rapids_cli.debug.debug import ( gather_command_output, @@ -10,24 +10,22 @@ gather_tools, run_debug, ) +from rapids_cli.hardware import FakeGpuInfo, FakeSystemInfo def test_gather_cuda_version(): - """Test CUDA version gathering.""" - with patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040): - result = gather_cuda_version() - assert result == "12.4" + gpu_info = FakeGpuInfo(cuda_driver_version=12040) + result = gather_cuda_version(gpu_info=gpu_info) + assert result == "12.4" def test_gather_cuda_version_with_patch(): - """Test CUDA version with patch number.""" - with patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12345): - result = gather_cuda_version() - assert result == "12.34.5" + gpu_info = FakeGpuInfo(cuda_driver_version=12345) + result = gather_cuda_version(gpu_info=gpu_info) + assert result == "12.34.5" def test_gather_package_versions(): - """Test package version gathering.""" result = gather_package_versions() assert isinstance(result, dict) assert len(result) > 0 @@ -36,25 +34,21 @@ def test_gather_package_versions(): def test_gather_command_output_success(): - """Test successful command output gathering.""" result = gather_command_output(["echo", "test"]) assert result == "test" def test_gather_command_output_with_fallback(): - """Test command output with fallback.""" result = gather_command_output(["nonexistent_command"], fallback_output="fallback") assert result == "fallback" def test_gather_command_output_no_fallback(): - """Test command output without fallback.""" result = gather_command_output(["nonexistent_command"]) assert result is None def test_gather_tools(): - """Test tools gathering.""" with ( patch( "rapids_cli.debug.debug.gather_command_output", @@ -69,40 +63,41 @@ def test_gather_tools(): def test_run_debug_console(capsys): - """Test run_debug with console output.""" - mock_vm = MagicMock() - mock_vm.total = 32 * 1024**3 + gpu_info = FakeGpuInfo( + device_count=1, + cuda_driver_version=12040, + driver_version="550.54.15", + ) + system_info = FakeSystemInfo( + total_memory_bytes=32 * 1024**3, + cuda_runtime_path="/usr/local/cuda/include", + ) with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54.15"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040), - patch( - "cuda.pathfinder.find_nvidia_header_directory", - return_value="/usr/local/cuda/include", - ), patch("pathlib.Path.glob", return_value=[]), patch("rapids_cli.debug.debug.gather_package_versions", return_value={}), patch("rapids_cli.debug.debug.gather_command_output", return_value=None), patch("rapids_cli.debug.debug.gather_tools", return_value={}), patch("pathlib.Path.read_text", return_value='NAME="Ubuntu"\nVERSION="22.04"'), ): - run_debug(output_format="console") + run_debug(output_format="console", gpu_info=gpu_info, system_info=system_info) captured = capsys.readouterr() assert "RAPIDS Debug Information" in captured.out def test_run_debug_json(capsys): - """Test run_debug with JSON output.""" + gpu_info = FakeGpuInfo( + device_count=1, + cuda_driver_version=12040, + driver_version="550.54.15", + ) + system_info = FakeSystemInfo( + total_memory_bytes=32 * 1024**3, + cuda_runtime_path="/usr/local/cuda/include", + ) + with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54.15"), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040), - patch( - "cuda.pathfinder.find_nvidia_header_directory", - return_value="/usr/local/cuda/include", - ), patch("pathlib.Path.glob", return_value=[]), patch( "rapids_cli.debug.debug.gather_package_versions", @@ -114,7 +109,7 @@ def test_run_debug_json(capsys): patch("rapids_cli.debug.debug.gather_tools", return_value={"pip": "pip 23.0"}), patch("pathlib.Path.read_text", return_value='NAME="Ubuntu"\nVERSION="22.04"'), ): - run_debug(output_format="json") + run_debug(output_format="json", gpu_info=gpu_info, system_info=system_info) captured = capsys.readouterr() output = json.loads(captured.out) diff --git a/rapids_cli/tests/test_gpu.py b/rapids_cli/tests/test_gpu.py index a895bc2..f9fdf28 100644 --- a/rapids_cli/tests/test_gpu.py +++ b/rapids_cli/tests/test_gpu.py @@ -1,7 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import patch - import pytest from rapids_cli.doctor.checks.gpu import ( @@ -9,67 +7,60 @@ check_gpu_compute_capability, gpu_check, ) +from rapids_cli.hardware import DeviceInfo, FailingGpuInfo, FakeGpuInfo def test_gpu_check_success(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - ): - result = gpu_check(verbose=True) - assert result == "GPU(s) detected: 2" + gpu_info = FakeGpuInfo(device_count=2) + result = gpu_check(verbose=True, gpu_info=gpu_info) + assert result == "GPU(s) detected: 2" def test_gpu_check_no_gpus(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=0), - ): - with pytest.raises(AssertionError, match="No GPUs detected"): - gpu_check(verbose=False) + gpu_info = FakeGpuInfo(device_count=0) + with pytest.raises(AssertionError, match="No GPUs detected"): + gpu_check(verbose=False, gpu_info=gpu_info) def test_gpu_check_nvml_error(): - import pynvml - - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises(ValueError, match="No available GPUs detected"): - gpu_check(verbose=False) + gpu_info = FailingGpuInfo() + with pytest.raises(ValueError, match="No available GPUs detected"): + gpu_check(verbose=False, gpu_info=gpu_info) def test_check_gpu_compute_capability_success(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex"), - patch( - "pynvml.nvmlDeviceGetCudaComputeCapability", - return_value=(REQUIRED_COMPUTE_CAPABILITY, 5), + devices = [ + DeviceInfo( + index=0, + compute_capability=(REQUIRED_COMPUTE_CAPABILITY, 5), + memory_total_bytes=0, ), - ): - result = check_gpu_compute_capability(verbose=True) - assert result is True + DeviceInfo( + index=1, + compute_capability=(REQUIRED_COMPUTE_CAPABILITY, 5), + memory_total_bytes=0, + ), + ] + gpu_info = FakeGpuInfo(device_count=2, devices=devices) + result = check_gpu_compute_capability(verbose=True, gpu_info=gpu_info) + assert result is True def test_check_gpu_compute_capability_insufficient(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), - patch("pynvml.nvmlDeviceGetHandleByIndex"), - patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(6, 0)), + devices = [ + DeviceInfo(index=0, compute_capability=(6, 0), memory_total_bytes=0), + ] + gpu_info = FakeGpuInfo(device_count=1, devices=devices) + with pytest.raises( + ValueError, + match=f"GPU 0 requires compute capability {REQUIRED_COMPUTE_CAPABILITY}", ): - with pytest.raises( - ValueError, - match=f"GPU 0 requires compute capability {REQUIRED_COMPUTE_CAPABILITY}", - ): - check_gpu_compute_capability(verbose=False) + check_gpu_compute_capability(verbose=False, gpu_info=gpu_info) def test_check_gpu_compute_capability_no_gpu(): - import pynvml - - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises( - ValueError, match="No GPU - cannot determine GPU Compute Capability" - ): - check_gpu_compute_capability(verbose=False) + gpu_info = FailingGpuInfo() + with pytest.raises( + ValueError, match="No GPU - cannot determine GPU Compute Capability" + ): + check_gpu_compute_capability(verbose=False, gpu_info=gpu_info) diff --git a/rapids_cli/tests/test_hardware.py b/rapids_cli/tests/test_hardware.py new file mode 100644 index 0000000..1236e0f --- /dev/null +++ b/rapids_cli/tests/test_hardware.py @@ -0,0 +1,235 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +from unittest.mock import MagicMock, patch + +import pynvml +import pytest + +from rapids_cli.hardware import ( + DefaultSystemInfo, + DeviceInfo, + FailingGpuInfo, + FailingSystemInfo, + FakeGpuInfo, + FakeSystemInfo, + GpuInfoProvider, + NvmlGpuInfo, + SystemInfoProvider, +) + +# --- NvmlGpuInfo tests --- + + +def test_nvml_gpu_info_init_failure(): + with patch( + "pynvml.nvmlInit", + side_effect=pynvml.NVMLError(pynvml.NVML_ERROR_DRIVER_NOT_LOADED), + ): + gpu_info = NvmlGpuInfo() + with pytest.raises(ValueError, match="Unable to initialize GPU driver"): + _ = gpu_info.device_count + + +def test_nvml_gpu_info_loads_once(): + mock_handle = MagicMock() + mock_memory = MagicMock() + mock_memory.total = 16 * 1024**3 + + with ( + patch("pynvml.nvmlInit") as mock_init, + patch("pynvml.nvmlDeviceGetCount", return_value=1), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), + patch( + "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported + ), + ): + gpu_info = NvmlGpuInfo() + # Access multiple properties to verify caching + _ = gpu_info.device_count + _ = gpu_info.devices + _ = gpu_info.cuda_driver_version + _ = gpu_info.driver_version + # nvmlInit should be called exactly once + mock_init.assert_called_once() + + +def test_nvml_gpu_info_device_data(): + mock_handle = MagicMock() + mock_memory = MagicMock() + mock_memory.total = 24 * 1024**3 + + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=2), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12060), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="560.10"), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(9, 0)), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), + patch("pynvml.nvmlDeviceGetNvLinkState", return_value=1), + ): + gpu_info = NvmlGpuInfo() + assert gpu_info.device_count == 2 + assert len(gpu_info.devices) == 2 + assert gpu_info.devices[0].compute_capability == (9, 0) + assert gpu_info.devices[0].memory_total_bytes == 24 * 1024**3 + assert gpu_info.cuda_driver_version == 12060 + assert gpu_info.driver_version == "560.10" + + +def test_nvml_gpu_info_nvlink_states(): + mock_handle = MagicMock() + mock_memory = MagicMock() + mock_memory.total = 16 * 1024**3 + + def nvlink_side_effect(handle, link_id): + if link_id < 2: + return 1 + raise pynvml.NVMLError_NotSupported() + + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=1), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), + patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=nvlink_side_effect), + ): + gpu_info = NvmlGpuInfo() + assert gpu_info.devices[0].nvlink_states == [True, True] + + +def test_nvml_gpu_info_no_nvlink(): + mock_handle = MagicMock() + mock_memory = MagicMock() + mock_memory.total = 16 * 1024**3 + + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=1), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), + patch( + "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported + ), + ): + gpu_info = NvmlGpuInfo() + assert gpu_info.devices[0].nvlink_states == [] + + +# --- DefaultSystemInfo tests --- + + +def test_default_system_info_total_memory(): + mock_vm = MagicMock() + mock_vm.total = 64 * 1024**3 + with patch("psutil.virtual_memory", return_value=mock_vm): + sys_info = DefaultSystemInfo() + assert sys_info.total_memory_bytes == 64 * 1024**3 + + +def test_default_system_info_cuda_runtime_path(): + with patch( + "cuda.pathfinder.find_nvidia_header_directory", + return_value="/usr/local/cuda/include", + ): + sys_info = DefaultSystemInfo() + assert sys_info.cuda_runtime_path == "/usr/local/cuda/include" + + +def test_default_system_info_caches(): + mock_vm = MagicMock() + mock_vm.total = 64 * 1024**3 + with patch("psutil.virtual_memory", return_value=mock_vm) as mock_psutil: + sys_info = DefaultSystemInfo() + _ = sys_info.total_memory_bytes + _ = sys_info.total_memory_bytes + mock_psutil.assert_called_once() + + +# --- FakeGpuInfo tests --- + + +def test_fake_gpu_info_defaults(): + fake = FakeGpuInfo() + assert fake.device_count == 0 + assert fake.devices == [] + assert fake.cuda_driver_version == 0 + assert fake.driver_version == "" + + +def test_fake_gpu_info_custom(): + devices = [ + DeviceInfo(index=0, compute_capability=(8, 0), memory_total_bytes=32 * 1024**3) + ] + fake = FakeGpuInfo( + device_count=1, + devices=devices, + cuda_driver_version=12040, + driver_version="550.0", + ) + assert fake.device_count == 1 + assert len(fake.devices) == 1 + assert fake.cuda_driver_version == 12040 + + +def test_fake_gpu_info_satisfies_protocol(): + assert isinstance(FakeGpuInfo(), GpuInfoProvider) + + +# --- FakeSystemInfo tests --- + + +def test_fake_system_info_defaults(): + fake = FakeSystemInfo() + assert fake.total_memory_bytes == 0 + assert fake.cuda_runtime_path is None + + +def test_fake_system_info_satisfies_protocol(): + assert isinstance(FakeSystemInfo(), SystemInfoProvider) + + +# --- FailingGpuInfo tests --- + + +def test_failing_gpu_info_device_count(): + with pytest.raises(ValueError, match="No GPU available"): + _ = FailingGpuInfo().device_count + + +def test_failing_gpu_info_devices(): + with pytest.raises(ValueError, match="No GPU available"): + _ = FailingGpuInfo().devices + + +def test_failing_gpu_info_cuda_driver_version(): + with pytest.raises(ValueError, match="No GPU available"): + _ = FailingGpuInfo().cuda_driver_version + + +def test_failing_gpu_info_driver_version(): + with pytest.raises(ValueError, match="No GPU available"): + _ = FailingGpuInfo().driver_version + + +# --- FailingSystemInfo tests --- + + +def test_failing_system_info_total_memory(): + with pytest.raises(ValueError, match="System info unavailable"): + _ = FailingSystemInfo().total_memory_bytes + + +def test_failing_system_info_cuda_runtime_path(): + with pytest.raises(ValueError, match="System info unavailable"): + _ = FailingSystemInfo().cuda_runtime_path diff --git a/rapids_cli/tests/test_memory.py b/rapids_cli/tests/test_memory.py index 572df33..183d7ff 100644 --- a/rapids_cli/tests/test_memory.py +++ b/rapids_cli/tests/test_memory.py @@ -1,7 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import MagicMock, patch - import pytest from rapids_cli.doctor.checks.memory import ( @@ -9,74 +7,62 @@ get_gpu_memory, get_system_memory, ) +from rapids_cli.hardware import DeviceInfo, FailingGpuInfo, FakeGpuInfo, FakeSystemInfo def test_get_system_memory(): - mock_vm = MagicMock() - mock_vm.total = 32 * 1024**3 # 32 GB in bytes - with patch("psutil.virtual_memory", return_value=mock_vm): - result = get_system_memory(verbose=False) - assert result == 32.0 + system_info = FakeSystemInfo(total_memory_bytes=32 * 1024**3) + result = get_system_memory(verbose=False, system_info=system_info) + assert result == 32.0 def test_get_gpu_memory_single_gpu(): - mock_handle = MagicMock() - mock_memory_info = MagicMock() - mock_memory_info.total = 16 * 1024**3 # 16 GB in bytes - - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory_info), - patch("pynvml.nvmlShutdown"), - ): - result = get_gpu_memory(verbose=False) - assert result == 16.0 + devices = [ + DeviceInfo(index=0, compute_capability=(7, 0), memory_total_bytes=16 * 1024**3) + ] + gpu_info = FakeGpuInfo(device_count=1, devices=devices) + result = get_gpu_memory(verbose=False, gpu_info=gpu_info) + assert result == 16.0 def test_get_gpu_memory_multiple_gpus(): - mock_handle = MagicMock() - mock_memory_info = MagicMock() - mock_memory_info.total = 16 * 1024**3 # 16 GB per GPU - - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=4), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory_info), - patch("pynvml.nvmlShutdown"), - ): - result = get_gpu_memory(verbose=False) - assert result == 64.0 # 16 GB * 4 GPUs + devices = [ + DeviceInfo(index=i, compute_capability=(7, 0), memory_total_bytes=16 * 1024**3) + for i in range(4) + ] + gpu_info = FakeGpuInfo(device_count=4, devices=devices) + result = get_gpu_memory(verbose=False, gpu_info=gpu_info) + assert result == 64.0 # 16 GB * 4 GPUs def test_check_memory_to_gpu_ratio_good_ratio(): - with ( - patch("pynvml.nvmlInit"), - patch("rapids_cli.doctor.checks.memory.get_system_memory", return_value=64.0), - patch("rapids_cli.doctor.checks.memory.get_gpu_memory", return_value=32.0), - ): - result = check_memory_to_gpu_ratio(verbose=True) - assert result is True + devices = [ + DeviceInfo(index=0, compute_capability=(7, 0), memory_total_bytes=32 * 1024**3) + ] + gpu_info = FakeGpuInfo(device_count=1, devices=devices) + system_info = FakeSystemInfo(total_memory_bytes=64 * 1024**3) + result = check_memory_to_gpu_ratio( + verbose=True, gpu_info=gpu_info, system_info=system_info + ) + assert result is True def test_check_memory_to_gpu_ratio_warning(): - with ( - patch("pynvml.nvmlInit"), - patch("rapids_cli.doctor.checks.memory.get_system_memory", return_value=32.0), - patch("rapids_cli.doctor.checks.memory.get_gpu_memory", return_value=32.0), - ): - with pytest.warns(UserWarning, match="System Memory to total GPU Memory ratio"): - result = check_memory_to_gpu_ratio(verbose=True) - assert result is True + devices = [ + DeviceInfo(index=0, compute_capability=(7, 0), memory_total_bytes=32 * 1024**3) + ] + gpu_info = FakeGpuInfo(device_count=1, devices=devices) + system_info = FakeSystemInfo(total_memory_bytes=32 * 1024**3) + with pytest.warns(UserWarning, match="System Memory to total GPU Memory ratio"): + result = check_memory_to_gpu_ratio( + verbose=True, gpu_info=gpu_info, system_info=system_info + ) + assert result is True def test_check_memory_to_gpu_ratio_no_gpu(): - import pynvml - - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises( - ValueError, match="GPU not found. Please ensure GPUs are installed." - ): - check_memory_to_gpu_ratio(verbose=False) + gpu_info = FailingGpuInfo() + with pytest.raises( + ValueError, match="GPU not found. Please ensure GPUs are installed." + ): + check_memory_to_gpu_ratio(verbose=False, gpu_info=gpu_info) diff --git a/rapids_cli/tests/test_nvlink.py b/rapids_cli/tests/test_nvlink.py index e2d82c7..a849ed0 100644 --- a/rapids_cli/tests/test_nvlink.py +++ b/rapids_cli/tests/test_nvlink.py @@ -1,54 +1,54 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import MagicMock, patch - import pytest from rapids_cli.doctor.checks.nvlink import check_nvlink_status +from rapids_cli.hardware import DeviceInfo, FailingGpuInfo, FakeGpuInfo def test_check_nvlink_status_success(): - mock_handle = MagicMock() - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetNvLinkState", return_value=1), - ): - result = check_nvlink_status(verbose=True) - assert result is True + devices = [ + DeviceInfo( + index=0, + compute_capability=(7, 0), + memory_total_bytes=0, + nvlink_states=[True], + ), + DeviceInfo( + index=1, + compute_capability=(7, 0), + memory_total_bytes=0, + nvlink_states=[True], + ), + ] + gpu_info = FakeGpuInfo(device_count=2, devices=devices) + result = check_nvlink_status(verbose=True, gpu_info=gpu_info) + assert result is True def test_check_nvlink_status_single_gpu(): - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), - ): - result = check_nvlink_status(verbose=False) - assert result is False + gpu_info = FakeGpuInfo(device_count=1) + result = check_nvlink_status(verbose=False, gpu_info=gpu_info) + assert result is False def test_check_nvlink_status_no_gpu(): - import pynvml - - with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): - with pytest.raises( - ValueError, match="GPU not found. Please ensure GPUs are installed." - ): - check_nvlink_status(verbose=False) - + gpu_info = FailingGpuInfo() + with pytest.raises( + ValueError, match="GPU not found. Please ensure GPUs are installed." + ): + check_nvlink_status(verbose=False, gpu_info=gpu_info) -def test_check_nvlink_status_nvml_error(): - import pynvml - mock_handle = MagicMock() - with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch( - "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported +def test_check_nvlink_status_no_nvlink(): + devices = [ + DeviceInfo( + index=0, compute_capability=(7, 0), memory_total_bytes=0, nvlink_states=[] ), - ): - with pytest.raises(ValueError, match="NVLink 0 Status Check Failed"): - check_nvlink_status(verbose=False) + DeviceInfo( + index=1, compute_capability=(7, 0), memory_total_bytes=0, nvlink_states=[] + ), + ] + gpu_info = FakeGpuInfo(device_count=2, devices=devices) + result = check_nvlink_status(verbose=True, gpu_info=gpu_info) + assert result is False