diff --git a/.github/workflows/build-cpu.yml b/.github/workflows/build-cpu.yml index 6ae2a3db1..98060175f 100644 --- a/.github/workflows/build-cpu.yml +++ b/.github/workflows/build-cpu.yml @@ -34,4 +34,4 @@ jobs: pip install -r build-requirements.txt # Build monarch (No tensor engine, CPU version) - USE_TENSOR_ENGINE=0 python setup.py bdist_wheel + MONARCH_FEATURES=core python setup.py bdist_wheel -v diff --git a/.github/workflows/build-cuda.yml b/.github/workflows/build-cuda.yml index 088599020..09e23c3dd 100644 --- a/.github/workflows/build-cuda.yml +++ b/.github/workflows/build-cuda.yml @@ -43,5 +43,5 @@ jobs: export CUDA_LIB_DIR=/usr/lib64 - # Build monarch (CUDA version) - python setup.py bdist_wheel + # Build monarch (CUDA version with all features) + MONARCH_FEATURES=full python setup.py bdist_wheel -v diff --git a/.github/workflows/doc_build.yml b/.github/workflows/doc_build.yml index f0bb0ad8d..04649675e 100644 --- a/.github/workflows/doc_build.yml +++ b/.github/workflows/doc_build.yml @@ -43,7 +43,6 @@ jobs: # Set environment variables for CUDA build export USE_CUDA=1 - export USE_TENSOR_ENGINE=1 export RUSTFLAGS="-Zthreads=16 ${RUSTFLAGS:-}" export _GLIBCXX_USE_CXX11_ABI=1 export CUDA_LIB_DIR=/usr/lib64 diff --git a/.github/workflows/publish_release.yml b/.github/workflows/publish_release.yml index 269ae1aee..6c8807233 100644 --- a/.github/workflows/publish_release.yml +++ b/.github/workflows/publish_release.yml @@ -50,7 +50,7 @@ jobs: export MONARCH_PACKAGE_NAME="torchmonarch" export CUDA_LIB_DIR=/usr/lib64 export MONARCH_VERSION="${{ github.event.inputs.version }}" - python setup.py bdist_wheel + MONARCH_FEATURES=full python setup.py bdist_wheel -v # hacky until the right distribution wheel can be made... find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \; diff --git a/.github/workflows/test-cpu-python.yml b/.github/workflows/test-cpu-python.yml index d2f63ffc5..f8a6e3c01 100644 --- a/.github/workflows/test-cpu-python.yml +++ b/.github/workflows/test-cpu-python.yml @@ -28,9 +28,6 @@ jobs: # Setup test environment setup_conda_environment - # Disable tensor engine - export USE_TENSOR_ENGINE=0 - # Install PyTorch nightly pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index ebf1c89a5..27fb82d8e 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -51,7 +51,7 @@ jobs: export MONARCH_VERSION=$(date +'%Y.%m.%d') export CUDA_LIB_DIR=/usr/lib64 - python setup.py bdist_wheel + MONARCH_FEATURES=full python setup.py bdist_wheel -v # hacky until the right distribution wheel can be made... find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \; diff --git a/.gitignore b/.gitignore index d87c0a063..42b420182 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,7 @@ python/monarch.egg-info/* *.egg build/* dist/* -monarch.egg-info/* +torchmonarch.egg-info/* .ipynb_checkpoints .monarch diff --git a/README.md b/README.md index 772625fee..c08a6f396 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ sudo dnf install clang-devel libnccl-devel conda install -c conda-forge clangdev nccl conda update -n monarchenv --all -c conda-forge -y -# If you are building with RDMA support, build monarch with `USE_TENSOR_ENGINE=1 pip install --no-build-isolation .` and dnf install the following packages +# If you are building with RDMA support, build monarch with `MONARCH_FEATURES=rdma pip install --no-build-isolation .` and dnf install the following packages sudo dnf install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel # Install build dependencies @@ -147,12 +147,12 @@ pip install -r build-requirements.txt # Install test dependencies pip install -r python/tests/requirements.txt -# Build and install Monarch (with tensor engine support) +# Build and install Monarch (with all features) pip install --no-build-isolation . # or # Build and install Monarch (without tensor engine support) -USE_TENSOR_ENGINE=0 pip install --no-build-isolation . +MONARCH_FEATURES=core pip install --no-build-isolation . # or setup for development pip install --no-build-isolation -e . @@ -185,10 +185,10 @@ pip install -r build-requirements.txt # Install test dependencies pip install -r python/tests/requirements.txt -# Build and install Monarch -USE_TENSOR_ENGINE=0 pip install --no-build-isolation . +# Build and install Monarch (core only, no RDMA or tensor engine) +MONARCH_FEATURES=core pip install --no-build-isolation . # or setup for development -USE_TENSOR_ENGINE=0 pip install --no-build-isolation -e . +MONARCH_FEATURES=core pip install --no-build-isolation -e . # Verify installation pip list | grep monarch diff --git a/monarch_extension/Cargo.toml b/monarch_extension/Cargo.toml index a0390cfd0..580300220 100644 --- a/monarch_extension/Cargo.toml +++ b/monarch_extension/Cargo.toml @@ -40,5 +40,8 @@ torch-sys-cuda = { version = "0.0.0", path = "../torch-sys-cuda", optional = tru tracing = { version = "0.1.41", features = ["attributes", "valuable"] } [features] -default = ["tensor_engine"] -tensor_engine = ["dep:monarch_messages", "dep:monarch_rdma_extension", "dep:monarch_tensor_worker", "dep:nccl-sys", "dep:rdmaxcel-sys", "dep:torch-sys", "dep:torch-sys-cuda"] +core = [] +default = ["full"] +full = ["rdma", "tensor_engine"] +rdma = ["dep:monarch_rdma_extension", "dep:rdmaxcel-sys"] +tensor_engine = ["dep:monarch_messages", "dep:monarch_tensor_worker", "dep:nccl-sys", "dep:torch-sys", "dep:torch-sys-cuda"] diff --git a/monarch_extension/src/lib.rs b/monarch_extension/src/lib.rs index f98f2db31..e91d3271b 100644 --- a/monarch_extension/src/lib.rs +++ b/monarch_extension/src/lib.rs @@ -122,6 +122,10 @@ pub fn mod_init(module: &Bound<'_, PyModule>) -> PyResult<()> { module, "monarch_extension.mesh_controller", )?)?; + } + + #[cfg(feature = "rdma")] + { monarch_rdma_extension::register_python_bindings(&get_or_add_new_module(module, "rdma")?)?; } simulation_tools::register_python_bindings(&get_or_add_new_module( diff --git a/python/monarch/_src/actor/proc_mesh.py b/python/monarch/_src/actor/proc_mesh.py index e3cc151ac..7c1bf69b2 100644 --- a/python/monarch/_src/actor/proc_mesh.py +++ b/python/monarch/_src/actor/proc_mesh.py @@ -478,7 +478,8 @@ def _spawn_nonblocking_on( def _device_mesh(self) -> "DeviceMesh": if not _has_tensor_engine(): raise RuntimeError( - "DeviceMesh is not available because tensor_engine was not compiled (USE_TENSOR_ENGINE=0)" + "DeviceMesh is not available because tensor_engine was not compiled.\n" + "Build with: MONARCH_FEATURES=tensor_engine pip install ." ) # type: ignore[21] diff --git a/python/monarch/_src/actor/v1/proc_mesh.py b/python/monarch/_src/actor/v1/proc_mesh.py index 9e3b65eb8..ddc256dd5 100644 --- a/python/monarch/_src/actor/v1/proc_mesh.py +++ b/python/monarch/_src/actor/v1/proc_mesh.py @@ -332,7 +332,8 @@ def _device_mesh(self) -> "DeviceMesh": if not _has_tensor_engine(): raise RuntimeError( - "DeviceMesh is not available because tensor_engine was not compiled (USE_TENSOR_ENGINE=0)" + "DeviceMesh is not available because tensor_engine was not compiled.\n" + "Build with: MONARCH_FEATURES=tensor_engine pip install ." ) # type: ignore[21] diff --git a/scripts/build_monarch_for_docs.sh b/scripts/build_monarch_for_docs.sh index 6e749cd83..769514564 100755 --- a/scripts/build_monarch_for_docs.sh +++ b/scripts/build_monarch_for_docs.sh @@ -15,7 +15,7 @@ echo "=========================================" export CI=true # BUILD MONARCH COMPLETELY - This is critical for API documentation echo "Building Monarch with Rust bindings..." -python -m pip install -e . --no-build-isolation +python -m pip install -e . --no-build-isolation -v # Verify Monarch installation and imports echo "Verifying Monarch installation..." diff --git a/setup.py b/setup.py index 1f1ec13d3..2f6b6b9f2 100644 --- a/setup.py +++ b/setup.py @@ -25,61 +25,92 @@ ) USE_CUDA = CUDA_HOME is not None -USE_TENSOR_ENGINE = os.environ.get("USE_TENSOR_ENGINE", "1") == "1" -monarch_cpp_src = ["python/monarch/common/init.cpp"] -if USE_CUDA: - monarch_cpp_src.append("python/monarch/common/mock_cuda.cpp") +# Feature detection for building torchmonarch-* variants. +def get_rust_features(): + """ + Determine which Rust features to build. -common_C = CppExtension( - "monarch.common._C", - monarch_cpp_src, - extra_compile_args=["-g", "-O3"], - libraries=["dl"], - include_dirs=[ - os.path.dirname(os.path.abspath(__file__)), - sysconfig.get_config_var("INCLUDEDIR"), - ], -) + Environment variable: + - MONARCH_FEATURES: "core", "tensor_engine", "rdma", "full" (comma-separated) + - Default: "full" (all features) + Returns: + list: features to enable + """ + features_str = os.environ.get("MONARCH_FEATURES", "").strip() -controller_C = CppExtension( - "monarch.gradient._gradient_generator", - ["python/monarch/gradient/_gradient_generator.cpp"], - extra_compile_args=["-g", "-O3"], - include_dirs=[ - os.path.dirname(os.path.abspath(__file__)), - sysconfig.get_config_var("INCLUDEDIR"), - ], -) + if features_str: + return [f.strip() for f in features_str.split(",") if f.strip()] + else: + # Use the full build by default. + return ["full"] -ENABLE_MSG_LOGGING = ( - "--cfg=enable_hyperactor_message_logging" - if os.environ.get("ENABLE_MESSAGE_LOGGING") - else "" -) -ENABLE_TRACING_UNSTABLE = "--cfg=tracing_unstable" +# Get features for this build +RUST_FEATURES = get_rust_features() + + +def has_feature(feature): + """ + Check if a feature is enabled. + + Args: + feature: Feature name to check (e.g., "rdma", "tensor_engine", "core") + + Returns: + bool: True if the feature is explicitly listed or "full" is enabled + """ + return feature in RUST_FEATURES or "full" in RUST_FEATURES -os.environ.update( - { + +# Print build configuration +package_version = os.environ.get("MONARCH_VERSION", "0.0.1") +package_name = os.environ.get("MONARCH_PACKAGE_NAME", "torchmonarch") +print(f"Building {package_name} v{package_version} with features: {RUST_FEATURES}") + + +def setup_build_environment(): + """ + Configure environment variables for Rust and C++ builds. + + Sets up compiler flags, PyTorch library paths, and feature-specific configuration. + """ + enable_msg_logging = ( + "--cfg=enable_hyperactor_message_logging" + if os.environ.get("ENABLE_MESSAGE_LOGGING") + else "" + ) + enable_tracing_unstable = "--cfg=tracing_unstable" + + # RDMA requires PyTorch CUDA libraries (torch_cuda, c10_cuda) for GPUDirect support + # So we only enable TORCH_SYS_USE_PYTORCH_APIS when building with RDMA + use_pytorch_apis = "1" if has_feature("rdma") else "0" + + env_updates = { "CXXFLAGS": f"-D_GLIBCXX_USE_CXX11_ABI={int(torch._C._GLIBCXX_USE_CXX11_ABI)}", "RUSTFLAGS": " ".join( - ["-Zthreads=16", ENABLE_MSG_LOGGING, ENABLE_TRACING_UNSTABLE] + ["-Zthreads=16", enable_msg_logging, enable_tracing_unstable] ), "LIBTORCH_LIB": TORCH_LIB_PATH, "LIBTORCH_INCLUDE": ":".join(torch_include_paths()), "_GLIBCXX_USE_CXX11_ABI": str(int(torch._C._GLIBCXX_USE_CXX11_ABI)), - "TORCH_SYS_USE_PYTORCH_APIS": "0", + "TORCH_SYS_USE_PYTORCH_APIS": use_pytorch_apis, } -) -if USE_CUDA: - os.environ.update( - { - "CUDA_HOME": CUDA_HOME, - } - ) + + if USE_CUDA: + env_updates["CUDA_HOME"] = CUDA_HOME + + print("Setting environment variables:") + for k, v in env_updates.items(): + print(f" {k}={v}") + + os.environ.update(env_updates) + + +# Setup build environment +setup_build_environment() class Clean(Command): @@ -181,19 +212,60 @@ def run(self): ) # Main extension (always built) -rust_extensions.append( - RustExtension( - "monarch._rust_bindings", - binding=Binding.PyO3, - path="monarch_extension/Cargo.toml", - debug=False, - features=["tensor_engine"] if USE_TENSOR_ENGINE else [], - args=[] if USE_TENSOR_ENGINE else ["--no-default-features"], +rust_ext = RustExtension( + "monarch._rust_bindings", + binding=Binding.PyO3, + path="monarch_extension/Cargo.toml", + debug=False, + features=RUST_FEATURES, + args=["--no-default-features"], +) + +print(f" Rust extension features: {RUST_FEATURES}") +print(f" Rust extension args: {rust_ext.args}") + +rust_extensions.append(rust_ext) + +# Build C++ extensions conditionally based on features +cpp_ext_modules = [] + +# common_C is always needed +monarch_cpp_src = ["python/monarch/common/init.cpp"] +if USE_CUDA: + monarch_cpp_src.append("python/monarch/common/mock_cuda.cpp") + +cpp_ext_modules.append( + CppExtension( + "monarch.common._C", + monarch_cpp_src, + extra_compile_args=["-g", "-O3"], + libraries=["dl"], + include_dirs=[ + os.path.dirname(os.path.abspath(__file__)), + sysconfig.get_config_var("INCLUDEDIR"), + ], ) ) +print(" Building common._C C++ extension") + +# Only build gradient_generator if tensor_engine is enabled +if has_feature("tensor_engine"): + cpp_ext_modules.append( + CppExtension( + "monarch.gradient._gradient_generator", + ["python/monarch/gradient/_gradient_generator.cpp"], + extra_compile_args=["-g", "-O3"], + include_dirs=[ + os.path.dirname(os.path.abspath(__file__)), + sysconfig.get_config_var("INCLUDEDIR"), + ], + ) + ) + print(" Building gradient_generator C++ extension") +else: + print(" Skipping gradient_generator C++ extension (tensor_engine not enabled)") -package_name = os.environ.get("MONARCH_PACKAGE_NAME", "monarch") -package_version = os.environ.get("MONARCH_VERSION", "0.0.1") +print(f" C++ extensions: {[ext.name for ext in cpp_ext_modules]}") setup( name=package_name, @@ -217,10 +289,7 @@ def run(self): description="Monarch: Single controller library", long_description=readme, long_description_content_type="text/markdown", - ext_modules=[ - controller_C, - common_C, - ], + ext_modules=cpp_ext_modules, entry_points={ "console_scripts": [ "monarch=monarch.tools.cli:main",