Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build-cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ jobs:
pip install -r build-requirements.txt

# Build monarch (No tensor engine, CPU version)
USE_TENSOR_ENGINE=0 python setup.py bdist_wheel
MONARCH_FEATURES=core python setup.py bdist_wheel -v
4 changes: 2 additions & 2 deletions .github/workflows/build-cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,5 @@ jobs:

export CUDA_LIB_DIR=/usr/lib64

# Build monarch (CUDA version)
python setup.py bdist_wheel
# Build monarch (CUDA version with all features)
MONARCH_FEATURES=full python setup.py bdist_wheel -v
1 change: 0 additions & 1 deletion .github/workflows/doc_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ jobs:

# Set environment variables for CUDA build
export USE_CUDA=1
export USE_TENSOR_ENGINE=1
export RUSTFLAGS="-Zthreads=16 ${RUSTFLAGS:-}"
export _GLIBCXX_USE_CXX11_ABI=1
export CUDA_LIB_DIR=/usr/lib64
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/publish_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
export MONARCH_PACKAGE_NAME="torchmonarch"
export CUDA_LIB_DIR=/usr/lib64
export MONARCH_VERSION="${{ github.event.inputs.version }}"
python setup.py bdist_wheel
MONARCH_FEATURES=full python setup.py bdist_wheel -v

# hacky until the right distribution wheel can be made...
find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;
Expand Down
3 changes: 0 additions & 3 deletions .github/workflows/test-cpu-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,6 @@ jobs:
# Setup test environment
setup_conda_environment

# Disable tensor engine
export USE_TENSOR_ENGINE=0

# Install PyTorch nightly
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:
export MONARCH_VERSION=$(date +'%Y.%m.%d')
export CUDA_LIB_DIR=/usr/lib64

python setup.py bdist_wheel
MONARCH_FEATURES=full python setup.py bdist_wheel -v

# hacky until the right distribution wheel can be made...
find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ python/monarch.egg-info/*
*.egg
build/*
dist/*
monarch.egg-info/*
torchmonarch.egg-info/*

.ipynb_checkpoints
.monarch
Expand Down
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ sudo dnf install clang-devel libnccl-devel
conda install -c conda-forge clangdev nccl
conda update -n monarchenv --all -c conda-forge -y

# If you are building with RDMA support, build monarch with `USE_TENSOR_ENGINE=1 pip install --no-build-isolation .` and dnf install the following packages
# If you are building with RDMA support, build monarch with `MONARCH_FEATURES=rdma pip install --no-build-isolation .` and dnf install the following packages
sudo dnf install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel

# Install build dependencies
Expand Down Expand Up @@ -147,12 +147,12 @@ pip install -r build-requirements.txt
# Install test dependencies
pip install -r python/tests/requirements.txt

# Build and install Monarch (with tensor engine support)
# Build and install Monarch (with all features)
pip install --no-build-isolation .

# or
# Build and install Monarch (without tensor engine support)
USE_TENSOR_ENGINE=0 pip install --no-build-isolation .
MONARCH_FEATURES=core pip install --no-build-isolation .

# or setup for development
pip install --no-build-isolation -e .
Expand Down Expand Up @@ -185,10 +185,10 @@ pip install -r build-requirements.txt
# Install test dependencies
pip install -r python/tests/requirements.txt

# Build and install Monarch
USE_TENSOR_ENGINE=0 pip install --no-build-isolation .
# Build and install Monarch (core only, no RDMA or tensor engine)
MONARCH_FEATURES=core pip install --no-build-isolation .
# or setup for development
USE_TENSOR_ENGINE=0 pip install --no-build-isolation -e .
MONARCH_FEATURES=core pip install --no-build-isolation -e .

# Verify installation
pip list | grep monarch
Expand Down
7 changes: 5 additions & 2 deletions monarch_extension/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,8 @@ torch-sys-cuda = { version = "0.0.0", path = "../torch-sys-cuda", optional = tru
tracing = { version = "0.1.41", features = ["attributes", "valuable"] }

[features]
default = ["tensor_engine"]
tensor_engine = ["dep:monarch_messages", "dep:monarch_rdma_extension", "dep:monarch_tensor_worker", "dep:nccl-sys", "dep:rdmaxcel-sys", "dep:torch-sys", "dep:torch-sys-cuda"]
core = []
default = ["full"]
full = ["rdma", "tensor_engine"]
rdma = ["dep:monarch_rdma_extension", "dep:rdmaxcel-sys"]
tensor_engine = ["dep:monarch_messages", "dep:monarch_tensor_worker", "dep:nccl-sys", "dep:torch-sys", "dep:torch-sys-cuda"]
4 changes: 4 additions & 0 deletions monarch_extension/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@ pub fn mod_init(module: &Bound<'_, PyModule>) -> PyResult<()> {
module,
"monarch_extension.mesh_controller",
)?)?;
}

#[cfg(feature = "rdma")]
{
monarch_rdma_extension::register_python_bindings(&get_or_add_new_module(module, "rdma")?)?;
}
simulation_tools::register_python_bindings(&get_or_add_new_module(
Expand Down
3 changes: 2 additions & 1 deletion python/monarch/_src/actor/proc_mesh.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,8 @@ def _spawn_nonblocking_on(
def _device_mesh(self) -> "DeviceMesh":
if not _has_tensor_engine():
raise RuntimeError(
"DeviceMesh is not available because tensor_engine was not compiled (USE_TENSOR_ENGINE=0)"
"DeviceMesh is not available because tensor_engine was not compiled.\n"
"Build with: MONARCH_FEATURES=tensor_engine pip install ."
)

# type: ignore[21]
Expand Down
3 changes: 2 additions & 1 deletion python/monarch/_src/actor/v1/proc_mesh.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,8 @@ def _device_mesh(self) -> "DeviceMesh":

if not _has_tensor_engine():
raise RuntimeError(
"DeviceMesh is not available because tensor_engine was not compiled (USE_TENSOR_ENGINE=0)"
"DeviceMesh is not available because tensor_engine was not compiled.\n"
"Build with: MONARCH_FEATURES=tensor_engine pip install ."
)

# type: ignore[21]
Expand Down
2 changes: 1 addition & 1 deletion scripts/build_monarch_for_docs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ echo "========================================="
export CI=true
# BUILD MONARCH COMPLETELY - This is critical for API documentation
echo "Building Monarch with Rust bindings..."
python -m pip install -e . --no-build-isolation
python -m pip install -e . --no-build-isolation -v

# Verify Monarch installation and imports
echo "Verifying Monarch installation..."
Expand Down
177 changes: 123 additions & 54 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,61 +25,92 @@
)

USE_CUDA = CUDA_HOME is not None
USE_TENSOR_ENGINE = os.environ.get("USE_TENSOR_ENGINE", "1") == "1"

monarch_cpp_src = ["python/monarch/common/init.cpp"]

if USE_CUDA:
monarch_cpp_src.append("python/monarch/common/mock_cuda.cpp")
# Feature detection for building torchmonarch-* variants.
def get_rust_features():
"""
Determine which Rust features to build.

common_C = CppExtension(
"monarch.common._C",
monarch_cpp_src,
extra_compile_args=["-g", "-O3"],
libraries=["dl"],
include_dirs=[
os.path.dirname(os.path.abspath(__file__)),
sysconfig.get_config_var("INCLUDEDIR"),
],
)
Environment variable:
- MONARCH_FEATURES: "core", "tensor_engine", "rdma", "full" (comma-separated)
- Default: "full" (all features)

Returns:
list: features to enable
"""
features_str = os.environ.get("MONARCH_FEATURES", "").strip()

controller_C = CppExtension(
"monarch.gradient._gradient_generator",
["python/monarch/gradient/_gradient_generator.cpp"],
extra_compile_args=["-g", "-O3"],
include_dirs=[
os.path.dirname(os.path.abspath(__file__)),
sysconfig.get_config_var("INCLUDEDIR"),
],
)
if features_str:
return [f.strip() for f in features_str.split(",") if f.strip()]
else:
# Use the full build by default.
return ["full"]

ENABLE_MSG_LOGGING = (
"--cfg=enable_hyperactor_message_logging"
if os.environ.get("ENABLE_MESSAGE_LOGGING")
else ""
)

ENABLE_TRACING_UNSTABLE = "--cfg=tracing_unstable"
# Get features for this build
RUST_FEATURES = get_rust_features()


def has_feature(feature):
"""
Check if a feature is enabled.

Args:
feature: Feature name to check (e.g., "rdma", "tensor_engine", "core")

Returns:
bool: True if the feature is explicitly listed or "full" is enabled
"""
return feature in RUST_FEATURES or "full" in RUST_FEATURES

os.environ.update(
{

# Print build configuration
package_version = os.environ.get("MONARCH_VERSION", "0.0.1")
package_name = os.environ.get("MONARCH_PACKAGE_NAME", "torchmonarch")
print(f"Building {package_name} v{package_version} with features: {RUST_FEATURES}")


def setup_build_environment():
"""
Configure environment variables for Rust and C++ builds.

Sets up compiler flags, PyTorch library paths, and feature-specific configuration.
"""
enable_msg_logging = (
"--cfg=enable_hyperactor_message_logging"
if os.environ.get("ENABLE_MESSAGE_LOGGING")
else ""
)
enable_tracing_unstable = "--cfg=tracing_unstable"

# RDMA requires PyTorch CUDA libraries (torch_cuda, c10_cuda) for GPUDirect support
# So we only enable TORCH_SYS_USE_PYTORCH_APIS when building with RDMA
use_pytorch_apis = "1" if has_feature("rdma") else "0"

env_updates = {
"CXXFLAGS": f"-D_GLIBCXX_USE_CXX11_ABI={int(torch._C._GLIBCXX_USE_CXX11_ABI)}",
"RUSTFLAGS": " ".join(
["-Zthreads=16", ENABLE_MSG_LOGGING, ENABLE_TRACING_UNSTABLE]
["-Zthreads=16", enable_msg_logging, enable_tracing_unstable]
),
"LIBTORCH_LIB": TORCH_LIB_PATH,
"LIBTORCH_INCLUDE": ":".join(torch_include_paths()),
"_GLIBCXX_USE_CXX11_ABI": str(int(torch._C._GLIBCXX_USE_CXX11_ABI)),
"TORCH_SYS_USE_PYTORCH_APIS": "0",
"TORCH_SYS_USE_PYTORCH_APIS": use_pytorch_apis,
}
)
if USE_CUDA:
os.environ.update(
{
"CUDA_HOME": CUDA_HOME,
}
)

if USE_CUDA:
env_updates["CUDA_HOME"] = CUDA_HOME

print("Setting environment variables:")
for k, v in env_updates.items():
print(f" {k}={v}")

os.environ.update(env_updates)


# Setup build environment
setup_build_environment()


class Clean(Command):
Expand Down Expand Up @@ -181,19 +212,60 @@ def run(self):
)

# Main extension (always built)
rust_extensions.append(
RustExtension(
"monarch._rust_bindings",
binding=Binding.PyO3,
path="monarch_extension/Cargo.toml",
debug=False,
features=["tensor_engine"] if USE_TENSOR_ENGINE else [],
args=[] if USE_TENSOR_ENGINE else ["--no-default-features"],
rust_ext = RustExtension(
"monarch._rust_bindings",
binding=Binding.PyO3,
path="monarch_extension/Cargo.toml",
debug=False,
features=RUST_FEATURES,
args=["--no-default-features"],
)

print(f" Rust extension features: {RUST_FEATURES}")
print(f" Rust extension args: {rust_ext.args}")

rust_extensions.append(rust_ext)

# Build C++ extensions conditionally based on features
cpp_ext_modules = []

# common_C is always needed
monarch_cpp_src = ["python/monarch/common/init.cpp"]
if USE_CUDA:
monarch_cpp_src.append("python/monarch/common/mock_cuda.cpp")

cpp_ext_modules.append(
CppExtension(
"monarch.common._C",
monarch_cpp_src,
extra_compile_args=["-g", "-O3"],
libraries=["dl"],
include_dirs=[
os.path.dirname(os.path.abspath(__file__)),
sysconfig.get_config_var("INCLUDEDIR"),
],
)
)
print(" Building common._C C++ extension")

# Only build gradient_generator if tensor_engine is enabled
if has_feature("tensor_engine"):
cpp_ext_modules.append(
CppExtension(
"monarch.gradient._gradient_generator",
["python/monarch/gradient/_gradient_generator.cpp"],
extra_compile_args=["-g", "-O3"],
include_dirs=[
os.path.dirname(os.path.abspath(__file__)),
sysconfig.get_config_var("INCLUDEDIR"),
],
)
)
print(" Building gradient_generator C++ extension")
else:
print(" Skipping gradient_generator C++ extension (tensor_engine not enabled)")

package_name = os.environ.get("MONARCH_PACKAGE_NAME", "monarch")
package_version = os.environ.get("MONARCH_VERSION", "0.0.1")
print(f" C++ extensions: {[ext.name for ext in cpp_ext_modules]}")

setup(
name=package_name,
Expand All @@ -217,10 +289,7 @@ def run(self):
description="Monarch: Single controller library",
long_description=readme,
long_description_content_type="text/markdown",
ext_modules=[
controller_C,
common_C,
],
ext_modules=cpp_ext_modules,
entry_points={
"console_scripts": [
"monarch=monarch.tools.cli:main",
Expand Down