Skip to content

Commit e4c269c

Browse files
allenwang28facebook-github-bot
authored andcommitted
Enable Monarch extra builds
Summary: This is the first step to supporting individual installs for Monarch capabilities, i.e. core/rdma/tensor_engine. Changes: - Modifies monarch_extension to use core, rdma, tensor_engine, full feature sets. - Switches setup.py to use MONARCH_FEATURES accepting strings rather than the 0/1 boolean only USE_TENSOR_ENGINE (also makes all the downstream changes needed to reflect it) - Renames the default wheel name from `monarch` to `torchmonarch`, to better align with what we're officially publishing to PyPI - Slight clean up to setup.py workflow Differential Revision: D87884695
1 parent 8e64e9b commit e4c269c

File tree

14 files changed

+149
-75
lines changed

14 files changed

+149
-75
lines changed

.github/workflows/build-cpu.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,4 @@ jobs:
3434
pip install -r build-requirements.txt
3535
3636
# Build monarch (No tensor engine, CPU version)
37-
USE_TENSOR_ENGINE=0 python setup.py bdist_wheel
37+
MONARCH_FEATURES=core python setup.py bdist_wheel -v

.github/workflows/build-cuda.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,5 +43,5 @@ jobs:
4343
4444
export CUDA_LIB_DIR=/usr/lib64
4545
46-
# Build monarch (CUDA version)
47-
python setup.py bdist_wheel
46+
# Build monarch (CUDA version with all features)
47+
MONARCH_FEATURES=full python setup.py bdist_wheel -v

.github/workflows/doc_build.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ jobs:
4343
4444
# Set environment variables for CUDA build
4545
export USE_CUDA=1
46-
export USE_TENSOR_ENGINE=1
4746
export RUSTFLAGS="-Zthreads=16 ${RUSTFLAGS:-}"
4847
export _GLIBCXX_USE_CXX11_ABI=1
4948
export CUDA_LIB_DIR=/usr/lib64

.github/workflows/publish_release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ jobs:
5050
export MONARCH_PACKAGE_NAME="torchmonarch"
5151
export CUDA_LIB_DIR=/usr/lib64
5252
export MONARCH_VERSION="${{ github.event.inputs.version }}"
53-
python setup.py bdist_wheel
53+
python setup.py bdist_wheel -v
5454
5555
# hacky until the right distribution wheel can be made...
5656
find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;

.github/workflows/test-cpu-python.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,6 @@ jobs:
2828
# Setup test environment
2929
setup_conda_environment
3030
31-
# Disable tensor engine
32-
export USE_TENSOR_ENGINE=0
33-
3431
# Install PyTorch nightly
3532
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
3633

.github/workflows/wheels.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ jobs:
5151
export MONARCH_VERSION=$(date +'%Y.%m.%d')
5252
export CUDA_LIB_DIR=/usr/lib64
5353
54-
python setup.py bdist_wheel
54+
MONARCH_FEATURES=full python setup.py bdist_wheel -v
5555
5656
# hacky until the right distribution wheel can be made...
5757
find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ python/monarch.egg-info/*
99
*.egg
1010
build/*
1111
dist/*
12-
monarch.egg-info/*
12+
torchmonarch.egg-info/*
1313

1414
.ipynb_checkpoints
1515
.monarch

README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ sudo dnf install clang-devel libnccl-devel
9898
conda install -c conda-forge clangdev nccl
9999
conda update -n monarchenv --all -c conda-forge -y
100100

101-
# If you are building with RDMA support, build monarch with `USE_TENSOR_ENGINE=1 pip install --no-build-isolation .` and dnf install the following packages
101+
# If you are building with RDMA support, build monarch with `MONARCH_FEATURES=rdma pip install --no-build-isolation .` and dnf install the following packages
102102
sudo dnf install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel
103103

104104
# Install build dependencies
@@ -147,12 +147,12 @@ pip install -r build-requirements.txt
147147
# Install test dependencies
148148
pip install -r python/tests/requirements.txt
149149

150-
# Build and install Monarch (with tensor engine support)
150+
# Build and install Monarch (with all features)
151151
pip install --no-build-isolation .
152152

153153
# or
154154
# Build and install Monarch (without tensor engine support)
155-
USE_TENSOR_ENGINE=0 pip install --no-build-isolation .
155+
MONARCH_FEATURES=core pip install --no-build-isolation .
156156

157157
# or setup for development
158158
pip install --no-build-isolation -e .
@@ -185,10 +185,10 @@ pip install -r build-requirements.txt
185185
# Install test dependencies
186186
pip install -r python/tests/requirements.txt
187187

188-
# Build and install Monarch
189-
USE_TENSOR_ENGINE=0 pip install --no-build-isolation .
188+
# Build and install Monarch (core only, no RDMA or tensor engine)
189+
MONARCH_FEATURES=core pip install --no-build-isolation .
190190
# or setup for development
191-
USE_TENSOR_ENGINE=0 pip install --no-build-isolation -e .
191+
MONARCH_FEATURES=core pip install --no-build-isolation -e .
192192

193193
# Verify installation
194194
pip list | grep monarch

monarch_extension/Cargo.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,5 +40,8 @@ torch-sys-cuda = { version = "0.0.0", path = "../torch-sys-cuda", optional = tru
4040
tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
4141

4242
[features]
43-
default = ["tensor_engine"]
44-
tensor_engine = ["dep:monarch_messages", "dep:monarch_rdma_extension", "dep:monarch_tensor_worker", "dep:nccl-sys", "dep:rdmaxcel-sys", "dep:torch-sys", "dep:torch-sys-cuda"]
43+
core = []
44+
default = ["full"]
45+
full = ["rdma", "tensor_engine"]
46+
rdma = ["dep:monarch_rdma_extension", "dep:rdmaxcel-sys"]
47+
tensor_engine = ["dep:monarch_messages", "dep:monarch_tensor_worker", "dep:nccl-sys", "dep:torch-sys", "dep:torch-sys-cuda"]

monarch_extension/src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,10 @@ pub fn mod_init(module: &Bound<'_, PyModule>) -> PyResult<()> {
122122
module,
123123
"monarch_extension.mesh_controller",
124124
)?)?;
125+
}
126+
127+
#[cfg(feature = "rdma")]
128+
{
125129
monarch_rdma_extension::register_python_bindings(&get_or_add_new_module(module, "rdma")?)?;
126130
}
127131
simulation_tools::register_python_bindings(&get_or_add_new_module(

0 commit comments

Comments
 (0)