diff --git a/.github/workflows/build-cuda.yml b/.github/workflows/build-cuda.yml index 088599020..96fd34e8f 100644 --- a/.github/workflows/build-cuda.yml +++ b/.github/workflows/build-cuda.yml @@ -41,7 +41,5 @@ jobs: # Setup Tensor Engine setup_tensor_engine - export CUDA_LIB_DIR=/usr/lib64 - # Build monarch (CUDA version) python setup.py bdist_wheel diff --git a/.github/workflows/doc_build.yml b/.github/workflows/doc_build.yml index f0bb0ad8d..d536ad1a0 100644 --- a/.github/workflows/doc_build.yml +++ b/.github/workflows/doc_build.yml @@ -46,7 +46,6 @@ jobs: export USE_TENSOR_ENGINE=1 export RUSTFLAGS="-Zthreads=16 ${RUSTFLAGS:-}" export _GLIBCXX_USE_CXX11_ABI=1 - export CUDA_LIB_DIR=/usr/lib64 # Build Monarch completely for documentation - use dedicated script ./scripts/build_monarch_for_docs.sh diff --git a/.github/workflows/publish_release.yml b/.github/workflows/publish_release.yml index 269ae1aee..03662d1b7 100644 --- a/.github/workflows/publish_release.yml +++ b/.github/workflows/publish_release.yml @@ -48,7 +48,6 @@ jobs: # Build wheel export MONARCH_PACKAGE_NAME="torchmonarch" - export CUDA_LIB_DIR=/usr/lib64 export MONARCH_VERSION="${{ github.event.inputs.version }}" python setup.py bdist_wheel diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index ebf1c89a5..495f371f6 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -49,7 +49,6 @@ jobs: # Build wheel export MONARCH_PACKAGE_NAME="torchmonarch-nightly" export MONARCH_VERSION=$(date +'%Y.%m.%d') - export CUDA_LIB_DIR=/usr/lib64 python setup.py bdist_wheel diff --git a/.gitignore b/.gitignore index cd326bbdf..d87c0a063 100644 --- a/.gitignore +++ b/.gitignore @@ -33,4 +33,3 @@ docs/_build/** docs/build/** docs/**/generated/** */sg_execution_times.rst -nccl/** diff --git a/Cargo.toml b/Cargo.toml index 54b409826..3fcb6971e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ members = [ "monarch_rdma", "monarch_tensor_worker", "monarch_types", + "monarch_cpp_static_libs", "nccl-sys", "ndslice", "preempt_rwlock", diff --git a/build_utils/Cargo.toml b/build_utils/Cargo.toml index 6eda2534e..8d97eaa46 100644 --- a/build_utils/Cargo.toml +++ b/build_utils/Cargo.toml @@ -12,5 +12,6 @@ test = false doctest = false [dependencies] +cc = "1.2.10" glob = "0.3.2" which = "4.2.4" diff --git a/build_utils/src/lib.rs b/build_utils/src/lib.rs index e210ad04b..3b558052c 100644 --- a/build_utils/src/lib.rs +++ b/build_utils/src/lib.rs @@ -57,9 +57,9 @@ print('PYTHON_LIB_DIR:', sysconfig.get_config_var('LIBDIR')) "; /// Configuration structure for CUDA environment -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone)] pub struct CudaConfig { - pub cuda_home: Option, + pub cuda_home: PathBuf, pub include_dirs: Vec, pub lib_dirs: Vec, } @@ -142,11 +142,10 @@ pub fn find_cuda_home() -> Option { /// Discover CUDA configuration including home, include dirs, and lib dirs pub fn discover_cuda_config() -> Result { - let cuda_home = find_cuda_home().ok_or(BuildError::CudaNotFound)?; - let cuda_home_path = PathBuf::from(&cuda_home); + let cuda_home_path = PathBuf::from(find_cuda_home().ok_or(BuildError::CudaNotFound)?); let mut config = CudaConfig { - cuda_home: Some(cuda_home_path.clone()), + cuda_home: cuda_home_path.clone(), include_dirs: Vec::new(), lib_dirs: Vec::new(), }; @@ -176,11 +175,10 @@ pub fn discover_cuda_config() -> Result { /// Validate CUDA installation exists and is complete pub fn validate_cuda_installation() -> Result { let cuda_config = discover_cuda_config()?; - let cuda_home = cuda_config.cuda_home.ok_or(BuildError::CudaNotFound)?; - let cuda_home_str = cuda_home.to_string_lossy().to_string(); + let cuda_home_str = cuda_config.cuda_home.to_string_lossy().to_string(); // Verify CUDA include directory exists - let cuda_include_path = cuda_home.join("include"); + let cuda_include_path = cuda_config.cuda_home.join("include"); if !cuda_include_path.exists() { return Err(BuildError::PathNotFound(format!( "CUDA include directory at {}", @@ -192,27 +190,61 @@ pub fn validate_cuda_installation() -> Result { } /// Get CUDA library directory -pub fn get_cuda_lib_dir() -> Result { - // Check if user explicitly set CUDA_LIB_DIR +/// +/// Searches for the directory containing libcudart_static.a in the CUDA installation. +/// Panics with a helpful error message if not found. +pub fn get_cuda_lib_dir() -> String { + // Check if user explicitly set CUDA_LIB_DIR and verify it contains the library if let Ok(cuda_lib_dir) = env::var("CUDA_LIB_DIR") { - return Ok(cuda_lib_dir); + let lib_path = PathBuf::from(&cuda_lib_dir); + let cudart_static = lib_path.join("libcudart_static.a"); + if !cudart_static.exists() { + panic!( + "CUDA_LIB_DIR is set to '{}' but libcudart_static.a not found at {}", + cuda_lib_dir, + cudart_static.display() + ); + } + return cuda_lib_dir; } // Try to deduce from CUDA configuration - let cuda_config = discover_cuda_config()?; - if let Some(cuda_home) = cuda_config.cuda_home { - // Check both old-style and new-style CUDA library paths - for lib_subdir in &["lib64", "lib", "targets/x86_64-linux/lib"] { - let lib_path = cuda_home.join(lib_subdir); - if lib_path.exists() { - return Ok(lib_path.to_string_lossy().to_string()); - } + let cuda_config = match discover_cuda_config() { + Ok(config) => config, + Err(_) => { + eprintln!("Error: CUDA installation not found!"); + eprintln!("Please ensure CUDA is installed and one of the following is true:"); + eprintln!( + " 1. Set CUDA_HOME environment variable to your CUDA installation directory" + ); + eprintln!( + " 2. Set CUDA_PATH environment variable to your CUDA installation directory" + ); + eprintln!(" 3. Ensure 'nvcc' is in your PATH"); + eprintln!(" 4. Install CUDA to the default location (/usr/local/cuda on Linux)"); + eprintln!(); + eprintln!("Example: export CUDA_HOME=/usr/local/cuda-12.0"); + panic!("CUDA installation not found"); + } + }; + + let cuda_home = &cuda_config.cuda_home; + // Check both old-style and new-style CUDA library paths + // Look for the actual cudart_static library file to ensure we find the right directory + let libs = &["lib64", "lib", "targets/x86_64-linux/lib"]; + for lib_subdir in libs { + let lib_path = cuda_home.join(lib_subdir); + let cudart_static = lib_path.join("libcudart_static.a"); + if cudart_static.exists() { + return lib_path.to_string_lossy().to_string(); } } - Err(BuildError::PathNotFound( - "CUDA library directory".to_string(), - )) + panic!( + "CUDA library directories {:#?} under {} do not contain libcudart_static.a", + libs, + cuda_home.display() + ); } /// Discover Python environment directories using sysconfig @@ -276,6 +308,100 @@ pub fn print_cuda_lib_error_help() { eprintln!("Or: export CUDA_LIB_DIR=/usr/lib64"); } +/// Emit cargo directives to statically link libstdc++ +/// +/// This finds the GCC library path containing libstdc++.a and emits the +/// appropriate cargo directives to link it statically. This avoids runtime +/// dependency on system libstdc++.so which can cause GLIBCXX version conflicts. +/// +/// Uses the `cc` crate to detect the C++ compiler, ensuring we use the same +/// compiler that `cc::Build` and `cxx_build` would use. +pub fn link_libstdcpp_static() { + // Use cc crate to get the C++ compiler, same as cc::Build and cxx_build use + let compiler = cc::Build::new().cpp(true).get_compiler(); + let gcc_lib_path = std::process::Command::new(compiler.path()) + .args(["-print-file-name=libstdc++.a"]) + .output() + .ok() + .and_then(|output| { + if output.status.success() { + String::from_utf8(output.stdout).ok().and_then(|s| { + let path = PathBuf::from(s.trim()); + path.parent().map(|p| p.to_path_buf()) + }) + } else { + None + } + }); + if let Some(gcc_lib_path) = gcc_lib_path { + println!("cargo:rustc-link-search=native={}", gcc_lib_path.display()); + } + println!("cargo:rustc-link-lib=static=stdc++"); +} + +/// Configuration for rdma-core static libraries from monarch_cpp_static_libs. +/// +/// Use `CppStaticLibsConfig::from_env()` to get the paths, then use the include +/// paths for bindgen/cc, and call `emit_link_directives()` to link. +pub struct CppStaticLibsConfig { + pub rdma_include: String, + pub rdma_lib_dir: String, + pub rdma_util_dir: String, +} + +impl CppStaticLibsConfig { + /// Load configuration from DEP_* environment variables set by monarch_cpp_static_libs. + /// + /// The monarch_cpp_static_libs crate must be listed as a build-dependency. + pub fn from_env() -> Self { + Self { + rdma_include: std::env::var("DEP_MONARCH_CPP_STATIC_LIBS_RDMA_INCLUDE") + .expect("DEP_MONARCH_CPP_STATIC_LIBS_RDMA_INCLUDE not set - add monarch_cpp_static_libs as build-dependency"), + rdma_lib_dir: std::env::var("DEP_MONARCH_CPP_STATIC_LIBS_RDMA_LIB_DIR") + .expect("DEP_MONARCH_CPP_STATIC_LIBS_RDMA_LIB_DIR not set - add monarch_cpp_static_libs as build-dependency"), + rdma_util_dir: std::env::var("DEP_MONARCH_CPP_STATIC_LIBS_RDMA_UTIL_DIR") + .expect("DEP_MONARCH_CPP_STATIC_LIBS_RDMA_UTIL_DIR not set - add monarch_cpp_static_libs as build-dependency"), + } + } + + /// Emit all cargo link directives for static linking of rdma-core. + /// + /// This emits search paths and link-lib directives for: + /// - libmlx5.a + /// - libibverbs.a + /// - librdma_util.a + pub fn emit_link_directives(&self) { + // Emit link search paths + println!("cargo::rustc-link-search=native={}", self.rdma_lib_dir); + println!("cargo::rustc-link-search=native={}", self.rdma_util_dir); + + // Use whole-archive for rdma-core static libraries + println!("cargo::rustc-link-arg=-Wl,--whole-archive"); + println!("cargo::rustc-link-lib=static=mlx5"); + println!("cargo::rustc-link-lib=static=ibverbs"); + println!("cargo::rustc-link-arg=-Wl,--no-whole-archive"); + + // rdma_util helper library + println!("cargo::rustc-link-lib=static=rdma_util"); + } +} + +/// Convenience function to set up rdma-core static linking. +/// +/// Returns the config with include paths, and emits all link directives. +/// The monarch_cpp_static_libs crate must be listed as a build-dependency. +/// +/// Example: +/// ```ignore +/// let config = build_utils::setup_cpp_static_libs(); +/// // Use config.rdma_include for bindgen/cc +/// ``` +pub fn setup_cpp_static_libs() -> CppStaticLibsConfig { + let config = CppStaticLibsConfig::from_env(); + config.emit_link_directives(); + config +} + #[cfg(test)] mod tests { use super::*; diff --git a/monarch_cpp_static_libs/Cargo.toml b/monarch_cpp_static_libs/Cargo.toml new file mode 100644 index 000000000..963a732c9 --- /dev/null +++ b/monarch_cpp_static_libs/Cargo.toml @@ -0,0 +1,17 @@ +# @generated by autocargo from //monarch/monarch_cpp_static_libs:monarch_cpp_static_libs + +[package] +name = "monarch_cpp_static_libs" +version = "0.0.0" +authors = ["Meta"] +edition = "2021" +license = "BSD-3-Clause" +build = "build.rs" +links = "monarch_cpp_static_libs" + +[lib] +test = false +doctest = false + +[build-dependencies] +build_utils = { path = "../build_utils" } diff --git a/monarch_cpp_static_libs/build.rs b/monarch_cpp_static_libs/build.rs new file mode 100644 index 000000000..fdce92f75 --- /dev/null +++ b/monarch_cpp_static_libs/build.rs @@ -0,0 +1,277 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Static rdma-core build script +//! +//! This build script: +//! 1. Obtains rdma-core source (from MONARCH_RDMA_CORE_SRC or by cloning) +//! 2. Builds rdma-core with static libraries (libibverbs.a, libmlx5.a) +//! 3. Emits link directives for downstream crates + +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +// Repository configuration +const RDMA_CORE_REPO: &str = "https://github.com/linux-rdma/rdma-core"; +const RDMA_CORE_TAG: &str = "224154663a9ad5b1ad5629fb76a0c40c675fb936"; + +#[cfg(not(target_os = "linux"))] +fn main() {} + +#[cfg(target_os = "linux")] +fn main() { + let out_dir = PathBuf::from(std::env::var("OUT_DIR").expect("OUT_DIR not set")); + let vendor_dir = out_dir.join("vendor"); + std::fs::create_dir_all(&vendor_dir).expect("Failed to create vendor directory"); + + let rdma_core_dir = vendor_dir.join("rdma-core"); + + // Get or clone rdma-core source + get_or_clone_rdma_core(&rdma_core_dir); + + // Build rdma-core + let rdma_build_dir = build_rdma_core(&rdma_core_dir); + + // Emit link directives + emit_link_directives(&rdma_build_dir); +} + +/// Get or clone rdma-core source. +/// +/// If MONARCH_RDMA_CORE_SRC is set, copies from that directory. +/// Otherwise, clones from GitHub at the specified tag. +fn get_or_clone_rdma_core(target_dir: &Path) { + // Skip if already exists + if target_dir.exists() { + println!( + "cargo:warning=rdma-core source already exists at {}", + target_dir.display() + ); + return; + } + + // Check for MONARCH_RDMA_CORE_SRC environment variable + println!("cargo:rerun-if-env-changed=MONARCH_RDMA_CORE_SRC"); + if let Ok(src_path) = std::env::var("MONARCH_RDMA_CORE_SRC") { + let src_dir = PathBuf::from(src_path); + println!( + "cargo:warning=Using rdma-core source from MONARCH_RDMA_CORE_SRC: {}", + src_dir.display() + ); + copy_dir(&src_dir, target_dir); + } else { + println!( + "cargo:warning=MONARCH_RDMA_CORE_SRC not set, cloning from {} (commit {})", + RDMA_CORE_REPO, RDMA_CORE_TAG + ); + clone_rdma_core(target_dir); + } +} + +/// Clone rdma-core from GitHub at the specified commit. +fn clone_rdma_core(target_dir: &Path) { + // First, clone the repository without checking out + let status = Command::new("git") + .args([ + "clone", + "--no-checkout", + RDMA_CORE_REPO, + target_dir.to_str().unwrap(), + ]) + .status() + .expect("Failed to execute git clone"); + + if !status.success() { + panic!("Failed to clone rdma-core from {}", RDMA_CORE_REPO); + } + + // Then checkout the specific commit + let status = Command::new("git") + .args(["checkout", RDMA_CORE_TAG]) + .current_dir(target_dir) + .status() + .expect("Failed to execute git checkout"); + + if !status.success() { + panic!("Failed to checkout rdma-core commit {}", RDMA_CORE_TAG); + } + + println!( + "cargo:warning=Successfully cloned rdma-core at commit {}", + RDMA_CORE_TAG + ); +} + +fn copy_dir(src_dir: &Path, target_dir: &Path) { + if target_dir.exists() { + println!( + "cargo:warning=Directory already exists at {}", + target_dir.display() + ); + return; + } + + println!( + "cargo:warning=Copying {} to {}", + src_dir.display(), + target_dir.display() + ); + + let status = Command::new("cp") + .args([ + "-r", + src_dir.to_str().unwrap(), + target_dir.to_str().unwrap(), + ]) + .status() + .expect("Failed to execute cp"); + + if !status.success() { + panic!( + "Failed to copy from {} to {}", + src_dir.display(), + target_dir.display() + ); + } +} + +fn build_rdma_core(rdma_core_dir: &Path) -> PathBuf { + let build_dir = rdma_core_dir.join("build"); + + // Check if already built + if build_dir.join("lib/statics/libibverbs.a").exists() { + println!("cargo:warning=rdma-core already built"); + return build_dir; + } + + std::fs::create_dir_all(&build_dir).expect("Failed to create rdma-core build directory"); + + println!("cargo:warning=Building rdma-core..."); + + // Detect cmake command + let cmake = if Command::new("cmake3").arg("--version").status().is_ok() { + "cmake3" + } else { + "cmake" + }; + + // Detect ninja + let use_ninja = Command::new("ninja-build") + .arg("--version") + .status() + .is_ok() + || Command::new("ninja").arg("--version").status().is_ok(); + + let ninja_cmd = if Command::new("ninja-build") + .arg("--version") + .status() + .is_ok() + { + "ninja-build" + } else { + "ninja" + }; + + // CMake configuration + // IMPORTANT: -DCMAKE_POSITION_INDEPENDENT_CODE=ON is required for static libs + // that will be linked into a shared object (.so) + let mut cmake_args = vec![ + "-DIN_PLACE=1", + "-DENABLE_STATIC=1", + "-DENABLE_RESOLVE_NEIGH=0", + "-DNO_PYVERBS=1", + "-DNO_MAN_PAGES=1", + "-DCMAKE_POSITION_INDEPENDENT_CODE=ON", + "-DCMAKE_C_FLAGS=-fPIC", + "-DCMAKE_CXX_FLAGS=-fPIC", + ]; + + if use_ninja { + cmake_args.push("-GNinja"); + } + + cmake_args.push(".."); + + let status = Command::new(cmake) + .current_dir(&build_dir) + .args(&cmake_args) + .status() + .expect("Failed to run cmake for rdma-core"); + + if !status.success() { + panic!("Failed to configure rdma-core with cmake"); + } + + // Build only the targets we need: libibverbs.a, libmlx5.a, and librdma_util.a + // We don't need librdmacm which has build issues with long paths + let targets = [ + "lib/statics/libibverbs.a", + "lib/statics/libmlx5.a", + "util/librdma_util.a", + ]; + + for target in &targets { + let status = if use_ninja { + Command::new(ninja_cmd) + .current_dir(&build_dir) + .arg(target) + .status() + .expect("Failed to run ninja for rdma-core") + } else { + let num_jobs = std::thread::available_parallelism() + .map(|p| p.get()) + .unwrap_or(4); + Command::new("make") + .current_dir(&build_dir) + .args(["-j", &num_jobs.to_string(), target]) + .status() + .expect("Failed to run make for rdma-core") + }; + + if !status.success() { + panic!("Failed to build rdma-core target: {}", target); + } + } + + println!("cargo:warning=rdma-core build complete"); + build_dir +} + +fn emit_link_directives(rdma_build_dir: &Path) { + let rdma_static_dir = rdma_build_dir.join("lib/statics"); + let rdma_util_dir = rdma_build_dir.join("util"); + + // Emit search paths + println!( + "cargo:rustc-link-search=native={}", + rdma_static_dir.display() + ); + println!("cargo:rustc-link-search=native={}", rdma_util_dir.display()); + + // Static libraries - use whole-archive for rdma-core static libraries + println!("cargo:rustc-link-arg=-Wl,--whole-archive"); + println!("cargo:rustc-link-lib=static=mlx5"); + println!("cargo:rustc-link-lib=static=ibverbs"); + println!("cargo:rustc-link-arg=-Wl,--no-whole-archive"); + + // rdma_util helper library + println!("cargo:rustc-link-lib=static=rdma_util"); + + // Export metadata for dependent crates + // Use cargo:: (double colon) format for proper DEP__ env vars + println!( + "cargo::metadata=RDMA_INCLUDE={}", + rdma_build_dir.join("include").display() + ); + println!("cargo::metadata=RDMA_LIB_DIR={}", rdma_static_dir.display()); + println!("cargo::metadata=RDMA_UTIL_DIR={}", rdma_util_dir.display()); + + // Re-run if build scripts change + println!("cargo:rerun-if-changed=build.rs"); +} diff --git a/monarch_cpp_static_libs/src/lib.rs b/monarch_cpp_static_libs/src/lib.rs new file mode 100644 index 000000000..d07ffffd9 --- /dev/null +++ b/monarch_cpp_static_libs/src/lib.rs @@ -0,0 +1,16 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Static C++ libraries for Monarch +//! +//! This crate builds NCCL and rdma-core (libibverbs, libmlx5) from source +//! as static libraries. Depend on this crate to link against them statically, +//! eliminating runtime dependencies on libnccl.so, libibverbs.so, and libmlx5.so. +//! +//! This crate does not provide Rust bindings - use `nccl-sys` and `rdmaxcel-sys` +//! for the bindings, and add this crate as a dependency to get static linking. diff --git a/monarch_extension/Cargo.toml b/monarch_extension/Cargo.toml index e362dd43b..26f90b908 100644 --- a/monarch_extension/Cargo.toml +++ b/monarch_extension/Cargo.toml @@ -23,6 +23,7 @@ hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" } hyperactor_mesh_macros = { version = "0.0.0", path = "../hyperactor_mesh_macros" } hyperactor_telemetry = { version = "0.0.0", path = "../hyperactor_telemetry" } libc = "0.2.139" +monarch_cpp_static_libs = { path = "../monarch_cpp_static_libs", optional = true } monarch_hyperactor = { version = "0.0.0", path = "../monarch_hyperactor" } monarch_messages = { version = "0.0.0", path = "../monarch_messages", optional = true } monarch_rdma_extension = { version = "0.0.0", path = "../monarch_rdma/extension", optional = true } @@ -36,6 +37,10 @@ tokio = { version = "1.47.1", features = ["full", "test-util", "tracing"] } torch-sys-cuda = { version = "0.0.0", path = "../torch-sys-cuda", optional = true } tracing = { version = "0.1.41", features = ["attributes", "valuable"] } +[build-dependencies] +build_utils = { path = "../build_utils" } +monarch_cpp_static_libs = { path = "../monarch_cpp_static_libs", optional = true } + [features] default = ["tensor_engine"] -tensor_engine = ["dep:monarch_messages", "dep:monarch_rdma_extension", "dep:monarch_tensor_worker", "dep:nccl-sys", "dep:rdmaxcel-sys", "dep:torch-sys-cuda"] +tensor_engine = ["dep:monarch_cpp_static_libs", "dep:monarch_messages", "dep:monarch_rdma_extension", "dep:monarch_tensor_worker", "dep:nccl-sys", "dep:rdmaxcel-sys", "dep:torch-sys-cuda"] diff --git a/monarch_extension/build.rs b/monarch_extension/build.rs index 432623496..37d6c8242 100644 --- a/monarch_extension/build.rs +++ b/monarch_extension/build.rs @@ -7,11 +7,10 @@ */ fn main() { - // Only set torch-related rpaths if tensor_engine feature is enabled - #[cfg(feature = "tensor_engine")] - { - if let Ok(path) = std::env::var("DEP_NCCL_LIB_PATH") { - println!("cargo::rustc-link-arg=-Wl,-rpath,{path}"); - } + // Only set up static linking if tensor_engine feature is enabled + if std::env::var("CARGO_FEATURE_TENSOR_ENGINE").is_ok() { + // Set up static linking for rdma-core + // This emits link directives for libmlx5.a, libibverbs.a, librdma_util.a + let _config = build_utils::setup_cpp_static_libs(); } } diff --git a/nccl-sys/build.rs b/nccl-sys/build.rs index 9db4bb74d..b64bca399 100644 --- a/nccl-sys/build.rs +++ b/nccl-sys/build.rs @@ -117,8 +117,6 @@ fn main() { } if let Some(lib_dir) = &python_config.lib_dir { println!("cargo::rustc-link-search=native={}", lib_dir); - // Set cargo metadata to inform dependent binaries about how to set their - // RPATH (see controller/build.rs for an example). println!("cargo::metadata=LIB_PATH={}", lib_dir); } @@ -133,8 +131,16 @@ fn main() { .expect("Couldn't write bindings!"); // We no longer link against nccl directly since we dlopen it - // But we do link against CUDA runtime - println!("cargo::rustc-link-lib=cudart"); + // But we do link against CUDA runtime statically + // Add CUDA library search path first + let cuda_lib_dir = build_utils::get_cuda_lib_dir(); + println!("cargo::rustc-link-search=native={}", cuda_lib_dir); + + println!("cargo::rustc-link-lib=static=cudart_static"); + // cudart_static requires linking against librt, libpthread, and libdl + println!("cargo::rustc-link-lib=rt"); + println!("cargo::rustc-link-lib=pthread"); + println!("cargo::rustc-link-lib=dl"); println!("cargo::rustc-cfg=cargo"); println!("cargo::rustc-check-cfg=cfg(cargo)"); } diff --git a/rdmaxcel-sys/Cargo.toml b/rdmaxcel-sys/Cargo.toml index ee1d34f3b..dd448b145 100644 --- a/rdmaxcel-sys/Cargo.toml +++ b/rdmaxcel-sys/Cargo.toml @@ -9,8 +9,12 @@ links = "rdmaxcel" [dependencies] cxx = "1.0.119" serde = { version = "1.0.185", features = ["derive", "rc"] } +# Depend on monarch_cpp_static_libs to get statically linked rdma-core (libibverbs, libmlx5) +monarch_cpp_static_libs = { path = "../monarch_cpp_static_libs" } [build-dependencies] bindgen = "0.70.1" cc = "1.0" build_utils = { path = "../build_utils" } +# Need monarch_cpp_static_libs as build-dep to get include paths via DEP_* env vars +monarch_cpp_static_libs = { path = "../monarch_cpp_static_libs" } diff --git a/rdmaxcel-sys/build.rs b/rdmaxcel-sys/build.rs index 61ab2229d..725e45bc3 100644 --- a/rdmaxcel-sys/build.rs +++ b/rdmaxcel-sys/build.rs @@ -15,11 +15,9 @@ fn main() {} #[cfg(not(target_os = "macos"))] fn main() { - // Link against the ibverbs library - println!("cargo:rustc-link-lib=ibverbs"); - - // Link against the mlx5 library - println!("cargo:rustc-link-lib=mlx5"); + // Get rdma-core config from cpp_static_libs (includes are used, links emitted by monarch_extension) + let cpp_static_libs_config = build_utils::CppStaticLibsConfig::from_env(); + let rdma_include = &cpp_static_libs_config.rdma_include; // Link against dl for dynamic loading println!("cargo:rustc-link-lib=dl"); @@ -140,6 +138,9 @@ fn main() { println!("cargo:rustc-env=CUDA_INCLUDE_PATH={}", cuda_include_path); builder = builder.clang_arg(format!("-I{}", cuda_include_path)); + // Add rdma-core include path from nccl-static-sys + builder = builder.clang_arg(format!("-I{}", rdma_include)); + // Include headers and libs from the active environment. let python_config = match build_utils::python_env_dirs_with_interpreter("python3") { Ok(config) => config, @@ -161,17 +162,15 @@ fn main() { } // Get CUDA library directory and emit link directives - let cuda_lib_dir = match build_utils::get_cuda_lib_dir() { - Ok(dir) => dir, - Err(_) => { - build_utils::print_cuda_lib_error_help(); - std::process::exit(1); - } - }; + let cuda_lib_dir = build_utils::get_cuda_lib_dir(); println!("cargo:rustc-link-search=native={}", cuda_lib_dir); // Note: libcuda is now loaded dynamically via dlopen in driver_api.cpp - // Only link cudart (CUDA Runtime API) - println!("cargo:rustc-link-lib=cudart"); + // Link cudart statically (CUDA Runtime API) + println!("cargo:rustc-link-lib=static=cudart_static"); + // cudart_static requires linking against librt and libpthread + println!("cargo:rustc-link-lib=rt"); + println!("cargo:rustc-link-lib=pthread"); + println!("cargo:rustc-link-lib=dl"); // Note: We no longer link against libtorch/c10 since segment scanning // is now done via a callback registered from the extension crate. @@ -201,6 +200,7 @@ fn main() { build .file(&c_source_path) .include(format!("{}/src", manifest_dir)) + .include(&rdma_include) .flag("-fPIC"); // Add CUDA include paths - reuse the paths we already found for bindgen @@ -220,6 +220,7 @@ fn main() { .file(&cpp_source_path) .file(&driver_api_cpp_path) .include(format!("{}/src", manifest_dir)) + .include(&rdma_include) .flag("-fPIC") .cpp(true) .flag("-std=c++14"); @@ -233,6 +234,9 @@ fn main() { } cpp_build.compile("rdmaxcel_cpp"); + + // Statically link libstdc++ to avoid runtime dependency on system libstdc++ + build_utils::link_libstdcpp_static(); } else { if !Path::new(&cpp_source_path).exists() { panic!("C++ source file not found at {}", cpp_source_path); @@ -273,8 +277,7 @@ fn main() { "-fPIC", &format!("-I{}", cuda_include_path), &format!("-I{}/src", manifest_dir), - &format!("-I/usr/include"), - &format!("-I/usr/include/infiniband"), + &format!("-I{}", rdma_include), ]) .output(); diff --git a/torch-sys-cuda/build.rs b/torch-sys-cuda/build.rs index 5840af8f2..8013b1244 100644 --- a/torch-sys-cuda/build.rs +++ b/torch-sys-cuda/build.rs @@ -12,10 +12,6 @@ #![feature(exit_status_error)] -use std::path::PathBuf; - -use build_utils::find_cuda_home; - #[cfg(target_os = "macos")] fn main() {} @@ -38,14 +34,8 @@ fn main() { println!("cargo::rustc-link-lib={}", lib_name); } - let cuda_home = PathBuf::from(find_cuda_home().expect("CUDA installation not found")); - - // Configure CUDA-specific linking - println!("cargo::rustc-link-lib=cudart"); - println!( - "cargo::rustc-link-search=native={}/lib64", - cuda_home.display() - ); + // Statically link libstdc++ to avoid runtime dependency on system libstdc++ + build_utils::link_libstdcpp_static(); // Add Python library directory to rpath for runtime linking if let Some(python_lib_dir) = &python_lib_dir {