Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .github/workflows/build-cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,5 @@ jobs:
# Setup Tensor Engine
setup_tensor_engine

export CUDA_LIB_DIR=/usr/lib64

# Build monarch (CUDA version)
python setup.py bdist_wheel
1 change: 0 additions & 1 deletion .github/workflows/doc_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ jobs:
export USE_TENSOR_ENGINE=1
export RUSTFLAGS="-Zthreads=16 ${RUSTFLAGS:-}"
export _GLIBCXX_USE_CXX11_ABI=1
export CUDA_LIB_DIR=/usr/lib64

# Build Monarch completely for documentation - use dedicated script
./scripts/build_monarch_for_docs.sh
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/publish_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ jobs:

# Build wheel
export MONARCH_PACKAGE_NAME="torchmonarch"
export CUDA_LIB_DIR=/usr/lib64
export MONARCH_VERSION="${{ github.event.inputs.version }}"
python setup.py bdist_wheel

Expand Down
1 change: 0 additions & 1 deletion .github/workflows/wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ jobs:
# Build wheel
export MONARCH_PACKAGE_NAME="torchmonarch-nightly"
export MONARCH_VERSION=$(date +'%Y.%m.%d')
export CUDA_LIB_DIR=/usr/lib64

python setup.py bdist_wheel

Expand Down
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,3 @@ docs/_build/**
docs/build/**
docs/**/generated/**
*/sg_execution_times.rst
nccl/**
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ members = [
"monarch_rdma",
"monarch_tensor_worker",
"monarch_types",
"monarch_cpp_static_libs",
"nccl-sys",
"ndslice",
"preempt_rwlock",
Expand Down
1 change: 1 addition & 0 deletions build_utils/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@ test = false
doctest = false

[dependencies]
cc = "1.2.10"
glob = "0.3.2"
which = "4.2.4"
170 changes: 148 additions & 22 deletions build_utils/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ print('PYTHON_LIB_DIR:', sysconfig.get_config_var('LIBDIR'))
";

/// Configuration structure for CUDA environment
#[derive(Debug, Clone, Default)]
#[derive(Debug, Clone)]
pub struct CudaConfig {
pub cuda_home: Option<PathBuf>,
pub cuda_home: PathBuf,
pub include_dirs: Vec<PathBuf>,
pub lib_dirs: Vec<PathBuf>,
}
Expand Down Expand Up @@ -142,11 +142,10 @@ pub fn find_cuda_home() -> Option<String> {

/// Discover CUDA configuration including home, include dirs, and lib dirs
pub fn discover_cuda_config() -> Result<CudaConfig, BuildError> {
let cuda_home = find_cuda_home().ok_or(BuildError::CudaNotFound)?;
let cuda_home_path = PathBuf::from(&cuda_home);
let cuda_home_path = PathBuf::from(find_cuda_home().ok_or(BuildError::CudaNotFound)?);

let mut config = CudaConfig {
cuda_home: Some(cuda_home_path.clone()),
cuda_home: cuda_home_path.clone(),
include_dirs: Vec::new(),
lib_dirs: Vec::new(),
};
Expand Down Expand Up @@ -176,11 +175,10 @@ pub fn discover_cuda_config() -> Result<CudaConfig, BuildError> {
/// Validate CUDA installation exists and is complete
pub fn validate_cuda_installation() -> Result<String, BuildError> {
let cuda_config = discover_cuda_config()?;
let cuda_home = cuda_config.cuda_home.ok_or(BuildError::CudaNotFound)?;
let cuda_home_str = cuda_home.to_string_lossy().to_string();
let cuda_home_str = cuda_config.cuda_home.to_string_lossy().to_string();

// Verify CUDA include directory exists
let cuda_include_path = cuda_home.join("include");
let cuda_include_path = cuda_config.cuda_home.join("include");
if !cuda_include_path.exists() {
return Err(BuildError::PathNotFound(format!(
"CUDA include directory at {}",
Expand All @@ -192,27 +190,61 @@ pub fn validate_cuda_installation() -> Result<String, BuildError> {
}

/// Get CUDA library directory
pub fn get_cuda_lib_dir() -> Result<String, BuildError> {
// Check if user explicitly set CUDA_LIB_DIR
///
/// Searches for the directory containing libcudart_static.a in the CUDA installation.
/// Panics with a helpful error message if not found.
pub fn get_cuda_lib_dir() -> String {
// Check if user explicitly set CUDA_LIB_DIR and verify it contains the library
if let Ok(cuda_lib_dir) = env::var("CUDA_LIB_DIR") {
return Ok(cuda_lib_dir);
let lib_path = PathBuf::from(&cuda_lib_dir);
let cudart_static = lib_path.join("libcudart_static.a");
if !cudart_static.exists() {
panic!(
"CUDA_LIB_DIR is set to '{}' but libcudart_static.a not found at {}",
cuda_lib_dir,
cudart_static.display()
);
}
return cuda_lib_dir;
}

// Try to deduce from CUDA configuration
let cuda_config = discover_cuda_config()?;
if let Some(cuda_home) = cuda_config.cuda_home {
// Check both old-style and new-style CUDA library paths
for lib_subdir in &["lib64", "lib", "targets/x86_64-linux/lib"] {
let lib_path = cuda_home.join(lib_subdir);
if lib_path.exists() {
return Ok(lib_path.to_string_lossy().to_string());
}
let cuda_config = match discover_cuda_config() {
Ok(config) => config,
Err(_) => {
eprintln!("Error: CUDA installation not found!");
eprintln!("Please ensure CUDA is installed and one of the following is true:");
eprintln!(
" 1. Set CUDA_HOME environment variable to your CUDA installation directory"
);
eprintln!(
" 2. Set CUDA_PATH environment variable to your CUDA installation directory"
);
eprintln!(" 3. Ensure 'nvcc' is in your PATH");
eprintln!(" 4. Install CUDA to the default location (/usr/local/cuda on Linux)");
eprintln!();
eprintln!("Example: export CUDA_HOME=/usr/local/cuda-12.0");
panic!("CUDA installation not found");
}
};

let cuda_home = &cuda_config.cuda_home;
// Check both old-style and new-style CUDA library paths
// Look for the actual cudart_static library file to ensure we find the right directory
let libs = &["lib64", "lib", "targets/x86_64-linux/lib"];
for lib_subdir in libs {
let lib_path = cuda_home.join(lib_subdir);
let cudart_static = lib_path.join("libcudart_static.a");
if cudart_static.exists() {
return lib_path.to_string_lossy().to_string();
}
}

Err(BuildError::PathNotFound(
"CUDA library directory".to_string(),
))
panic!(
"CUDA library directories {:#?} under {} do not contain libcudart_static.a",
libs,
cuda_home.display()
);
}

/// Discover Python environment directories using sysconfig
Expand Down Expand Up @@ -276,6 +308,100 @@ pub fn print_cuda_lib_error_help() {
eprintln!("Or: export CUDA_LIB_DIR=/usr/lib64");
}

/// Emit cargo directives to statically link libstdc++
///
/// This finds the GCC library path containing libstdc++.a and emits the
/// appropriate cargo directives to link it statically. This avoids runtime
/// dependency on system libstdc++.so which can cause GLIBCXX version conflicts.
///
/// Uses the `cc` crate to detect the C++ compiler, ensuring we use the same
/// compiler that `cc::Build` and `cxx_build` would use.
pub fn link_libstdcpp_static() {
// Use cc crate to get the C++ compiler, same as cc::Build and cxx_build use
let compiler = cc::Build::new().cpp(true).get_compiler();
let gcc_lib_path = std::process::Command::new(compiler.path())
.args(["-print-file-name=libstdc++.a"])
.output()
.ok()
.and_then(|output| {
if output.status.success() {
String::from_utf8(output.stdout).ok().and_then(|s| {
let path = PathBuf::from(s.trim());
path.parent().map(|p| p.to_path_buf())
})
} else {
None
}
});
if let Some(gcc_lib_path) = gcc_lib_path {
println!("cargo:rustc-link-search=native={}", gcc_lib_path.display());
}
println!("cargo:rustc-link-lib=static=stdc++");
}

/// Configuration for rdma-core static libraries from monarch_cpp_static_libs.
///
/// Use `CppStaticLibsConfig::from_env()` to get the paths, then use the include
/// paths for bindgen/cc, and call `emit_link_directives()` to link.
pub struct CppStaticLibsConfig {
pub rdma_include: String,
pub rdma_lib_dir: String,
pub rdma_util_dir: String,
}

impl CppStaticLibsConfig {
/// Load configuration from DEP_* environment variables set by monarch_cpp_static_libs.
///
/// The monarch_cpp_static_libs crate must be listed as a build-dependency.
pub fn from_env() -> Self {
Self {
rdma_include: std::env::var("DEP_MONARCH_CPP_STATIC_LIBS_RDMA_INCLUDE")
.expect("DEP_MONARCH_CPP_STATIC_LIBS_RDMA_INCLUDE not set - add monarch_cpp_static_libs as build-dependency"),
rdma_lib_dir: std::env::var("DEP_MONARCH_CPP_STATIC_LIBS_RDMA_LIB_DIR")
.expect("DEP_MONARCH_CPP_STATIC_LIBS_RDMA_LIB_DIR not set - add monarch_cpp_static_libs as build-dependency"),
rdma_util_dir: std::env::var("DEP_MONARCH_CPP_STATIC_LIBS_RDMA_UTIL_DIR")
.expect("DEP_MONARCH_CPP_STATIC_LIBS_RDMA_UTIL_DIR not set - add monarch_cpp_static_libs as build-dependency"),
}
}

/// Emit all cargo link directives for static linking of rdma-core.
///
/// This emits search paths and link-lib directives for:
/// - libmlx5.a
/// - libibverbs.a
/// - librdma_util.a
pub fn emit_link_directives(&self) {
// Emit link search paths
println!("cargo::rustc-link-search=native={}", self.rdma_lib_dir);
println!("cargo::rustc-link-search=native={}", self.rdma_util_dir);

// Use whole-archive for rdma-core static libraries
println!("cargo::rustc-link-arg=-Wl,--whole-archive");
println!("cargo::rustc-link-lib=static=mlx5");
println!("cargo::rustc-link-lib=static=ibverbs");
println!("cargo::rustc-link-arg=-Wl,--no-whole-archive");

// rdma_util helper library
println!("cargo::rustc-link-lib=static=rdma_util");
}
}

/// Convenience function to set up rdma-core static linking.
///
/// Returns the config with include paths, and emits all link directives.
/// The monarch_cpp_static_libs crate must be listed as a build-dependency.
///
/// Example:
/// ```ignore
/// let config = build_utils::setup_cpp_static_libs();
/// // Use config.rdma_include for bindgen/cc
/// ```
pub fn setup_cpp_static_libs() -> CppStaticLibsConfig {
let config = CppStaticLibsConfig::from_env();
config.emit_link_directives();
config
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
17 changes: 17 additions & 0 deletions monarch_cpp_static_libs/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# @generated by autocargo from //monarch/monarch_cpp_static_libs:monarch_cpp_static_libs

[package]
name = "monarch_cpp_static_libs"
version = "0.0.0"
authors = ["Meta"]
edition = "2021"
license = "BSD-3-Clause"
build = "build.rs"
links = "monarch_cpp_static_libs"

[lib]
test = false
doctest = false

[build-dependencies]
build_utils = { path = "../build_utils" }
Loading