Skip to content

Commit c43db5e

Browse files
Add disk space guard, shared cache toggle, binary rename
- Add ZS_NO_SHARED_CACHE env var to skip shared wheel cache - Check available disk (<2GB free) before populating shared cache - Add --no-deps to uv pip install for daemon-managed wheels - Rename output binary to zerostart-linux-x86_64 - Add libc dep for statvfs disk space check - Add DMTCP checkpoint experiment scripts (deferred feature) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent f299a9d commit c43db5e

File tree

6 files changed

+415
-30
lines changed

6 files changed

+415
-30
lines changed

crates/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/zs-fast-wheel/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ hex = "0.4"
5757
toml = "0.8"
5858
pyo3 = { version = "0.28", features = ["extension-module"], optional = true }
5959

60+
[target.'cfg(unix)'.dependencies]
61+
libc = "0.2"
62+
6063
[features]
6164
default = []
6265
iouring = ["io-uring"]

crates/zs-fast-wheel/src/main.rs

Lines changed: 72 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,7 @@ fn uv_install(venv: &std::path::Path, specs: &[String]) -> Result<()> {
411411
let mut args = vec![
412412
"pip".to_string(),
413413
"install".to_string(),
414+
"--no-deps".to_string(),
414415
"--python".to_string(),
415416
python.display().to_string(),
416417
];
@@ -776,31 +777,38 @@ async fn main() -> Result<()> {
776777
};
777778

778779
if !plan.daemon_wheels.is_empty() {
780+
let no_shared_cache = std::env::var("ZS_NO_SHARED_CACHE").is_ok();
781+
779782
// Check shared cache in parallel — restore cached wheels via hardlinks
780-
let sp_for_cache = site_packages.clone();
781-
let wheels_for_cache = plan.daemon_wheels.clone();
782-
let cache_results: Vec<bool> = tokio::task::spawn_blocking(move || {
783-
use rayon::prelude::*;
784-
wheels_for_cache
785-
.par_iter()
786-
.map(|spec| restore_from_shared_cache(spec, &sp_for_cache))
787-
.collect()
788-
})
789-
.await?;
790-
791-
let mut uncached_wheels = Vec::new();
792-
let mut cached_count = 0u32;
793-
794-
for (spec, was_cached) in plan.daemon_wheels.iter().zip(cache_results.iter()) {
795-
if *was_cached {
796-
cached_count += 1;
797-
if verbose {
798-
eprintln!(" {} (shared cache hit)", spec.distribution);
783+
let (uncached_wheels, cached_count) = if no_shared_cache {
784+
(plan.daemon_wheels.clone(), 0u32)
785+
} else {
786+
let sp_for_cache = site_packages.clone();
787+
let wheels_for_cache = plan.daemon_wheels.clone();
788+
let cache_results: Vec<bool> = tokio::task::spawn_blocking(move || {
789+
use rayon::prelude::*;
790+
wheels_for_cache
791+
.par_iter()
792+
.map(|spec| restore_from_shared_cache(spec, &sp_for_cache))
793+
.collect()
794+
})
795+
.await?;
796+
797+
let mut uncached = Vec::new();
798+
let mut count = 0u32;
799+
800+
for (spec, was_cached) in plan.daemon_wheels.iter().zip(cache_results.iter()) {
801+
if *was_cached {
802+
count += 1;
803+
if verbose {
804+
eprintln!(" {} (shared cache hit)", spec.distribution);
805+
}
806+
} else {
807+
uncached.push(spec.clone());
799808
}
800-
} else {
801-
uncached_wheels.push(spec.clone());
802809
}
803-
}
810+
(uncached, count)
811+
};
804812

805813
if verbose {
806814
if cached_count > 0 {
@@ -843,13 +851,27 @@ async fn main() -> Result<()> {
843851
}
844852

845853
// Populate shared cache for newly extracted wheels
846-
let sp_for_cache = site_packages.clone();
847-
tokio::task::spawn_blocking(move || {
848-
for spec in &wheels_to_cache {
849-
populate_shared_cache(spec, &sp_for_cache);
850-
}
851-
})
852-
.await?;
854+
// Skip if ZS_NO_SHARED_CACHE=1 or disk is tight (<2GB free)
855+
if std::env::var("ZS_NO_SHARED_CACHE").is_ok() {
856+
tracing::info!("Shared cache disabled via ZS_NO_SHARED_CACHE");
857+
} else {
858+
let sp_for_cache = site_packages.clone();
859+
tokio::task::spawn_blocking(move || {
860+
if let Ok(avail) = available_disk_mb(&sp_for_cache) {
861+
if avail < 2048 {
862+
tracing::warn!(
863+
"Skipping shared cache — only {}MB free",
864+
avail
865+
);
866+
return;
867+
}
868+
}
869+
for spec in &wheels_to_cache {
870+
populate_shared_cache(spec, &sp_for_cache);
871+
}
872+
})
873+
.await?;
874+
}
853875
}
854876
}
855877

@@ -998,6 +1020,27 @@ async fn main() -> Result<()> {
9981020
}
9991021
}
10001022

1023+
/// Check available disk space in MB for the filesystem containing `path`.
1024+
fn available_disk_mb(path: &Path) -> Result<u64> {
1025+
#[cfg(unix)]
1026+
{
1027+
use std::ffi::CString;
1028+
let c_path = CString::new(path.to_string_lossy().as_bytes())
1029+
.context("invalid path for statvfs")?;
1030+
let mut stat: libc::statvfs = unsafe { std::mem::zeroed() };
1031+
let ret = unsafe { libc::statvfs(c_path.as_ptr(), &mut stat) };
1032+
if ret != 0 {
1033+
anyhow::bail!("statvfs failed");
1034+
}
1035+
Ok((stat.f_bavail as u64 * stat.f_frsize as u64) / (1024 * 1024))
1036+
}
1037+
#[cfg(not(unix))]
1038+
{
1039+
let _ = path;
1040+
Ok(u64::MAX) // assume unlimited on non-unix
1041+
}
1042+
}
1043+
10011044
/// Recursively compute directory size in bytes.
10021045
fn dir_size(path: &Path) -> u64 {
10031046
if !path.exists() {

scripts/cross-compile.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@ cd crates
2121
cargo zigbuild --target "$TARGET" --release -p zs-fast-wheel
2222
cd ..
2323

24-
cp "crates/target/$TARGET/release/zs-fast-wheel" "$OUTDIR/zs-fast-wheel-linux-x86_64"
24+
cp "crates/target/$TARGET/release/zerostart" "$OUTDIR/zerostart-linux-x86_64"
25+
chmod +x "$OUTDIR/zerostart-linux-x86_64"
26+
# Also copy with old name for backwards compat
27+
cp "crates/target/$TARGET/release/zerostart" "$OUTDIR/zs-fast-wheel-linux-x86_64"
2528
chmod +x "$OUTDIR/zs-fast-wheel-linux-x86_64"
2629

2730
echo ""

tests/dmtcp_debug.sh

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#!/bin/bash
2+
set -x
3+
4+
pkill -f dmtcp_coordinator 2>/dev/null || true
5+
sleep 0.5
6+
7+
rm -rf /tmp/ckpt_test
8+
mkdir -p /tmp/ckpt_test
9+
cd /tmp/ckpt_test
10+
11+
# Start coordinator
12+
dmtcp_coordinator --daemon -p 7779 --exit-on-last 2>&1
13+
sleep 1
14+
15+
# Launch a simple sleeping Python
16+
cat > /tmp/simple.py << 'EOF'
17+
import time, os
18+
print(f"PID={os.getpid()}, running...")
19+
time.sleep(999)
20+
EOF
21+
22+
dmtcp_launch -p 7779 python3 /tmp/simple.py &
23+
DPID=$!
24+
sleep 3
25+
26+
# Status
27+
echo "=== STATUS ==="
28+
dmtcp_command -p 7779 -s 2>&1
29+
30+
# Checkpoint
31+
echo "=== CHECKPOINT ==="
32+
dmtcp_command -p 7779 -c 2>&1
33+
sleep 2
34+
35+
# Find checkpoint files
36+
echo "=== CHECKPOINT FILES ==="
37+
find /tmp -maxdepth 3 -name "ckpt_*" 2>/dev/null
38+
find /tmp/ckpt_test -type f 2>/dev/null
39+
find /root -maxdepth 2 -name "ckpt_*" 2>/dev/null
40+
find / -maxdepth 3 -name "ckpt_*.dmtcp" 2>/dev/null | head -5
41+
ls -la /tmp/ckpt_test/ 2>/dev/null
42+
43+
# Also check the gpu workspace
44+
find /gpu-cli-workspaces -maxdepth 3 -name "ckpt_*" 2>/dev/null | head -5
45+
46+
# Kill
47+
dmtcp_command -p 7779 -k 2>&1 || true
48+
wait $DPID 2>/dev/null || true
49+
50+
echo "=== DONE ==="

0 commit comments

Comments
 (0)