Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,22 @@ jobs:
LIBRARY_PATH: ${{ github.workspace }}/libtorch/lib
run: cargo clippy -- -W clippy::all

- name: Doc (strict, fails on warnings)
# Clippy doesn't lint doc comments -- rustdoc is the only validator
# for intra-doc links and HTML tags. Run doc as its own pass with
# warnings promoted to errors so doc-comment regressions surface in CI.
env:
LIBTORCH_PATH: ${{ github.workspace }}/libtorch
LD_LIBRARY_PATH: ${{ github.workspace }}/libtorch/lib
LIBRARY_PATH: ${{ github.workspace }}/libtorch/lib
RUSTDOCFLAGS: "-D warnings"
run: |
cargo doc --no-deps --document-private-items
# Sanity check: build reported success, verify the top-level crate
# index actually got written. Catches silent failures where doc
# generation produces nothing useful.
test -f target/doc/flodl/index.html

cuda-build:
name: CUDA (build + clippy)
runs-on: ubuntu-24.04
Expand Down
14 changes: 12 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/target
target/
benchmarks/target
Cargo.lock
.cargo-cache/
Expand All @@ -14,6 +14,9 @@ Cargo.lock
__pycache__/
benchmarks/rounds/
benchmarks/report*.txt
ddp-bench/runs-*/
ddp-bench/data/
/data/

# Generated guide pages (built from site/_stubs/ + docs/ by site/build_guide.py)
site/guide/*.md
Expand All @@ -23,4 +26,11 @@ site/.jekyll-cache/
site/Gemfile.lock

# libtorch variants (downloaded or built from source)
/libtorch/
/libtorch/

# Local fdl config (fdl.yaml.example is committed; fdl copies it on first run)
fdl.yaml
fdl.yml

mempalace.yaml
entities.json
131 changes: 125 additions & 6 deletions CHANGELOG.md

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
[workspace]
members = ["flodl-sys", "flodl", "flodl-cli"]
exclude = ["benchmarks"]
exclude = ["benchmarks", "ddp-bench"]
resolver = "2"

[workspace.package]
version = "0.3.0"
version = "0.4.0"
edition = "2024"
rust-version = "1.85"
license = "MIT"
Expand Down
234 changes: 11 additions & 223 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,242 +1,30 @@
# flodl development commands
# flodl development -- legacy Makefile
#
# All commands run inside Docker containers via docker compose.
# libtorch is mounted from the host libtorch/ directory (not baked into images).
# All development commands are available via fdl (see fdl.yml.example).
# This Makefile retains only host-side tasks that fdl can't handle.
#
# Quick start:
# make setup # detect hardware, download libtorch, build Docker image
# make test # run CPU tests
# make cuda-test # run CUDA tests (parallel)
# make cuda-test-all # full suite: parallel + NCCL (isolated) + serial
# fdl setup # detect hardware, download libtorch, build Docker image
# fdl test # run CPU tests
# fdl cuda-test-all # full CUDA suite

COMPOSE = docker compose
COMPOSE = docker compose

# --- libtorch auto-detection ---
# Read the active libtorch variant from libtorch/.active
LIBTORCH_ACTIVE := $(shell cat libtorch/.active 2>/dev/null | tr -d '[:space:]')
LIBTORCH_HOST_PATH := $(if $(LIBTORCH_ACTIVE),./libtorch/$(LIBTORCH_ACTIVE),)
.PHONY: docs-rs site site-stop test-init clean

# Read .arch properties from the active variant
ARCH_FILE := $(if $(LIBTORCH_HOST_PATH),$(LIBTORCH_HOST_PATH)/.arch,)
ARCH_CUDA := $(shell grep '^cuda=' $(ARCH_FILE) 2>/dev/null | cut -d= -f2)

# Determine CUDA version for Docker image. Override: CUDA_VERSION=12.6.0 make cuda-test
ifeq ($(ARCH_CUDA),none)
_CUDA_VER :=
else ifneq ($(ARCH_CUDA),)
_CUDA_VER := $(ARCH_CUDA).0
else
_CUDA_VER := 12.8.0
endif
CUDA_VERSION ?= $(_CUDA_VER)
CUDA_TAG ?= $(shell echo "$(CUDA_VERSION)" | cut -d. -f1,2)

# CPU libtorch is always the precompiled CPU variant
LIBTORCH_CPU_PATH := ./libtorch/precompiled/cpu

export LIBTORCH_HOST_PATH
export LIBTORCH_CPU_PATH
export CUDA_VERSION
export CUDA_TAG

# Docker run shortcuts
RUN = $(COMPOSE) run --rm dev
RUN_GPU = $(COMPOSE) run --rm cuda
RUN_BENCH = $(COMPOSE) run --rm bench

.PHONY: build test test-release check clippy clippy-all doc shell clean \
cuda-build cuda-test cuda-test-nccl cuda-test-serial cuda-test-graph cuda-test-all \
cuda-clippy cuda-clippy-all cuda-shell \
image cuda-image \
test-all setup build-libtorch \
bench-image bench bench-cpu bench-compare bench-publish \
docs-rs site site-stop test-init \
cli \
_require-libtorch _require-libtorch-cuda

# --- libtorch guards ---

_require-libtorch:
@if [ ! -d "$(LIBTORCH_CPU_PATH)/lib" ]; then \
echo ""; \
echo "[flodl] ERROR: No CPU libtorch found at $(LIBTORCH_CPU_PATH)."; \
echo "[flodl] Run: make setup"; \
echo "[flodl] or: fdl libtorch download --cpu"; \
echo ""; \
exit 1; \
fi

_require-libtorch-cuda:
@if [ -z "$(LIBTORCH_HOST_PATH)" ] || [ ! -d "$(LIBTORCH_HOST_PATH)/lib" ]; then \
echo ""; \
echo "[flodl] ERROR: No active CUDA libtorch found."; \
echo "[flodl] Run: make setup (auto-detect and download)"; \
echo "[flodl] or: fdl libtorch download --cuda 12.8"; \
echo ""; \
exit 1; \
fi; \
if [ "$(ARCH_CUDA)" = "none" ]; then \
echo ""; \
echo "[flodl] ERROR: Active libtorch is CPU-only ($(LIBTORCH_HOST_PATH))."; \
echo "[flodl] Run: make setup (to get a CUDA variant)"; \
echo ""; \
exit 1; \
fi

# --- CPU targets ---

# Build the Docker image (skips if already exists)
image:
@mkdir -p .cargo-cache .cargo-git
@if ! docker image inspect flodl-dev:latest >/dev/null 2>&1; then \
$(COMPOSE) build dev; \
fi

# Build the project (debug)
build: image _require-libtorch
$(RUN) cargo build

# Run all tests
test: image _require-libtorch
$(RUN) cargo test -- --nocapture

# Run tests in release mode
test-release: image _require-libtorch
$(RUN) cargo test --release -- --nocapture

# Type check without building
check: image _require-libtorch
$(RUN) cargo check

# Lint
clippy: image _require-libtorch
$(RUN) cargo clippy -- -W clippy::all

# Lint including test code
clippy-all: image _require-libtorch
$(RUN) cargo clippy --tests -- -W clippy::all

# Generate API docs
doc: image _require-libtorch
$(RUN) cargo doc --no-deps --document-private-items

# Interactive shell
shell: image
$(COMPOSE) run --rm dev bash

# --- CUDA targets ---

# Build the CUDA Docker image (skips if already exists)
cuda-image:
@mkdir -p .cargo-cache-cuda .cargo-git-cuda
@if ! docker image inspect flodl-cuda:$(CUDA_TAG) >/dev/null 2>&1; then \
$(COMPOSE) build cuda; \
fi

# Build with CUDA feature
cuda-build: cuda-image _require-libtorch-cuda
$(RUN_GPU) cargo build --features cuda

# Run all tests with CUDA (parallel, excludes NCCL/DDP/Graph)
cuda-test: cuda-image _require-libtorch-cuda
$(RUN_GPU) cargo test --features cuda -- --nocapture

# Run NCCL/DDP tests (NCCL init poisons CUBLAS -- each group in isolated process)
cuda-test-nccl: cuda-image _require-libtorch-cuda
$(RUN_GPU) cargo test --features cuda -- --nocapture --ignored --test-threads=1 nccl
$(RUN_GPU) cargo test --features cuda -- --nocapture --ignored --test-threads=1 graph_distribute

# Run remaining serial tests (Graphs, manual_seed, etc.) -- separate process, no NCCL poison
cuda-test-serial: cuda-image _require-libtorch-cuda
$(RUN_GPU) cargo test --features cuda -- --nocapture --ignored --test-threads=1 --skip nccl --skip graph_distribute

# Run CUDA Graph tests only (need exclusive GPU, single-threaded)
cuda-test-graph: cuda-image _require-libtorch-cuda
$(RUN_GPU) cargo test --features cuda -- --nocapture --ignored --test-threads=1 cuda_graph

# Full CUDA test suite: parallel + NCCL (isolated) + remaining serial
cuda-test-all: cuda-test cuda-test-nccl cuda-test-serial

# Lint with CUDA feature
cuda-clippy: cuda-image _require-libtorch-cuda
$(RUN_GPU) cargo clippy --features cuda -- -W clippy::all

# Lint with CUDA feature including test code
cuda-clippy-all: cuda-image _require-libtorch-cuda
$(RUN_GPU) cargo clippy --features cuda --tests -- -W clippy::all

# Interactive shell (CUDA)
cuda-shell: cuda-image
$(COMPOSE) run --rm cuda bash

# --- Combined ---

# Run CPU tests, then CUDA tests if a GPU is available
test-all: test
@if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then \
echo ""; \
echo "=== GPU detected -- running CUDA tests ==="; \
$(MAKE) cuda-test-all; \
else \
echo ""; \
echo "=== No GPU available -- skipping CUDA tests ==="; \
fi

# --- Setup ---

# Detect hardware, download/build libtorch, build Docker image.
setup: cli
./target/release/fdl setup --non-interactive

# Build libtorch from PyTorch source for custom GPU architectures.
# Auto-detects compute capabilities from installed GPUs.
# Takes 2-6 hours. Run overnight: make build-libtorch
build-libtorch: cli
./target/release/fdl libtorch build

# --- CLI (pure Rust, no libtorch needed) ---

cli:
cargo build --release -p flodl-cli

# --- Benchmarks ---

bench-image:
@mkdir -p .cargo-cache-bench .cargo-git-bench
@if ! docker image inspect flodl-bench:latest >/dev/null 2>&1; then \
$(COMPOSE) build bench; \
fi

# Run CUDA benchmarks: flodl vs PyTorch comparison
bench: bench-image _require-libtorch-cuda
$(RUN_BENCH) benchmarks/run.sh $(ARGS)

# Run CPU-only benchmarks
bench-cpu: bench-image _require-libtorch
$(RUN_BENCH) benchmarks/run.sh --cpu $(ARGS)

# Publication benchmarks: interleaved rounds, locked clocks, long warmup.
ROUNDS ?= 10
CLOCK ?= 2407
OUTPUT ?= benchmarks/report.txt
bench-publish: bench-image _require-libtorch-cuda
$(RUN_BENCH) benchmarks/run.sh --rounds $(ROUNDS) --lock-clocks $(CLOCK) --warmup-secs 15 --output $(OUTPUT) $(ARGS)

bench-compare: bench

# --- docs.rs validation ---
# --- docs.rs validation (host-side mkdir + nightly toolchain) ---

docs-rs:
@mkdir -p .cargo-cache-docsrs .cargo-git-docsrs .target-docsrs
$(COMPOSE) run --rm docs-rs bash -c "\
rustup install nightly 2>&1 | tail -1 && \
cargo +nightly rustdoc --lib \
--no-default-features \
--no-default-features --features rng \
--config 'build.rustflags=[\"--cfg\", \"docsrs\"]' \
--config 'build.rustdocflags=[\"--cfg\", \"docsrs\"]' \
-Zrustdoc-scrape-examples"

# --- Site ---
# --- Site (host python + docker compose up/down) ---

site:
@python3 site/build_guide.py
Expand Down
40 changes: 39 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,15 @@ Same GPU kernels as PyTorch. No Python. No GIL. No GC. Just Rust.

<p align="center">
<a href="#if-you-know-pytorch-you-know-flodl">PyTorch Users</a> &bull;
<a href="https://flodl.dev/thesis"><b>Thesis</b></a> &bull;
<a href="#getting-started">Getting Started</a> &bull;
<a href="#the-graph-builder">Graph Builder</a> &bull;
<a href="#graph-tree-hierarchical-composition">Graph Tree</a> &bull;
<a href="#the-training-experience">Training</a> &bull;
<a href="#multi-gpu-training">Multi-GPU</a> &bull;
<a href="#pytorch-parity">Parity</a> &bull;
<a href="#performance">Benchmarks</a> &bull;
<a href="https://github.com/fab2s/floDl/blob/main/ROADMAP.md">Roadmap</a> &bull;
<a href="https://github.com/fab2s/floDl/blob/main/docs/pytorch_migration.md">Migration Guide</a> &bull;
<a href="https://github.com/fab2s/floDl/blob/main/docs/tutorials/13-data-loading.md">Data Loading</a>
</p>
Expand Down Expand Up @@ -426,6 +428,42 @@ See the **[Multi-GPU Tutorial](https://github.com/fab2s/floDl/blob/main/docs/tut
**[Data Loading Tutorial](https://github.com/fab2s/floDl/blob/main/docs/tutorials/13-data-loading.md)**, and
**[DDP Reference](https://github.com/fab2s/floDl/blob/main/docs/ddp.md)**.

### Validation suite — `ddp-bench`

The repo ships with [`ddp-bench/`](https://github.com/fab2s/floDl/tree/main/ddp-bench),
a workspace member that reproduces published training setups (Logistic /
MLP / LeNet-5 / ResNet-20 / Char-RNN / GPT-nano / Conv-AE on MNIST,
CIFAR-10, Shakespeare) to build scientifically valid solo baselines, then
measures DDP/ElChe convergence quality against them across all 8
backend × policy combinations:

```bash
fdl ddp-bench --list # list models and modes
fdl ddp-bench quick # 1-epoch smoke test
fdl ddp-bench validate # full sweep vs structured baselines
fdl ddp-bench --model gpt-nano --mode nccl-cadence --epochs 50 --lr-scale 2
fdl ddp-bench --report runs/report.md # convergence report from saved runs
```

Every run produces a high-frequency `Timeline` (CPU/GPU utilization, sync
events, anchor changes, idle gaps) saved as JSON / CSV / interactive HTML
under `runs/<model>/<mode>/`.

### Built-in datasets

The framework ships ready-to-use parsers for common benchmarks (all
implement `BatchDataSet`, plug straight into `DataLoader::builder`):

```rust
use flodl::data::datasets::{Cifar10, Mnist, Shakespeare};

let mnist = Mnist::parse(&images_gz, &labels_gz)?;
let cifar = Cifar10::parse(&[&batch1, &batch2, /* ... */])?;
let text = Shakespeare::parse(&corpus, /*seq_len=*/ 128)?;
```

`ddp-bench` downloads and caches the underlying files on first run.

## PyTorch Parity

floDl covers the modules, losses, and optimizers you actually use:
Expand Down Expand Up @@ -602,7 +640,7 @@ supports. If `nvidia-smi` works, floDl trains on it.
5. **[Graph Builder](https://github.com/fab2s/floDl/blob/main/docs/tutorials/05-graph-builder.md)** — fluent API from simple to complex
6. **[Advanced Graphs](https://github.com/fab2s/floDl/blob/main/docs/tutorials/06-advanced-graphs.md)** — forward refs, loops, gates, switches
7. **[Visualization](https://github.com/fab2s/floDl/blob/main/docs/tutorials/07-visualization.md)** — DOT/SVG, profiling heatmaps
8. **[Utilities](https://github.com/fab2s/floDl/blob/main/docs/tutorials/08-utilities.md)** — checkpoints, clipping, freezing, initialization, scheduling
8. **[Utilities](https://github.com/fab2s/floDl/blob/main/docs/tutorials/08-utilities.md)** — checkpoints, clipping, freezing, initialization, scheduling, verbosity-gated logging
9. **[Training Monitor](https://github.com/fab2s/floDl/blob/main/docs/tutorials/09-monitor.md)** — ETA, resource tracking, live dashboard
10. **[Graph Tree](https://github.com/fab2s/floDl/blob/main/docs/tutorials/10-graph-tree.md)** — hierarchical composition, freeze/thaw, subgraph checkpoints
11. **[Multi-GPU Training](https://github.com/fab2s/floDl/blob/main/docs/tutorials/11-multi-gpu.md)** — Ddp::setup, El Che, auto-balancing, DataLoader integration
Expand Down
Loading
Loading