fab2s · fab2s · Apr 14, 2026 · Apr 9, 2026 · Apr 10, 2026 · Apr 10, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -91,6 +91,22 @@ jobs:
           LIBRARY_PATH: ${{ github.workspace }}/libtorch/lib
         run: cargo clippy -- -W clippy::all
 
+      - name: Doc (strict, fails on warnings)
+        # Clippy doesn't lint doc comments -- rustdoc is the only validator
+        # for intra-doc links and HTML tags. Run doc as its own pass with
+        # warnings promoted to errors so doc-comment regressions surface in CI.
+        env:
+          LIBTORCH_PATH: ${{ github.workspace }}/libtorch
+          LD_LIBRARY_PATH: ${{ github.workspace }}/libtorch/lib
+          LIBRARY_PATH: ${{ github.workspace }}/libtorch/lib
+          RUSTDOCFLAGS: "-D warnings"
+        run: |
+          cargo doc --no-deps --document-private-items
+          # Sanity check: build reported success, verify the top-level crate
+          # index actually got written. Catches silent failures where doc
+          # generation produces nothing useful.
+          test -f target/doc/flodl/index.html
+
   cuda-build:
     name: CUDA (build + clippy)
     runs-on: ubuntu-24.04

diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,4 @@
-/target
+target/
 benchmarks/target
 Cargo.lock
 .cargo-cache/
@@ -14,6 +14,9 @@ Cargo.lock
 __pycache__/
 benchmarks/rounds/
 benchmarks/report*.txt
+ddp-bench/runs-*/
+ddp-bench/data/
+/data/
 
 # Generated guide pages (built from site/_stubs/ + docs/ by site/build_guide.py)
 site/guide/*.md
@@ -23,4 +26,11 @@ site/.jekyll-cache/
 site/Gemfile.lock
 
 # libtorch variants (downloaded or built from source)
-/libtorch/
+/libtorch/
+
+# Local fdl config (fdl.yaml.example is committed; fdl copies it on first run)
+fdl.yaml
+fdl.yml
+
+mempalace.yaml
+entities.json
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,10 +1,10 @@
 [workspace]
 members = ["flodl-sys", "flodl", "flodl-cli"]
-exclude = ["benchmarks"]
+exclude = ["benchmarks", "ddp-bench"]
 resolver = "2"
 
 [workspace.package]
-version = "0.3.0"
+version = "0.4.0"
 edition = "2024"
 rust-version = "1.85"
 license = "MIT"

diff --git a/Makefile b/Makefile
@@ -1,242 +1,30 @@
-# flodl development commands
+# flodl development -- legacy Makefile
 #
-# All commands run inside Docker containers via docker compose.
-# libtorch is mounted from the host libtorch/ directory (not baked into images).
+# All development commands are available via fdl (see fdl.yml.example).
+# This Makefile retains only host-side tasks that fdl can't handle.
 #
 # Quick start:
-#   make setup        # detect hardware, download libtorch, build Docker image
-#   make test         # run CPU tests
-#   make cuda-test    # run CUDA tests (parallel)
-#   make cuda-test-all  # full suite: parallel + NCCL (isolated) + serial
+#   fdl setup             # detect hardware, download libtorch, build Docker image
+#   fdl test              # run CPU tests
+#   fdl cuda-test-all     # full CUDA suite
 
-COMPOSE   = docker compose
+COMPOSE = docker compose
 
-# --- libtorch auto-detection ---
-# Read the active libtorch variant from libtorch/.active
-LIBTORCH_ACTIVE := $(shell cat libtorch/.active 2>/dev/null | tr -d '[:space:]')
-LIBTORCH_HOST_PATH := $(if $(LIBTORCH_ACTIVE),./libtorch/$(LIBTORCH_ACTIVE),)
+.PHONY: docs-rs site site-stop test-init clean
 
-# Read .arch properties from the active variant
-ARCH_FILE := $(if $(LIBTORCH_HOST_PATH),$(LIBTORCH_HOST_PATH)/.arch,)
-ARCH_CUDA := $(shell grep '^cuda=' $(ARCH_FILE) 2>/dev/null | cut -d= -f2)
-
-# Determine CUDA version for Docker image. Override: CUDA_VERSION=12.6.0 make cuda-test
-ifeq ($(ARCH_CUDA),none)
-  _CUDA_VER :=
-else ifneq ($(ARCH_CUDA),)
-  _CUDA_VER := $(ARCH_CUDA).0
-else
-  _CUDA_VER := 12.8.0
-endif
-CUDA_VERSION ?= $(_CUDA_VER)
-CUDA_TAG     ?= $(shell echo "$(CUDA_VERSION)" | cut -d. -f1,2)
-
-# CPU libtorch is always the precompiled CPU variant
-LIBTORCH_CPU_PATH := ./libtorch/precompiled/cpu
-
-export LIBTORCH_HOST_PATH
-export LIBTORCH_CPU_PATH
-export CUDA_VERSION
-export CUDA_TAG
-
-# Docker run shortcuts
-RUN       = $(COMPOSE) run --rm dev
-RUN_GPU   = $(COMPOSE) run --rm cuda
-RUN_BENCH = $(COMPOSE) run --rm bench
-
-.PHONY: build test test-release check clippy clippy-all doc shell clean \
-        cuda-build cuda-test cuda-test-nccl cuda-test-serial cuda-test-graph cuda-test-all \
-        cuda-clippy cuda-clippy-all cuda-shell \
-        image cuda-image \
-        test-all setup build-libtorch \
-        bench-image bench bench-cpu bench-compare bench-publish \
-        docs-rs site site-stop test-init \
-        cli \
-        _require-libtorch _require-libtorch-cuda
-
-# --- libtorch guards ---
-
-_require-libtorch:
-	@if [ ! -d "$(LIBTORCH_CPU_PATH)/lib" ]; then \
-		echo ""; \
-		echo "[flodl] ERROR: No CPU libtorch found at $(LIBTORCH_CPU_PATH)."; \
-		echo "[flodl] Run:  make setup"; \
-		echo "[flodl]   or: fdl libtorch download --cpu"; \
-		echo ""; \
-		exit 1; \
-	fi
-
-_require-libtorch-cuda:
-	@if [ -z "$(LIBTORCH_HOST_PATH)" ] || [ ! -d "$(LIBTORCH_HOST_PATH)/lib" ]; then \
-		echo ""; \
-		echo "[flodl] ERROR: No active CUDA libtorch found."; \
-		echo "[flodl] Run:  make setup        (auto-detect and download)"; \
-		echo "[flodl]   or: fdl libtorch download --cuda 12.8"; \
-		echo ""; \
-		exit 1; \
-	fi; \
-	if [ "$(ARCH_CUDA)" = "none" ]; then \
-		echo ""; \
-		echo "[flodl] ERROR: Active libtorch is CPU-only ($(LIBTORCH_HOST_PATH))."; \
-		echo "[flodl] Run:  make setup        (to get a CUDA variant)"; \
-		echo ""; \
-		exit 1; \
-	fi
-
-# --- CPU targets ---
-
-# Build the Docker image (skips if already exists)
-image:
-	@mkdir -p .cargo-cache .cargo-git
-	@if ! docker image inspect flodl-dev:latest >/dev/null 2>&1; then \
-		$(COMPOSE) build dev; \
-	fi
-
-# Build the project (debug)
-build: image _require-libtorch
-	$(RUN) cargo build
-
-# Run all tests
-test: image _require-libtorch
-	$(RUN) cargo test -- --nocapture
-
-# Run tests in release mode
-test-release: image _require-libtorch
-	$(RUN) cargo test --release -- --nocapture
-
-# Type check without building
-check: image _require-libtorch
-	$(RUN) cargo check
-
-# Lint
-clippy: image _require-libtorch
-	$(RUN) cargo clippy -- -W clippy::all
-
-# Lint including test code
-clippy-all: image _require-libtorch
-	$(RUN) cargo clippy --tests -- -W clippy::all
-
-# Generate API docs
-doc: image _require-libtorch
-	$(RUN) cargo doc --no-deps --document-private-items
-
-# Interactive shell
-shell: image
-	$(COMPOSE) run --rm dev bash
-
-# --- CUDA targets ---
-
-# Build the CUDA Docker image (skips if already exists)
-cuda-image:
-	@mkdir -p .cargo-cache-cuda .cargo-git-cuda
-	@if ! docker image inspect flodl-cuda:$(CUDA_TAG) >/dev/null 2>&1; then \
-		$(COMPOSE) build cuda; \
-	fi
-
-# Build with CUDA feature
-cuda-build: cuda-image _require-libtorch-cuda
-	$(RUN_GPU) cargo build --features cuda
-
-# Run all tests with CUDA (parallel, excludes NCCL/DDP/Graph)
-cuda-test: cuda-image _require-libtorch-cuda
-	$(RUN_GPU) cargo test --features cuda -- --nocapture
-
-# Run NCCL/DDP tests (NCCL init poisons CUBLAS -- each group in isolated process)
-cuda-test-nccl: cuda-image _require-libtorch-cuda
-	$(RUN_GPU) cargo test --features cuda -- --nocapture --ignored --test-threads=1 nccl
-	$(RUN_GPU) cargo test --features cuda -- --nocapture --ignored --test-threads=1 graph_distribute
-
-# Run remaining serial tests (Graphs, manual_seed, etc.) -- separate process, no NCCL poison
-cuda-test-serial: cuda-image _require-libtorch-cuda
-	$(RUN_GPU) cargo test --features cuda -- --nocapture --ignored --test-threads=1 --skip nccl --skip graph_distribute
-
-# Run CUDA Graph tests only (need exclusive GPU, single-threaded)
-cuda-test-graph: cuda-image _require-libtorch-cuda
-	$(RUN_GPU) cargo test --features cuda -- --nocapture --ignored --test-threads=1 cuda_graph
-
-# Full CUDA test suite: parallel + NCCL (isolated) + remaining serial
-cuda-test-all: cuda-test cuda-test-nccl cuda-test-serial
-
-# Lint with CUDA feature
-cuda-clippy: cuda-image _require-libtorch-cuda
-	$(RUN_GPU) cargo clippy --features cuda -- -W clippy::all
-
-# Lint with CUDA feature including test code
-cuda-clippy-all: cuda-image _require-libtorch-cuda
-	$(RUN_GPU) cargo clippy --features cuda --tests -- -W clippy::all
-
-# Interactive shell (CUDA)
-cuda-shell: cuda-image
-	$(COMPOSE) run --rm cuda bash
-
-# --- Combined ---
-
-# Run CPU tests, then CUDA tests if a GPU is available
-test-all: test
-	@if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then \
-		echo ""; \
-		echo "=== GPU detected -- running CUDA tests ==="; \
-		$(MAKE) cuda-test-all; \
-	else \
-		echo ""; \
-		echo "=== No GPU available -- skipping CUDA tests ==="; \
-	fi
-
-# --- Setup ---
-
-# Detect hardware, download/build libtorch, build Docker image.
-setup: cli
-	./target/release/fdl setup --non-interactive
-
-# Build libtorch from PyTorch source for custom GPU architectures.
-# Auto-detects compute capabilities from installed GPUs.
-# Takes 2-6 hours. Run overnight: make build-libtorch
-build-libtorch: cli
-	./target/release/fdl libtorch build
-
-# --- CLI (pure Rust, no libtorch needed) ---
-
-cli:
-	cargo build --release -p flodl-cli
-
-# --- Benchmarks ---
-
-bench-image:
-	@mkdir -p .cargo-cache-bench .cargo-git-bench
-	@if ! docker image inspect flodl-bench:latest >/dev/null 2>&1; then \
-		$(COMPOSE) build bench; \
-	fi
-
-# Run CUDA benchmarks: flodl vs PyTorch comparison
-bench: bench-image _require-libtorch-cuda
-	$(RUN_BENCH) benchmarks/run.sh $(ARGS)
-
-# Run CPU-only benchmarks
-bench-cpu: bench-image _require-libtorch
-	$(RUN_BENCH) benchmarks/run.sh --cpu $(ARGS)
-
-# Publication benchmarks: interleaved rounds, locked clocks, long warmup.
-ROUNDS ?= 10
-CLOCK  ?= 2407
-OUTPUT ?= benchmarks/report.txt
-bench-publish: bench-image _require-libtorch-cuda
-	$(RUN_BENCH) benchmarks/run.sh --rounds $(ROUNDS) --lock-clocks $(CLOCK) --warmup-secs 15 --output $(OUTPUT) $(ARGS)
-
-bench-compare: bench
-
-# --- docs.rs validation ---
+# --- docs.rs validation (host-side mkdir + nightly toolchain) ---
 
 docs-rs:
 	@mkdir -p .cargo-cache-docsrs .cargo-git-docsrs .target-docsrs
 	$(COMPOSE) run --rm docs-rs bash -c "\
 		rustup install nightly 2>&1 | tail -1 && \
 		cargo +nightly rustdoc --lib \
-			--no-default-features \
+			--no-default-features --features rng \
 			--config 'build.rustflags=[\"--cfg\", \"docsrs\"]' \
 			--config 'build.rustdocflags=[\"--cfg\", \"docsrs\"]' \
 			-Zrustdoc-scrape-examples"
 
-# --- Site ---
+# --- Site (host python + docker compose up/down) ---
 
 site:
 	@python3 site/build_guide.py

diff --git a/README.md b/README.md
@@ -19,13 +19,15 @@ Same GPU kernels as PyTorch. No Python. No GIL. No GC. Just Rust.
 
 <p align="center">
   <a href="#if-you-know-pytorch-you-know-flodl">PyTorch Users</a> &bull;
+  <a href="https://flodl.dev/thesis"><b>Thesis</b></a> &bull;
   <a href="#getting-started">Getting Started</a> &bull;
   <a href="#the-graph-builder">Graph Builder</a> &bull;
   <a href="#graph-tree-hierarchical-composition">Graph Tree</a> &bull;
   <a href="#the-training-experience">Training</a> &bull;
   <a href="#multi-gpu-training">Multi-GPU</a> &bull;
   <a href="#pytorch-parity">Parity</a> &bull;
   <a href="#performance">Benchmarks</a> &bull;
+  <a href="https://github.com/fab2s/floDl/blob/main/ROADMAP.md">Roadmap</a> &bull;
   <a href="https://github.com/fab2s/floDl/blob/main/docs/pytorch_migration.md">Migration Guide</a> &bull;
   <a href="https://github.com/fab2s/floDl/blob/main/docs/tutorials/13-data-loading.md">Data Loading</a>
 </p>
@@ -426,6 +428,42 @@ See the **[Multi-GPU Tutorial](https://github.com/fab2s/floDl/blob/main/docs/tut
 **[Data Loading Tutorial](https://github.com/fab2s/floDl/blob/main/docs/tutorials/13-data-loading.md)**, and
 **[DDP Reference](https://github.com/fab2s/floDl/blob/main/docs/ddp.md)**.
 
+### Validation suite — `ddp-bench`
+
+The repo ships with [`ddp-bench/`](https://github.com/fab2s/floDl/tree/main/ddp-bench),
+a workspace member that reproduces published training setups (Logistic /
+MLP / LeNet-5 / ResNet-20 / Char-RNN / GPT-nano / Conv-AE on MNIST,
+CIFAR-10, Shakespeare) to build scientifically valid solo baselines, then
+measures DDP/ElChe convergence quality against them across all 8
+backend × policy combinations:
+
+```bash
+fdl ddp-bench --list                       # list models and modes
+fdl ddp-bench quick                        # 1-epoch smoke test
+fdl ddp-bench validate                     # full sweep vs structured baselines
+fdl ddp-bench --model gpt-nano --mode nccl-cadence --epochs 50 --lr-scale 2
+fdl ddp-bench --report runs/report.md      # convergence report from saved runs
+```
+
+Every run produces a high-frequency `Timeline` (CPU/GPU utilization, sync
+events, anchor changes, idle gaps) saved as JSON / CSV / interactive HTML
+under `runs/<model>/<mode>/`.
+
+### Built-in datasets
+
+The framework ships ready-to-use parsers for common benchmarks (all
+implement `BatchDataSet`, plug straight into `DataLoader::builder`):
+
+```rust
+use flodl::data::datasets::{Cifar10, Mnist, Shakespeare};
+
+let mnist = Mnist::parse(&images_gz, &labels_gz)?;
+let cifar = Cifar10::parse(&[&batch1, &batch2, /* ... */])?;
+let text  = Shakespeare::parse(&corpus, /*seq_len=*/ 128)?;
+```
+
+`ddp-bench` downloads and caches the underlying files on first run.
+
 ## PyTorch Parity
 
 floDl covers the modules, losses, and optimizers you actually use:
@@ -602,7 +640,7 @@ supports. If `nvidia-smi` works, floDl trains on it.
 5. **[Graph Builder](https://github.com/fab2s/floDl/blob/main/docs/tutorials/05-graph-builder.md)** — fluent API from simple to complex
 6. **[Advanced Graphs](https://github.com/fab2s/floDl/blob/main/docs/tutorials/06-advanced-graphs.md)** — forward refs, loops, gates, switches
 7. **[Visualization](https://github.com/fab2s/floDl/blob/main/docs/tutorials/07-visualization.md)** — DOT/SVG, profiling heatmaps
-8. **[Utilities](https://github.com/fab2s/floDl/blob/main/docs/tutorials/08-utilities.md)** — checkpoints, clipping, freezing, initialization, scheduling
+8. **[Utilities](https://github.com/fab2s/floDl/blob/main/docs/tutorials/08-utilities.md)** — checkpoints, clipping, freezing, initialization, scheduling, verbosity-gated logging
 9. **[Training Monitor](https://github.com/fab2s/floDl/blob/main/docs/tutorials/09-monitor.md)** — ETA, resource tracking, live dashboard
 10. **[Graph Tree](https://github.com/fab2s/floDl/blob/main/docs/tutorials/10-graph-tree.md)** — hierarchical composition, freeze/thaw, subgraph checkpoints
 11. **[Multi-GPU Training](https://github.com/fab2s/floDl/blob/main/docs/tutorials/11-multi-gpu.md)** — Ddp::setup, El Che, auto-balancing, DataLoader integration