bihius · bihius · Jun 6, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 6, 2026
diff --git a/.gitignore b/.gitignore
@@ -111,16 +111,20 @@ logs/
 *.mkv
 
 # ===== Benchmark Results =====
+# Flat files (legacy pattern)
 benchmarks/results/*.json
 benchmarks/results/*.html
 benchmarks/results/*.csv
+# Run directories (run-<RUN_ID>/) — all raw output is gitignored
+benchmarks/results/run-*/
 
 # ===== HAProxy / Coraza Runtime =====
 *.sock
 *.pid
 haproxy.stats
 
 # ===== Thesis =====
+# Separate repository — not tracked here.
 thesis/
 
 # ===== Keep Empty Directories =====

diff --git a/Makefile b/Makefile
@@ -2,7 +2,8 @@ COMPOSE_FILE := deploy/docker/docker-compose.yml
 COMPOSE_DEBUG_FILE := deploy/docker/docker-compose.debug.yml
 ENV_FILE := deploy/docker/.env
 
-.PHONY: run dev down clean logs ps seed users coraza-build
+.PHONY: run dev down clean logs ps seed users coraza-build \
+        eval-up eval-down eval-clean eval-ftw eval-zap eval-nuclei eval-load eval-metrics eval-all eval-results
 
 run:
 	docker-compose -f $(COMPOSE_FILE) --env-file $(ENV_FILE) up --build -d
@@ -30,3 +31,48 @@ users:
 
 coraza-build:
 	docker build -f deploy/docker/coraza.Dockerfile -t guard-proxy/coraza-spoa:dev .
+
+# ── Evaluation lab (delegates to benchmarks/Makefile) ─────────────────────
+# See benchmarks/Makefile for full documentation and variable overrides.
+
+eval-up:
+	$(MAKE) -C benchmarks lab-up
+
+eval-down:
+	$(MAKE) -C benchmarks lab-down
+
+eval-clean:
+	$(MAKE) -C benchmarks lab-clean
+
+eval-ftw:
+	$(MAKE) -C benchmarks eval-ftw \
+	  $(if $(RUN_ID),RUN_ID=$(RUN_ID)) \
+	  $(if $(TARGET_VHOST),TARGET_VHOST=$(TARGET_VHOST))
+
+eval-zap:
+	$(MAKE) -C benchmarks eval-zap \
+	  $(if $(RUN_ID),RUN_ID=$(RUN_ID)) \
+	  $(if $(TARGET_VHOST),TARGET_VHOST=$(TARGET_VHOST))
+
+eval-nuclei:
+	$(MAKE) -C benchmarks eval-nuclei \
+	  $(if $(RUN_ID),RUN_ID=$(RUN_ID)) \
+	  $(if $(TARGET_VHOST),TARGET_VHOST=$(TARGET_VHOST))
+
+eval-load:
+	$(MAKE) -C benchmarks eval-load \
+	  $(if $(RUN_ID),RUN_ID=$(RUN_ID)) \
+	  $(if $(TARGET_VHOST),TARGET_VHOST=$(TARGET_VHOST))
+
+eval-metrics:
+	$(MAKE) -C benchmarks eval-metrics \
+	  $(if $(RUN_ID),RUN_ID=$(RUN_ID))
+
+eval-all:
+	$(MAKE) -C benchmarks eval-all \
+	  $(if $(RUN_ID),RUN_ID=$(RUN_ID)) \
+	  $(if $(TARGET_VHOST),TARGET_VHOST=$(TARGET_VHOST))
+
+eval-results:
+	$(MAKE) -C benchmarks results \
+	  $(if $(RUN_ID),RUN_ID=$(RUN_ID))
diff --git a/README.testing.md b/README.testing.md
@@ -6,8 +6,8 @@
 |-------|----------|-------|---------------|
 | **Unit** (many, fast) | `src/backend/tests/unit/` | pytest, Vitest | >80% |
 | **Integration** (some) | `src/backend/tests/integration/` | pytest, Docker | Key flows |
-| **Security** | `tests/security/` | sqlmap, OWASP ZAP, custom payloads | OWASP Top 10 |
-| **Performance** | `benchmarks/` | wrk, k6, Locust | <20% WAF overhead |
+| **Security** | `benchmarks/lab/` | OWASP ZAP, Nuclei, go-ftw (CRS corpus) | OWASP Top 10 |
+| **Performance** | `benchmarks/lab/` | wrk (WAF vs direct) | <20% WAF overhead |
 
 ## WAF Testing
 
@@ -54,10 +54,32 @@ uv run pytest -m e2e tests/e2e/test_policy_apply.py
 The test uses the same prerequisites as the smoke test and is wired into the
 nightly smoke workflow. Normal backend pytest runs exclude tests marked `e2e`.
 
+## Evaluation Lab (thesis M6)
+
+Full WAF evaluation with real target apps (WordPress, Juice Shop, DVWA):
+
+```sh
+# Prerequisites
+cp deploy/demo/.env.example deploy/demo/.env
+cp benchmarks/lab/.env.example benchmarks/lab/.env
+git submodule update --init --recursive
+
+# Bring up the lab
+make eval-up
+
+# Run all scenarios (ftw → zap → nuclei → load → metrics)
+make eval-all
+
+# View results
+make eval-results
+```
+
+See `benchmarks/lab/` for scenario configs and `docs/evaluation-plan.md` for methodology.
+
 ## Test Data
 
-- Payloads: `benchmarks/payloads/` (sqli.txt, xss.txt, legitimate.txt)
-- Results: `benchmarks/results/` (timestamped JSON, gitignored)
+- Payloads: `benchmarks/payloads/` (sqli.txt, xss.txt, lfi.txt, legitimate.txt)
+- Results: `benchmarks/results/` (timestamped JSON/CSV, gitignored)
 
 ## Commands
 

diff --git a/benchmarks/Makefile b/benchmarks/Makefile
@@ -0,0 +1,108 @@
+REPO_ROOT    := $(shell git rev-parse --show-toplevel 2>/dev/null || pwd)
+DEMO_COMPOSE := $(REPO_ROOT)/deploy/demo/docker-compose.yml
+LAB_COMPOSE  := $(REPO_ROOT)/benchmarks/lab/docker-compose.targets.yml
+DEMO_ENV     := $(REPO_ROOT)/deploy/demo/.env
+LAB_ENV      := $(REPO_ROOT)/benchmarks/lab/.env
+RUNNERS      := $(REPO_ROOT)/benchmarks/lab/runners
+
+RUN_ID       ?= $(shell date +%Y%m%d-%H%M%S)
+TARGET_VHOST ?= juice.local
+DIRECT_HOST  ?= juiceshop
+DIRECT_PORT  ?= 3000
+
+.PHONY: lab-up lab-down lab-clean \
+        eval-ftw eval-zap eval-nuclei eval-load eval-metrics eval-all \
+        results help
+
+# ── Lab lifecycle ──────────────────────────────────────────────────────────
+
+## Bring up the demo + all lab targets and register vhosts.
+lab-up:
+	@echo "==> Starting guard-proxy demo + lab targets..."
+	docker compose \
+	  -f $(DEMO_COMPOSE) \
+	  -f $(LAB_COMPOSE) \
+	  --env-file $(DEMO_ENV) \
+	  --env-file $(LAB_ENV) \
+	  up -d --build
+	@echo "==> Seeding vhosts..."
+	bash $(REPO_ROOT)/deploy/demo/setup-demo.sh
+	bash $(REPO_ROOT)/benchmarks/lab/setup-lab.sh --skip-compose
+
+## Stop the lab (preserve volumes).
+lab-down:
+	bash $(REPO_ROOT)/benchmarks/lab/teardown-lab.sh
+
+## Stop the lab and remove all volumes.
+lab-clean:
+	bash $(REPO_ROOT)/benchmarks/lab/teardown-lab.sh --clean
+
+# ── Individual scenario runners ────────────────────────────────────────────
+
+## CRS regression suite (TPR gold standard). Uses go-ftw against the CRS corpus.
+eval-ftw:
+	@mkdir -p $(REPO_ROOT)/benchmarks/results/run-$(RUN_ID)/ftw
+	RUN_ID=$(RUN_ID) TARGET_VHOST=$(TARGET_VHOST) \
+	  bash $(RUNNERS)/run-ftw.sh
+
+## ZAP baseline scan (FPR measurement, WordPress is best target for FP).
+eval-zap:
+	@mkdir -p $(REPO_ROOT)/benchmarks/results/run-$(RUN_ID)/zap-$(TARGET_VHOST)
+	RUN_ID=$(RUN_ID) TARGET_VHOST=$(TARGET_VHOST) \
+	  bash $(RUNNERS)/run-zap.sh
+
+## Nuclei CVE templates (WAF TPR against real attack payloads).
+eval-nuclei:
+	@mkdir -p $(REPO_ROOT)/benchmarks/results/run-$(RUN_ID)/nuclei-$(TARGET_VHOST)
+	RUN_ID=$(RUN_ID) TARGET_VHOST=$(TARGET_VHOST) \
+	  bash $(RUNNERS)/run-nuclei.sh
+
+## Latency + RPS load test (WAF vs direct). Measures overhead.
+eval-load:
+	@mkdir -p $(REPO_ROOT)/benchmarks/results/run-$(RUN_ID)/load-$(TARGET_VHOST)
+	RUN_ID=$(RUN_ID) TARGET_VHOST=$(TARGET_VHOST) \
+	  DIRECT_HOST=$(DIRECT_HOST) DIRECT_PORT=$(DIRECT_PORT) \
+	  bash $(RUNNERS)/run-load.sh
+
+## Aggregate all scenario outputs into results.csv + report.json.
+eval-metrics:
+	RUN_ID=$(RUN_ID) bash $(RUNNERS)/collect-metrics.sh
+
+## Run all scenarios (ftw → zap → nuclei → load → metrics) in one pass.
+eval-all: eval-ftw eval-zap eval-nuclei eval-load eval-metrics
+
+# ── Results summary ────────────────────────────────────────────────────────
+
+## Print a summary of the most recent run (or RUN_ID= a specific run).
+results:
+	@latest=$$(ls -1t $(REPO_ROOT)/benchmarks/results/ 2>/dev/null \
+	            | grep '^run-' | head -1 | sed 's/^run-//'); \
+	run=$${RUN_ID:-$$latest}; \
+	csv="$(REPO_ROOT)/benchmarks/results/run-$${run}/results.csv"; \
+	if [[ -f "$$csv" ]]; then \
+	  echo "Run: $${run}"; column -t -s, "$$csv"; \
+	else \
+	  echo "No results found. Run 'make eval-all RUN_ID=<id>' first."; \
+	fi
+
+# ── Help ───────────────────────────────────────────────────────────────────
+
+help:
+	@echo "Guard Proxy Evaluation Lab"
+	@echo ""
+	@echo "Setup:"
+	@echo "  cp deploy/demo/.env.example deploy/demo/.env"
+	@echo "  cp benchmarks/lab/.env.example benchmarks/lab/.env"
+	@echo "  git submodule update --init --recursive"
+	@echo ""
+	@echo "Targets:"
+	@grep -E '^## ' $(MAKEFILE_LIST) | sed 's/^## /  /'
+	@echo ""
+	@echo "Variables:"
+	@echo "  RUN_ID=<id>         Override run ID (default: timestamp)"
+	@echo "  TARGET_VHOST=<host> Target vhost (default: juice.local)"
+	@echo "  DIRECT_HOST=<name>  Direct-access Docker service name"
+	@echo "  DIRECT_PORT=<port>  Direct-access port (bypasses HAProxy)"
+	@echo ""
+	@echo "Example (3 runs for thesis median):"
+	@echo "  for i in 1 2 3; do make eval-all; done"
diff --git a/benchmarks/lab/.env.example b/benchmarks/lab/.env.example
@@ -0,0 +1,34 @@
+# Copy to benchmarks/lab/.env before running the eval lab.
+# These are local-lab defaults only — never use these credentials in production.
+
+# ── DVWA database ─────────────────────────────────────────────────────────
+DVWA_DB_ROOT_PASSWORD=dvwa_root_pw
+DVWA_DB_PASSWORD=dvwa_pw
+
+# ── WordPress database ────────────────────────────────────────────────────
+WP_DB_ROOT_PASSWORD=wp_root_pw
+WP_DB_PASSWORD=wp_pw
+WP_ADMIN_PASSWORD=LabAdmin12345!
+
+# ── Lab policy settings (used by setup-lab.sh) ────────────────────────────
+# Baseline policy: PL1, anomaly threshold 5, block mode
+LAB_POLICY_NAME=Lab Baseline
+LAB_POLICY_PARANOIA=1
+LAB_POLICY_INBOUND_THRESHOLD=5
+LAB_POLICY_OUTBOUND_THRESHOLD=4
+
+# High-paranoia policy for sweep tests: PL2, anomaly threshold 3, block mode
+LAB_PL2_POLICY_NAME=Lab PL2
+LAB_PL2_POLICY_PARANOIA=2
+LAB_PL2_POLICY_INBOUND_THRESHOLD=3
+LAB_PL2_POLICY_OUTBOUND_THRESHOLD=3
+
+# ── Vhost domains ─────────────────────────────────────────────────────────
+LAB_JUICESHOP_DOMAIN=juice.local
+LAB_JUICESHOP_BACKEND_URL=http://juiceshop:3000
+
+LAB_DVWA_DOMAIN=dvwa.local
+LAB_DVWA_BACKEND_URL=http://dvwa:80
+
+LAB_WP_DOMAIN=wp.local
+LAB_WP_BACKEND_URL=http://wordpress:80