Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -111,16 +111,20 @@ logs/
*.mkv

# ===== Benchmark Results =====
# Flat files (legacy pattern)
benchmarks/results/*.json
benchmarks/results/*.html
benchmarks/results/*.csv
# Run directories (run-<RUN_ID>/) — all raw output is gitignored
benchmarks/results/run-*/

# ===== HAProxy / Coraza Runtime =====
*.sock
*.pid
haproxy.stats

# ===== Thesis =====
# Separate repository — not tracked here.
thesis/

# ===== Keep Empty Directories =====
Expand Down
48 changes: 47 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ COMPOSE_FILE := deploy/docker/docker-compose.yml
COMPOSE_DEBUG_FILE := deploy/docker/docker-compose.debug.yml
ENV_FILE := deploy/docker/.env

.PHONY: run dev down clean logs ps seed users coraza-build
.PHONY: run dev down clean logs ps seed users coraza-build \
eval-up eval-down eval-clean eval-ftw eval-zap eval-nuclei eval-load eval-metrics eval-all eval-results

run:
docker-compose -f $(COMPOSE_FILE) --env-file $(ENV_FILE) up --build -d
Expand Down Expand Up @@ -30,3 +31,48 @@ users:

coraza-build:
docker build -f deploy/docker/coraza.Dockerfile -t guard-proxy/coraza-spoa:dev .

# ── Evaluation lab (delegates to benchmarks/Makefile) ─────────────────────
# See benchmarks/Makefile for full documentation and variable overrides.

eval-up:
$(MAKE) -C benchmarks lab-up

eval-down:
$(MAKE) -C benchmarks lab-down

eval-clean:
$(MAKE) -C benchmarks lab-clean

eval-ftw:
$(MAKE) -C benchmarks eval-ftw \
$(if $(RUN_ID),RUN_ID=$(RUN_ID)) \
$(if $(TARGET_VHOST),TARGET_VHOST=$(TARGET_VHOST))

eval-zap:
$(MAKE) -C benchmarks eval-zap \
$(if $(RUN_ID),RUN_ID=$(RUN_ID)) \
$(if $(TARGET_VHOST),TARGET_VHOST=$(TARGET_VHOST))

eval-nuclei:
$(MAKE) -C benchmarks eval-nuclei \
$(if $(RUN_ID),RUN_ID=$(RUN_ID)) \
$(if $(TARGET_VHOST),TARGET_VHOST=$(TARGET_VHOST))

eval-load:
$(MAKE) -C benchmarks eval-load \
$(if $(RUN_ID),RUN_ID=$(RUN_ID)) \
$(if $(TARGET_VHOST),TARGET_VHOST=$(TARGET_VHOST))

eval-metrics:
$(MAKE) -C benchmarks eval-metrics \
$(if $(RUN_ID),RUN_ID=$(RUN_ID))

eval-all:
$(MAKE) -C benchmarks eval-all \
$(if $(RUN_ID),RUN_ID=$(RUN_ID)) \
$(if $(TARGET_VHOST),TARGET_VHOST=$(TARGET_VHOST))

eval-results:
$(MAKE) -C benchmarks results \
$(if $(RUN_ID),RUN_ID=$(RUN_ID))
30 changes: 26 additions & 4 deletions README.testing.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
|-------|----------|-------|---------------|
| **Unit** (many, fast) | `src/backend/tests/unit/` | pytest, Vitest | >80% |
| **Integration** (some) | `src/backend/tests/integration/` | pytest, Docker | Key flows |
| **Security** | `tests/security/` | sqlmap, OWASP ZAP, custom payloads | OWASP Top 10 |
| **Performance** | `benchmarks/` | wrk, k6, Locust | <20% WAF overhead |
| **Security** | `benchmarks/lab/` | OWASP ZAP, Nuclei, go-ftw (CRS corpus) | OWASP Top 10 |
| **Performance** | `benchmarks/lab/` | wrk (WAF vs direct) | <20% WAF overhead |

## WAF Testing

Expand Down Expand Up @@ -54,10 +54,32 @@ uv run pytest -m e2e tests/e2e/test_policy_apply.py
The test uses the same prerequisites as the smoke test and is wired into the
nightly smoke workflow. Normal backend pytest runs exclude tests marked `e2e`.

## Evaluation Lab (thesis M6)

Full WAF evaluation with real target apps (WordPress, Juice Shop, DVWA):

```sh
# Prerequisites
cp deploy/demo/.env.example deploy/demo/.env
cp benchmarks/lab/.env.example benchmarks/lab/.env
git submodule update --init --recursive

# Bring up the lab
make eval-up

# Run all scenarios (ftw → zap → nuclei → load → metrics)
make eval-all

# View results
make eval-results
```

See `benchmarks/lab/` for scenario configs and `docs/evaluation-plan.md` for methodology.

## Test Data

- Payloads: `benchmarks/payloads/` (sqli.txt, xss.txt, legitimate.txt)
- Results: `benchmarks/results/` (timestamped JSON, gitignored)
- Payloads: `benchmarks/payloads/` (sqli.txt, xss.txt, lfi.txt, legitimate.txt)
- Results: `benchmarks/results/` (timestamped JSON/CSV, gitignored)

## Commands

Expand Down
108 changes: 108 additions & 0 deletions benchmarks/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
REPO_ROOT := $(shell git rev-parse --show-toplevel 2>/dev/null || pwd)
DEMO_COMPOSE := $(REPO_ROOT)/deploy/demo/docker-compose.yml
LAB_COMPOSE := $(REPO_ROOT)/benchmarks/lab/docker-compose.targets.yml
DEMO_ENV := $(REPO_ROOT)/deploy/demo/.env
LAB_ENV := $(REPO_ROOT)/benchmarks/lab/.env
RUNNERS := $(REPO_ROOT)/benchmarks/lab/runners

RUN_ID ?= $(shell date +%Y%m%d-%H%M%S)
TARGET_VHOST ?= juice.local
DIRECT_HOST ?= juiceshop
DIRECT_PORT ?= 3000

.PHONY: lab-up lab-down lab-clean \
eval-ftw eval-zap eval-nuclei eval-load eval-metrics eval-all \
results help

# ── Lab lifecycle ──────────────────────────────────────────────────────────

## Bring up the demo + all lab targets and register vhosts.
lab-up:
@echo "==> Starting guard-proxy demo + lab targets..."
docker compose \
-f $(DEMO_COMPOSE) \
-f $(LAB_COMPOSE) \
--env-file $(DEMO_ENV) \
--env-file $(LAB_ENV) \
up -d --build
@echo "==> Seeding vhosts..."
bash $(REPO_ROOT)/deploy/demo/setup-demo.sh
bash $(REPO_ROOT)/benchmarks/lab/setup-lab.sh --skip-compose

## Stop the lab (preserve volumes).
lab-down:
bash $(REPO_ROOT)/benchmarks/lab/teardown-lab.sh

## Stop the lab and remove all volumes.
lab-clean:
bash $(REPO_ROOT)/benchmarks/lab/teardown-lab.sh --clean

# ── Individual scenario runners ────────────────────────────────────────────

## CRS regression suite (TPR gold standard). Uses go-ftw against the CRS corpus.
eval-ftw:
@mkdir -p $(REPO_ROOT)/benchmarks/results/run-$(RUN_ID)/ftw
RUN_ID=$(RUN_ID) TARGET_VHOST=$(TARGET_VHOST) \
bash $(RUNNERS)/run-ftw.sh

## ZAP baseline scan (FPR measurement, WordPress is best target for FP).
eval-zap:
@mkdir -p $(REPO_ROOT)/benchmarks/results/run-$(RUN_ID)/zap-$(TARGET_VHOST)
RUN_ID=$(RUN_ID) TARGET_VHOST=$(TARGET_VHOST) \
bash $(RUNNERS)/run-zap.sh

## Nuclei CVE templates (WAF TPR against real attack payloads).
eval-nuclei:
@mkdir -p $(REPO_ROOT)/benchmarks/results/run-$(RUN_ID)/nuclei-$(TARGET_VHOST)
RUN_ID=$(RUN_ID) TARGET_VHOST=$(TARGET_VHOST) \
bash $(RUNNERS)/run-nuclei.sh

## Latency + RPS load test (WAF vs direct). Measures overhead.
eval-load:
@mkdir -p $(REPO_ROOT)/benchmarks/results/run-$(RUN_ID)/load-$(TARGET_VHOST)
RUN_ID=$(RUN_ID) TARGET_VHOST=$(TARGET_VHOST) \
DIRECT_HOST=$(DIRECT_HOST) DIRECT_PORT=$(DIRECT_PORT) \
bash $(RUNNERS)/run-load.sh

## Aggregate all scenario outputs into results.csv + report.json.
eval-metrics:
RUN_ID=$(RUN_ID) bash $(RUNNERS)/collect-metrics.sh

## Run all scenarios (ftw → zap → nuclei → load → metrics) in one pass.
eval-all: eval-ftw eval-zap eval-nuclei eval-load eval-metrics

# ── Results summary ────────────────────────────────────────────────────────

## Print a summary of the most recent run (or RUN_ID= a specific run).
results:
@latest=$$(ls -1t $(REPO_ROOT)/benchmarks/results/ 2>/dev/null \
| grep '^run-' | head -1 | sed 's/^run-//'); \
run=$${RUN_ID:-$$latest}; \
csv="$(REPO_ROOT)/benchmarks/results/run-$${run}/results.csv"; \
if [[ -f "$$csv" ]]; then \
echo "Run: $${run}"; column -t -s, "$$csv"; \
else \
echo "No results found. Run 'make eval-all RUN_ID=<id>' first."; \
fi

# ── Help ───────────────────────────────────────────────────────────────────

help:
@echo "Guard Proxy Evaluation Lab"
@echo ""
@echo "Setup:"
@echo " cp deploy/demo/.env.example deploy/demo/.env"
@echo " cp benchmarks/lab/.env.example benchmarks/lab/.env"
@echo " git submodule update --init --recursive"
@echo ""
@echo "Targets:"
@grep -E '^## ' $(MAKEFILE_LIST) | sed 's/^## / /'
@echo ""
@echo "Variables:"
@echo " RUN_ID=<id> Override run ID (default: timestamp)"
@echo " TARGET_VHOST=<host> Target vhost (default: juice.local)"
@echo " DIRECT_HOST=<name> Direct-access Docker service name"
@echo " DIRECT_PORT=<port> Direct-access port (bypasses HAProxy)"
@echo ""
@echo "Example (3 runs for thesis median):"
@echo " for i in 1 2 3; do make eval-all; done"
34 changes: 34 additions & 0 deletions benchmarks/lab/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copy to benchmarks/lab/.env before running the eval lab.
# These are local-lab defaults only — never use these credentials in production.

# ── DVWA database ─────────────────────────────────────────────────────────
DVWA_DB_ROOT_PASSWORD=dvwa_root_pw
DVWA_DB_PASSWORD=dvwa_pw

# ── WordPress database ────────────────────────────────────────────────────
WP_DB_ROOT_PASSWORD=wp_root_pw
WP_DB_PASSWORD=wp_pw
WP_ADMIN_PASSWORD=LabAdmin12345!

# ── Lab policy settings (used by setup-lab.sh) ────────────────────────────
# Baseline policy: PL1, anomaly threshold 5, block mode
LAB_POLICY_NAME=Lab Baseline
LAB_POLICY_PARANOIA=1
LAB_POLICY_INBOUND_THRESHOLD=5
LAB_POLICY_OUTBOUND_THRESHOLD=4

# High-paranoia policy for sweep tests: PL2, anomaly threshold 3, block mode
LAB_PL2_POLICY_NAME=Lab PL2
LAB_PL2_POLICY_PARANOIA=2
LAB_PL2_POLICY_INBOUND_THRESHOLD=3
LAB_PL2_POLICY_OUTBOUND_THRESHOLD=3

# ── Vhost domains ─────────────────────────────────────────────────────────
LAB_JUICESHOP_DOMAIN=juice.local
LAB_JUICESHOP_BACKEND_URL=http://juiceshop:3000

LAB_DVWA_DOMAIN=dvwa.local
LAB_DVWA_BACKEND_URL=http://dvwa:80

LAB_WP_DOMAIN=wp.local
LAB_WP_BACKEND_URL=http://wordpress:80
Loading
Loading