From 854b1a964af6660b82fd6b39b7fd561699229439 Mon Sep 17 00:00:00 2001
From: Derek Ko <koderek000@gmail.com>
Date: Sun, 17 May 2026 11:39:37 -0700
Subject: [PATCH] docs: expand env example for benchmark setup

---
 .env.example | 133 +++++++++++++++++++++++++++++++++++++++++----------
 README.md    |   8 +++-
 2 files changed, 115 insertions(+), 26 deletions(-)
diff --git a/.env.example b/.env.example
index b1cb439..97995ca 100644
--- a/.env.example
+++ b/.env.example
@@ -1,40 +1,125 @@
-DOCKER_IMAGE=wildclawbench-ubuntu:v1.3
-GATEWAY_PORT=18789
-TMP_WORKSPACE=/tmp_workspace
- 
+# Copy this file to .env and fill in the values for the harnesses/tasks you run.
+# Do not commit .env; it is ignored by .gitignore.
+
+# ---------------------------------------------------------------------------
+# Core benchmark settings
+# ---------------------------------------------------------------------------
 TASKS_SUBDIR=tasks
 OUTPUT_SUBDIR=output
- 
-DEFAULT_MODEL=openrouter/xxx
- 
+TMP_WORKSPACE=/tmp_workspace
 DEFAULT_PARALLEL=1
- 
+
+# Model passed to eval/run_batch.py when --model is omitted.
+# OpenClaw/Codex examples use the openrouter/<provider>/<model> form.
+# Claude Code/Hermes examples use <provider>/<model>; their runners add the
+# OpenRouter prefix internally when needed.
+DEFAULT_MODEL=openrouter/anthropic/claude-sonnet-4.6
+
+# ---------------------------------------------------------------------------
+# Provider credentials
+# ---------------------------------------------------------------------------
+OPENROUTER_BASE_URL=https://openrouter.ai/api/v1
+OPENROUTER_API_KEY=
+
+# Used by judge-based grading metrics. Leave empty to use task/code defaults,
+# or set an explicit OpenRouter model such as openai/gpt-5.4.
+JUDGE_MODEL=openai/gpt-5.4
+
+# Required for Search & Retrieval tasks and for harnesses that expose web search.
+BRAVE_API_KEY=
+
+# ---------------------------------------------------------------------------
+# Docker images
+# ---------------------------------------------------------------------------
+# OpenClaw harness image.
+DOCKER_IMAGE=wildclawbench-ubuntu:v1.3
+
+# Claude Code harness image. Both names are supported by the runner.
+DOCKER_IMAGE_CLAUDECODE=wildclawbench-claudecode-ubuntu:v0.2
+# CLAUDECODE_DOCKER_IMAGE=wildclawbench-claudecode-ubuntu:v0.2
+
+# Codex harness image.
+DOCKER_IMAGE_CODEX=wildclawbench-codex-ubuntu:v0.0
+
+# Hermes Agent harness image.
+HERMES_DOCKER_IMAGE=wildclawbench-hermes-agent:v0.5
+
+# OpenClaw gateway port on the host.
+GATEWAY_PORT=18789
+
+# ---------------------------------------------------------------------------
+# Network/proxy settings inside benchmark containers
+# ---------------------------------------------------------------------------
 HTTP_PROXY_INNER=
 HTTPS_PROXY_INNER=
 NO_PROXY_INNER=
 
-OPENROUTER_BASE_URL='https://openrouter.ai/api/v1'
-JUDGE_MODEL=openai/gpt-5.4
-OPENROUTER_API_KEY=
-BRAVE_API_KEY=
+# ---------------------------------------------------------------------------
+# OpenClaw and multimodal helper settings
+# ---------------------------------------------------------------------------
+# Optional image model for the OpenClaw image tool. If unset, the OpenClaw
+# runner falls back to the chat model passed with --model.
+OPENCLAW_IMAGE_MODEL=
 
-# Using a Custom Model Endpoint
+# Optional image model for the Codex image helper.
+OPENROUTER_IMAGE_MODEL=
+WILDCLAW_IMAGE_MODEL=
+WILDCLAW_IMAGE_HELPER_CALL_LIMIT=2
+
+# ---------------------------------------------------------------------------
+# Custom OpenClaw model endpoint support
+# ---------------------------------------------------------------------------
+# Required only when a --models-config JSON file contains ${MY_PROXY_API_KEY}.
 # MY_PROXY_API_KEY=
 
-# Lobster profile env keys (add values here for skills that need them)
-# GEMINI_API_KEY=
-# FIRECRAWL_API_KEY=
-# EXA_API_KEY=
+# ---------------------------------------------------------------------------
+# Codex harness settings
+# ---------------------------------------------------------------------------
+CODEX_NPM_PACKAGE=@openai/codex
+CODEX_NPM_VERSION=0.117.0
+CODEX_BOOTSTRAP_RETRIES=2
+CODEX_BOOTSTRAP_RETRY_BASE_DELAY=3
 
-# Task env
-# 01_Productivity_Flow
+# Optional overrides. Leave blank for runner defaults.
+CODEX_REASONING_EFFORT=
+CODEX_WIRE_API=
 
-# 02_Code_Intelligence
+# Optional cost accounting for Codex usage.json. Values are USD per 1M tokens.
+CODEX_INPUT_PRICE_PER_MTOK=0
+CODEX_OUTPUT_PRICE_PER_MTOK=0
+CODEX_CACHE_READ_PRICE_PER_MTOK=0
+CODEX_CACHE_WRITE_PRICE_PER_MTOK=0
 
-# 03_Social_Interaction
+# ---------------------------------------------------------------------------
+# Claude Code harness settings
+# ---------------------------------------------------------------------------
+DISABLE_PROMPT_CACHING=1
+DISABLE_INTERLEAVED_THINKING=1
+CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS=1
+IS_SANDBOX=1
+CLAUDE_CODE_FULL_LOG_PATH=./log
 
-# 04_Search_Retrieval
+# Optional cost accounting for Claude Code usage.json. Values are USD per 1M
+# tokens.
+CLAUDECODE_INPUT_PRICE_PER_MTOK=0
+CLAUDECODE_OUTPUT_PRICE_PER_MTOK=0
+CLAUDECODE_CACHE_READ_PRICE_PER_MTOK=0
+CLAUDECODE_CACHE_WRITE_PRICE_PER_MTOK=0
 
-# 05_Creative_Synthesis
+# ---------------------------------------------------------------------------
+# Personal OpenClaw/Lobster profile env keys
+# ---------------------------------------------------------------------------
+# Add real values here when passing their names through --lobster-env.
+# GEMINI_API_KEY=
+# FIRECRAWL_API_KEY=
+# EXA_API_KEY=
 
-# 06_Safety_Alignment
+# ---------------------------------------------------------------------------
+# Task-specific env
+# ---------------------------------------------------------------------------
+# Most built-in automated checks that call an LLM use:
+#   OPENROUTER_API_KEY
+#   OPENROUTER_BASE_URL
+#   JUDGE_MODEL
+# Individual task ## Env sections may request additional keys; define them here
+# and they will be injected into the runtime container.
diff --git a/README.md b/README.md
index e2f71de..6901eab 100644
--- a/README.md
+++ b/README.md
@@ -226,7 +226,7 @@ The script will:
 - Download SAM3 model weights for Code Intelligence tasks
 
 
-Prerequisites: `yt-dlp`, `ffmpeg`, `gdown`.
+Prerequisites: `yt-dlp`, `ffmpeg`, `modelscope`.
 
 > **Note:** YouTube downloads may require authentication. If you encounter a "Sign in to confirm you're not a bot" error, try one of the following:
 > - [Get cookies.txt locally](https://chromewebstore.google.com/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc?pli=1).
@@ -235,9 +235,13 @@ Prerequisites: `yt-dlp`, `ffmpeg`, `gdown`.
 
 ### Run
 
-Set your API keys in the `.env` file:
+Create a local `.env` file from the example, then set your API keys:
 
+```bash
+cp .env.example .env
 ```
+
+```bash
 OPENROUTER_API_KEY=your_api_key_here
 BRAVE_API_KEY=your_brave_key_here  # required for search tasks
 ```