yourconscience · yourconscience · May 13, 2026 · May 11, 2026 · May 11, 2026 · May 11, 2026
diff --git a/README.md b/README.md
@@ -58,7 +58,7 @@ uv run llm-quest benchmark --config configs/benchmarks/memory_full_transcript.ya
 uv run llm-quest benchmark-report --benchmark-id <id> --output report.md
 
 # Analyze a single run
-uv run llm-quest analyze-run --run-summary results/<agent>/<quest>/run_<id>/run_summary.json
+uv run llm-quest analyze-run --run-summary results/<harness>/<quest>/run_<id>/run_summary.json
 
 # Play as human in terminal
 uv run llm-quest play --quest quests/Boat.qm
@@ -107,7 +107,8 @@ Provider-specific keys in `.env`:
 
 ## Project Structure
 
-- `llm_quest_benchmark/agents/` - Agent implementations (LLM, planner, tool-augmented)
+- `llm_quest_benchmark/harnesses/` - LLM harness implementations for prompt, memory, tools, and planning experiments
+- `llm_quest_benchmark/players/` - Non-LLM player primitives (`human`, `random_choice`)
 - `llm_quest_benchmark/prompt_templates/` - Jinja2 prompt templates for the public context-scaffold taxonomy
 - `llm_quest_benchmark/executors/` - CLI, benchmark orchestration, TS bridge
 - `configs/benchmarks/` - YAML benchmark configurations

diff --git a/configs/benchmarks/balanced_gpt5mini_all_modes.yaml b/configs/benchmarks/balanced_gpt5mini_all_modes.yaml
@@ -21,49 +21,45 @@ quests:
 agents:
   # 1. Minimal prompt
   - model: gpt-5-mini
-    template: stub
+    harness: minimal
     temperature: 0.4
     runs: 3
   # 2. Short-context reasoning
   - model: gpt-5-mini
-    template: reasoning
+    harness: reasoning_recent
     temperature: 0.4
     runs: 3
   # 3. Full-history reasoning
   - model: gpt-5-mini
-    template: reasoning
+    harness: reasoning_full
     temperature: 0.4
     runs: 3
-    memory_mode: full_transcript
   # 4. Compact memory / memo
   - model: gpt-5-mini
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
   # 5. Prompt hints
   - model: gpt-5-mini
-    template: light_hints
+    harness: hinted_compact
     temperature: 0.4
     runs: 3
   # 6. Tools + compact memory
   - model: gpt-5-mini
-    template: tool_augmented
+    harness: tool_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
   # 7. Tools + hints + compact memory
   - model: gpt-5-mini
-    template: tool_augmented_hints
+    harness: tool_hinted
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
   # 8. Planner loop
   - model: gpt-5-mini
-    template: planner
+    harness: planner
     temperature: 0.4
     runs: 3
 debug: false

diff --git a/configs/benchmarks/exp3_no_loop_breaker.yaml b/configs/benchmarks/exp3_no_loop_breaker.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: reasoning
+    harness: reasoning_full
     temperature: 0.4
     runs: 2
-    memory_mode: full_transcript
 debug: false
 quest_timeout: 600
 max_workers: 2

diff --git a/configs/benchmarks/exp3_stateful_compact.yaml b/configs/benchmarks/exp3_stateful_compact.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/exp4_compaction_no_memo.yaml b/configs/benchmarks/exp4_compaction_no_memo.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: reasoning
+    harness: compaction_no_memo
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/exp4_memo_cot.yaml b/configs/benchmarks/exp4_memo_cot.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: memo_cot
+    harness: memo_cot
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/exp4_memo_extended.yaml b/configs/benchmarks/exp4_memo_extended.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: memo_extended
+    harness: memo_extended
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/exp4_memo_structured.yaml b/configs/benchmarks/exp4_memo_structured.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: memo_structured
+    harness: memo_structured
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/exp5_stateful_compact_variance.yaml b/configs/benchmarks/exp5_stateful_compact_variance.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 5
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/exp6_prompt_hints.yaml b/configs/benchmarks/exp6_prompt_hints.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: stateful_compact_hints
+    harness: hinted_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/exp6_tools.yaml b/configs/benchmarks/exp6_tools.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: tool_augmented
+    harness: tool_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/exp6_tools_hints.yaml b/configs/benchmarks/exp6_tools_hints.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: tool_augmented_hints
+    harness: tool_hinted
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/exp6_unified_tools_screen.yaml b/configs/benchmarks/exp6_unified_tools_screen.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: tool_augmented
+    harness: tool_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/exp7_deepseek.yaml b/configs/benchmarks/exp7_deepseek.yaml
@@ -7,10 +7,9 @@ quests:
   - quests/sr_2_1_2121_eng/Robots_eng.qm
 agents:
   - model: "openrouter:deepseek/deepseek-chat-v3-0324"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/exp7_haiku.yaml b/configs/benchmarks/exp7_haiku.yaml
@@ -7,10 +7,9 @@ quests:
   - quests/sr_2_1_2121_eng/Robots_eng.qm
 agents:
   - model: "anthropic:claude-3-5-haiku-latest"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/exp7_llama.yaml b/configs/benchmarks/exp7_llama.yaml
@@ -7,10 +7,9 @@ quests:
   - quests/sr_2_1_2121_eng/Robots_eng.qm
 agents:
   - model: "openrouter:meta-llama/llama-4-scout"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/exp7_mistral.yaml b/configs/benchmarks/exp7_mistral.yaml
@@ -7,10 +7,9 @@ quests:
   - quests/sr_2_1_2121_eng/Robots_eng.qm
 agents:
   - model: "openrouter:mistralai/mistral-small-2603"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/exp7_qwen.yaml b/configs/benchmarks/exp7_qwen.yaml
@@ -7,10 +7,9 @@ quests:
   - quests/sr_2_1_2121_eng/Robots_eng.qm
 agents:
   - model: "openrouter:qwen/qwen3-30b-a3b"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/exp7b_model_upgrades.yaml b/configs/benchmarks/exp7b_model_upgrades.yaml
@@ -20,22 +20,19 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:deepseek/deepseek-v4-flash"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
   - model: "openrouter:qwen/qwen3.6-flash"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
-  - model: "claude:claude-haiku-4-5-20251001"
-    template: stateful_compact
+  - model: "anthropic:claude-haiku-4-5-20251001"
+    harness: memo_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/memory_compaction.yaml b/configs/benchmarks/memory_compaction.yaml
@@ -18,45 +18,39 @@ quests:
 agents:
   # Gemini 3 Flash - compaction interval 10
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 10
   # Gemini 3 Flash - compaction interval 20
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 20
   # GPT-5.4 Mini - compaction interval 10
   - model: "openrouter:openai/gpt-5.4-mini"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 10
   # GPT-5.4 Mini - compaction interval 20
   - model: "openrouter:openai/gpt-5.4-mini"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 20
   # DeepSeek V3.2 - compaction interval 10
   - model: "openrouter:deepseek/deepseek-v3.2"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 10
   # DeepSeek V3.2 - compaction interval 20
   - model: "openrouter:deepseek/deepseek-v3.2"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 20
 debug: false
 quest_timeout: 600

diff --git a/configs/benchmarks/memory_full_transcript.yaml b/configs/benchmarks/memory_full_transcript.yaml
@@ -18,22 +18,19 @@ quests:
 agents:
   # Gemini 3 Flash - full transcript
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: reasoning
+    harness: reasoning_full
     temperature: 0.4
     runs: 3
-    memory_mode: full_transcript
   # GPT-5.4 Mini - full transcript
   - model: "openrouter:openai/gpt-5.4-mini"
-    template: reasoning
+    harness: reasoning_full
     temperature: 0.4
     runs: 3
-    memory_mode: full_transcript
   # DeepSeek V3.2 - full transcript
   - model: "openrouter:deepseek/deepseek-v3.2"
-    template: reasoning
+    harness: reasoning_full
     temperature: 0.4
     runs: 3
-    memory_mode: full_transcript
 debug: false
 quest_timeout: 600
 max_workers: 2