diff --git a/README.md b/README.md
index 3ff854c..013fb57 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ uv run llm-quest benchmark --config configs/benchmarks/memory_full_transcript.ya
 uv run llm-quest benchmark-report --benchmark-id <id> --output report.md
 
 # Analyze a single run
-uv run llm-quest analyze-run --run-summary results/<agent>/<quest>/run_<id>/run_summary.json
+uv run llm-quest analyze-run --run-summary results/<harness>/<quest>/run_<id>/run_summary.json
 
 # Play as human in terminal
 uv run llm-quest play --quest quests/Boat.qm
@@ -107,7 +107,8 @@ Provider-specific keys in `.env`:
 
 ## Project Structure
 
-- `llm_quest_benchmark/agents/` - Agent implementations (LLM, planner, tool-augmented)
+- `llm_quest_benchmark/harnesses/` - LLM harness implementations for prompt, memory, tools, and planning experiments
+- `llm_quest_benchmark/players/` - Non-LLM player primitives (`human`, `random_choice`)
 - `llm_quest_benchmark/prompt_templates/` - Jinja2 prompt templates for the public context-scaffold taxonomy
 - `llm_quest_benchmark/executors/` - CLI, benchmark orchestration, TS bridge
 - `configs/benchmarks/` - YAML benchmark configurations
diff --git a/configs/benchmarks/balanced_gpt5mini_all_modes.yaml b/configs/benchmarks/balanced_gpt5mini_all_modes.yaml
index 4ab3e65..812e94a 100644
--- a/configs/benchmarks/balanced_gpt5mini_all_modes.yaml
+++ b/configs/benchmarks/balanced_gpt5mini_all_modes.yaml
@@ -21,49 +21,45 @@ quests:
 agents:
   # 1. Minimal prompt
   - model: gpt-5-mini
-    template: stub
+    harness: minimal
     temperature: 0.4
     runs: 3
   # 2. Short-context reasoning
   - model: gpt-5-mini
-    template: reasoning
+    harness: reasoning_recent
     temperature: 0.4
     runs: 3
   # 3. Full-history reasoning
   - model: gpt-5-mini
-    template: reasoning
+    harness: reasoning_full
     temperature: 0.4
     runs: 3
-    memory_mode: full_transcript
   # 4. Compact memory / memo
   - model: gpt-5-mini
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
   # 5. Prompt hints
   - model: gpt-5-mini
-    template: light_hints
+    harness: hinted_compact
     temperature: 0.4
     runs: 3
   # 6. Tools + compact memory
   - model: gpt-5-mini
-    template: tool_augmented
+    harness: tool_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
   # 7. Tools + hints + compact memory
   - model: gpt-5-mini
-    template: tool_augmented_hints
+    harness: tool_hinted
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
   # 8. Planner loop
   - model: gpt-5-mini
-    template: planner
+    harness: planner
     temperature: 0.4
     runs: 3
 debug: false
diff --git a/configs/benchmarks/exp3_no_loop_breaker.yaml b/configs/benchmarks/exp3_no_loop_breaker.yaml
index 64240fe..57e7124 100644
--- a/configs/benchmarks/exp3_no_loop_breaker.yaml
+++ b/configs/benchmarks/exp3_no_loop_breaker.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: reasoning
+    harness: reasoning_full
     temperature: 0.4
     runs: 2
-    memory_mode: full_transcript
 debug: false
 quest_timeout: 600
 max_workers: 2
diff --git a/configs/benchmarks/exp3_stateful_compact.yaml b/configs/benchmarks/exp3_stateful_compact.yaml
index b43fc6b..bb9973c 100644
--- a/configs/benchmarks/exp3_stateful_compact.yaml
+++ b/configs/benchmarks/exp3_stateful_compact.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp4_compaction_no_memo.yaml b/configs/benchmarks/exp4_compaction_no_memo.yaml
index 5ef4130..4ab63e6 100644
--- a/configs/benchmarks/exp4_compaction_no_memo.yaml
+++ b/configs/benchmarks/exp4_compaction_no_memo.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: reasoning
+    harness: compaction_no_memo
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp4_memo_cot.yaml b/configs/benchmarks/exp4_memo_cot.yaml
index fe97bca..320da54 100644
--- a/configs/benchmarks/exp4_memo_cot.yaml
+++ b/configs/benchmarks/exp4_memo_cot.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: memo_cot
+    harness: memo_cot
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp4_memo_extended.yaml b/configs/benchmarks/exp4_memo_extended.yaml
index 66d1bf4..a5d6613 100644
--- a/configs/benchmarks/exp4_memo_extended.yaml
+++ b/configs/benchmarks/exp4_memo_extended.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: memo_extended
+    harness: memo_extended
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp4_memo_structured.yaml b/configs/benchmarks/exp4_memo_structured.yaml
index 83502c7..f70ab81 100644
--- a/configs/benchmarks/exp4_memo_structured.yaml
+++ b/configs/benchmarks/exp4_memo_structured.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: memo_structured
+    harness: memo_structured
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp5_stateful_compact_variance.yaml b/configs/benchmarks/exp5_stateful_compact_variance.yaml
index 6f99f29..89cc80b 100644
--- a/configs/benchmarks/exp5_stateful_compact_variance.yaml
+++ b/configs/benchmarks/exp5_stateful_compact_variance.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 5
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp6_prompt_hints.yaml b/configs/benchmarks/exp6_prompt_hints.yaml
index 098b1db..4c70e61 100644
--- a/configs/benchmarks/exp6_prompt_hints.yaml
+++ b/configs/benchmarks/exp6_prompt_hints.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: stateful_compact_hints
+    harness: hinted_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp6_tools.yaml b/configs/benchmarks/exp6_tools.yaml
index 8630bb0..b254005 100644
--- a/configs/benchmarks/exp6_tools.yaml
+++ b/configs/benchmarks/exp6_tools.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: tool_augmented
+    harness: tool_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp6_tools_hints.yaml b/configs/benchmarks/exp6_tools_hints.yaml
index b7949fc..0c0c3b6 100644
--- a/configs/benchmarks/exp6_tools_hints.yaml
+++ b/configs/benchmarks/exp6_tools_hints.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: tool_augmented_hints
+    harness: tool_hinted
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp6_unified_tools_screen.yaml b/configs/benchmarks/exp6_unified_tools_screen.yaml
index 0c43290..b80f8c0 100644
--- a/configs/benchmarks/exp6_unified_tools_screen.yaml
+++ b/configs/benchmarks/exp6_unified_tools_screen.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: tool_augmented
+    harness: tool_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp7_deepseek.yaml b/configs/benchmarks/exp7_deepseek.yaml
index 1b82664..6971569 100644
--- a/configs/benchmarks/exp7_deepseek.yaml
+++ b/configs/benchmarks/exp7_deepseek.yaml
@@ -7,10 +7,9 @@ quests:
   - quests/sr_2_1_2121_eng/Robots_eng.qm
 agents:
   - model: "openrouter:deepseek/deepseek-chat-v3-0324"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp7_haiku.yaml b/configs/benchmarks/exp7_haiku.yaml
index 72cd6c2..8546c80 100644
--- a/configs/benchmarks/exp7_haiku.yaml
+++ b/configs/benchmarks/exp7_haiku.yaml
@@ -7,10 +7,9 @@ quests:
   - quests/sr_2_1_2121_eng/Robots_eng.qm
 agents:
   - model: "anthropic:claude-3-5-haiku-latest"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp7_llama.yaml b/configs/benchmarks/exp7_llama.yaml
index 27eda5a..61e156c 100644
--- a/configs/benchmarks/exp7_llama.yaml
+++ b/configs/benchmarks/exp7_llama.yaml
@@ -7,10 +7,9 @@ quests:
   - quests/sr_2_1_2121_eng/Robots_eng.qm
 agents:
   - model: "openrouter:meta-llama/llama-4-scout"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp7_mistral.yaml b/configs/benchmarks/exp7_mistral.yaml
index 76f1a40..f570882 100644
--- a/configs/benchmarks/exp7_mistral.yaml
+++ b/configs/benchmarks/exp7_mistral.yaml
@@ -7,10 +7,9 @@ quests:
   - quests/sr_2_1_2121_eng/Robots_eng.qm
 agents:
   - model: "openrouter:mistralai/mistral-small-2603"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp7_qwen.yaml b/configs/benchmarks/exp7_qwen.yaml
index 572d7a6..27496cc 100644
--- a/configs/benchmarks/exp7_qwen.yaml
+++ b/configs/benchmarks/exp7_qwen.yaml
@@ -7,10 +7,9 @@ quests:
   - quests/sr_2_1_2121_eng/Robots_eng.qm
 agents:
   - model: "openrouter:qwen/qwen3-30b-a3b"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp7b_model_upgrades.yaml b/configs/benchmarks/exp7b_model_upgrades.yaml
index 4c35c8b..80ab53c 100644
--- a/configs/benchmarks/exp7b_model_upgrades.yaml
+++ b/configs/benchmarks/exp7b_model_upgrades.yaml
@@ -20,22 +20,19 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:deepseek/deepseek-v4-flash"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
   - model: "openrouter:qwen/qwen3.6-flash"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
-  - model: "claude:claude-haiku-4-5-20251001"
-    template: stateful_compact
+  - model: "anthropic:claude-haiku-4-5-20251001"
+    harness: memo_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/memory_compaction.yaml b/configs/benchmarks/memory_compaction.yaml
index 1bb10a8..c403665 100644
--- a/configs/benchmarks/memory_compaction.yaml
+++ b/configs/benchmarks/memory_compaction.yaml
@@ -18,45 +18,39 @@ quests:
 agents:
   # Gemini 3 Flash - compaction interval 10
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 10
   # Gemini 3 Flash - compaction interval 20
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 20
   # GPT-5.4 Mini - compaction interval 10
   - model: "openrouter:openai/gpt-5.4-mini"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 10
   # GPT-5.4 Mini - compaction interval 20
   - model: "openrouter:openai/gpt-5.4-mini"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 20
   # DeepSeek V3.2 - compaction interval 10
   - model: "openrouter:deepseek/deepseek-v3.2"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 10
   # DeepSeek V3.2 - compaction interval 20
   - model: "openrouter:deepseek/deepseek-v3.2"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 20
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/memory_full_transcript.yaml b/configs/benchmarks/memory_full_transcript.yaml
index 04ad152..9fc82a4 100644
--- a/configs/benchmarks/memory_full_transcript.yaml
+++ b/configs/benchmarks/memory_full_transcript.yaml
@@ -18,22 +18,19 @@ quests:
 agents:
   # Gemini 3 Flash - full transcript
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: reasoning
+    harness: reasoning_full
     temperature: 0.4
     runs: 3
-    memory_mode: full_transcript
   # GPT-5.4 Mini - full transcript
   - model: "openrouter:openai/gpt-5.4-mini"
-    template: reasoning
+    harness: reasoning_full
     temperature: 0.4
     runs: 3
-    memory_mode: full_transcript
   # DeepSeek V3.2 - full transcript
   - model: "openrouter:deepseek/deepseek-v3.2"
-    template: reasoning
+    harness: reasoning_full
     temperature: 0.4
     runs: 3
-    memory_mode: full_transcript
 debug: false
 quest_timeout: 600
 max_workers: 2
diff --git a/configs/benchmarks/memory_modes_pilot.yaml b/configs/benchmarks/memory_modes_pilot.yaml
index 2e4d862..db6aa23 100644
--- a/configs/benchmarks/memory_modes_pilot.yaml
+++ b/configs/benchmarks/memory_modes_pilot.yaml
@@ -5,31 +5,27 @@ quests:
 agents:
 # Short-context reasoning - default memory (3 obs, 5 decisions)
 - model: openrouter:google/gemini-3-flash-preview
-  template: reasoning
+  harness: reasoning_recent
   temperature: 0.4
   runs: 3
-  memory_mode: default
 
 # Short-context reasoning - loop-aware template
 - model: openrouter:google/gemini-3-flash-preview
-  template: loop_aware_reasoning
+  harness: reasoning_recent
   temperature: 0.4
   runs: 3
-  memory_mode: default
 
 # Full-history reasoning
 - model: openrouter:google/gemini-3-flash-preview
-  template: reasoning
+  harness: reasoning_full
   temperature: 0.4
   runs: 3
-  memory_mode: full_transcript
 
 # Compact memory / memo (compact every 10 steps)
 - model: openrouter:google/gemini-3-flash-preview
-  template: reasoning
+  harness: memo_compact
   temperature: 0.4
   runs: 3
-  memory_mode: compaction
   compaction_interval: 10
 
 debug: false
diff --git a/configs/benchmarks/openrouter_smoke_test.yaml b/configs/benchmarks/openrouter_smoke_test.yaml
index 6194df3..2fb50be 100644
--- a/configs/benchmarks/openrouter_smoke_test.yaml
+++ b/configs/benchmarks/openrouter_smoke_test.yaml
@@ -3,23 +3,23 @@ quests:
   - quests/Boat.qm
 agents:
   - model: "openrouter:anthropic/claude-sonnet-4-6"
-    template: stub
+    harness: minimal
     temperature: 0.4
     runs: 1
   - model: "openrouter:openai/gpt-5.4-mini"
-    template: stub
+    harness: minimal
     temperature: 0.4
     runs: 1
   - model: "openrouter:google/gemini-2.5-flash"
-    template: stub
+    harness: minimal
     temperature: 0.4
     runs: 1
   - model: "openrouter:deepseek/deepseek-chat"
-    template: stub
+    harness: minimal
     temperature: 0.4
     runs: 1
   - model: "openrouter:qwen/qwen3-235b-a22b"
-    template: stub
+    harness: minimal
     temperature: 0.4
     runs: 1
 debug: false
diff --git a/configs/default.yaml b/configs/default.yaml
index d7dbe67..3159029 100644
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -5,27 +5,27 @@ quests:
 
 agents:
   - model: random_choice
-    template: reasoning.jinja
+    harness: random_choice
     temperature: 0.0
     skip_single: true
 
   - model: gpt-5-mini
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
   - model: claude-sonnet-4-5
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
   - model: gemini-2.5-flash
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
   - model: deepseek-3.2-chat
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
diff --git a/configs/kr1.yaml b/configs/kr1.yaml
index c7771e6..c31cc3b 100644
--- a/configs/kr1.yaml
+++ b/configs/kr1.yaml
@@ -5,22 +5,22 @@ quests:
 
 agents:
   - model: gpt-5-mini
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
   - model: claude-sonnet-4-5
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
   - model: gemini-2.5-flash
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
   - model: deepseek-3.2-chat
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
diff --git a/configs/kr1_micro.yaml b/configs/kr1_micro.yaml
index c19bd1a..ac3df96 100644
--- a/configs/kr1_micro.yaml
+++ b/configs/kr1_micro.yaml
@@ -8,12 +8,12 @@ quests:
 agents:
   # Just 2 agents to validate the process
   - model: gpt-5-mini
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.7
     skip_single: true
     
   - model: gemini-2.5-flash
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.6
     skip_single: true
 
diff --git a/configs/kr1_test.yaml b/configs/kr1_test.yaml
index fbe843c..bb8ed98 100644
--- a/configs/kr1_test.yaml
+++ b/configs/kr1_test.yaml
@@ -7,12 +7,12 @@ quests:
 agents:
   # Just 2 agents to validate the process
   - model: gpt-5-mini
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.7
     skip_single: true
     
   - model: gemini-2.5-flash
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.6
     skip_single: true
 
diff --git a/configs/kr2_en_benchmark.yaml b/configs/kr2_en_benchmark.yaml
deleted file mode 100644
index 76b6c21..0000000
--- a/configs/kr2_en_benchmark.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Benchmark configuration for Kr2 English quests
-# Using recommended models with optimized temperature settings
-
-quests:
-  - quests/kr2_en
-
-agents:
-  # OpenAI models
-  - model: gpt-4o
-    template: reasoning.jinja
-    temperature: 0.5
-    skip_single: true
-
-  - model: gpt-4o-mini
-    template: reasoning.jinja
-    temperature: 0.7
-    skip_single: true
-
-  # Anthropic models
-  - model: claude-3-7-sonnet-latest
-    template: reasoning.jinja
-    temperature: 0.5
-    skip_single: true
-
-  - model: claude-3-5-sonnet-latest
-    template: reasoning.jinja
-    temperature: 0.6
-    skip_single: true
-
-# Debug mode enables more detailed logging
-debug: true
-
-# Quest timeout in seconds
-quest_timeout: 120
-
-# Output directory for benchmark results
-output_dir: metrics/kr2_en
-
-# Optional name for this benchmark run
-name: kr2_en_benchmark
\ No newline at end of file
diff --git a/configs/kr2_en_test.yaml b/configs/kr2_en_test.yaml
index 7dbe160..94cfaa3 100644
--- a/configs/kr2_en_test.yaml
+++ b/configs/kr2_en_test.yaml
@@ -5,7 +5,7 @@ quests:
 agents:
   - model: random_choice  # Use random agent for speed and reliability
     temperature: 0.5
-    template: reasoning.jinja
+    harness: random_choice
 quest_timeout: 10  # short timeout for testing
 debug: true
 output_dir: results/benchmarks
diff --git a/configs/test/parallel_agents_test.yaml b/configs/test/parallel_agents_test.yaml
index 37bca75..873d3ed 100644
--- a/configs/test/parallel_agents_test.yaml
+++ b/configs/test/parallel_agents_test.yaml
@@ -5,8 +5,9 @@ quests:
   - quests/kr_1_ru/Diamond.qm
 agents:
   - model: random_choice
+    harness: random_choice
   - model: gpt-5-mini
-    template: reasoning.jinja
+    harness: reasoning_recent
 debug: true
 # No max_workers setting - we'll use one worker per agent
 output_dir: results/benchmarks
diff --git a/configs/test/temperature_test.yaml b/configs/test/temperature_test.yaml
index 8f8e0cc..d79b705 100644
--- a/configs/test/temperature_test.yaml
+++ b/configs/test/temperature_test.yaml
@@ -7,32 +7,32 @@ quests:
 
 agents:
   - model: claude-sonnet-4-5
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.3
     skip_single: true
 
   - model: claude-sonnet-4-5
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
   - model: claude-sonnet-4-5
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.7
     skip_single: true
 
   - model: deepseek-3.2-chat
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.3
     skip_single: true
 
   - model: deepseek-3.2-chat
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
   - model: deepseek-3.2-chat
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.7
     skip_single: true
 
diff --git a/configs/test/test_benchmark.yaml b/configs/test/test_benchmark.yaml
index 3c89dab..b3321d9 100644
--- a/configs/test/test_benchmark.yaml
+++ b/configs/test/test_benchmark.yaml
@@ -3,8 +3,9 @@ quests:
   - quests/Boat.qm
 agents:
   - model: random_choice
+    harness: random_choice
   - model: gemini-2.5-flash
-    template: reasoning.jinja
+    harness: reasoning_recent
 debug: true
 quest_timeout: 60
 max_workers: 2
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index c7f556d..2588ee2 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -1,38 +1,75 @@
 # Architecture
 
 ## Overview
-LLM Quest Benchmark evaluates how different agent architectures complete interactive fiction quests (Space Rangers `.qm` format).
+
+LLM Quest Benchmark evaluates how **agent harnesses** complete interactive
+fiction quests in the Space Rangers `.qm` format. The benchmark holds the quest
+environment and result logging constant while varying the harness around the
+model: prompt template, memory strategy, tools, and action loop.
+
 The runtime loop is:
+
 1. Parse or step quest state via the TypeScript engine bridge.
-2. Build an action prompt from current state and available choices.
-3. Get agent choice (human/random/LLM with varying agent modes).
-4. Apply choice, log step, and detect outcome.
+2. Build harness context from current state, available choices, and memory.
+3. Get a choice from a human, random policy, or LLM-backed harness.
+4. Apply the choice, log the step, and detect the terminal outcome.
 5. Persist run metrics and run summaries.
 
+## Harness Engineering Framing
+
+This project treats the **agent harness** as the primary experimental object.
+An agent harness is the wrapper around a model that controls what the model
+sees, what state is carried forward, what external tools are available, and how
+a raw completion is converted into a quest action. In this codebase, harnesses
+are not incidental plumbing: they are the independent variable.
+
+This follows the practical question raised by "How Much Heavy Lifting Can an
+Agent Harness Do?" (arXiv:2604.07236): how much performance comes from the
+surrounding scaffold rather than the base model alone? Space Rangers text
+quests are useful because they are long enough to stress memory, planning, and
+state tracking, but concrete enough to score with terminal success/failure
+outcomes.
+
+Closest text-game benchmarks such as TextQuests and TALE-Suite usually vary
+models under a mostly fixed evaluation scaffold. LLM Quest Benchmark can hold
+the model fixed and vary the harness to ask which prompt, memory, tool, and
+planning choices change behavior.
+
 ## Main Runtime Layers
 
 ### 1. Quest Engine Layer
-- `space-rangers-quest/`:
-  TypeScript quest parser/player submodule.
-- `llm_quest_benchmark/executors/ts_bridge/consoleplayer.ts`:
-  Node entrypoint for parse/step execution.
-- `llm_quest_benchmark/executors/ts_bridge/bridge.py`:
-  Python subprocess bridge with startup preflight and actionable errors.
+
+- `space-rangers-quest/`: TypeScript quest parser/player submodule.
+- `llm_quest_benchmark/executors/ts_bridge/consoleplayer.ts`: Node entrypoint
+  for parse/step execution.
+- `llm_quest_benchmark/executors/ts_bridge/bridge.py`: Python subprocess
+  bridge with startup preflight and actionable errors.
 
 ### 2. Environment Layer
-- `llm_quest_benchmark/environments/qm.py`:
-  Wraps bridge into Python environment semantics (`reset`, `step`, terminal detection).
 
-### 3. Agent Layer
-- `llm_quest_benchmark/agents/llm_agent.py`: Base LLM agent with template-driven prompts, retry logic, loop-breaking, and safety filters.
-- `llm_quest_benchmark/agents/planner_agent.py`: Planner loop with observation-diff heuristic for re-planning.
-- `llm_quest_benchmark/agents/tool_agent.py`: Tool-using scaffold with quest history tool.
-- `llm_quest_benchmark/agents/agent_factory.py`: Factory that maps Prompt Template choices to agent classes.
-- `llm_quest_benchmark/agents/human_player.py`, `random_agent.py`: Non-LLM agents.
+- `llm_quest_benchmark/environments/qm.py`: Wraps the bridge into Python
+  environment semantics (`reset`, `step`, terminal detection).
+
+### 3. Harness Layer
 
-`LLMAgent` lazily initializes provider clients, so template rendering and agent construction do not require API keys.
+- `llm_quest_benchmark/harnesses/base.py`: `BaseHarness`, the shared
+  LLM-backed `QuestPlayer` implementation for prompt rendering, response
+  parsing, retries, contextual state, and safety filtering.
+- `llm_quest_benchmark/harnesses/memory.py`: `DefaultMemory`,
+  `FullTranscriptMemory`, and `CompactionMemory`.
+- `llm_quest_benchmark/harnesses/tools.py`: Calculator, scratchpad, and quest
+  history helpers used by tool harnesses.
+- `llm_quest_benchmark/harnesses/factory.py`: `create_harness()` and the
+  canonical harness registry.
+- `llm_quest_benchmark/players/human.py`,
+  `llm_quest_benchmark/players/random.py`: Non-LLM `QuestPlayer`
+  implementations preserved for interactive and random baselines.
+
+Harness construction lazily initializes provider clients, so template rendering
+and benchmark configuration parsing do not require API keys.
 
 ### 4. LLM Provider Layer
+
 - `llm_quest_benchmark/llm/client.py`:
   - provider/model normalization (`provider:model` + aliases)
   - adapters: OpenAI, Anthropic, Google Gemini, DeepSeek
@@ -40,38 +77,60 @@ The runtime loop is:
   - token/cost usage tracking per completion call
 
 ### 5. Execution and Analysis Layer
+
 - `llm_quest_benchmark/core/runner.py`: Core quest run loop.
-- `llm_quest_benchmark/core/analyzer.py`: Post-run analysis and benchmark summaries.
+- `llm_quest_benchmark/core/analyzer.py`: Post-run analysis and benchmark
+  summaries.
 - `llm_quest_benchmark/core/benchmark_report.py`: Markdown report generator.
-- `llm_quest_benchmark/core/logging.py`: Quest logger with per-run metrics (repetition_rate, bad_decision_rate).
-- `llm_quest_benchmark/executors/benchmark.py`: Benchmark orchestration with parallel workers.
-- `llm_quest_benchmark/executors/cli/commands.py`: CLI commands (`run`, `play`, `analyze`, `analyze-run`, `benchmark`, `benchmark-report`, `download-quests`, `cleanup`).
+- `llm_quest_benchmark/core/logging.py`: Quest logger with per-run metrics
+  (`repetition_rate`, `bad_decision_rate`).
+- `llm_quest_benchmark/executors/benchmark.py`: Benchmark orchestration with
+  parallel workers.
+- `llm_quest_benchmark/executors/cli/commands.py`: CLI commands (`run`, `play`,
+  `analyze`, `analyze-run`, `benchmark`, `benchmark-report`,
+  `download-quests`, `cleanup`).
 
 ### 6. Prompt Templates
-- `llm_quest_benchmark/prompt_templates/`: Jinja2 templates for each agent mode.
+
+- `llm_quest_benchmark/prompt_templates/`: Jinja2 templates referenced by
+  harnesses.
   - `stub.jinja`: Minimal prompt.
-  - `reasoning.jinja`, `strategic.jinja`, etc.: Short-context or full-history reasoning depending on memory mode.
-  - `stateful_compact.jinja`, `memo_*.jinja`: Compact memory / memo prompts.
-  - `light_hints.jinja`, `stateful_compact_hints.jinja`: Prompt hints.
-  - `planner.jinja`: Planner loop prompts.
-  - `tool_augmented.jinja`, `tool_augmented_hints.jinja`: Tool prompts with compact memory, optionally with hints.
+  - `reasoning.jinja`: Short-context or full-history reasoning depending on
+    harness memory.
+  - `stateful_compact.jinja`: Compact memory / 20-word memo prompt.
+  - `stateful_compact_hints.jinja`: Compact memo prompt with mechanics hints.
+  - `memo_cot.jinja`, `memo_extended.jinja`, `memo_structured.jinja`:
+    retained Exp 4 memo variants.
+  - `planner.jinja`: Planner loop prompt.
+  - `tool_augmented.jinja`, `tool_augmented_hints.jinja`: Tool prompts with
+    compact memory, optionally with hints.
 
 ## Persistence
+
 - `metrics.db`: Benchmark/run metrics for CLI workflows.
-- `results/<agent>/<quest>/run_<id>/run_summary.json`: Step trace + per-step decisions + aggregated token/cost usage.
+- `results/<harness>/<quest>/run_<id>/run_summary.json`: Step trace,
+  per-step decisions, and aggregated token/cost usage.
 
 ## Configuration
+
 - `.env` (copied from `.env.template`): Provider API keys.
-- `configs/benchmarks/`: Benchmark YAML configs defining model x template x quest matrix.
+- `configs/benchmarks/`: Benchmark YAML configs defining model × harness ×
+  quest matrices.
 
 ## Public Taxonomy (Benchmark Dimension)
-| Label | Template / memory source | Agent Class | Description |
-|------|----------|-------------|-------------|
-| Minimal prompt | stub | LLMAgent | Smallest action-selection prompt |
-| Short-context reasoning | reasoning/strategic + default memory | LLMAgent | Local prompted analysis |
-| Full-history reasoning | reasoning + full transcript memory | LLMAgent | Whole transcript retained in context |
-| Compact memory / memo | reasoning/stateful/memo templates + compaction | LLMAgent | Summarized state instead of unbounded transcript |
-| Prompt hints | light_hints/stateful_compact_hints | LLMAgent | Mechanics hints injected into prompt |
-| Tools + compact memory | tool_augmented | ToolAgent | Quest history/scratchpad tools with compact context |
-| Tools + hints + compact memory | tool_augmented_hints | ToolAgent | Tool scaffold plus prompt hints |
-| Planner loop | planner | PlannerAgent | Plan-maintain-act loop |
+
+| Public label | Harness name | Template | Memory | Tools | Loop |
+|---|---|---|---|---|---|
+| Minimal prompt | `minimal` | `stub.jinja` | `DefaultMemory` | none | react |
+| Short-context reasoning | `reasoning_recent` | `reasoning.jinja` | `DefaultMemory` | none | react |
+| Full-history reasoning | `reasoning_full` | `reasoning.jinja` | `FullTranscriptMemory` | none | react |
+| Compact memory / memo | `memo_compact` | `stateful_compact.jinja` | `CompactionMemory` | none | react |
+| Prompt hints | `hinted_compact` | `stateful_compact_hints.jinja` | `CompactionMemory` | none | react |
+| Tools + compact memory | `tool_compact` | `tool_augmented.jinja` | `CompactionMemory` | calculator, scratchpad, quest history | tool-select-then-act |
+| Tools + hints + compact memory | `tool_hinted` | `tool_augmented_hints.jinja` | `CompactionMemory` | calculator, scratchpad, quest history | tool-select-then-act |
+| Planner loop | `planner` | `planner.jinja` | `CompactionMemory` | none | plan-maintain-act |
+
+The harness names above are canonical snake_case identifiers used in YAML
+configs, the CLI, result artifacts, and documentation. Public labels can be
+friendlier, but experiment records should preserve the canonical names so runs
+remain comparable.
diff --git a/docs/EXPERIMENTS_LOG.md b/docs/EXPERIMENTS_LOG.md
index dadef6e..a9ca972 100644
--- a/docs/EXPERIMENTS_LOG.md
+++ b/docs/EXPERIMENTS_LOG.md
@@ -1,5 +1,19 @@
 # Experiments Log
 
+## Harness Name Mapping
+
+| Experiment arm | Old label | New harness name |
+|---|---|---|
+| Minimal prompt arms | `stub` | `minimal` |
+| Short-context reasoning arms | `reasoning` + `default` memory | `reasoning_recent` |
+| Full-history reasoning arms | `reasoning` + `full_transcript` memory | `reasoning_full` |
+| Stateful compact memo arms | `stateful_compact` + compaction | `memo_compact` |
+| Hinted compact memo arms | `stateful_compact_hints` + compaction | `hinted_compact` |
+| Tool-augmented compact arms | `tool_augmented` + compaction | `tool_compact` |
+| Tool-augmented hinted arms | `tool_augmented_hints` + compaction | `tool_hinted` |
+| Planner arms | `planner` | `planner` |
+| Memo variation arms | `memo_extended`, `memo_structured`, `memo_cot` | retired experiment variants, not canonical harnesses |
+
 > Historical / non-authoritative notes. This log preserves experiment history
 > and branch-era shorthand. For the current public taxonomy and public
 > comparison slice, use `site/about.html`, `site/leaderboard.json`,
@@ -7,6 +21,145 @@
 
 Record of benchmark experiments, findings, and decisions. Keeps history out of source code.
 
+## Current Coverage Audit (2026-05-11)
+
+Sources reviewed for this audit:
+
+- `docs/EXPERIMENTS_LOG.md`
+- `docs/ARCHITECTURE.md`
+- `configs/benchmarks/*.yaml`
+- `site/leaderboard.json`
+
+This audit uses the post-refactor harness taxonomy: `minimal`,
+`reasoning_recent`, `reasoning_full`, `memo_compact`, `hinted_compact`,
+`tool_compact`, `tool_hinted`, and `planner`.
+
+### Experiment Inventory
+
+| Experiment | Config / source | Harness mapping | Quest scope | Completed runs recorded in log | Audit disposition |
+|---|---|---|---|---:|---|
+| Exp 2: Memory Modes | `memory_full_transcript.yaml`, `memory_compaction.yaml` | `reasoning_full`, `memo_compact` | 14 historical quests including `Prison` | 126 | Unreliable for canonical comparison: loop-breaker bug era. |
+| Exp 3 Arm 1: No Loop Breaker | `exp3_no_loop_breaker.yaml` | `reasoning_full` | 18 quests, excluding `Boat`/`Prison` | 36 | Use only rerun after timeout fix; pre-fix attempt is noisy/incomplete. |
+| Exp 3 Arm 2: Stateful Compact | `exp3_stateful_compact.yaml` | `memo_compact` | 18 quests, excluding `Boat`/`Prison` | 36 | Canonical memo baseline, but only 2 runs/quest. |
+| Exp 4: Compaction No Memo | `exp4_compaction_no_memo.yaml` | retired ablation, not canonical | 18 quests | 36 | Do not aggregate into `memo_compact`. |
+| Exp 4: Memo Extended | `exp4_memo_extended.yaml` | retired `memo_extended` variant | 18 quests | 36 | Non-canonical variant. |
+| Exp 4: Memo Structured | `exp4_memo_structured.yaml` | retired `memo_structured` variant | 18 quests | 36 | Non-canonical variant. |
+| Exp 4: Memo CoT | `exp4_memo_cot.yaml` | retired `memo_cot` variant | 18 quests | 36 | Non-canonical variant. |
+| Exp 5: Baseline Variance | `exp5_stateful_compact_variance.yaml` | `memo_compact` | 18 quests | 90 | Canonical memo baseline variance study. |
+| Exp 6: Prompt Hints | `exp6_prompt_hints.yaml` | `hinted_compact` | 18 quests | 54 | Canonical single-model harness comparison. |
+| Exp 6: Tools | `exp6_tools.yaml` | `tool_compact` | 18 quests | 54 | Canonical single-model harness comparison. |
+| Exp 6: Tools + Hints | `exp6_tools_hints.yaml` | `tool_hinted` | 18 quests | 54 | Canonical single-model harness comparison. |
+| Exp 7: Multi-Model Comparison | `exp7_*.yaml` | `memo_compact` | 5 winnable quests | 75 | Canonical model sweep for memo harness. |
+| Exp 7b: Model Upgrades | `exp7b_model_upgrades.yaml` | `memo_compact` | 18 quests | 108 | Noisy model-upgrade sweep; high timeout rates for Qwen 3.6 and Haiku 4.5. |
+
+### Harness Coverage Matrix
+
+The table below is computed from `site/leaderboard.json` and counts recorded
+leaderboard runs by harness and quest. `Boat` and `Prison` are retained because
+they still appear in the published leaderboard data, but they are retired from
+the canonical experiment set.
+
+| Harness | Badday | Banket | Boat | Codebox | Depth | Driver | Edelweiss | Election | Foncers | Leonardo | Ministry | Pizza | Prison | Robots | Ski | Total |
+|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+| `minimal` | 22 | 22 | 23 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 331 |
+| `reasoning_recent` | 22 | 22 | 28 | 22 | 22 | 24 | 25 | 30 | 25 | 25 | 26 | 22 | 28 | 31 | 31 | 383 |
+| `reasoning_full` | 17 | 17 | 9 | 17 | 17 | 15 | 17 | 17 | 17 | 17 | 16 | 17 | 6 | 14 | 14 | 227 |
+| `memo_compact` | 37 | 39 | 18 | 39 | 39 | 39 | 39 | 37 | 39 | 37 | 39 | 37 | 15 | 39 | 34 | 527 |
+| `hinted_compact` | 4 | 4 | 1 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 1 | 4 | 4 | 54 |
+| `tool_compact` | 3 | 3 | 0 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 0 | 3 | 3 | 39 |
+| `tool_hinted` | 3 | 3 | 0 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 0 | 3 | 3 | 39 |
+| `planner` | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 15 |
+
+Leaderboard scope note: the current public JSON includes 15 quest columns and
+does not include several 18-quest experiment-log quests such as `Pilot`,
+`Disk`, `Player`, `Shashki`, and `Sortirovka1`. A future leaderboard refresh
+should either add them or explicitly document why the public slice excludes
+them.
+
+### Gap Analysis
+
+All zero-run cells in the published leaderboard matrix are retired quest cells:
+
+- `tool_compact` x `Boat`: 0 runs.
+- `tool_compact` x `Prison`: 0 runs.
+- `tool_hinted` x `Boat`: 0 runs.
+- `tool_hinted` x `Prison`: 0 runs.
+
+Because `Boat` and `Prison` are retired, these do not require new canonical
+runs. They do indicate that the public leaderboard mixes active and retired
+quest scopes.
+
+Cells with fewer than 3 runs:
+
+- `hinted_compact` x `Boat`: 1 run; retired quest.
+- `hinted_compact` x `Prison`: 1 run; retired quest.
+- `planner`: 1 run on every published quest.
+
+Canonical action item: the planner harness has insufficient variance coverage.
+For active quests, it needs at least two additional runs per quest to reach the
+minimum 3-run threshold.
+
+The following harnesses have leaderboard cells where the run count may be at
+least 3, but the model dimension is still only one model: `tool_compact`,
+`tool_hinted`, and `planner`. Their comparison is promising, but not yet
+model-robust.
+
+### Noise And Anomalies
+
+Loop-breaker bug era:
+
+- Exp 2 memory-mode runs are unreliable. The experiment log documents a
+  number-normalization bug in `_normalize_for_signature` and aggressive loop
+  breaker overrides that changed correct model decisions.
+- Exp 3 Arm 1 has a pre-fix/incomplete attempt affected by SDK timeout issues.
+  Only the rerun after the timeout fix should be considered.
+- Any leaderboard entry whose provenance traces to Exp 2 or the Exp 3 pre-fix
+  attempt should be marked non-canonical until regenerated or excluded.
+
+High-timeout model-upgrade runs:
+
+- Exp 7b `Qwen 3.6 Flash`: 17/36 timeouts (47%).
+- Exp 7b `Claude Haiku 4.5`: 19/36 timeouts (53%).
+- Exp 7b `DeepSeek V4 Flash`: 5/36 timeouts (14%), below the >30% threshold
+  but still noisy because success was 0/36.
+
+Retired quests:
+
+- `Boat`: trivial / smoke-test-like quest; removed from canonical experiment
+  configs.
+- `Prison`: loops endlessly; removed from canonical experiment configs.
+
+Retired harness variants:
+
+- `memo_extended`
+- `memo_structured`
+- `memo_cot`
+- `compaction_no_memo` ablation
+
+These variants should not be merged into canonical `memo_compact` results.
+
+### Budget Estimate
+
+Top-priority new runs to close actionable gaps while avoiding retired quests:
+
+| Priority | Harness | Quest(s) | New runs needed | Reason |
+|---:|---|---|---:|---|
+| 1 | `planner` | 13 active published quests (`Badday`, `Banket`, `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`, `Foncers`, `Leonardo`, `Ministry`, `Pizza`, `Robots`, `Ski`) | 26 | Bring 1-run planner cells up to the 3-run minimum on active leaderboard quests. |
+| 2 | `planner` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest so planner effects are not single-model artifacts. |
+| 3 | `tool_compact` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest; current cells are all one-model results. |
+| 4 | `tool_hinted` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest; current cells are all one-model results. |
+| 5 | Public leaderboard refresh | `Pilot`, `Disk`, `Player`, `Shashki`, `Sortirovka1` | Scope-dependent | These quests are present in canonical 18-quest configs/logs but absent from the current public leaderboard matrix. Backfill or explicitly exclude them. |
+
+Do not spend new budget on `Boat` or `Prison` unless the goal is only to
+reproduce historical/public rows; both are retired from canonical analysis.
+
+### Leaderboard Integrity
+
+Recommended integrity rule: canonical leaderboard aggregates should require
+non-retired quests, canonical harness names, no loop-breaker bug provenance, at
+least 3 runs per harness x quest cell, and at least two models for claims about
+harness effects rather than model effects.
+
 ## Exp 2: Memory Modes (2026-04-27)
 
 **Config**: `configs/benchmarks/memory_full_transcript.yaml`, `configs/benchmarks/memory_compaction.yaml`
@@ -37,7 +190,7 @@ The `_apply_loop_breaker` mechanism was overriding correct LLM decisions. Eviden
 
 ### Decision
 
-- **Disabled loop breaker** entirely in all agent types (llm_agent, planner_agent, tool_agent)
+- **Disabled loop breaker** entirely in all harness types
 - **Removed number normalization** from state signature computation
 - Kept `_state_action_counts` and `_state_signature` (used by safety filter and loop escape)
 - Removed `_apply_loop_breaker` method and `_loop_repetition_threshold` field as dead code
diff --git a/docs/SPEC.md b/docs/SPEC.md
index 99289fb..44ef498 100644
--- a/docs/SPEC.md
+++ b/docs/SPEC.md
@@ -7,8 +7,11 @@ For the public narrative and interpretation of results, use the project
 ## Purpose
 
 LLM Quest Benchmark evaluates how LLMs make sequential choices in Space
-Rangers text quests. The benchmark varies the context scaffold around a model
-while holding the quest environment and result logging consistent.
+Rangers text quests. The benchmark varies the agent harness around a model
+while holding the quest environment and result logging consistent. A harness is
+the wrapper that decides what context the model sees and how its response is
+converted into an action: prompt template, memory strategy, tools, and loop
+shape.
 
 The core question is practical: which kinds of context help, hurt, or expose
 state-tracking failures during 10-50 turn interactive fiction tasks?
@@ -35,29 +38,47 @@ analysis, but the public slice is the authoritative comparison surface.
 
 ## Current Taxonomy
 
-Use these labels for current public descriptions of benchmark modes:
+Use these labels for current public descriptions of benchmark harnesses:
 
-| Label | Implementation source | Agent class |
-|---|---|---|
-| Minimal prompt | `stub.jinja` | `LLMAgent` |
-| Short-context reasoning | `reasoning.jinja`, `strategic.jinja` with default/recent context | `LLMAgent` |
-| Full-history reasoning | reasoning templates with `full_transcript` memory | `LLMAgent` |
-| Compact memory / memo | `stateful_compact.jinja`, memo templates, compaction memory | `LLMAgent` |
-| Prompt hints | `light_hints.jinja`, `stateful_compact_hints.jinja` | `LLMAgent` |
-| Tools + compact memory | `tool_augmented.jinja` | `ToolAgent` |
-| Tools + hints + compact memory | `tool_augmented_hints.jinja` | `ToolAgent` |
-| Planner loop | `planner.jinja` | `PlannerAgent` |
+| Label | Harness name | Template | Memory | Tools / loop |
+|---|---|---|---|---|
+| Minimal prompt | `minimal` | `stub.jinja` | `DefaultMemory` | no tools, react loop |
+| Short-context reasoning | `reasoning_recent` | `reasoning.jinja` | `DefaultMemory` | no tools, react loop |
+| Full-history reasoning | `reasoning_full` | `reasoning.jinja` | `FullTranscriptMemory` | no tools, react loop |
+| Compact memory / memo | `memo_compact` | `stateful_compact.jinja` | `CompactionMemory` | no tools, react loop |
+| Prompt hints | `hinted_compact` | `stateful_compact_hints.jinja` | `CompactionMemory` | no tools, react loop |
+| Tools + compact memory | `tool_compact` | `tool_augmented.jinja` | `CompactionMemory` | calculator, scratchpad, quest history |
+| Tools + hints + compact memory | `tool_hinted` | `tool_augmented_hints.jinja` | `CompactionMemory` | calculator, scratchpad, quest history |
+| Planner loop | `planner` | `planner.jinja` | `CompactionMemory` | plan-maintain-act loop |
 
 Older internal experiment labels are historical and should not be presented as
 the current public taxonomy.
 
+## Current Interpretation
+
+The strongest pattern so far is that bigger scaffolds are not automatically
+better. A concise 20-word memo produced a useful sweet spot: it improved over
+no-memo and full-transcript baselines, while longer or more structured memo
+variants regressed. The likely mechanism is selective pressure: the short memo
+forces the harness to preserve only state that matters for future decisions.
+
+Tools and hints showed a synergy effect. Prompt hints alone hurt, and tools
+alone were modest, but tools plus hints improved outcomes because the hints
+pointed the model toward quantities and quest mechanics while the calculator,
+scratchpad, and history search gave it ways to act on those signals.
+
+Verbosity is a recurring failure mode. Some newer or larger models timed out
+more often because they spent too much of the quest budget generating long step
+responses. For sequential decision tasks, a harness that elicits concise,
+actionable state updates can outperform one that invites broad reasoning.
+
 ## Implemented Runtime
 
 - Quest execution uses the TypeScript `space-rangers-quest` submodule through
   the Python bridge in `llm_quest_benchmark/executors/ts_bridge/`.
 - Environment state is exposed through `llm_quest_benchmark/environments/qm.py`.
-- Agents live under `llm_quest_benchmark/agents/` and are selected by template
-  aliases and agent factory wiring.
+- Agent harnesses live under `llm_quest_benchmark/harnesses/` and are selected
+  by canonical snake_case harness names.
 - Provider calls are normalized in `llm_quest_benchmark/llm/client.py` with
   OpenAI-compatible, Anthropic, Google, and DeepSeek adapters.
 - Benchmark execution is CLI + YAML driven through `uv run llm-quest ...`.
@@ -107,7 +128,7 @@ Provider API keys are required for real LLM runs. Tests and static validation
 should run without external credentials in a prepared checkout.
 
 Reproducible benchmark rows depend on recording the quest file, model/provider
-ID, prompt templates, memory mode, run ID, outcome, and run summaries with
-usage/metrics. Agent responses are parsed into a chosen action plus optional
+ID, harness name, run ID, outcome, and run summaries with usage/metrics.
+Harness responses are parsed into a chosen action plus optional
 analysis/reasoning so action validity, terminal outcome, steps, tokens/cost,
 and repetition diagnostics can be regenerated from stored artifacts.
diff --git a/llm_quest_benchmark/agents/__init__.py b/llm_quest_benchmark/agents/__init__.py
deleted file mode 100644
index 852fb91..0000000
--- a/llm_quest_benchmark/agents/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from .agent_factory import create_agent
-from .base import QuestPlayer
-from .llm_agent import LLMAgent
-from .planner_agent import PlannerAgent
-from .random_agent import RandomAgent
-from .tool_agent import ToolAgent
-
-__all__ = [
-    "create_agent",
-    "QuestPlayer",
-    "RandomAgent",
-    "LLMAgent",
-    "PlannerAgent",
-    "ToolAgent",
-]
diff --git a/llm_quest_benchmark/agents/agent_factory.py b/llm_quest_benchmark/agents/agent_factory.py
deleted file mode 100644
index d7b889b..0000000
--- a/llm_quest_benchmark/agents/agent_factory.py
+++ /dev/null
@@ -1,102 +0,0 @@
-"""Factory for creating quest agents"""
-
-import logging
-
-from llm_quest_benchmark.agents.base import QuestPlayer
-from llm_quest_benchmark.agents.human_player import HumanPlayer
-from llm_quest_benchmark.agents.llm_agent import LLMAgent
-from llm_quest_benchmark.agents.planner_agent import PlannerAgent
-from llm_quest_benchmark.agents.random_agent import RandomAgent
-from llm_quest_benchmark.agents.tool_agent import ToolAgent
-from llm_quest_benchmark.constants import (
-    DEFAULT_MODEL,
-    DEFAULT_TEMPERATURE,
-    DEFAULT_TEMPLATE,
-    SYSTEM_ROLE_TEMPLATE,
-    normalize_template_name,
-)
-
-logger = logging.getLogger(__name__)
-
-
-def create_agent(
-    model: str = DEFAULT_MODEL,
-    system_template: str = SYSTEM_ROLE_TEMPLATE,
-    action_template: str = DEFAULT_TEMPLATE,
-    temperature: float = DEFAULT_TEMPERATURE,
-    skip_single: bool = False,
-    debug: bool = False,
-    memory_mode: str = "default",
-    compaction_interval: int = 10,
-) -> QuestPlayer:
-    """Create a quest agent based on model name and parameters.
-
-    Args:
-        model (str): Model identifier. Can be:
-            - LLM model name (e.g. 'gpt-5-mini', 'claude-sonnet-4-5')
-            - 'random_choice' for random testing agent (can include seed e.g. 'random_choice_123')
-            - 'human' for interactive human player
-        debug (bool): Enable debug logging
-        system_template (str): System template for LLM agents
-        action_template (str): Action template for LLM agents
-        temperature (float): Temperature for LLM sampling
-        skip_single (bool): Auto-select single choices
-
-    Returns:
-        QuestPlayer: Appropriate agent instance
-
-    Raises:
-        ValueError: If model type is not recognized
-    """
-    logger.debug(f"Creating agent for model: {model}")
-    resolved_action_template = normalize_template_name(action_template)
-
-    # Human player
-    if model == "human":
-        return HumanPlayer(skip_single=skip_single)
-
-    # Random choice agent
-    if model.startswith("random_choice"):
-        seed = None
-        if "_" in model:
-            try:
-                seed = int(model.split("_")[-1])
-            except ValueError:
-                pass
-        return RandomAgent(seed=seed, debug=debug, skip_single=skip_single)
-
-    if resolved_action_template == "planner.jinja":
-        return PlannerAgent(
-            debug=debug,
-            model_name=model,
-            system_template=system_template,
-            action_template=resolved_action_template,
-            temperature=temperature,
-            skip_single=skip_single,
-            memory_mode=memory_mode,
-            compaction_interval=compaction_interval,
-        )
-
-    if resolved_action_template in ("tool_augmented.jinja", "tool_augmented_hints.jinja"):
-        return ToolAgent(
-            debug=debug,
-            model_name=model,
-            system_template=system_template,
-            action_template=resolved_action_template,
-            temperature=temperature,
-            skip_single=skip_single,
-            memory_mode=memory_mode,
-            compaction_interval=compaction_interval,
-        )
-
-    # Default to LLM agent
-    return LLMAgent(
-        debug=debug,
-        model_name=model,
-        system_template=system_template,
-        action_template=resolved_action_template,
-        temperature=temperature,
-        skip_single=skip_single,
-        memory_mode=memory_mode,
-        compaction_interval=compaction_interval,
-    )
diff --git a/llm_quest_benchmark/agents/llm_agent.py b/llm_quest_benchmark/agents/llm_agent.py
deleted file mode 100644
index 64ff0cc..0000000
--- a/llm_quest_benchmark/agents/llm_agent.py
+++ /dev/null
@@ -1,968 +0,0 @@
-"""LLM agent for Space Rangers quests"""
-
-import hashlib
-import json
-import logging
-import re
-from typing import Any
-
-from json_repair import repair_json
-
-from llm_quest_benchmark.agents.base import QuestPlayer
-from llm_quest_benchmark.constants import (
-    DEFAULT_MODEL,
-    DEFAULT_TEMPERATURE,
-    DEFAULT_TEMPLATE,
-    MODEL_CHOICES,
-    SYSTEM_ROLE_TEMPLATE,
-    normalize_template_name,
-)
-from llm_quest_benchmark.llm.client import (
-    get_llm_client,
-    is_supported_model_name,
-    parse_model_name,
-)
-from llm_quest_benchmark.llm.prompt import PromptRenderer
-from llm_quest_benchmark.schemas.response import LLMResponse
-
-RISKY_CHOICE_KEYWORDS = (
-    "улететь",
-    "сдаться",
-    "отказ",
-    "провал",
-    "убежать",
-    "surrender",
-    "give up",
-)
-
-SAFE_CHOICE_KEYWORDS = (
-    "пройти мимо",
-    "избежать",
-    "подготов",
-    "библиотек",
-    "изуч",
-    "wait",
-    "avoid",
-    "study",
-)
-
-
-def _parse_json_response(
-    response: str,
-    debug: bool = False,
-    logger: logging.Logger | None = None,
-) -> tuple[dict[str, Any] | None, str | None]:
-    """Try to parse response as JSON, with repair attempt if needed."""
-    cleaned_response = (response or "").strip()
-    if not cleaned_response:
-        return None, None
-
-    try:
-        # Extract JSON from response if there are backticks
-        if "```json" in cleaned_response:
-            # Find the start and end of the JSON block
-            start = cleaned_response.find("```json") + 7
-            end = cleaned_response.find("```", start)
-            if end > start:
-                json_str = cleaned_response[start:end].strip()
-                if debug and logger:
-                    logger.debug(f"Extracted JSON: {json_str}")
-                result = json.loads(json_str)
-                if debug and logger:
-                    logger.debug(f"Parsed JSON: {result}")
-                return result, "json_fenced"
-
-        # Extract a probable JSON object from free-form text.
-        embedded_json = re.search(r"\{[\s\S]*\}", cleaned_response)
-        if embedded_json:
-            candidate = embedded_json.group(0).strip()
-            if candidate and candidate != cleaned_response:
-                try:
-                    result = json.loads(candidate)
-                    if debug and logger:
-                        logger.debug(f"Parsed embedded JSON: {result}")
-                    return result, "json_embedded"
-                except json.JSONDecodeError:
-                    pass
-
-        # Try to parse directly
-        result = json.loads(cleaned_response)
-        if debug and logger:
-            logger.debug(f"Direct JSON parse successful: {result}")
-        return result, "json_direct"
-    except json.JSONDecodeError:
-        if debug and logger:
-            logger.debug("Initial JSON parse failed, attempting repair")
-        try:
-            repaired = repair_json(cleaned_response)
-            if debug and logger:
-                logger.debug(f"Repaired JSON: {repaired}")
-            result = json.loads(repaired)
-            if debug and logger:
-                logger.debug(f"Parse of repaired JSON successful: {result}")
-            return result, "json_repaired"
-        except Exception as e:
-            if debug and logger:
-                logger.error(f"JSON repair failed: {e}")
-            return None, None
-
-
-def _validate_action_number(
-    action: int, num_choices: int, debug: bool = False, logger: logging.Logger | None = None
-) -> bool:
-    """Validate that action number is within valid range"""
-    if 1 <= action <= num_choices:
-        return True
-    if debug and logger:
-        logger.error(f"Action number {action} out of range [1, {num_choices}]")
-    return False
-
-
-def _extract_action_from_text(response: str, num_choices: int) -> int | None:
-    """Extract a candidate action from free-form text."""
-    for match in re.finditer(r"\b(\d+)\b", response):
-        action = int(match.group(1))
-        if 1 <= action <= num_choices:
-            return action
-    return None
-
-
-def _extract_field_from_text(response: str, field: str) -> str | None:
-    """Best-effort extraction of analysis/reasoning from loosely formatted output."""
-    if not response:
-        return None
-
-    # JSON-like field forms: "analysis": "...", 'analysis': '...'
-    json_pattern = re.compile(
-        rf"""['"]{re.escape(field)}['"]\s*:\s*['"](?P<value>.*?)['"]""",
-        re.IGNORECASE | re.DOTALL,
-    )
-    match = json_pattern.search(response)
-    if match:
-        value = " ".join(match.group("value").strip().split())
-        if value:
-            return value
-
-    # Partial JSON field forms without a closing quote in truncated outputs.
-    partial_json_pattern = re.compile(
-        rf"""['"]{re.escape(field)}['"]\s*:\s*['"](?P<value>[^"\n\r]+)""",
-        re.IGNORECASE,
-    )
-    match = partial_json_pattern.search(response)
-    if match:
-        value = " ".join(match.group("value").strip().split())
-        if value:
-            return value
-
-    # Label forms: Analysis: ..., Reasoning - ...
-    label_pattern = re.compile(
-        rf"""(?im)^\s*{re.escape(field)}\s*[:\-]\s*(?P<value>.+?)\s*$""",
-    )
-    match = label_pattern.search(response)
-    if match:
-        value = " ".join(match.group("value").strip().split())
-        if value:
-            return value
-
-    return None
-
-
-def _raw_reasoning_fallback(response: str) -> str | None:
-    compact = " ".join((response or "").strip().split())
-    if not compact:
-        return None
-    if len(compact) > 240:
-        compact = compact[:237] + "..."
-    return f"raw_response: {compact}"
-
-
-def _is_numeric_raw_reasoning(reasoning: str | None) -> bool:
-    if not reasoning:
-        return False
-    if not reasoning.startswith("raw_response:"):
-        return False
-    payload = reasoning.split(":", 1)[1].strip()
-    return payload.isdigit()
-
-
-def parse_llm_response(
-    response: str, num_choices: int, debug: bool = False, logger: logging.Logger | None = None
-) -> LLMResponse:
-    """Parse LLM response and return structured response object."""
-    if debug and logger:
-        logger.debug(f"Raw LLM response: {response}")
-
-    extracted_analysis = _extract_field_from_text(response, "analysis")
-    extracted_reasoning = _extract_field_from_text(response, "reasoning")
-    raw_reasoning = _raw_reasoning_fallback(response)
-
-    # Try parsing as JSON first
-    response_json, json_parse_mode = _parse_json_response(response, debug, logger)
-    if response_json and isinstance(response_json, dict):
-        analysis = response_json.get("analysis") or extracted_analysis
-        reasoning = response_json.get("reasoning") or response_json.get("thinking") or extracted_reasoning
-        if not reasoning and analysis:
-            reasoning = analysis
-        if not analysis and not reasoning:
-            reasoning = raw_reasoning
-
-        memo_raw = response_json.get("memo")
-        memo = str(memo_raw) if memo_raw is not None else None
-
-        # Check for either 'action' or 'result' field
-        action_value = response_json.get("action") or response_json.get("result") or response_json.get("choice")
-        if action_value is not None:
-            try:
-                action = int(action_value)
-                if _validate_action_number(action, num_choices, debug, logger):
-                    return LLMResponse(
-                        action=action,
-                        reasoning=reasoning,
-                        analysis=analysis,
-                        memo=memo,
-                        is_default=False,
-                        parse_mode=json_parse_mode or "json",
-                    )
-            except (ValueError, TypeError):
-                if debug and logger:
-                    logger.error(f"Invalid action value in JSON: {action_value}")
-
-    # Try parsing as plain number
-    try:
-        action = int(response.strip())
-        if _validate_action_number(action, num_choices, debug, logger):
-            return LLMResponse(
-                action=action,
-                reasoning=extracted_reasoning or extracted_analysis or raw_reasoning,
-                analysis=extracted_analysis,
-                is_default=False,
-                parse_mode="number_only",
-            )
-    except ValueError:
-        if debug and logger:
-            logger.error(f"Could not parse response as number: {response}")
-
-    # Fallback: extract first valid integer from text.
-    extracted_action = _extract_action_from_text(response, num_choices)
-    if extracted_action is not None:
-        return LLMResponse(
-            action=extracted_action,
-            reasoning=extracted_reasoning or extracted_analysis or raw_reasoning,
-            analysis=extracted_analysis,
-            is_default=False,
-            parse_mode="number_extracted",
-        )
-
-    # Default to first choice if all parsing attempts fail
-    if debug and logger:
-        logger.error(f"Error during response parsing, defaulting to first choice. Response: {response[:100]}...")
-    return LLMResponse(
-        action=1,
-        reasoning=extracted_reasoning or extracted_analysis or raw_reasoning,
-        analysis=extracted_analysis,
-        is_default=True,
-        parse_mode="default_first",
-    )
-
-
-class LLMAgent(QuestPlayer):
-    """LLM-powered agent for Space Rangers quests"""
-
-    SUPPORTED_MODELS = MODEL_CHOICES
-
-    def __init__(
-        self,
-        model_name: str = DEFAULT_MODEL,
-        system_template: str = SYSTEM_ROLE_TEMPLATE,
-        action_template: str = DEFAULT_TEMPLATE,
-        temperature: float = DEFAULT_TEMPERATURE,
-        skip_single: bool = False,
-        debug: bool = False,
-        memory_mode: str = "default",
-        compaction_interval: int = 10,
-    ):
-        super().__init__(skip_single=skip_single)
-        self.debug = debug
-        self.model_name = model_name.lower()
-        self.system_template = normalize_template_name(system_template)
-        self.action_template = normalize_template_name(action_template)
-        self.temperature = temperature
-        # Set agent_id for database records
-        self.agent_id = f"llm_{self.model_name}"
-
-        if not is_supported_model_name(self.model_name):
-            raise ValueError(f"Unsupported model: {model_name}. Supported models are: {self.SUPPORTED_MODELS}")
-
-        self.model_spec = parse_model_name(self.model_name)
-        self.logger = logging.getLogger(self.__class__.__name__)
-        if self.debug:
-            self.logger.setLevel(logging.DEBUG)
-            self.logger.propagate = False
-            if not any(getattr(h, "_llm_quest_handler", False) for h in self.logger.handlers):
-                handler = logging.StreamHandler()
-                handler.setFormatter(logging.Formatter("%(name)s - %(message)s"))
-                handler._llm_quest_handler = True
-                self.logger.addHandler(handler)
-
-        # Initialize prompt renderer
-        self.prompt_renderer = PromptRenderer(
-            None, system_template=self.system_template, action_template=self.action_template
-        )
-
-        # Delay API client creation so template-only flows and tests do not require API keys.
-        self.llm = None
-        self.history: list[LLMResponse] = []
-        self._observation_history: list[str] = []
-        self._decision_history: list[dict[str, Any]] = []
-        self._state_action_counts: dict[str, dict[int, int]] = {}
-        self._context_window = 3
-        self._context_chars = 220
-        self._decision_window = 5
-        self._max_state_signatures = 200
-        self._use_safety_filter = True
-        self._last_response = LLMResponse(action=1, is_default=True)
-
-        # Quest briefing: pinned first observation (mission goal)
-        self._quest_briefing: str | None = None
-
-        # Memory mode: "default", "full_transcript", "compaction"
-        if memory_mode not in ("default", "full_transcript", "compaction"):
-            raise ValueError(f"Invalid memory_mode: {memory_mode}")
-        self._memory_mode = memory_mode
-        self._transcript: list[dict[str, Any]] = []
-        self._compaction_interval = compaction_interval
-        self._compaction_summary: str | None = None
-        self._steps_since_compaction = 0
-        self._step_count = 0
-
-    def _ensure_llm(self):
-        """Lazily create the provider client only when inference is needed."""
-        if self.llm is None:
-            self.llm = get_llm_client(
-                self.model_name,
-                system_prompt=self.prompt_renderer.render_system_prompt(),
-                temperature=self.temperature,
-            )
-
-    def get_last_response(self) -> LLMResponse | None:
-        """Get the last LLM response from history"""
-        return self._last_response
-
-    def get_action(self, observation: str, choices: list[dict[str, str]]) -> int:
-        """Track observation history for context, then delegate base action flow."""
-        self._remember_observation(observation)
-        return super().get_action(observation, choices)
-
-    def _remember_observation(self, observation: str) -> None:
-        clean = (observation or "").strip()
-        if not clean:
-            return
-        if self._quest_briefing is None:
-            self._quest_briefing = clean
-        self._observation_history.append(clean)
-        if len(self._observation_history) > 20:
-            self._observation_history = self._observation_history[-20:]
-
-    def _build_contextual_state(self, state: str) -> str:
-        """Build context-augmented state based on memory mode."""
-        if self._memory_mode == "full_transcript":
-            return self._build_full_transcript_state(state)
-        if self._memory_mode == "compaction":
-            return self._build_compaction_state(state)
-        return self._build_default_state(state)
-
-    def _briefing_block(self, state: str) -> str | None:
-        """Return quest briefing block if available and not redundant with current state."""
-        if not self._quest_briefing:
-            return None
-        if state.strip() == self._quest_briefing:
-            return None
-        briefing = self._quest_briefing
-        if len(briefing) > 800:
-            briefing = briefing[:800] + "..."
-        return f"Quest briefing (your mission):\n{briefing}"
-
-    def _build_default_state(self, state: str) -> str:
-        """Original sliding-window context, now with pinned briefing."""
-        blocks: list[str] = []
-
-        briefing = self._briefing_block(state)
-        if briefing:
-            blocks.append(briefing)
-
-        if len(self._observation_history) > 1:
-            previous = self._observation_history[:-1][-self._context_window :]
-            if previous:
-                snippets = []
-                for idx, text in enumerate(previous, start=1):
-                    clipped = text if len(text) <= self._context_chars else text[: self._context_chars] + "..."
-                    snippets.append(f"[Previous {idx}] {clipped}")
-                blocks.append("Recent context from previous steps:\n" + "\n\n".join(snippets))
-
-        if self._decision_history:
-            recent_memos = []
-            for item in self._decision_history[-self._decision_window :]:
-                m = (item.get("memo") or "").strip()
-                if not m:
-                    continue
-                if recent_memos and recent_memos[-1] == m:
-                    continue
-                recent_memos.append(m)
-            if recent_memos:
-                lines = [f"[Memo {idx}] {m}" for idx, m in enumerate(recent_memos, start=1)]
-                blocks.append("State memo (recent):\n" + "\n".join(lines))
-
-            recent_decisions = self._decision_history[-self._decision_window :]
-            decision_lines = []
-            for idx, item in enumerate(recent_decisions, start=1):
-                choice = item.get("choice", "")
-                parse_mode = item.get("parse_mode", "unknown")
-                memo_val = item.get("memo")
-                memo_suffix = f" | memo: {memo_val}" if memo_val else ""
-                decision_lines.append(
-                    f"[Decision {idx}] action {item.get('action')}: {choice} (parse={parse_mode}){memo_suffix}"
-                )
-            blocks.append("Recent selected actions:\n" + "\n".join(decision_lines))
-
-        if not blocks:
-            return state
-
-        sep = "\n\n"
-        return f"{sep.join(blocks)}\n\nCurrent story state:\n{state}"
-
-    def _build_full_transcript_state(self, state: str) -> str:
-        """Full decision transcript with pinned briefing."""
-        blocks: list[str] = []
-
-        briefing = self._briefing_block(state)
-        if briefing:
-            blocks.append(briefing)
-
-        if self._transcript:
-            lines = []
-            entries = self._transcript
-            # Budget: keep first 3 + last N that fit under ~40 entries total
-            if len(entries) > 40:
-                entries = entries[:3] + [{"_gap": len(entries) - 40}] + entries[-(40 - 3) :]
-            for entry in entries:
-                if "_gap" in entry:
-                    lines.append(f"  ... ({entry['_gap']} steps omitted) ...")
-                    continue
-                step = entry.get("step", "?")
-                obs = entry.get("observation", "")
-                if len(obs) > 400:
-                    obs = obs[:400] + "..."
-                chosen = entry.get("choice_text", "")
-                reasoning = entry.get("reasoning", "")
-                line = f"Step {step}: {obs}"
-                if chosen:
-                    line += f"\n  You chose: {chosen}"
-                if reasoning:
-                    line += f"\n  Reasoning: {reasoning[:800]}"
-                state_notes = entry.get("memo", "")
-                if state_notes:
-                    line += f"\n  State: {state_notes[:350]}"
-                lines.append(line)
-            blocks.append("=== QUEST TRANSCRIPT ===\n" + "\n\n".join(lines))
-
-        blocks.append(f"Step {self._step_count} (CURRENT):\n{state}")
-        return "\n\n".join(blocks)
-
-    def _build_compaction_state(self, state: str) -> str:
-        """Compacted memory summary + recent steps since last compaction."""
-        blocks: list[str] = []
-
-        briefing = self._briefing_block(state)
-        if briefing:
-            blocks.append(briefing)
-
-        if self._compaction_summary:
-            blocks.append(
-                f"=== QUEST MEMORY (compacted at step {self._step_count - self._steps_since_compaction}) ===\n{self._compaction_summary}"
-            )
-
-        if self._transcript:
-            recent = self._transcript[-self._steps_since_compaction :] if self._steps_since_compaction > 0 else []
-            if recent:
-                lines = []
-                for entry in recent:
-                    step = entry.get("step", "?")
-                    obs = entry.get("observation", "")
-                    if len(obs) > 400:
-                        obs = obs[:400] + "..."
-                    chosen = entry.get("choice_text", "")
-                    line = f"Step {step}: {obs}"
-                    if chosen:
-                        line += f"\n  You chose: {chosen}"
-                    state_notes = entry.get("memo", "")
-                    if state_notes:
-                        line += f"\n  State: {state_notes[:350]}"
-                    lines.append(line)
-                blocks.append("=== RECENT STEPS ===\n" + "\n\n".join(lines))
-
-        blocks.append(f"Step {self._step_count} (CURRENT):\n{state}")
-        return "\n\n".join(blocks)
-
-    def _maybe_compact(self) -> None:
-        """Run compaction if interval reached. Called after recording a decision."""
-        if self._memory_mode != "compaction":
-            return
-        if self._steps_since_compaction < self._compaction_interval:
-            return
-
-        transcript_text = self._format_transcript_for_compaction()
-        if not transcript_text:
-            return
-
-        prompt_parts = []
-        prompt_parts.append("You are summarizing an agent's progress through a text quest.")
-        if self._quest_briefing:
-            prompt_parts.append(f"\nQUEST BRIEFING (the original mission):\n{self._quest_briefing}")
-        if self._compaction_summary:
-            prompt_parts.append(f"\nPREVIOUS SUMMARY:\n{self._compaction_summary}")
-        prompt_parts.append(f"\nTRANSCRIPT OF LAST {self._steps_since_compaction} STEPS:\n{transcript_text}")
-        prompt_parts.append(
-            "\nSummarize the agent's progress. Include:\n"
-            "- Current objective (what the agent should do next)\n"
-            "- Progress so far (what has been accomplished)\n"
-            "- Key facts (NPCs, items, locations, deadlines discovered)\n"
-            "- Failed approaches (actions/paths that didn't work)\n"
-            "- Map knowledge (locations visited and connections)\n\n"
-            "Write a concise summary in plain text, max 300 words."
-        )
-
-        compaction_prompt = "\n".join(prompt_parts)
-        try:
-            self._ensure_llm()
-            summary = self.llm.get_completion(compaction_prompt)
-            compaction_usage = self.llm.get_last_usage() or {}
-            if compaction_usage:
-                pt = int(
-                    compaction_usage.get("prompt_tokens", 0)
-                    if isinstance(compaction_usage, dict)
-                    else getattr(compaction_usage, "prompt_tokens", 0)
-                )
-                ct = int(
-                    compaction_usage.get("completion_tokens", 0)
-                    if isinstance(compaction_usage, dict)
-                    else getattr(compaction_usage, "completion_tokens", 0)
-                )
-                self._record_compaction_usage(pt, ct)
-            stripped = (summary or "").strip()
-            if not stripped:
-                if self.debug:
-                    self.logger.warning("Compaction returned empty summary at step %d", self._step_count)
-                self._steps_since_compaction = max(0, self._compaction_interval // 2)
-                return
-            self._compaction_summary = stripped
-            self._transcript = []
-            self._steps_since_compaction = 0
-            if self.debug:
-                self.logger.debug(
-                    "Compaction completed at step %d: %s", self._step_count, self._compaction_summary[:200]
-                )
-        except Exception as e:
-            if self.debug:
-                self.logger.warning("Compaction failed at step %d: %s", self._step_count, e)
-            self._steps_since_compaction = max(0, self._compaction_interval // 2)
-
-    def _record_compaction_usage(self, prompt_tokens: int, completion_tokens: int) -> None:
-        """Record token usage from compaction calls into agent history."""
-        compaction_response = LLMResponse(
-            action=0,
-            is_default=True,
-            parse_mode="compaction",
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-            total_tokens=prompt_tokens + completion_tokens,
-        )
-        self.history.append(compaction_response)
-
-    def _format_transcript_for_compaction(self) -> str:
-        """Format recent transcript entries for the compaction prompt."""
-        recent = (
-            self._transcript[-self._steps_since_compaction :]
-            if self._steps_since_compaction > 0
-            else self._transcript[-self._compaction_interval :]
-        )
-        lines = []
-        for entry in recent:
-            step = entry.get("step", "?")
-            obs = entry.get("observation", "")
-            if len(obs) > 400:
-                obs = obs[:400] + "..."
-            chosen = entry.get("choice_text", "")
-            reasoning = entry.get("reasoning", "")
-            state_notes = entry.get("memo", "")
-            line = f"Step {step}: {obs}"
-            if chosen:
-                line += f"\n  Chose: {chosen}"
-            if state_notes:
-                line += f"\n  State: {state_notes[:350]}"
-            if reasoning:
-                line += f"\n  Reasoning: {reasoning[:800]}"
-            lines.append(line)
-        return "\n\n".join(lines)
-
-    @staticmethod
-    def _normalize_for_signature(value: str, max_len: int = 320) -> str:
-        text = (value or "").lower()
-        text = re.sub(r"\s+", " ", text).strip()
-        if len(text) > max_len:
-            return text[:max_len]
-        return text
-
-    def _state_signature(self, state: str, choices: list[dict[str, str]]) -> str:
-        normalized_state = self._normalize_for_signature(state, max_len=420)
-        normalized_choices = "|".join(
-            self._normalize_for_signature(choice.get("text", ""), max_len=110) for choice in choices
-        )
-        raw_signature = f"{normalized_state}||{normalized_choices}"
-        return hashlib.sha1(raw_signature.encode("utf-8", errors="ignore")).hexdigest()[:20]
-
-    def _remember_decision(
-        self,
-        state: str,
-        choices: list[dict[str, str]],
-        state_signature: str,
-        response: LLMResponse,
-    ) -> None:
-        action = int(response.action)
-        counts = self._state_action_counts.setdefault(state_signature, {})
-        counts[action] = counts.get(action, 0) + 1
-
-        if len(self._state_action_counts) > self._max_state_signatures:
-            oldest_key = next(iter(self._state_action_counts.keys()))
-            if oldest_key != state_signature:
-                self._state_action_counts.pop(oldest_key, None)
-
-        selected_text = ""
-        if 1 <= action <= len(choices):
-            selected_text = choices[action - 1].get("text", "")
-        state_snippet = state.strip()
-        if len(state_snippet) > self._context_chars:
-            state_snippet = state_snippet[: self._context_chars] + "..."
-
-        self._decision_history.append(
-            {
-                "state": state_snippet,
-                "action": action,
-                "choice": selected_text,
-                "parse_mode": response.parse_mode or "unknown",
-                "memo": (response.memo or "").strip()[:350] or None,
-            }
-        )
-        if len(self._decision_history) > 40:
-            self._decision_history = self._decision_history[-40:]
-
-        # Transcript for full_transcript and compaction modes
-        if self._memory_mode in ("full_transcript", "compaction"):
-            self._step_count += 1
-            self._steps_since_compaction += 1
-            self._transcript.append(
-                {
-                    "step": self._step_count,
-                    "observation": state_snippet if self._memory_mode == "compaction" else state.strip()[:400],
-                    "choice_text": selected_text,
-                    "reasoning": (response.reasoning or "")[:800],
-                    "memo": (response.memo or "").strip()[:350] or None,
-                    "action": action,
-                }
-            )
-            self._maybe_compact()
-
-    def _choice_risk_score(self, choice_text: str) -> int:
-        text = (choice_text or "").lower()
-        score = 0
-        for keyword in RISKY_CHOICE_KEYWORDS:
-            if keyword in text:
-                score += 2
-        for keyword in SAFE_CHOICE_KEYWORDS:
-            if keyword in text:
-                score -= 1
-        return score
-
-    def _apply_safety_filter(self, action: int, choices: list[dict[str, str]]) -> int:
-        """Replace obviously risky actions when a clearly safer alternative exists."""
-        if not self._use_safety_filter or len(choices) < 2:
-            return action
-
-        current_idx = action - 1
-        if current_idx < 0 or current_idx >= len(choices):
-            return action
-
-        scored = [(idx + 1, self._choice_risk_score(c.get("text", ""))) for idx, c in enumerate(choices)]
-        scored.sort(key=lambda item: item[1])
-
-        best_action, best_score = scored[0]
-        current_score = self._choice_risk_score(choices[current_idx].get("text", ""))
-
-        # Only override when the chosen action is materially riskier than the best option.
-        if current_score - best_score >= 2:
-            if self.debug:
-                self.logger.debug(
-                    "Safety filter override: %s -> %s (risk %s -> %s)",
-                    action,
-                    best_action,
-                    current_score,
-                    best_score,
-                )
-            return best_action
-        return action
-
-    @staticmethod
-    def _state_fingerprint(state: str) -> str:
-        """Create a stable fingerprint for loop detection."""
-        compact = " ".join((state or "").lower().split())
-        if len(compact) > 500:
-            compact = compact[:500]
-        return compact
-
-    def _apply_loop_escape(
-        self,
-        state_key: str,
-        action: int,
-        choices: list[dict[str, str]],
-    ) -> tuple[int, bool]:
-        """Diversify action when the same state repeats with no apparent progress."""
-        if len(choices) <= 1:
-            return action, False
-
-        counts = self._state_action_counts.get(state_key, {})
-        total_visits = sum(counts.values())
-        if total_visits < 3:
-            return action, False
-
-        current_count = counts.get(action, 0)
-        if current_count < 2:
-            return action, False
-        all_actions = list(range(1, len(choices) + 1))
-        ranked = sorted(
-            all_actions,
-            key=lambda a: (
-                counts.get(a, 0),
-                self._choice_risk_score(choices[a - 1].get("text", "")),
-            ),
-        )
-        best_action = ranked[0]
-
-        if best_action != action and counts.get(best_action, 0) < current_count:
-            return best_action, True
-        if total_visits >= 5 and current_count >= 3 and best_action != action:
-            return best_action, True
-        return action, False
-
-    @staticmethod
-    def _normalize_usage(usage: dict[str, Any] | None) -> dict[str, Any]:
-        usage = usage or {}
-        prompt_tokens = int(usage.get("prompt_tokens") or 0)
-        completion_tokens = int(usage.get("completion_tokens") or 0)
-        total_tokens = int(usage.get("total_tokens") or (prompt_tokens + completion_tokens))
-        estimated_cost_usd = usage.get("estimated_cost_usd")
-        if estimated_cost_usd is not None:
-            estimated_cost_usd = float(estimated_cost_usd)
-        return {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": total_tokens,
-            "estimated_cost_usd": estimated_cost_usd,
-        }
-
-    @classmethod
-    def _merge_usage(cls, first: dict[str, Any] | None, second: dict[str, Any] | None) -> dict[str, Any]:
-        a = cls._normalize_usage(first)
-        b = cls._normalize_usage(second)
-        merged_cost = None
-        if a["estimated_cost_usd"] is not None or b["estimated_cost_usd"] is not None:
-            merged_cost = (a["estimated_cost_usd"] or 0.0) + (b["estimated_cost_usd"] or 0.0)
-        return {
-            "prompt_tokens": a["prompt_tokens"] + b["prompt_tokens"],
-            "completion_tokens": a["completion_tokens"] + b["completion_tokens"],
-            "total_tokens": a["total_tokens"] + b["total_tokens"],
-            "estimated_cost_usd": merged_cost,
-        }
-
-    def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int:
-        """Implementation of action selection logic.
-
-        Args:
-            state (str): Current game state text
-            choices (List[Dict[str, str]]): List of available choices
-
-        Returns:
-            int: Selected action number (1-based)
-        """
-        if self.debug:
-            self.logger.debug(f"Getting action for state with {len(choices)} choices available")
-            for i, choice in enumerate(choices):
-                self.logger.debug(f"Choice {i + 1}: {choice.get('text', 'NO TEXT')}")
-        try:
-            state_signature = self._state_signature(state, choices)
-            # Format prompt
-            prompt = self._format_prompt(self._build_contextual_state(state), choices)
-            if self.debug:
-                self.logger.debug(f"\nPrompt:\n{prompt}")
-
-            # Get LLM response
-            self._ensure_llm()
-            llm_response = self.llm.get_completion(prompt)
-            llm_usage = self.llm.get_last_usage()
-            if self.debug:
-                self.logger.debug(f"LLM response: {llm_response}")
-                choices_debug = []
-                for i, c in enumerate(choices):
-                    choices_debug.append(f"{i + 1}: {c['text']}")
-                self.logger.debug(f"Available choices: {choices_debug}")
-
-            # Parse response
-            first_response = parse_llm_response(
-                llm_response,
-                len(choices),
-                self.debug,
-                self.logger,
-            )
-            parsed_response = first_response
-
-            if parsed_response.is_default:
-                retry_response = self.llm.get_completion(self._format_retry_prompt(state, choices))
-                retry_usage = self.llm.get_last_usage()
-                llm_usage = self._merge_usage(llm_usage, retry_usage)
-                retry_parsed = parse_llm_response(retry_response, len(choices), self.debug, self.logger)
-                if not retry_parsed.is_default:
-                    retry_parsed.parse_mode = f"retry_{retry_parsed.parse_mode or 'parsed'}"
-                    parsed_response = retry_parsed
-                elif self._needs_force_numeric_retry():
-                    # GPT-5/o models occasionally return empty visible text on long prompts.
-                    # Use a tiny final retry that asks for number-only output.
-                    force_retry_response = self.llm.get_completion(self._format_force_numeric_retry_prompt(choices))
-                    force_retry_usage = self.llm.get_last_usage()
-                    llm_usage = self._merge_usage(llm_usage, force_retry_usage)
-                    force_retry_parsed = parse_llm_response(
-                        force_retry_response,
-                        len(choices),
-                        self.debug,
-                        self.logger,
-                    )
-                    if not force_retry_parsed.is_default:
-                        force_retry_parsed.parse_mode = f"force_retry_{force_retry_parsed.parse_mode or 'parsed'}"
-                        parsed_response = force_retry_parsed
-
-            action_before_policy = parsed_response.action
-            if parsed_response is not first_response:
-                if parsed_response.analysis is None and first_response.analysis is not None:
-                    parsed_response.analysis = first_response.analysis
-                if _is_numeric_raw_reasoning(parsed_response.reasoning):
-                    if first_response.reasoning and not _is_numeric_raw_reasoning(first_response.reasoning):
-                        parsed_response.reasoning = first_response.reasoning
-                    else:
-                        first_raw_reasoning = _raw_reasoning_fallback(llm_response)
-                        if first_raw_reasoning and not _is_numeric_raw_reasoning(first_raw_reasoning):
-                            parsed_response.reasoning = first_raw_reasoning
-
-            parsed_response.action = self._apply_safety_filter(parsed_response.action, choices)
-            if parsed_response.action != action_before_policy and not parsed_response.reasoning:
-                parsed_response.reasoning = "policy_safety_override"
-            usage_payload = self._normalize_usage(llm_usage)
-            parsed_response.prompt_tokens = usage_payload["prompt_tokens"]
-            parsed_response.completion_tokens = usage_payload["completion_tokens"]
-            parsed_response.total_tokens = usage_payload["total_tokens"]
-            parsed_response.estimated_cost_usd = usage_payload["estimated_cost_usd"]
-
-            if self.debug:
-                self.logger.debug(f"Parsed LLM response: {parsed_response}")
-                self.logger.debug(f"Final action to be returned: {parsed_response.action}")
-
-            # Store response in history
-            self.history.append(parsed_response)
-            self._last_response = parsed_response
-            self._remember_decision(state, choices, state_signature, parsed_response)
-
-            # Check that action is within valid range before returning
-            if parsed_response.action < 1 or parsed_response.action > len(choices):
-                self.logger.error(f"INVALID ACTION DETECTED: {parsed_response.action} not in range 1-{len(choices)}")
-                # Use default first action instead
-                parsed_response.action = 1
-                self.logger.warning("Defaulting to action 1 instead")
-
-            return parsed_response.action
-
-        except Exception as e:
-            self.logger.error(f"Error during LLM call: {e}")
-            default_response = LLMResponse(
-                action=1,
-                is_default=True,
-                parse_mode="error_default",
-                reasoning=_raw_reasoning_fallback(f"llm_call_error: {e}"),
-            )
-            self.history.append(default_response)
-            self._last_response = default_response
-            return 1  # Default to first choice on error
-
-    def reset(self) -> None:
-        """Reset agent state"""
-        self.history = []
-        self._observation_history = []
-        self._decision_history = []
-        self._state_action_counts = {}
-        self._last_response = LLMResponse(action=1, is_default=True)
-        self._quest_briefing = None
-        self._transcript = []
-        self._compaction_summary = None
-        self._steps_since_compaction = 0
-        self._step_count = 0
-
-    def on_game_start(self) -> None:
-        """Called when game starts"""
-        super().on_game_start()
-        self._observation_history = []
-        self._decision_history = []
-        self._state_action_counts = {}
-        self._last_response = LLMResponse(action=1, is_default=True)
-        self._quest_briefing = None
-        self._transcript = []
-        self._compaction_summary = None
-        self._steps_since_compaction = 0
-        self._step_count = 0
-
-    def on_game_end(self, final_state: dict[str, Any]) -> None:
-        """Log final state for analysis"""
-        if self.debug:
-            self.logger.debug(f"Game ended with state: {final_state}")
-
-    def __str__(self) -> str:
-        """String representation of the agent"""
-        return f"LLMAgent(model={self.model_name}, system_template={self.system_template}, action_template={self.action_template}, temperature={self.temperature})"
-
-    def _format_prompt(self, state: str, choices: list[dict[str, str]]) -> str:
-        """Format the prompt for the LLM"""
-        return self.prompt_renderer.render_action_prompt(state, choices).strip()
-
-    def _format_retry_prompt(self, state: str, choices: list[dict[str, str]]) -> str:
-        """Fallback prompt that still preserves reasoning for log analysis."""
-        clipped_state = (state or "").strip()
-        if len(clipped_state) > 500:
-            clipped_state = clipped_state[:500] + "..."
-        choices_text = "\n".join([f"{i + 1}. {(c.get('text', '') or '')[:160]}" for i, c in enumerate(choices)])
-        return f"""Choose the best action.
-State: {clipped_state}
-Actions:
-{choices_text}
-
-Return valid JSON only:
-{{
-  "analysis": "<max 25 words>",
-  "reasoning": "<max 25 words>",
-  "result": <integer from 1 to {len(choices)}>
-}}"""
-
-    def _format_force_numeric_retry_prompt(self, choices: list[dict[str, str]]) -> str:
-        """Very short retry prompt used for models that return empty visible output."""
-        choices_text = "\n".join([f"{i + 1}. {(c.get('text', '') or '')[:110]}" for i, c in enumerate(choices)])
-        return f"""Pick one action number.
-{choices_text}
-Reply with one integer only: 1 to {len(choices)}."""
-
-    def _needs_force_numeric_retry(self) -> bool:
-        return self.model_spec.provider == "openai" and (
-            self.model_spec.model_id.startswith("gpt-5") or self.model_spec.model_id.startswith("o")
-        )
diff --git a/llm_quest_benchmark/agents/strategic_agent.py b/llm_quest_benchmark/agents/strategic_agent.py
deleted file mode 100644
index 387c650..0000000
--- a/llm_quest_benchmark/agents/strategic_agent.py
+++ /dev/null
@@ -1,94 +0,0 @@
-"""Strategic agent decorator that adds analysis capabilities"""
-
-import logging
-from typing import Any
-
-from llm_quest_benchmark.agents.base import QuestPlayer
-from llm_quest_benchmark.llm.prompt import PromptRenderer
-
-
-class StrategicAgent(QuestPlayer):
-    """Decorator that adds strategic thinking to any quest player"""
-
-    def __init__(self, base_agent: QuestPlayer, debug: bool = False, template: str = "advanced.jinja"):
-        """Initialize strategic agent wrapper
-
-        Args:
-            base_agent: Base agent to wrap (usually LLMAgent)
-            debug: Enable debug logging
-            template: Template to use for enhanced prompts
-        """
-        super().__init__(skip_single=base_agent.skip_single)
-        self.agent = base_agent
-        self.debug = debug
-        self.history = []
-
-        # Setup logging
-        self.logger = logging.getLogger(self.__class__.__name__)
-        if self.debug:
-            self.logger.setLevel(logging.DEBUG)
-            handler = logging.StreamHandler()
-            handler.setFormatter(logging.Formatter("%(name)s - %(message)s"))
-            self.logger.addHandler(handler)
-
-        # Initialize prompt renderer
-        self.prompt_renderer = PromptRenderer(None, template=template)
-
-    def _get_action_impl(self, observation: str, choices: list) -> str:
-        """Implementation of action selection logic with strategic analysis"""
-        if hasattr(self.agent, "llm"):
-            # First, get situation analysis
-            if self.debug:
-                self.logger.debug(f"\nObservation:\n{observation}")
-
-            analysis = self.agent.llm(
-                "Analyze this situation and explain your thinking step-by-step instead of choosing an action:\n"
-                + observation
-            )
-
-            if self.debug:
-                self.logger.debug(f"\nAnalysis:\n{analysis}")
-
-            # Store analysis in history
-            self.history.append({"observation": observation, "analysis": analysis})
-
-            # Get enhanced context with history
-            enhanced_context = self.get_enhanced_context(observation, choices)
-            if self.debug:
-                self.logger.debug(f"\nEnhanced Context:\n{enhanced_context}")
-
-            # Then make the actual choice with analysis context
-            return self.agent.get_action(enhanced_context, choices)
-        else:
-            # If agent doesn't have LLM capability, just pass through
-            return self.agent.get_action(observation, choices)
-
-    def get_enhanced_context(self, observation: str, choices: list) -> str:
-        """Build context for advanced prompt with historical analysis"""
-        context = [
-            f"Turn {len(self.history) + 1}: {entry['analysis']}"
-            for entry in self.history[-3:]  # Last 3 analyses
-        ]
-        return self.prompt_renderer.render_action_prompt(
-            observation=observation, choices=choices, state_tracker=context
-        )
-
-    def reset(self) -> None:
-        """Reset both strategic and base agent state"""
-        self.history = []
-        self.agent.reset()
-
-    def on_game_start(self) -> None:
-        """Pass through to base agent"""
-        if self.debug:
-            self.logger.debug("Starting new game with strategic analysis")
-        self.agent.on_game_start()
-
-    def on_game_end(self, final_state: dict[str, Any]) -> None:
-        """Pass through to base agent and log analysis history"""
-        self.agent.on_game_end(final_state)
-        if self.debug:
-            self.logger.debug("Final Analysis History:")
-            for entry in self.history:
-                self.logger.debug(f"\nObservation: {entry['observation']}")
-                self.logger.debug(f"Analysis: {entry['analysis']}")
diff --git a/llm_quest_benchmark/agents/tool_agent.py b/llm_quest_benchmark/agents/tool_agent.py
deleted file mode 100644
index 694d1ac..0000000
--- a/llm_quest_benchmark/agents/tool_agent.py
+++ /dev/null
@@ -1,384 +0,0 @@
-"""Tool-augmented agent with lightweight structured prompting."""
-
-import ast
-import re
-from typing import Any
-
-from llm_quest_benchmark.agents.llm_agent import (
-    LLMAgent,
-    LLMResponse,
-    _parse_json_response,
-    parse_llm_response,
-)
-
-
-class ToolAgent(LLMAgent):
-    """LLM agent with generic run-local tools for history, math, and state notes."""
-
-    DEFAULT_HISTORY_WINDOW = 10
-    MAX_SCRATCHPAD_CHARS = 1200
-    MAX_TOOL_INPUT_CHARS = 500
-
-    def __init__(
-        self,
-        *args,
-        action_template: str = "tool_augmented.jinja",
-        history_window: int | None = None,
-        **kwargs,
-    ):
-        super().__init__(*args, action_template=action_template, **kwargs)
-        self.agent_id = f"tool_{self.model_name}"
-        self._step_log: list[dict[str, Any]] = []
-        self._history_window = history_window or self.DEFAULT_HISTORY_WINDOW
-        self._scratchpad = ""
-
-    def _recent_steps(self) -> list[str]:
-        snippets = []
-        for entry in self._step_log[-self._history_window :]:
-            snippets.append(f"Step {entry['step']}: {entry['observation']} -> {entry.get('selected_choice', 'n/a')}")
-        return snippets
-
-    def _tool_descriptions(self) -> list[str]:
-        return [
-            "quest_history(query): search earlier observations and chosen actions in this quest.",
-            "calculator(expression): evaluate arithmetic and simple comparisons.",
-            "scratchpad(operation, content): read or replace one persistent note. operation is read or write_replace.",
-        ]
-
-    def quest_history(self, query: str) -> str:
-        """Return relevant previous steps from this quest run via keyword match."""
-        if not self._step_log:
-            return "No prior quest steps recorded yet."
-
-        tokens = set(re.findall(r"[a-zA-Z\u0400-\u04ff0-9_]{3,}", (query or "").lower()))
-        scored = []
-        for entry in self._step_log:
-            haystack = " ".join(
-                [
-                    entry.get("observation", ""),
-                    " ".join(entry.get("choices", [])),
-                    entry.get("selected_choice", ""),
-                ]
-            ).lower()
-            score = sum(1 for token in tokens if token in haystack)
-            scored.append((score, entry))
-
-        scored.sort(key=lambda item: (item[0], item[1].get("step", 0)), reverse=True)
-        best = [entry for s, entry in scored if s > 0][: self._history_window]
-        if not best:
-            best = [entry for _, entry in scored[-self._history_window :]]
-
-        lines = []
-        for entry in best:
-            lines.append(
-                f"Step {entry['step']}: obs={entry['observation']} | "
-                f"choices={'; '.join(entry['choices'])} | picked={entry.get('selected_choice', 'n/a')}"
-            )
-        return "\n".join(lines)
-
-    @staticmethod
-    def calculator(expression: str) -> str:
-        """Evaluate a restricted arithmetic/comparison expression."""
-        expr = (expression or "").strip()
-        if not expr:
-            return "error: empty expression"
-        if len(expr) > 240:
-            return "error: expression too long"
-        if not re.fullmatch(r"[0-9a-zA-Z\s+\-*/().,<>=!%]+", expr):
-            return "error: unsupported characters"
-
-        allowed_nodes = (
-            ast.Expression,
-            ast.Constant,
-            ast.UnaryOp,
-            ast.UAdd,
-            ast.USub,
-            ast.BinOp,
-            ast.Add,
-            ast.Sub,
-            ast.Mult,
-            ast.Div,
-            ast.FloorDiv,
-            ast.Mod,
-            ast.Pow,
-            ast.Compare,
-            ast.Eq,
-            ast.NotEq,
-            ast.Lt,
-            ast.LtE,
-            ast.Gt,
-            ast.GtE,
-            ast.BoolOp,
-            ast.And,
-            ast.Or,
-        )
-        try:
-            tree = ast.parse(expr, mode="eval")
-            for node in ast.walk(tree):
-                if not isinstance(node, allowed_nodes):
-                    return f"error: unsupported expression element {node.__class__.__name__}"
-                if isinstance(node, ast.Constant) and not isinstance(node.value, (int, float, bool)):
-                    return "error: constants must be numeric or boolean"
-            result = ToolAgent._eval_calculator_node(tree.body)
-        except Exception as exc:
-            return f"error: {exc}"
-        return f"{expr} = {result}"
-
-    @staticmethod
-    def _eval_calculator_node(node: ast.AST) -> int | float | bool:
-        if isinstance(node, ast.Constant) and isinstance(node.value, (int, float, bool)):
-            return node.value
-        if isinstance(node, ast.UnaryOp):
-            value = ToolAgent._eval_calculator_node(node.operand)
-            if isinstance(node.op, ast.UAdd):
-                return +value
-            if isinstance(node.op, ast.USub):
-                return -value
-        if isinstance(node, ast.BinOp):
-            left = ToolAgent._eval_calculator_node(node.left)
-            right = ToolAgent._eval_calculator_node(node.right)
-            if isinstance(node.op, ast.Add):
-                return left + right
-            if isinstance(node.op, ast.Sub):
-                return left - right
-            if isinstance(node.op, ast.Mult):
-                return left * right
-            if isinstance(node.op, ast.Div):
-                return left / right
-            if isinstance(node.op, ast.FloorDiv):
-                return left // right
-            if isinstance(node.op, ast.Mod):
-                return left % right
-            if isinstance(node.op, ast.Pow):
-                if abs(right) > 8:
-                    raise ValueError("exponent too large")
-                return left**right
-        if isinstance(node, ast.BoolOp):
-            values = [bool(ToolAgent._eval_calculator_node(value)) for value in node.values]
-            if isinstance(node.op, ast.And):
-                return all(values)
-            if isinstance(node.op, ast.Or):
-                return any(values)
-        if isinstance(node, ast.Compare):
-            left = ToolAgent._eval_calculator_node(node.left)
-            for op, comparator in zip(node.ops, node.comparators, strict=True):
-                right = ToolAgent._eval_calculator_node(comparator)
-                if isinstance(op, ast.Eq):
-                    ok = left == right
-                elif isinstance(op, ast.NotEq):
-                    ok = left != right
-                elif isinstance(op, ast.Lt):
-                    ok = left < right
-                elif isinstance(op, ast.LtE):
-                    ok = left <= right
-                elif isinstance(op, ast.Gt):
-                    ok = left > right
-                elif isinstance(op, ast.GtE):
-                    ok = left >= right
-                else:
-                    raise ValueError("unsupported comparison")
-                if not ok:
-                    return False
-                left = right
-            return True
-        raise ValueError("unsupported expression")
-
-    def scratchpad(self, operation: str, content: str = "") -> str:
-        """Read or replace one persistent free-form note blob."""
-        op = (operation or "").strip().lower()
-        if op == "read":
-            return self._scratchpad or "(empty)"
-        if op == "write_replace":
-            note = " ".join((content or "").strip().split())
-            self._scratchpad = note[: self.MAX_SCRATCHPAD_CHARS]
-            return f"updated: {self._scratchpad or '(empty)'}"
-        return "error: operation must be read or write_replace"
-
-    def _build_tool_prompt(
-        self,
-        observation: str,
-        choices: list[dict[str, str]],
-        prompt_kind: str,
-        tool_results: list[str] | None = None,
-    ) -> str:
-        template = self.prompt_renderer.get_template(self.action_template)
-        return template.render(
-            prompt_kind=prompt_kind,
-            observation=observation,
-            choices=[{"text": choice.get("text", "")} for choice in choices],
-            tool_descriptions=self._tool_descriptions(),
-            tool_results=tool_results or [],
-            recent_steps=self._recent_steps(),
-            scratchpad_note=self._scratchpad,
-        ).strip()
-
-    @staticmethod
-    def _extract_tool_calls(response: str) -> list[dict[str, Any]]:
-        payload, _ = _parse_json_response(response)
-        if not isinstance(payload, dict):
-            return []
-
-        tool_calls = payload.get("tool_calls")
-        if not isinstance(tool_calls, list):
-            return []
-
-        normalized = []
-        for item in tool_calls[:1]:
-            if not isinstance(item, dict):
-                continue
-            tool_name = str(item.get("tool") or "").strip()
-            tool_input = item.get("input")
-            operation = str(item.get("operation") or "").strip()
-            content = str(item.get("content") or "").strip()
-            if isinstance(tool_input, dict):
-                operation = operation or str(tool_input.get("operation") or "").strip()
-                content = content or str(tool_input.get("content") or "").strip()
-                tool_input = tool_input.get("expression") or tool_input.get("query") or tool_input.get("content") or ""
-            tool_input = str(tool_input or "").strip()
-            if len(tool_input) > ToolAgent.MAX_TOOL_INPUT_CHARS:
-                tool_input = tool_input[: ToolAgent.MAX_TOOL_INPUT_CHARS]
-            if len(content) > ToolAgent.MAX_TOOL_INPUT_CHARS:
-                content = content[: ToolAgent.MAX_TOOL_INPUT_CHARS]
-            if tool_name:
-                normalized.append(
-                    {
-                        "tool": tool_name,
-                        "input": tool_input,
-                        "operation": operation,
-                        "content": content,
-                    }
-                )
-        return normalized
-
-    def _execute_tool_calls(self, tool_calls: list[dict[str, Any]]) -> list[str]:
-        results = []
-        for tc in tool_calls:
-            name, inp = tc["tool"], tc.get("input", "")
-            if name == "quest_history":
-                result = self.quest_history(inp)
-            elif name == "calculator":
-                result = self.calculator(inp)
-            elif name == "scratchpad":
-                operation = tc.get("operation") or inp
-                result = self.scratchpad(str(operation), str(tc.get("content") or ""))
-            else:
-                result = f"unknown tool: {name}"
-            call_repr = inp
-            if name == "scratchpad":
-                call_repr = f"{tc.get('operation') or inp}, {tc.get('content') or ''}".strip(", ")
-            results.append(f"{name}({call_repr}) => {result}")
-        return results
-
-    def _final_choice(
-        self,
-        observation: str,
-        choices: list[dict[str, str]],
-        tool_results: list[str] | None = None,
-    ) -> tuple[LLMResponse, dict[str, Any]]:
-        prompt = self._build_tool_prompt(
-            observation,
-            choices,
-            prompt_kind="final",
-            tool_results=tool_results,
-        )
-        llm_response = self.llm.get_completion(prompt)
-        llm_usage = self.llm.get_last_usage()
-        parsed_response = parse_llm_response(llm_response, len(choices), self.debug, self.logger)
-
-        if parsed_response.is_default:
-            retry_response = self.llm.get_completion(self._format_retry_prompt(observation, choices))
-            retry_usage = self.llm.get_last_usage()
-            llm_usage = self._merge_usage(llm_usage, retry_usage)
-            retry_parsed = parse_llm_response(retry_response, len(choices), self.debug, self.logger)
-            if not retry_parsed.is_default:
-                retry_parsed.parse_mode = f"retry_{retry_parsed.parse_mode or 'parsed'}"
-                parsed_response = retry_parsed
-            elif self._needs_force_numeric_retry():
-                force_response = self.llm.get_completion(self._format_force_numeric_retry_prompt(choices))
-                force_usage = self.llm.get_last_usage()
-                llm_usage = self._merge_usage(llm_usage, force_usage)
-                force_parsed = parse_llm_response(force_response, len(choices), self.debug, self.logger)
-                if not force_parsed.is_default:
-                    force_parsed.parse_mode = f"force_retry_{force_parsed.parse_mode or 'parsed'}"
-                    parsed_response = force_parsed
-
-        return parsed_response, llm_usage
-
-    def _log_step(self, observation: str, choices: list[dict[str, str]], response: LLMResponse) -> None:
-        selected = ""
-        if 1 <= response.action <= len(choices):
-            selected = choices[response.action - 1].get("text", "")
-
-        clipped = " ".join((observation or "").strip().split())
-        if len(clipped) > 180:
-            clipped = clipped[:180] + "..."
-
-        self._step_log.append(
-            {
-                "step": len(self._step_log) + 1,
-                "observation": clipped,
-                "choices": [c.get("text", "") for c in choices],
-                "selected_choice": selected,
-            }
-        )
-
-    def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int:
-        try:
-            state_signature = self._state_signature(state, choices)
-            contextual_state = self._build_contextual_state(state)
-            self._ensure_llm()
-
-            selection_prompt = self._build_tool_prompt(contextual_state, choices, prompt_kind="select")
-            selection_response = self.llm.get_completion(selection_prompt)
-            selection_usage = self.llm.get_last_usage()
-            tool_calls = self._extract_tool_calls(selection_response)
-            parsed_response = parse_llm_response(selection_response, len(choices), self.debug, self.logger)
-            tool_results: list[str] = []
-
-            total_usage = self._normalize_usage(selection_usage)
-            if tool_calls:
-                tool_results = self._execute_tool_calls(tool_calls)
-                parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=tool_results)
-                total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage))
-            elif parsed_response.is_default:
-                parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=[])
-                total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage))
-
-            action_before_policy = parsed_response.action
-            parsed_response.action = self._apply_safety_filter(parsed_response.action, choices)
-            if parsed_response.action != action_before_policy and not parsed_response.reasoning:
-                parsed_response.reasoning = "policy_safety_override"
-
-            parsed_response.prompt_tokens = total_usage["prompt_tokens"]
-            parsed_response.completion_tokens = total_usage["completion_tokens"]
-            parsed_response.total_tokens = total_usage["total_tokens"]
-            parsed_response.estimated_cost_usd = total_usage["estimated_cost_usd"]
-            parsed_response.tool_calls = tool_calls or None
-            parsed_response.tool_results = tool_results or None
-
-            self.history.append(parsed_response)
-            self._last_response = parsed_response
-            self._remember_decision(state, choices, state_signature, parsed_response)
-            self._log_step(state, choices, parsed_response)
-            return parsed_response.action
-        except Exception as exc:
-            self.logger.error("Tool agent error during LLM call: %s", exc)
-            default_response = LLMResponse(
-                action=1,
-                is_default=True,
-                parse_mode="error_default",
-                reasoning=f"tool_agent_error: {exc}",
-            )
-            self.history.append(default_response)
-            self._last_response = default_response
-            return 1
-
-    def reset(self) -> None:
-        super().reset()
-        self._step_log = []
-        self._scratchpad = ""
-
-    def on_game_start(self) -> None:
-        super().on_game_start()
-        self._step_log = []
-        self._scratchpad = ""
diff --git a/llm_quest_benchmark/core/leaderboard.py b/llm_quest_benchmark/core/leaderboard.py
index 032ad48..078648e 100644
--- a/llm_quest_benchmark/core/leaderboard.py
+++ b/llm_quest_benchmark/core/leaderboard.py
@@ -28,9 +28,6 @@
     "stub": ("minimal_prompt", TAXONOMY_MODES["minimal_prompt"]),
     "strategic": ("short_context_reasoning", TAXONOMY_MODES["short_context_reasoning"]),
     "stateful_compact": ("compact_memory_memo", TAXONOMY_MODES["compact_memory_memo"]),
-    "memo_cot": ("compact_memory_memo", TAXONOMY_MODES["compact_memory_memo"]),
-    "memo_extended": ("compact_memory_memo", TAXONOMY_MODES["compact_memory_memo"]),
-    "memo_structured": ("compact_memory_memo", TAXONOMY_MODES["compact_memory_memo"]),
     "light_hints": ("prompt_hints", TAXONOMY_MODES["prompt_hints"]),
     "stateful_compact_hints": ("prompt_hints", TAXONOMY_MODES["prompt_hints"]),
     "planner": ("planner_loop", TAXONOMY_MODES["planner_loop"]),
@@ -38,6 +35,26 @@
     "tool_augmented_hints": ("tools_hints_compact_memory", TAXONOMY_MODES["tools_hints_compact_memory"]),
 }
 
+RETIRED_BENCHMARK_NAMES = {
+    "exp4_compaction_no_memo",
+    "exp4_memo_cot",
+    "exp4_memo_extended",
+    "exp4_memo_structured",
+}
+
+RETIRED_HARNESSES = {
+    "compaction_no_memo",
+    "memo_cot",
+    "memo_extended",
+    "memo_structured",
+}
+
+RETIRED_TEMPLATE_IDS = {
+    "memo_cot",
+    "memo_extended",
+    "memo_structured",
+}
+
 REASONING_STYLE_TEMPLATES = {
     "reasoning",
     "strategic",
@@ -87,6 +104,25 @@ def _mode_from_template(template_name: str, memory_mode: str | None = None) -> t
     return TEMPLATE_TO_MODE.get(template_id, (template_id or "unknown", template_id or "unknown"))
 
 
+def _is_retired_result(
+    source_name: str | None,
+    benchmark_id: str | None,
+    result_row: dict[str, Any],
+    agent_config: dict[str, Any],
+    template_name: str,
+) -> bool:
+    source_names = {str(value) for value in (source_name, benchmark_id) if value}
+    if source_names & RETIRED_BENCHMARK_NAMES:
+        return True
+
+    harness = str(result_row.get("harness") or agent_config.get("harness") or "")
+    if harness in RETIRED_HARNESSES:
+        return True
+
+    template_id = _strip_template_suffix(template_name)
+    return template_id in RETIRED_TEMPLATE_IDS
+
+
 def _agent_config(db_run: dict[str, Any]) -> dict[str, Any]:
     raw_config = db_run.get("agent_config")
     if not isinstance(raw_config, str) or not raw_config:
@@ -298,6 +334,7 @@ def generate_leaderboard(
             continue
 
         benchmark_id = summary.get("benchmark_id")
+        source_name = summary.get("name")
         if benchmark_id:
             benchmark_ids.append(str(benchmark_id))
 
@@ -348,7 +385,15 @@ def generate_leaderboard(
             template_from_config = str(config.get("action_template") or "")
             if template_from_config:
                 template = template_from_config
-            memory_mode = config.get("memory_mode")
+            memory_mode = config.get("memory_mode") or result_row.get("memory_mode")
+            if _is_retired_result(
+                str(source_name) if source_name else None,
+                str(benchmark_id) if benchmark_id else None,
+                result_row,
+                config,
+                template,
+            ):
+                continue
             mode_id, mode_label = _mode_from_template(template, str(memory_mode) if memory_mode is not None else None)
 
             try:
diff --git a/llm_quest_benchmark/core/runner.py b/llm_quest_benchmark/core/runner.py
index e5f531a..d86c07b 100644
--- a/llm_quest_benchmark/core/runner.py
+++ b/llm_quest_benchmark/core/runner.py
@@ -10,24 +10,23 @@
 from copy import deepcopy
 from typing import Any
 
-from llm_quest_benchmark.agents.base import QuestPlayer
 from llm_quest_benchmark.constants import DEFAULT_QUEST_TIMEOUT
 from llm_quest_benchmark.core.logging import LogManager, QuestLogger
 from llm_quest_benchmark.environments.qm import QMPlayerEnv as QuestEnvironment
 from llm_quest_benchmark.environments.state import QuestOutcome
-from llm_quest_benchmark.schemas.config import AgentConfig
+from llm_quest_benchmark.players.base import QuestPlayer
+from llm_quest_benchmark.schemas.config import HarnessConfig
 from llm_quest_benchmark.schemas.state import AgentState
 
 # Configure logging
 logging.getLogger("quest").setLevel(logging.WARNING)
-logging.getLogger("LLMAgent").setLevel(logging.WARNING)
 
 
 def run_quest_with_timeout(
     quest_path: str,
     agent: QuestPlayer,
     timeout: int = DEFAULT_QUEST_TIMEOUT,
-    agent_config: AgentConfig | None = None,
+    agent_config: HarnessConfig | Any | None = None,
     debug: bool = False,
     callbacks: list[Callable[[str, Any], None]] = None,
 ) -> QuestOutcome | None:
diff --git a/llm_quest_benchmark/executors/benchmark.py b/llm_quest_benchmark/executors/benchmark.py
index 0b78062..14dacaf 100644
--- a/llm_quest_benchmark/executors/benchmark.py
+++ b/llm_quest_benchmark/executors/benchmark.py
@@ -12,10 +12,10 @@
 from pathlib import Path
 from typing import Any
 
-from llm_quest_benchmark.agents.agent_factory import create_agent
 from llm_quest_benchmark.core.logging import DEFAULT_DB_PATH
 from llm_quest_benchmark.core.runner import run_quest_with_timeout
 from llm_quest_benchmark.environments.state import QuestOutcome
+from llm_quest_benchmark.harnesses.factory import create_harness
 from llm_quest_benchmark.llm import tracing
 from llm_quest_benchmark.schemas.config import BenchmarkConfig
 
@@ -34,6 +34,68 @@
 logger = logging.getLogger(__name__)
 
 
+def _agent_harness(agent_config) -> str:
+    """Return the configured harness name."""
+    return agent_config.harness
+
+
+def _agent_model(agent_config) -> str:
+    """Return the result model label for the executed harness."""
+    harness = _agent_harness(agent_config)
+    if harness == "human":
+        return "human"
+    if harness.startswith("random_choice"):
+        return "random_policy"
+    return agent_config.model
+
+
+def _agent_id(agent_config) -> str:
+    """Return the stable result identifier for legacy and harness configs."""
+    return getattr(agent_config, "harness_id", None) or agent_config.agent_id
+
+
+def _agent_template(agent_config) -> str:
+    """Return legacy template name for result artifacts."""
+    if hasattr(agent_config, "action_template"):
+        return agent_config.action_template
+
+    harness_templates = {
+        "minimal": "stub.jinja",
+        "reasoning_recent": "reasoning.jinja",
+        "reasoning_full": "reasoning.jinja",
+        "memo_compact": "stateful_compact.jinja",
+        "hinted_compact": "stateful_compact_hints.jinja",
+        "tool_compact": "tool_augmented.jinja",
+        "tool_hinted": "tool_augmented_hints.jinja",
+        "planner": "planner.jinja",
+        "compaction_no_memo": "reasoning.jinja",
+        "memo_cot": "memo_cot.jinja",
+        "memo_extended": "memo_extended.jinja",
+        "memo_structured": "memo_structured.jinja",
+    }
+    return harness_templates.get(_agent_harness(agent_config), "reasoning.jinja")
+
+
+def _agent_memory_mode(agent_config) -> str:
+    """Return legacy memory mode for result artifacts."""
+    if hasattr(agent_config, "memory_mode"):
+        return agent_config.memory_mode
+
+    harness_memory_modes = {
+        "reasoning_full": "full_transcript",
+        "memo_compact": "compaction",
+        "hinted_compact": "compaction",
+        "tool_compact": "compaction",
+        "tool_hinted": "compaction",
+        "planner": "compaction",
+        "compaction_no_memo": "compaction",
+        "memo_cot": "compaction",
+        "memo_extended": "compaction",
+        "memo_structured": "compaction",
+    }
+    return harness_memory_modes.get(_agent_harness(agent_config), "default")
+
+
 def _result_entry(
     quest: str,
     agent_config,
@@ -44,10 +106,12 @@ def _result_entry(
 ) -> dict[str, Any]:
     return {
         "quest": quest,
-        "model": agent_config.model,
+        "model": _agent_model(agent_config),
         "temperature": agent_config.temperature,
-        "template": agent_config.action_template,
-        "agent_id": agent_config.agent_id,
+        "harness": _agent_harness(agent_config),
+        "template": _agent_template(agent_config),
+        "memory_mode": _agent_memory_mode(agent_config),
+        "agent_id": _agent_id(agent_config),
         "attempt": attempt,
         "outcome": outcome,
         "reward": reward,
@@ -78,7 +142,7 @@ def _mark_run_timeout(run_id: int | None, quest: str, agent_config, benchmark_id
                 WHERE id = ?
                 """,
                 (
-                    agent_config.agent_id,
+                    _agent_id(agent_config),
                     agent_config_json,
                     benchmark_id,
                     QuestOutcome.TIMEOUT.name,
@@ -101,7 +165,7 @@ def _mark_run_timeout(run_id: int | None, quest: str, agent_config, benchmark_id
                     Path(quest).stem,
                     end_time,
                     end_time,
-                    agent_config.agent_id,
+                    _agent_id(agent_config),
                     agent_config_json,
                     QuestOutcome.TIMEOUT.name,
                     0.0,
@@ -132,15 +196,14 @@ def callback(event: str, data: Any = None) -> None:
             )
 
     try:
-        agent = create_agent(
+        agent = create_harness(
+            harness=_agent_harness(agent_config),
             model=agent_config.model,
             temperature=agent_config.temperature,
-            system_template=agent_config.system_template,
-            action_template=agent_config.action_template,
             skip_single=agent_config.skip_single,
             debug=agent_config.debug,
-            memory_mode=agent_config.memory_mode,
             compaction_interval=agent_config.compaction_interval,
+            system_template=agent_config.system_template,
         )
         outcome = run_quest_with_timeout(
             quest,
@@ -254,7 +317,7 @@ def _write_benchmark_artifacts(config: BenchmarkConfig, results: list[dict[str,
                 "temperature": agent.temperature,
                 "runs": agent.runs,
                 "system_template": agent.system_template,
-                "action_template": agent.action_template,
+                "harness": _agent_harness(agent),
             }
             for agent in config.agents
         ],
@@ -281,7 +344,7 @@ def _write_benchmark_artifacts(config: BenchmarkConfig, results: list[dict[str,
             {
                 "model": agent.model,
                 "system_template": agent.system_template,
-                "action_template": agent.action_template,
+                "harness": _agent_harness(agent),
                 "temperature": agent.temperature,
                 "runs": agent.runs,
                 "skip_single": agent.skip_single,
@@ -346,7 +409,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[
 
                 logger.info(
                     "Queued agent %s quest %s (attempt %s/%s)",
-                    agent_config.agent_id,
+                    _agent_id(agent_config),
                     quest_name,
                     attempt,
                     agent_config.runs,
@@ -378,7 +441,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[
             }
             logger.info(
                 "Agent %s running quest %s (attempt %s/%s)",
-                agent_config.agent_id,
+                _agent_id(agent_config),
                 task["quest_name"],
                 task["attempt"],
                 agent_config.runs,
@@ -391,7 +454,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[
                     "total_runs": total_runs,
                     "quest": task["quest"],
                     "quest_name": task["quest_name"],
-                    "agent_id": agent_config.agent_id,
+                    "agent_id": _agent_id(agent_config),
                     "model": agent_config.model,
                     "attempt": task["attempt"],
                 },
@@ -426,7 +489,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[
                             "total_runs": total_runs,
                             "quest": task["quest"],
                             "quest_name": task["quest_name"],
-                            "agent_id": agent_config.agent_id,
+                            "agent_id": _agent_id(agent_config),
                             "model": agent_config.model,
                             "attempt": task["attempt"],
                             "outcome": result["outcome"],
@@ -471,7 +534,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[
                         "total_runs": total_runs,
                         "quest": task["quest"],
                         "quest_name": task["quest_name"],
-                        "agent_id": agent_config.agent_id,
+                        "agent_id": _agent_id(agent_config),
                         "model": agent_config.model,
                         "attempt": task["attempt"],
                         "outcome": QuestOutcome.TIMEOUT.name,
diff --git a/llm_quest_benchmark/executors/cli/commands.py b/llm_quest_benchmark/executors/cli/commands.py
index e3cecbd..d554f70 100644
--- a/llm_quest_benchmark/executors/cli/commands.py
+++ b/llm_quest_benchmark/executors/cli/commands.py
@@ -18,13 +18,10 @@
 
 import typer
 
-from llm_quest_benchmark.agents.agent_factory import create_agent
-from llm_quest_benchmark.agents.human_player import HumanPlayer
 from llm_quest_benchmark.constants import (
     DEFAULT_MODEL,
     DEFAULT_QUEST,
     DEFAULT_TEMPERATURE,
-    DEFAULT_TEMPLATE,
     INFINITE_TIMEOUT,
     MODEL_CHOICES,
     SYSTEM_ROLE_TEMPLATE,
@@ -40,9 +37,10 @@
     print_summary,
     run_benchmark,
 )
+from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness
 from llm_quest_benchmark.llm import tracing
 from llm_quest_benchmark.renderers.terminal import RichRenderer
-from llm_quest_benchmark.schemas.config import AgentConfig, BenchmarkConfig
+from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig
 
 # Initialize logging
 log_manager = LogManager()
@@ -53,6 +51,8 @@
     rich_markup_mode="rich",
 )
 
+HARNESS_CHOICES = list(HARNESS_REGISTRY.keys())
+
 
 def version_callback(value: bool):
     if value:
@@ -348,7 +348,12 @@ def run(
     model: str = typer.Option(DEFAULT_MODEL, help=f"Model for the LLM agent (choices: {', '.join(MODEL_CHOICES)})."),
     temperature: float = typer.Option(DEFAULT_TEMPERATURE, help="Temperature for LLM sampling"),
     system_template: str = typer.Option(SYSTEM_ROLE_TEMPLATE, help="Template to use for system instructions."),
-    action_template: str = typer.Option(DEFAULT_TEMPLATE, help="Template to use for action prompts."),
+    harness: str = typer.Option(
+        "reasoning_recent",
+        "--harness",
+        help="Harness to use for quest decisions.",
+    ),
+    compaction_interval: int = typer.Option(50, help="Advanced override for compaction interval."),
     timeout: int = typer.Option(60, help="Timeout in seconds for run (0 for no timeout)."),
     skip: bool = typer.Option(True, help="Auto-select single choices without asking agent."),
     debug: bool = typer.Option(False, help="Enable debug logging and output, remove terminal UI."),
@@ -365,23 +370,25 @@ def run(
         log_manager.setup(debug)
 
         # Create agent config
-        agent_config = AgentConfig(
+        agent_config = HarnessConfig(
             model=model,
             system_template=system_template,
-            action_template=action_template,
+            harness=harness,
             temperature=temperature,
             skip_single=skip,
             debug=debug,
+            compaction_interval=compaction_interval,
         )
 
         # Create agent
-        agent = create_agent(
+        agent = create_harness(
+            harness=harness,
             model=model,
             system_template=system_template,
-            action_template=action_template,
             temperature=temperature,
             skip_single=skip,
             debug=debug,
+            compaction_interval=compaction_interval,
         )
 
         log.warning(f"Starting quest run with agent {str(agent)}")
@@ -458,7 +465,7 @@ def play(
         log.debug(f"Quest file: {quest}")
 
         # Create interactive player
-        player = HumanPlayer(skip_single=skip, debug=debug)
+        player = create_harness(harness="human", skip_single=skip, debug=debug)
 
         # Run quest in interactive mode
         result = run_quest_with_timeout(quest_path=str(quest), agent=player, timeout=INFINITE_TIMEOUT, debug=debug)
@@ -952,7 +959,7 @@ def benchmark(
 
     This command runs benchmark evaluation using a YAML configuration file that specifies:
     - quests: list of quest files or directories to test
-    - agents: list of agents with their model, template, and temperature settings
+    - agents: list of harnesses with their model, harness, and temperature settings
     - other settings: debug, timeout, workers, etc.
 
     Example:
diff --git a/llm_quest_benchmark/harnesses/__init__.py b/llm_quest_benchmark/harnesses/__init__.py
new file mode 100644
index 0000000..75cef22
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/__init__.py
@@ -0,0 +1,3 @@
+from llm_quest_benchmark.harnesses.base import BaseHarness
+
+__all__ = ["BaseHarness"]
diff --git a/llm_quest_benchmark/harnesses/base.py b/llm_quest_benchmark/harnesses/base.py
new file mode 100644
index 0000000..fd8864b
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/base.py
@@ -0,0 +1,581 @@
+"""Base harness class for quest benchmark experiments."""
+
+import hashlib
+import json
+import logging
+import re
+from abc import abstractmethod
+from typing import Any
+
+from json_repair import repair_json
+
+from llm_quest_benchmark.constants import DEFAULT_TEMPLATE, normalize_template_name
+from llm_quest_benchmark.llm.client import get_llm_client, parse_model_name
+from llm_quest_benchmark.llm.prompt import PromptRenderer
+from llm_quest_benchmark.players.base import QuestPlayer
+from llm_quest_benchmark.schemas.response import LLMResponse
+
+RISKY_CHOICE_KEYWORDS = (
+    "улететь",
+    "сдаться",
+    "отказ",
+    "провал",
+    "убежать",
+    "surrender",
+    "give up",
+)
+
+SAFE_CHOICE_KEYWORDS = (
+    "пройти мимо",
+    "избежать",
+    "подготов",
+    "библиотек",
+    "изуч",
+    "wait",
+    "avoid",
+    "study",
+)
+
+
+def _parse_json_response(
+    response: str,
+    debug: bool = False,
+    logger: logging.Logger | None = None,
+) -> tuple[dict[str, Any] | None, str | None]:
+    """Try to parse response as JSON, with repair attempt if needed."""
+    cleaned_response = (response or "").strip()
+    if not cleaned_response:
+        return None, None
+
+    try:
+        if "```json" in cleaned_response:
+            start = cleaned_response.find("```json") + 7
+            end = cleaned_response.find("```", start)
+            if end > start:
+                json_str = cleaned_response[start:end].strip()
+                if debug and logger:
+                    logger.debug("Extracted JSON: %s", json_str)
+                result = json.loads(json_str)
+                if debug and logger:
+                    logger.debug("Parsed JSON: %s", result)
+                return result, "json_fenced"
+
+        embedded_json = re.search(r"\{[\s\S]*\}", cleaned_response)
+        if embedded_json:
+            candidate = embedded_json.group(0).strip()
+            if candidate and candidate != cleaned_response:
+                try:
+                    result = json.loads(candidate)
+                    if debug and logger:
+                        logger.debug("Parsed embedded JSON: %s", result)
+                    return result, "json_embedded"
+                except json.JSONDecodeError:
+                    pass
+
+        result = json.loads(cleaned_response)
+        if debug and logger:
+            logger.debug("Direct JSON parse successful: %s", result)
+        return result, "json_direct"
+    except json.JSONDecodeError:
+        if debug and logger:
+            logger.debug("Initial JSON parse failed, attempting repair")
+        try:
+            repaired = repair_json(cleaned_response)
+            if debug and logger:
+                logger.debug("Repaired JSON: %s", repaired)
+            result = json.loads(repaired)
+            if debug and logger:
+                logger.debug("Parse of repaired JSON successful: %s", result)
+            return result, "json_repaired"
+        except Exception as exc:
+            if debug and logger:
+                logger.error("JSON repair failed: %s", exc)
+            return None, None
+
+
+def _validate_action_number(
+    action: int,
+    num_choices: int,
+    debug: bool = False,
+    logger: logging.Logger | None = None,
+) -> bool:
+    """Validate that action number is within valid range."""
+    if 1 <= action <= num_choices:
+        return True
+    if debug and logger:
+        logger.error("Action number %s out of range [1, %s]", action, num_choices)
+    return False
+
+
+def _extract_action_from_text(response: str, num_choices: int) -> int | None:
+    """Extract a candidate action from free-form text."""
+    for match in re.finditer(r"\b(\d+)\b", response):
+        action = int(match.group(1))
+        if 1 <= action <= num_choices:
+            return action
+    return None
+
+
+def _extract_field_from_text(response: str, field: str) -> str | None:
+    """Best-effort extraction of analysis/reasoning from loosely formatted output."""
+    if not response:
+        return None
+
+    json_pattern = re.compile(
+        rf"""['"]{re.escape(field)}['"]\s*:\s*['"](?P<value>.*?)['"]""",
+        re.IGNORECASE | re.DOTALL,
+    )
+    match = json_pattern.search(response)
+    if match:
+        value = " ".join(match.group("value").strip().split())
+        if value:
+            return value
+
+    partial_json_pattern = re.compile(
+        rf"""['"]{re.escape(field)}['"]\s*:\s*['"](?P<value>[^"\n\r]+)""",
+        re.IGNORECASE,
+    )
+    match = partial_json_pattern.search(response)
+    if match:
+        value = " ".join(match.group("value").strip().split())
+        if value:
+            return value
+
+    label_pattern = re.compile(
+        rf"""(?im)^\s*{re.escape(field)}\s*[:\-]\s*(?P<value>.+?)\s*$""",
+    )
+    match = label_pattern.search(response)
+    if match:
+        value = " ".join(match.group("value").strip().split())
+        if value:
+            return value
+
+    return None
+
+
+def _raw_reasoning_fallback(response: str) -> str | None:
+    compact = " ".join((response or "").strip().split())
+    if not compact:
+        return None
+    if len(compact) > 240:
+        compact = compact[:237] + "..."
+    return f"raw_response: {compact}"
+
+
+def _is_numeric_raw_reasoning(reasoning: str | None) -> bool:
+    if not reasoning or not reasoning.startswith("raw_response:"):
+        return False
+    payload = reasoning.split(":", 1)[1].strip()
+    return payload.isdigit()
+
+
+def parse_llm_response(
+    response: str,
+    num_choices: int,
+    debug: bool = False,
+    logger: logging.Logger | None = None,
+) -> LLMResponse:
+    """Parse an LLM response and return a structured response object."""
+    if debug and logger:
+        logger.debug("Raw LLM response: %s", response)
+
+    extracted_analysis = _extract_field_from_text(response, "analysis")
+    extracted_reasoning = _extract_field_from_text(response, "reasoning")
+    raw_reasoning = _raw_reasoning_fallback(response)
+
+    response_json, json_parse_mode = _parse_json_response(response, debug, logger)
+    if response_json and isinstance(response_json, dict):
+        analysis = response_json.get("analysis") or extracted_analysis
+        reasoning = response_json.get("reasoning") or response_json.get("thinking") or extracted_reasoning
+        if not reasoning and analysis:
+            reasoning = analysis
+        if not analysis and not reasoning:
+            reasoning = raw_reasoning
+
+        memo_raw = response_json.get("memo")
+        memo = str(memo_raw) if memo_raw is not None else None
+        action_value = response_json.get("action") or response_json.get("result") or response_json.get("choice")
+        if action_value is not None:
+            try:
+                action = int(action_value)
+                if _validate_action_number(action, num_choices, debug, logger):
+                    return LLMResponse(
+                        action=action,
+                        reasoning=reasoning,
+                        analysis=analysis,
+                        memo=memo,
+                        is_default=False,
+                        parse_mode=json_parse_mode or "json",
+                    )
+            except (ValueError, TypeError):
+                if debug and logger:
+                    logger.error("Invalid action value in JSON: %s", action_value)
+
+    try:
+        action = int(response.strip())
+        if _validate_action_number(action, num_choices, debug, logger):
+            return LLMResponse(
+                action=action,
+                reasoning=extracted_reasoning or extracted_analysis or raw_reasoning,
+                analysis=extracted_analysis,
+                is_default=False,
+                parse_mode="number_only",
+            )
+    except ValueError:
+        if debug and logger:
+            logger.error("Could not parse response as number: %s", response)
+
+    extracted_action = _extract_action_from_text(response, num_choices)
+    if extracted_action is not None:
+        return LLMResponse(
+            action=extracted_action,
+            reasoning=extracted_reasoning or extracted_analysis or raw_reasoning,
+            analysis=extracted_analysis,
+            is_default=False,
+            parse_mode="number_extracted",
+        )
+
+    if debug and logger:
+        logger.error("Error during response parsing, defaulting to first choice. Response: %s...", response[:100])
+    return LLMResponse(
+        action=1,
+        reasoning=extracted_reasoning or extracted_analysis or raw_reasoning,
+        analysis=extracted_analysis,
+        is_default=True,
+        parse_mode="default_first",
+    )
+
+
+class BaseHarness(QuestPlayer):
+    """Abstract LLM harness base class."""
+
+    def __init__(
+        self,
+        model_name,
+        system_template,
+        temperature,
+        skip_single,
+        debug,
+        memory_module=None,
+        tools=None,
+        action_template=DEFAULT_TEMPLATE,
+    ):
+        super().__init__(skip_single=skip_single)
+        self.debug = debug
+        self.model_name = model_name.lower()
+        self.system_template = normalize_template_name(system_template)
+        self.action_template = normalize_template_name(action_template)
+        self.temperature = temperature
+        self.harness_name = getattr(self.__class__, "harness_name", "")
+        self.agent_id = f"harness_{self.model_name}"
+        self.memory_module = memory_module
+        self.tools = tools or []
+        self.model_spec = parse_model_name(self.model_name)
+        self.logger = logging.getLogger(self.__class__.__name__)
+        if self.debug:
+            self.logger.setLevel(logging.DEBUG)
+            self.logger.propagate = False
+            if not any(getattr(h, "_llm_quest_handler", False) for h in self.logger.handlers):
+                handler = logging.StreamHandler()
+                handler.setFormatter(logging.Formatter("%(name)s - %(message)s"))
+                handler._llm_quest_handler = True
+                self.logger.addHandler(handler)
+
+        self.prompt_renderer = PromptRenderer(
+            None,
+            system_template=self.system_template,
+            action_template=self.action_template,
+        )
+        self.llm = None
+        self.history: list[LLMResponse] = []
+        self._use_safety_filter = True
+        self._last_response = LLMResponse(action=1, is_default=True)
+        self._observation_history: list[str] = []
+        self._decision_history: list[dict[str, Any]] = []
+        self._state_action_counts: dict[str, dict[int, int]] = {}
+        self._step_count = 0
+
+    def _ensure_llm(self) -> None:
+        """Lazily create the provider client only when inference is needed."""
+        if self.llm is None:
+            self.llm = get_llm_client(
+                self.model_name,
+                system_prompt=self.prompt_renderer.render_system_prompt(),
+                temperature=self.temperature,
+            )
+        if self.memory_module is not None and hasattr(self.memory_module, "llm_client"):
+            self.memory_module.llm_client = self.llm
+
+    @abstractmethod
+    def _get_action_impl(self, observation, choices) -> int:
+        """Return the selected 1-based action number."""
+        pass
+
+    def reset(self) -> None:
+        """Reset harness state between episodes."""
+        super().reset()
+        self.history = []
+        self._last_response = LLMResponse(action=1, is_default=True)
+        self._observation_history = []
+        self._decision_history = []
+        self._state_action_counts = {}
+        self._step_count = 0
+        if self.memory_module is not None:
+            self.memory_module.reset()
+
+    def get_action(self, observation: str, choices: list[dict[str, str]]) -> int:
+        clean = (observation or "").strip()
+        if clean:
+            self._observation_history.append(clean)
+            if len(self._observation_history) > 20:
+                self._observation_history = self._observation_history[-20:]
+        return super().get_action(observation, choices)
+
+    def on_game_start(self) -> None:
+        super().on_game_start()
+        self.reset()
+
+    def on_game_end(self, final_state: dict[str, Any]) -> None:
+        if self.debug:
+            self.logger.debug("Game ended with state: %s", final_state)
+
+    def get_last_response(self) -> LLMResponse | None:
+        return self._last_response
+
+    def _build_contextual_state(self, state: str) -> str:
+        if self.memory_module is None:
+            return state
+        context = self.memory_module.get_context(self._step_count + 1)
+        if not context:
+            return state
+        return f"{context}\n\nCurrent story state:\n{state}"
+
+    @staticmethod
+    def _normalize_for_signature(value: str, max_len: int = 320) -> str:
+        text = (value or "").lower()
+        text = re.sub(r"\s+", " ", text).strip()
+        return text[:max_len] if len(text) > max_len else text
+
+    def _state_signature(self, state: str, choices: list[dict[str, str]]) -> str:
+        normalized_state = self._normalize_for_signature(state, max_len=420)
+        normalized_choices = "|".join(
+            self._normalize_for_signature(choice.get("text", ""), max_len=110) for choice in choices
+        )
+        raw_signature = f"{normalized_state}||{normalized_choices}"
+        return hashlib.sha1(raw_signature.encode("utf-8", errors="ignore")).hexdigest()[:20]
+
+    def _remember_decision(
+        self,
+        state: str,
+        choices: list[dict[str, str]],
+        state_signature: str,
+        response: LLMResponse,
+    ) -> None:
+        action = int(response.action)
+        counts = self._state_action_counts.setdefault(state_signature, {})
+        counts[action] = counts.get(action, 0) + 1
+
+        selected_text = ""
+        if 1 <= action <= len(choices):
+            selected_text = choices[action - 1].get("text", "")
+        state_snippet = (state or "").strip()
+        if len(state_snippet) > 220:
+            state_snippet = state_snippet[:220] + "..."
+
+        decision = {
+            "state": state_snippet,
+            "action": action,
+            "choice": selected_text,
+            "choice_text": selected_text,
+            "parse_mode": response.parse_mode or "unknown",
+            "memo": (response.memo or "").strip()[:350] or None,
+            "reasoning": (response.reasoning or "")[:800],
+        }
+        self._decision_history.append(decision)
+        if len(self._decision_history) > 40:
+            self._decision_history = self._decision_history[-40:]
+
+        self._step_count += 1
+        if self.memory_module is not None:
+            self.memory_module.update(
+                {
+                    "step": self._step_count,
+                    "observation": state,
+                    "choices": [c.get("text", "") for c in choices],
+                    **decision,
+                }
+            )
+
+    def _format_prompt(self, observation, choices, memo=None, context=None) -> str:
+        """Render the action Jinja template for the current decision."""
+        return self.prompt_renderer.action_template.render(
+            observation=observation,
+            choices=[{"text": c.get("text", "")} for c in choices],
+            memo=memo,
+            context=context,
+        ).strip()
+
+    def _parse_llm_response(self, response, num_choices) -> LLMResponse:
+        """Parse an LLM response into a structured response object."""
+        return parse_llm_response(response, num_choices, self.debug, self.logger)
+
+    def _call_llm(self, prompt, system_prompt=None) -> str:
+        """Call the LLM client with lightweight retry handling."""
+        self._ensure_llm()
+        last_error: Exception | None = None
+        for attempt in range(3):
+            try:
+                if system_prompt is not None:
+                    return self.llm.get_completion(prompt, system_prompt=system_prompt)
+                return self.llm.get_completion(prompt)
+            except TypeError:
+                if system_prompt is not None:
+                    return self.llm.get_completion(prompt)
+                raise
+            except Exception as exc:
+                last_error = exc
+                if self.debug:
+                    self.logger.warning("LLM call failed on attempt %d: %s", attempt + 1, exc)
+        raise last_error or RuntimeError("LLM call failed")
+
+    def _choice_risk_score(self, choice_text: str) -> int:
+        text = (choice_text or "").lower()
+        score = 0
+        for keyword in RISKY_CHOICE_KEYWORDS:
+            if keyword in text:
+                score += 2
+        for keyword in SAFE_CHOICE_KEYWORDS:
+            if keyword in text:
+                score -= 1
+        return score
+
+    def _apply_safety_filter(self, choices, preferred_action) -> int:
+        """Replace obviously risky actions when a clearly safer alternative exists."""
+        if not self._use_safety_filter or len(choices) < 2:
+            return preferred_action
+
+        current_idx = preferred_action - 1
+        if current_idx < 0 or current_idx >= len(choices):
+            return preferred_action
+
+        scored = [(idx + 1, self._choice_risk_score(c.get("text", ""))) for idx, c in enumerate(choices)]
+        scored.sort(key=lambda item: item[1])
+
+        best_action, best_score = scored[0]
+        current_score = self._choice_risk_score(choices[current_idx].get("text", ""))
+        if current_score - best_score >= 2:
+            if self.debug:
+                self.logger.debug(
+                    "Safety filter override: %s -> %s (risk %s -> %s)",
+                    preferred_action,
+                    best_action,
+                    current_score,
+                    best_score,
+                )
+            return best_action
+        return preferred_action
+
+    @staticmethod
+    def _normalize_usage(usage: dict[str, Any] | None) -> dict[str, Any]:
+        usage = usage or {}
+        prompt_tokens = int(usage.get("prompt_tokens") or 0)
+        completion_tokens = int(usage.get("completion_tokens") or 0)
+        total_tokens = int(usage.get("total_tokens") or (prompt_tokens + completion_tokens))
+        estimated_cost_usd = usage.get("estimated_cost_usd")
+        if estimated_cost_usd is not None:
+            estimated_cost_usd = float(estimated_cost_usd)
+        return {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": total_tokens,
+            "estimated_cost_usd": estimated_cost_usd,
+        }
+
+    @classmethod
+    def _merge_usage(cls, first: dict[str, Any] | None, second: dict[str, Any] | None) -> dict[str, Any]:
+        a = cls._normalize_usage(first)
+        b = cls._normalize_usage(second)
+        merged_cost = None
+        if a["estimated_cost_usd"] is not None or b["estimated_cost_usd"] is not None:
+            merged_cost = (a["estimated_cost_usd"] or 0.0) + (b["estimated_cost_usd"] or 0.0)
+        return {
+            "prompt_tokens": a["prompt_tokens"] + b["prompt_tokens"],
+            "completion_tokens": a["completion_tokens"] + b["completion_tokens"],
+            "total_tokens": a["total_tokens"] + b["total_tokens"],
+            "estimated_cost_usd": merged_cost,
+        }
+
+    def _format_retry_prompt(self, state: str, choices: list[dict[str, str]]) -> str:
+        clipped_state = (state or "").strip()
+        if len(clipped_state) > 500:
+            clipped_state = clipped_state[:500] + "..."
+        choices_text = "\n".join([f"{i + 1}. {(c.get('text', '') or '')[:160]}" for i, c in enumerate(choices)])
+        return f"""Choose the best action.
+State: {clipped_state}
+Actions:
+{choices_text}
+
+Return valid JSON only:
+{{
+  "analysis": "<max 25 words>",
+  "reasoning": "<max 25 words>",
+  "result": <integer from 1 to {len(choices)}>
+}}"""
+
+    def _format_force_numeric_retry_prompt(self, choices: list[dict[str, str]]) -> str:
+        choices_text = "\n".join([f"{i + 1}. {(c.get('text', '') or '')[:110]}" for i, c in enumerate(choices)])
+        return f"""Pick one action number.
+{choices_text}
+Reply with one integer only: 1 to {len(choices)}."""
+
+    def _needs_force_numeric_retry(self) -> bool:
+        return self.model_spec.provider == "openai" and (
+            self.model_spec.model_id.startswith("gpt-5") or self.model_spec.model_id.startswith("o")
+        )
+
+    def _parse_with_retries(self, prompt: str, observation: str, choices: list[dict[str, str]]) -> LLMResponse:
+        """Call the model, parse, and retry once on invalid/default output."""
+        llm_response = self._call_llm(prompt)
+        llm_usage = self.llm.get_last_usage()
+        first_response = self._parse_llm_response(llm_response, len(choices))
+        parsed_response = first_response
+
+        if parsed_response.is_default:
+            retry_response = self._call_llm(self._format_retry_prompt(observation, choices))
+            retry_usage = self.llm.get_last_usage()
+            llm_usage = self._merge_usage(llm_usage, retry_usage)
+            retry_parsed = self._parse_llm_response(retry_response, len(choices))
+            if not retry_parsed.is_default:
+                retry_parsed.parse_mode = f"retry_{retry_parsed.parse_mode or 'parsed'}"
+                parsed_response = retry_parsed
+            elif self._needs_force_numeric_retry():
+                force_retry_response = self._call_llm(self._format_force_numeric_retry_prompt(choices))
+                force_retry_usage = self.llm.get_last_usage()
+                llm_usage = self._merge_usage(llm_usage, force_retry_usage)
+                force_retry_parsed = self._parse_llm_response(force_retry_response, len(choices))
+                if not force_retry_parsed.is_default:
+                    force_retry_parsed.parse_mode = f"force_retry_{force_retry_parsed.parse_mode or 'parsed'}"
+                    parsed_response = force_retry_parsed
+
+        if parsed_response is not first_response:
+            if parsed_response.analysis is None and first_response.analysis is not None:
+                parsed_response.analysis = first_response.analysis
+            if _is_numeric_raw_reasoning(parsed_response.reasoning):
+                if first_response.reasoning and not _is_numeric_raw_reasoning(first_response.reasoning):
+                    parsed_response.reasoning = first_response.reasoning
+                else:
+                    first_raw_reasoning = _raw_reasoning_fallback(llm_response)
+                    if first_raw_reasoning and not _is_numeric_raw_reasoning(first_raw_reasoning):
+                        parsed_response.reasoning = first_raw_reasoning
+
+        action_before_policy = parsed_response.action
+        parsed_response.action = self._apply_safety_filter(choices, parsed_response.action)
+        if parsed_response.action != action_before_policy and not parsed_response.reasoning:
+            parsed_response.reasoning = "policy_safety_override"
+
+        usage_payload = self._normalize_usage(llm_usage)
+        parsed_response.prompt_tokens = usage_payload["prompt_tokens"]
+        parsed_response.completion_tokens = usage_payload["completion_tokens"]
+        parsed_response.total_tokens = usage_payload["total_tokens"]
+        parsed_response.estimated_cost_usd = usage_payload["estimated_cost_usd"]
+        return parsed_response
diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py
new file mode 100644
index 0000000..87e2d77
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/factory.py
@@ -0,0 +1,92 @@
+"""Factory for creating harness-based quest players."""
+
+from llm_quest_benchmark.constants import DEFAULT_MODEL
+from llm_quest_benchmark.harnesses.memo import (
+    CompactionNoMemoHarness,
+    HintedCompactHarness,
+    MemoCompactHarness,
+    MemoCotHarness,
+    MemoExtendedHarness,
+    MemoStructuredHarness,
+)
+from llm_quest_benchmark.harnesses.minimal import MinimalHarness
+from llm_quest_benchmark.harnesses.planner import PlannerHarness
+from llm_quest_benchmark.harnesses.reasoning import ReasoningFullTranscriptHarness, ReasoningRecentHarness
+from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness, ToolHintedHarness
+from llm_quest_benchmark.players.base import QuestPlayer
+from llm_quest_benchmark.players.human import HumanPlayer
+from llm_quest_benchmark.players.random import RandomPlayer
+
+HARNESS_REGISTRY = {
+    "minimal": MinimalHarness,
+    "reasoning_recent": ReasoningRecentHarness,
+    "reasoning_full": ReasoningFullTranscriptHarness,
+    "memo_compact": MemoCompactHarness,
+    "hinted_compact": HintedCompactHarness,
+    "tool_compact": ToolCompactHarness,
+    "tool_hinted": ToolHintedHarness,
+    "planner": PlannerHarness,
+    "compaction_no_memo": CompactionNoMemoHarness,
+    "memo_cot": MemoCotHarness,
+    "memo_extended": MemoExtendedHarness,
+    "memo_structured": MemoStructuredHarness,
+}
+
+SPECIAL_HARNESSES = ("human", "random_choice", "random_choice_<seed>")
+
+
+def _parse_random_choice_seed(identifier: str) -> tuple[bool, int | None]:
+    if identifier == "random_choice":
+        return True, None
+    prefix = "random_choice_"
+    if identifier.startswith(prefix) and identifier[len(prefix) :].isdigit():
+        return True, int(identifier[len(prefix) :])
+    return False, None
+
+
+def is_random_choice_harness(identifier: str) -> bool:
+    is_random, _ = _parse_random_choice_seed(identifier)
+    return is_random
+
+
+def create_harness(
+    harness: str,
+    model: str = DEFAULT_MODEL,
+    temperature: float = 0.4,
+    skip_single: bool = False,
+    debug: bool = False,
+    compaction_interval: int = 50,
+    system_template: str = "system_role.jinja",
+) -> QuestPlayer:
+    valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES]
+    is_random_harness, seed = _parse_random_choice_seed(harness)
+    is_random_model, _ = _parse_random_choice_seed(model)
+    if is_random_harness:
+        if is_random_model and model != "random_choice":
+            raise ValueError("Encode random seeds in harness, for example harness='random_choice_123'")
+        if model not in (DEFAULT_MODEL, "random_choice"):
+            raise ValueError("Use model='random_choice' with random_choice harnesses")
+        return RandomPlayer(seed=seed, debug=debug, skip_single=skip_single)
+    if harness.startswith("random_choice"):
+        raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}")
+    if harness == "human":
+        return HumanPlayer(skip_single=skip_single)
+    if harness not in HARNESS_REGISTRY:
+        raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}")
+    if is_random_model:
+        raise ValueError(
+            "Use harness='random_choice' for random policy runs instead of pairing random_choice model with an LLM harness"
+        )
+    if model.startswith("random_choice"):
+        raise ValueError(f"Unknown random_choice model '{model}'. Valid: {valid}")
+    if model == "human":
+        raise ValueError("Use harness='human' for human runs instead of pairing human model with an LLM harness")
+    cls = HARNESS_REGISTRY[harness]
+    return cls(
+        model_name=model,
+        temperature=temperature,
+        skip_single=skip_single,
+        debug=debug,
+        compaction_interval=compaction_interval,
+        system_template=system_template,
+    )
diff --git a/llm_quest_benchmark/harnesses/memo.py b/llm_quest_benchmark/harnesses/memo.py
new file mode 100644
index 0000000..63bfb60
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/memo.py
@@ -0,0 +1,102 @@
+"""Compacted-memory harness variants."""
+
+from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE
+from llm_quest_benchmark.harnesses.memory import CompactionMemory
+from llm_quest_benchmark.harnesses.minimal import MinimalHarness
+
+
+class MemoCompactHarness(MinimalHarness):
+    harness_name = "memo_compact"
+
+    def __init__(
+        self,
+        model_name: str = DEFAULT_MODEL,
+        system_template: str = SYSTEM_ROLE_TEMPLATE,
+        action_template: str = "stateful_compact.jinja",
+        temperature: float = DEFAULT_TEMPERATURE,
+        skip_single: bool = False,
+        debug: bool = False,
+        compaction_interval: int = 50,
+        memory_module=None,
+        **kwargs,
+    ):
+        super().__init__(
+            model_name=model_name,
+            system_template=system_template,
+            action_template=action_template,
+            temperature=temperature,
+            skip_single=skip_single,
+            debug=debug,
+            memory_module=(
+                memory_module
+                if memory_module is not None
+                else CompactionMemory(compaction_interval=compaction_interval)
+            ),
+            **kwargs,
+        )
+        self._memory_mode = "compaction"
+        self._compaction_interval = compaction_interval
+
+
+class HintedCompactHarness(MemoCompactHarness):
+    harness_name = "hinted_compact"
+
+    def __init__(
+        self,
+        model_name: str = DEFAULT_MODEL,
+        system_template: str = SYSTEM_ROLE_TEMPLATE,
+        action_template: str = "stateful_compact_hints.jinja",
+        temperature: float = DEFAULT_TEMPERATURE,
+        skip_single: bool = False,
+        debug: bool = False,
+        compaction_interval: int = 50,
+        memory_module=None,
+        **kwargs,
+    ):
+        super().__init__(
+            model_name=model_name,
+            system_template=system_template,
+            action_template=action_template,
+            temperature=temperature,
+            skip_single=skip_single,
+            debug=debug,
+            compaction_interval=compaction_interval,
+            memory_module=memory_module,
+            **kwargs,
+        )
+
+
+class CompactionNoMemoHarness(MemoCompactHarness):
+    """Retired Exp 4 ablation: compacted transcript without memo-oriented prompting."""
+
+    harness_name = "compaction_no_memo"
+
+    def __init__(self, *args, action_template: str = "reasoning.jinja", **kwargs):
+        super().__init__(*args, action_template=action_template, **kwargs)
+
+
+class MemoExtendedHarness(MemoCompactHarness):
+    """Retired Exp 4 variant with a larger generic memo field."""
+
+    harness_name = "memo_extended"
+
+    def __init__(self, *args, action_template: str = "memo_extended.jinja", **kwargs):
+        super().__init__(*args, action_template=action_template, **kwargs)
+
+
+class MemoStructuredHarness(MemoCompactHarness):
+    """Retired Exp 4 variant with structured memo prompting."""
+
+    harness_name = "memo_structured"
+
+    def __init__(self, *args, action_template: str = "memo_structured.jinja", **kwargs):
+        super().__init__(*args, action_template=action_template, **kwargs)
+
+
+class MemoCotHarness(MemoCompactHarness):
+    """Retired Exp 4 variant with scratchpad-style memo prompting."""
+
+    harness_name = "memo_cot"
+
+    def __init__(self, *args, action_template: str = "memo_cot.jinja", **kwargs):
+        super().__init__(*args, action_template=action_template, **kwargs)
diff --git a/llm_quest_benchmark/harnesses/memory.py b/llm_quest_benchmark/harnesses/memory.py
new file mode 100644
index 0000000..45ba5e5
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/memory.py
@@ -0,0 +1,353 @@
+"""Memory modules for harness-based quest players."""
+
+import logging
+from abc import ABC, abstractmethod
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+class MemoryModule(ABC):
+    @abstractmethod
+    def get_context(self, step: int) -> str:
+        pass
+
+    @abstractmethod
+    def update(self, step_data: dict) -> None:
+        pass
+
+    @abstractmethod
+    def reset(self) -> None:
+        pass
+
+    @property
+    def quest_briefing(self) -> str | None:
+        return None
+
+    @property
+    def transcript(self) -> list[dict[str, Any]]:
+        return []
+
+    @transcript.setter
+    def transcript(self, value: list[dict[str, Any]]) -> None:
+        raise TypeError(f"{self.__class__.__name__} does not support transcript assignment")
+
+    @property
+    def steps_since_compaction(self) -> int:
+        return 0
+
+    @steps_since_compaction.setter
+    def steps_since_compaction(self, value: int) -> None:
+        raise TypeError(f"{self.__class__.__name__} does not support compaction counters")
+
+    def set_quest_briefing(self, briefing: str) -> None:
+        clean = (briefing or "").strip()
+        if hasattr(self, "_quest_briefing"):
+            self._quest_briefing = clean or None
+
+    def _briefing_block(self, current_state: str) -> str | None:
+        briefing = self.quest_briefing
+        if not briefing:
+            return None
+        if current_state.strip() == briefing:
+            return None
+        if len(briefing) > 800:
+            briefing = briefing[:800] + "..."
+        return f"Quest briefing (your mission):\n{briefing}"
+
+
+class DefaultMemory(MemoryModule):
+    """Recent N observations window without compaction."""
+
+    def __init__(self, context_window: int = 3, context_chars: int = 220, decision_window: int = 5):
+        self.context_window = context_window
+        self.context_chars = context_chars
+        self.decision_window = decision_window
+        self._quest_briefing: str | None = None
+        self._observations: list[str] = []
+        self._decisions: list[dict[str, Any]] = []
+
+    @property
+    def quest_briefing(self) -> str | None:
+        return self._quest_briefing
+
+    def get_context(self, step: int) -> str:
+        blocks: list[str] = []
+        current = self._observations[-1] if self._observations else ""
+
+        briefing = self._briefing_block(current)
+        if briefing:
+            blocks.append(briefing)
+
+        if len(self._observations) > 1:
+            previous = self._observations[:-1][-self.context_window :]
+            if previous:
+                snippets = []
+                for idx, text in enumerate(previous, start=1):
+                    clipped = text if len(text) <= self.context_chars else text[: self.context_chars] + "..."
+                    snippets.append(f"[Previous {idx}] {clipped}")
+                blocks.append("Recent context from previous steps:\n" + "\n\n".join(snippets))
+
+        if self._decisions:
+            recent_memos = []
+            for item in self._decisions[-self.decision_window :]:
+                memo = (item.get("memo") or "").strip()
+                if not memo:
+                    continue
+                if recent_memos and recent_memos[-1] == memo:
+                    continue
+                recent_memos.append(memo)
+            if recent_memos:
+                lines = [f"[Memo {idx}] {memo}" for idx, memo in enumerate(recent_memos, start=1)]
+                blocks.append("State memo (recent):\n" + "\n".join(lines))
+
+            decision_lines = []
+            for idx, item in enumerate(self._decisions[-self.decision_window :], start=1):
+                choice = item.get("choice") or item.get("choice_text", "")
+                parse_mode = item.get("parse_mode", "unknown")
+                memo_val = item.get("memo")
+                memo_suffix = f" | memo: {memo_val}" if memo_val else ""
+                decision_lines.append(
+                    f"[Decision {idx}] action {item.get('action')}: {choice} (parse={parse_mode}){memo_suffix}"
+                )
+            blocks.append("Recent selected actions:\n" + "\n".join(decision_lines))
+
+        return "\n\n".join(blocks)
+
+    def update(self, step_data: dict) -> None:
+        observation = (step_data.get("observation") or step_data.get("state") or "").strip()
+        if observation:
+            if self._quest_briefing is None:
+                self._quest_briefing = observation
+            self._observations.append(observation)
+            if len(self._observations) > 20:
+                self._observations = self._observations[-20:]
+
+        if any(key in step_data for key in ("action", "choice", "choice_text", "memo")):
+            memo = (step_data.get("memo") or "").strip()[:350] or None
+            self._decisions.append(
+                {
+                    "action": step_data.get("action"),
+                    "choice": step_data.get("choice") or step_data.get("choice_text", ""),
+                    "parse_mode": step_data.get("parse_mode", "unknown"),
+                    "memo": memo,
+                }
+            )
+            if len(self._decisions) > 40:
+                self._decisions = self._decisions[-40:]
+
+    def reset(self) -> None:
+        self._quest_briefing = None
+        self._observations = []
+        self._decisions = []
+
+
+class FullTranscriptMemory(MemoryModule):
+    """Unbounded full transcript in context."""
+
+    def __init__(self):
+        self._quest_briefing: str | None = None
+        self._transcript: list[dict[str, Any]] = []
+
+    @property
+    def quest_briefing(self) -> str | None:
+        return self._quest_briefing
+
+    @property
+    def transcript(self) -> list[dict[str, Any]]:
+        return self._transcript
+
+    @transcript.setter
+    def transcript(self, value: list[dict[str, Any]]) -> None:
+        self._transcript = value
+
+    def get_context(self, step: int) -> str:
+        blocks: list[str] = []
+        current_state = self._transcript[-1].get("observation", "") if self._transcript else ""
+        briefing = self._briefing_block(current_state)
+        if briefing:
+            blocks.append(briefing)
+
+        if self._transcript:
+            lines = []
+            for entry in self._transcript:
+                step_value = entry.get("step", "?")
+                obs = entry.get("observation", "")
+                if len(obs) > 400:
+                    obs = obs[:400] + "..."
+                chosen = entry.get("choice_text") or entry.get("choice", "")
+                reasoning = entry.get("reasoning", "")
+                line = f"Step {step_value}: {obs}"
+                if chosen:
+                    line += f"\n  You chose: {chosen}"
+                if reasoning:
+                    line += f"\n  Reasoning: {reasoning[:800]}"
+                state_notes = entry.get("memo", "")
+                if state_notes:
+                    line += f"\n  State: {state_notes[:350]}"
+                lines.append(line)
+            blocks.append("=== QUEST TRANSCRIPT ===\n" + "\n\n".join(lines))
+
+        return "\n\n".join(blocks)
+
+    def update(self, step_data: dict) -> None:
+        observation = (step_data.get("observation") or step_data.get("state") or "").strip()
+        if observation and self._quest_briefing is None:
+            self._quest_briefing = observation
+        entry = dict(step_data)
+        entry["observation"] = observation
+        entry["step"] = entry.get("step") or len(self._transcript) + 1
+        self._transcript.append(entry)
+
+    def reset(self) -> None:
+        self._quest_briefing = None
+        self._transcript = []
+
+
+class CompactionMemory(MemoryModule):
+    """Periodic LLM summarization plus 20-word memo field."""
+
+    def __init__(self, compaction_interval: int = 50, llm_client=None):
+        self.compaction_interval = compaction_interval
+        self.llm_client = llm_client
+        self._quest_briefing: str | None = None
+        self._transcript: list[dict[str, Any]] = []
+        self._compaction_summary: str | None = None
+        self._steps_since_compaction = 0
+
+    @property
+    def quest_briefing(self) -> str | None:
+        return self._quest_briefing
+
+    @property
+    def transcript(self) -> list[dict[str, Any]]:
+        return self._transcript
+
+    @transcript.setter
+    def transcript(self, value: list[dict[str, Any]]) -> None:
+        self._transcript = value
+
+    @property
+    def steps_since_compaction(self) -> int:
+        return self._steps_since_compaction
+
+    @steps_since_compaction.setter
+    def steps_since_compaction(self, value: int) -> None:
+        self._steps_since_compaction = value
+
+    def get_context(self, step: int) -> str:
+        blocks: list[str] = []
+        current_state = self._transcript[-1].get("observation", "") if self._transcript else ""
+        briefing = self._briefing_block(current_state)
+        if briefing:
+            blocks.append(briefing)
+
+        if self._compaction_summary:
+            compacted_at = max(0, step - self._steps_since_compaction)
+            blocks.append(f"=== QUEST MEMORY (compacted at step {compacted_at}) ===\n{self._compaction_summary}")
+
+        recent = self._transcript[-self._steps_since_compaction :] if self._steps_since_compaction > 0 else []
+        if recent:
+            lines = []
+            for entry in recent:
+                step_value = entry.get("step", "?")
+                obs = entry.get("observation", "")
+                if len(obs) > 400:
+                    obs = obs[:400] + "..."
+                chosen = entry.get("choice_text") or entry.get("choice", "")
+                line = f"Step {step_value}: {obs}"
+                if chosen:
+                    line += f"\n  You chose: {chosen}"
+                state_notes = entry.get("memo", "")
+                if state_notes:
+                    line += f"\n  State: {state_notes[:350]}"
+                lines.append(line)
+            blocks.append("=== RECENT STEPS ===\n" + "\n\n".join(lines))
+
+        return "\n\n".join(blocks)
+
+    def update(self, step_data: dict) -> None:
+        observation = (step_data.get("observation") or step_data.get("state") or "").strip()
+        if observation and self._quest_briefing is None:
+            self._quest_briefing = observation
+        entry = dict(step_data)
+        entry["observation"] = observation[:400]
+        entry["step"] = entry.get("step") or len(self._transcript) + 1
+        if entry.get("memo"):
+            entry["memo"] = self._twenty_word_memo(str(entry["memo"]))
+        self._transcript.append(entry)
+        self._steps_since_compaction += 1
+        self._maybe_compact()
+
+    def reset(self) -> None:
+        self._quest_briefing = None
+        self._transcript = []
+        self._compaction_summary = None
+        self._steps_since_compaction = 0
+
+    def _maybe_compact(self) -> None:
+        if self._steps_since_compaction < self.compaction_interval:
+            return
+        if self.llm_client is None:
+            logger.debug("Skipping compaction because no LLM client is attached")
+            return
+        transcript_text = self._format_transcript_for_compaction()
+        if not transcript_text:
+            self._steps_since_compaction = 0
+            return
+
+        prompt_parts = ["You are summarizing a quest player's progress through a text quest."]
+        if self._quest_briefing:
+            prompt_parts.append(f"\nQUEST BRIEFING (the original mission):\n{self._quest_briefing}")
+        if self._compaction_summary:
+            prompt_parts.append(f"\nPREVIOUS SUMMARY:\n{self._compaction_summary}")
+        prompt_parts.append(f"\nTRANSCRIPT OF LAST {self._steps_since_compaction} STEPS:\n{transcript_text}")
+        prompt_parts.append(
+            "\nSummarize the agent's progress. Include:\n"
+            "- Current objective (what the player should do next)\n"
+            "- Progress so far (what has been accomplished)\n"
+            "- Key facts (NPCs, items, locations, deadlines discovered)\n"
+            "- Failed approaches (actions/paths that didn't work)\n"
+            "- Map knowledge (locations visited and connections)\n\n"
+            "Write a concise summary in plain text, max 300 words."
+        )
+
+        try:
+            summary = (self.llm_client.get_completion("\n".join(prompt_parts)) or "").strip()
+        except Exception as exc:
+            logger.debug("Skipping compaction because summarization failed: %s", exc)
+            self._steps_since_compaction = 0
+            return
+        if summary:
+            self._compaction_summary = summary
+            self._transcript = []
+        self._steps_since_compaction = 0
+
+    def _format_transcript_for_compaction(self) -> str:
+        recent = (
+            self._transcript[-self._steps_since_compaction :]
+            if self._steps_since_compaction > 0
+            else self._transcript[-self.compaction_interval :]
+        )
+        lines = []
+        for entry in recent:
+            step = entry.get("step", "?")
+            obs = entry.get("observation", "")
+            if len(obs) > 400:
+                obs = obs[:400] + "..."
+            chosen = entry.get("choice_text") or entry.get("choice", "")
+            reasoning = entry.get("reasoning", "")
+            state_notes = entry.get("memo", "")
+            line = f"Step {step}: {obs}"
+            if chosen:
+                line += f"\n  Chose: {chosen}"
+            if state_notes:
+                line += f"\n  State: {state_notes[:350]}"
+            if reasoning:
+                line += f"\n  Reasoning: {reasoning[:800]}"
+            lines.append(line)
+        return "\n\n".join(lines)
+
+    @staticmethod
+    def _twenty_word_memo(memo: str) -> str:
+        return " ".join(memo.split()[:20])
diff --git a/llm_quest_benchmark/harnesses/minimal.py b/llm_quest_benchmark/harnesses/minimal.py
new file mode 100644
index 0000000..462d128
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/minimal.py
@@ -0,0 +1,56 @@
+"""Minimal harness implementation."""
+
+from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE
+from llm_quest_benchmark.harnesses.base import BaseHarness
+from llm_quest_benchmark.harnesses.memory import DefaultMemory
+from llm_quest_benchmark.schemas.response import LLMResponse
+
+
+class MinimalHarness(BaseHarness):
+    """Simple prompt-call-parse action loop with recent-memory context."""
+
+    harness_name = "minimal"
+
+    def __init__(
+        self,
+        model_name: str = DEFAULT_MODEL,
+        system_template: str = SYSTEM_ROLE_TEMPLATE,
+        action_template: str = "stub.jinja",
+        temperature: float = DEFAULT_TEMPERATURE,
+        skip_single: bool = False,
+        debug: bool = False,
+        memory_module=None,
+        **_,
+    ):
+        super().__init__(
+            model_name=model_name,
+            system_template=system_template,
+            action_template=action_template,
+            temperature=temperature,
+            skip_single=skip_single,
+            debug=debug,
+            memory_module=memory_module or DefaultMemory(),
+        )
+
+    def _get_action_impl(self, observation: str, choices: list[dict[str, str]]) -> int:
+        try:
+            state_signature = self._state_signature(observation, choices)
+            prompt = self._format_prompt(self._build_contextual_state(observation), choices)
+            parsed_response = self._parse_with_retries(prompt, observation, choices)
+            if parsed_response.action < 1 or parsed_response.action > len(choices):
+                parsed_response.action = 1
+            self.history.append(parsed_response)
+            self._last_response = parsed_response
+            self._remember_decision(observation, choices, state_signature, parsed_response)
+            return parsed_response.action
+        except Exception as exc:
+            self.logger.error("Harness error during LLM call: %s", exc)
+            default_response = LLMResponse(
+                action=1,
+                is_default=True,
+                parse_mode="error_default",
+                reasoning=f"llm_call_error: {exc}",
+            )
+            self.history.append(default_response)
+            self._last_response = default_response
+            return 1
diff --git a/llm_quest_benchmark/agents/planner_agent.py b/llm_quest_benchmark/harnesses/planner.py
similarity index 62%
rename from llm_quest_benchmark/agents/planner_agent.py
rename to llm_quest_benchmark/harnesses/planner.py
index 1999afd..810440c 100644
--- a/llm_quest_benchmark/agents/planner_agent.py
+++ b/llm_quest_benchmark/harnesses/planner.py
@@ -1,33 +1,53 @@
-"""Planner agent with a lightweight plan-maintain-act loop."""
+"""Planner harness implementation."""
 
 import logging
 import re
 from typing import Any
 
-from llm_quest_benchmark.agents.llm_agent import LLMAgent, LLMResponse, parse_llm_response
+from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE
+from llm_quest_benchmark.harnesses.base import BaseHarness
+from llm_quest_benchmark.harnesses.memory import CompactionMemory
+from llm_quest_benchmark.schemas.response import LLMResponse
 
 
-class PlannerAgent(LLMAgent):
-    """LLM agent that maintains a short plan and re-plans on notable changes."""
+class PlannerHarness(BaseHarness):
+    """Compacted-memory harness with a lightweight plan-maintain-act loop."""
+
+    harness_name = "planner"
 
     def __init__(
         self,
-        *args,
+        model_name: str = DEFAULT_MODEL,
+        system_template: str = SYSTEM_ROLE_TEMPLATE,
         action_template: str = "planner.jinja",
-        **kwargs,
+        temperature: float = DEFAULT_TEMPERATURE,
+        skip_single: bool = False,
+        debug: bool = False,
+        compaction_interval: int = 50,
+        memory_module=None,
+        **_,
     ):
-        super().__init__(*args, action_template=action_template, **kwargs)
+        super().__init__(
+            model_name=model_name,
+            system_template=system_template,
+            action_template=action_template,
+            temperature=temperature,
+            skip_single=skip_single,
+            debug=debug,
+            memory_module=memory_module or CompactionMemory(compaction_interval=compaction_interval),
+        )
         self.agent_id = f"planner_{self.model_name}"
         self.current_plan: str | None = None
         self._plan_history: list[str] = []
+        self._memory_mode = "compaction"
+        self._compaction_interval = compaction_interval
 
     def _recent_actions(self) -> list[str]:
         entries = []
         for item in self._decision_history[-3:]:
             choice = (item.get("choice") or "").strip()
-            if not choice:
-                continue
-            entries.append(f"{item.get('action')}. {choice}")
+            if choice:
+                entries.append(f"{item.get('action')}. {choice}")
         return entries
 
     @staticmethod
@@ -35,7 +55,6 @@ def _normalize_plan(raw_plan: str) -> str:
         compact = " ".join((raw_plan or "").strip().split())
         if not compact:
             return ""
-
         sentences = re.split(r"(?<=[.!?])\s+", compact)
         sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
         if len(sentences) >= 5:
@@ -60,14 +79,8 @@ def _build_planner_prompt(
         ).strip()
 
     def _observation_changed_significantly(self, observation: str) -> bool:
-        """Check if the observation differs enough from the previous one to warrant re-planning.
-
-        Uses token-level overlap ratio: if less than 50% of tokens are shared,
-        the scene has changed significantly.
-        """
         if len(self._observation_history) < 2:
             return False
-
         prev_tokens = set(self._observation_history[-2].lower().split())
         curr_tokens = set((observation or "").lower().split())
         if not prev_tokens or not curr_tokens:
@@ -78,13 +91,10 @@ def _observation_changed_significantly(self, observation: str) -> bool:
     def _should_replan(self, observation: str, state_signature: str) -> tuple[bool, str | None]:
         if not self.current_plan:
             return True, "No plan exists yet."
-
         if any(self._state_action_counts.get(state_signature, {}).values()):
             return True, "This state has repeated, so a previous action already failed to progress."
-
         if self._observation_changed_significantly(observation):
             return True, "The scene changed significantly from the previous observation."
-
         return False, None
 
     def _update_plan(
@@ -94,23 +104,14 @@ def _update_plan(
         replan_reason: str | None,
     ) -> dict[str, Any]:
         self._ensure_llm()
-        prompt = self._build_planner_prompt(
-            observation,
-            choices,
-            prompt_kind="plan",
-            replan_reason=replan_reason,
-        )
-        plan_response = self.llm.get_completion(prompt)
+        prompt = self._build_planner_prompt(observation, choices, prompt_kind="plan", replan_reason=replan_reason)
+        plan_response = self._call_llm(prompt)
         usage = self.llm.get_last_usage()
         plan = self._normalize_plan(plan_response)
         if not plan:
-            if self.current_plan:
-                plan = self.current_plan
-            else:
-                plan = (
-                    "Gather clues, protect resources, and avoid obvious traps while "
-                    "advancing toward the main objective."
-                )
+            plan = self.current_plan or (
+                "Gather clues, protect resources, and avoid obvious traps while advancing toward the main objective."
+            )
         self.current_plan = plan
         self._plan_history.append(plan)
         if len(self._plan_history) > 10:
@@ -123,48 +124,18 @@ def _choose_action_with_plan(
         choices: list[dict[str, str]],
         replan_reason: str | None,
     ) -> tuple[LLMResponse, dict[str, Any]]:
-        prompt = self._build_planner_prompt(
-            observation,
-            choices,
-            prompt_kind="act",
-            replan_reason=replan_reason,
-        )
-        llm_response = self.llm.get_completion(prompt)
-        llm_usage = self.llm.get_last_usage()
-        parsed_response = parse_llm_response(llm_response, len(choices), self.debug, self.logger)
-
-        if parsed_response.is_default:
-            retry_response = self.llm.get_completion(self._format_retry_prompt(observation, choices))
-            retry_usage = self.llm.get_last_usage()
-            llm_usage = self._merge_usage(llm_usage, retry_usage)
-            retry_parsed = parse_llm_response(
-                retry_response,
-                len(choices),
-                self.debug,
-                self.logger,
-            )
-            if not retry_parsed.is_default:
-                retry_parsed.parse_mode = f"retry_{retry_parsed.parse_mode or 'parsed'}"
-                parsed_response = retry_parsed
-            elif self._needs_force_numeric_retry():
-                force_retry_response = self.llm.get_completion(self._format_force_numeric_retry_prompt(choices))
-                force_retry_usage = self.llm.get_last_usage()
-                llm_usage = self._merge_usage(llm_usage, force_retry_usage)
-                force_retry_parsed = parse_llm_response(
-                    force_retry_response,
-                    len(choices),
-                    self.debug,
-                    self.logger,
-                )
-                if not force_retry_parsed.is_default:
-                    force_retry_parsed.parse_mode = f"force_retry_{force_retry_parsed.parse_mode or 'parsed'}"
-                    parsed_response = force_retry_parsed
-
-        return parsed_response, llm_usage
+        prompt = self._build_planner_prompt(observation, choices, prompt_kind="act", replan_reason=replan_reason)
+        parsed_response = self._parse_with_retries(prompt, observation, choices)
+        return parsed_response, {
+            "prompt_tokens": parsed_response.prompt_tokens,
+            "completion_tokens": parsed_response.completion_tokens,
+            "total_tokens": parsed_response.total_tokens,
+            "estimated_cost_usd": parsed_response.estimated_cost_usd,
+        }
 
     def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int:
         if self.debug:
-            self.logger.debug("PlannerAgent evaluating state with %s choices", len(choices))
+            self.logger.debug("PlannerHarness evaluating state with %s choices", len(choices))
         try:
             state_signature = self._state_signature(state, choices)
             contextual_state = self._build_contextual_state(state)
@@ -178,37 +149,29 @@ def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int:
                 choices,
                 replan_reason if should_replan else None,
             )
+
             action_before_policy = parsed_response.action
-            parsed_response.action = self._apply_safety_filter(parsed_response.action, choices)
+            parsed_response.action = self._apply_safety_filter(choices, parsed_response.action)
             if parsed_response.action != action_before_policy and not parsed_response.reasoning:
                 parsed_response.reasoning = "policy_safety_override"
 
             total_usage = (
                 self._merge_usage(plan_usage, action_usage) if plan_usage else self._normalize_usage(action_usage)
             )
-            if plan_usage:
-                total_usage = self._normalize_usage(total_usage)
-
+            total_usage = self._normalize_usage(total_usage)
             parsed_response.prompt_tokens = total_usage["prompt_tokens"]
             parsed_response.completion_tokens = total_usage["completion_tokens"]
             parsed_response.total_tokens = total_usage["total_tokens"]
             parsed_response.estimated_cost_usd = total_usage["estimated_cost_usd"]
 
+            if parsed_response.action < 1 or parsed_response.action > len(choices):
+                parsed_response.action = 1
             self.history.append(parsed_response)
             self._last_response = parsed_response
             self._remember_decision(state, choices, state_signature, parsed_response)
-
-            if parsed_response.action < 1 or parsed_response.action > len(choices):
-                self.logger.error(
-                    "INVALID ACTION DETECTED: %s not in range 1-%s",
-                    parsed_response.action,
-                    len(choices),
-                )
-                parsed_response.action = 1
-
             return parsed_response.action
         except Exception as exc:
-            self.logger.error("Planner agent error during LLM call: %s", exc)
+            self.logger.error("Planner harness error during LLM call: %s", exc)
             default_response = LLMResponse(
                 action=1,
                 is_default=True,
diff --git a/llm_quest_benchmark/harnesses/reasoning.py b/llm_quest_benchmark/harnesses/reasoning.py
new file mode 100644
index 0000000..79564d5
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/reasoning.py
@@ -0,0 +1,57 @@
+"""Reasoning harness variants."""
+
+from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE
+from llm_quest_benchmark.harnesses.memory import DefaultMemory, FullTranscriptMemory
+from llm_quest_benchmark.harnesses.minimal import MinimalHarness
+
+
+class ReasoningRecentHarness(MinimalHarness):
+    harness_name = "reasoning_recent"
+
+    def __init__(
+        self,
+        model_name: str = DEFAULT_MODEL,
+        system_template: str = SYSTEM_ROLE_TEMPLATE,
+        action_template: str = "reasoning.jinja",
+        temperature: float = DEFAULT_TEMPERATURE,
+        skip_single: bool = False,
+        debug: bool = False,
+        memory_module=None,
+        **kwargs,
+    ):
+        super().__init__(
+            model_name=model_name,
+            system_template=system_template,
+            action_template=action_template,
+            temperature=temperature,
+            skip_single=skip_single,
+            debug=debug,
+            memory_module=memory_module or DefaultMemory(),
+            **kwargs,
+        )
+
+
+class ReasoningFullTranscriptHarness(MinimalHarness):
+    harness_name = "reasoning_full"
+
+    def __init__(
+        self,
+        model_name: str = DEFAULT_MODEL,
+        system_template: str = SYSTEM_ROLE_TEMPLATE,
+        action_template: str = "reasoning.jinja",
+        temperature: float = DEFAULT_TEMPERATURE,
+        skip_single: bool = False,
+        debug: bool = False,
+        memory_module=None,
+        **kwargs,
+    ):
+        super().__init__(
+            model_name=model_name,
+            system_template=system_template,
+            action_template=action_template,
+            temperature=temperature,
+            skip_single=skip_single,
+            debug=debug,
+            memory_module=memory_module or FullTranscriptMemory(),
+            **kwargs,
+        )
diff --git a/llm_quest_benchmark/harnesses/tool_harness.py b/llm_quest_benchmark/harnesses/tool_harness.py
new file mode 100644
index 0000000..0acc699
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/tool_harness.py
@@ -0,0 +1,241 @@
+"""Tool-augmented harness implementations."""
+
+from typing import Any
+
+from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE
+from llm_quest_benchmark.harnesses.base import BaseHarness, _parse_json_response
+from llm_quest_benchmark.harnesses.memory import CompactionMemory
+from llm_quest_benchmark.harnesses.tools import QuestHistoryTool, Scratchpad, calculator
+from llm_quest_benchmark.schemas.response import LLMResponse
+
+
+class ToolCompactHarness(BaseHarness):
+    """Compacted-memory harness with a two-phase tool selection/action loop."""
+
+    harness_name = "tool_compact"
+    DEFAULT_HISTORY_WINDOW = 10
+    MAX_TOOL_INPUT_CHARS = 500
+
+    def __init__(
+        self,
+        model_name: str = DEFAULT_MODEL,
+        system_template: str = SYSTEM_ROLE_TEMPLATE,
+        action_template: str = "tool_augmented.jinja",
+        temperature: float = DEFAULT_TEMPERATURE,
+        skip_single: bool = False,
+        debug: bool = False,
+        compaction_interval: int = 50,
+        memory_module=None,
+        history_window: int | None = None,
+        **_,
+    ):
+        self._step_log: list[dict[str, Any]] = []
+        self._history_window = history_window or self.DEFAULT_HISTORY_WINDOW
+        self._scratchpad_tool = Scratchpad()
+        self._history_tool = QuestHistoryTool(self._step_log, self._history_window)
+        super().__init__(
+            model_name=model_name,
+            system_template=system_template,
+            action_template=action_template,
+            temperature=temperature,
+            skip_single=skip_single,
+            debug=debug,
+            memory_module=memory_module or CompactionMemory(compaction_interval=compaction_interval),
+            tools=[calculator, self._scratchpad_tool, self._history_tool],
+        )
+        self._memory_mode = "compaction"
+        self._compaction_interval = compaction_interval
+
+    def _recent_steps(self) -> list[str]:
+        return [
+            f"Step {entry['step']}: {entry['observation']} -> {entry.get('selected_choice', 'n/a')}"
+            for entry in self._step_log[-self._history_window :]
+        ]
+
+    def _tool_descriptions(self) -> list[str]:
+        return [
+            "quest_history(query): search earlier observations and chosen actions in this quest.",
+            "calculator(expression): evaluate arithmetic and simple comparisons.",
+            "scratchpad(operation, content): read or replace one persistent note. operation is read or write_replace.",
+        ]
+
+    def quest_history(self, query: str) -> str:
+        return self._history_tool.search(query)
+
+    @staticmethod
+    def calculator(expression: str) -> str:
+        return calculator(expression)
+
+    def scratchpad(self, operation: str, content: str = "") -> str:
+        op = (operation or "").strip().lower()
+        if op == "read":
+            return self._scratchpad_tool.read()
+        if op == "write_replace":
+            return self._scratchpad_tool.write_replace(content)
+        return "error: operation must be read or write_replace"
+
+    def _build_tool_prompt(
+        self,
+        observation: str,
+        choices: list[dict[str, str]],
+        prompt_kind: str,
+        tool_results: list[str] | None = None,
+    ) -> str:
+        template = self.prompt_renderer.get_template(self.action_template)
+        return template.render(
+            prompt_kind=prompt_kind,
+            observation=observation,
+            choices=[{"text": choice.get("text", "")} for choice in choices],
+            tool_descriptions=self._tool_descriptions(),
+            tool_results=tool_results or [],
+            recent_steps=self._recent_steps(),
+            scratchpad_note=self._scratchpad_tool.read() if self._scratchpad_tool.read() != "(empty)" else "",
+        ).strip()
+
+    @staticmethod
+    def _extract_tool_calls(response: str) -> list[dict[str, Any]]:
+        payload, _ = _parse_json_response(response)
+        if not isinstance(payload, dict):
+            return []
+        tool_calls = payload.get("tool_calls")
+        if not isinstance(tool_calls, list):
+            return []
+
+        normalized = []
+        for item in tool_calls[:1]:
+            if not isinstance(item, dict):
+                continue
+            tool_name = str(item.get("tool") or "").strip()
+            tool_input = item.get("input")
+            operation = str(item.get("operation") or "").strip()
+            content = str(item.get("content") or "").strip()
+            if isinstance(tool_input, dict):
+                operation = operation or str(tool_input.get("operation") or "").strip()
+                content = content or str(tool_input.get("content") or "").strip()
+                tool_input = tool_input.get("expression") or tool_input.get("query") or tool_input.get("content") or ""
+            tool_input = str(tool_input or "").strip()
+            if len(tool_input) > ToolCompactHarness.MAX_TOOL_INPUT_CHARS:
+                tool_input = tool_input[: ToolCompactHarness.MAX_TOOL_INPUT_CHARS]
+            if len(content) > ToolCompactHarness.MAX_TOOL_INPUT_CHARS:
+                content = content[: ToolCompactHarness.MAX_TOOL_INPUT_CHARS]
+            if tool_name:
+                normalized.append({"tool": tool_name, "input": tool_input, "operation": operation, "content": content})
+        return normalized
+
+    def _execute_tool_calls(self, tool_calls: list[dict[str, Any]]) -> list[str]:
+        results = []
+        for tc in tool_calls:
+            name, inp = tc["tool"], tc.get("input", "")
+            if name == "quest_history":
+                result = self.quest_history(inp)
+            elif name == "calculator":
+                result = self.calculator(inp)
+            elif name == "scratchpad":
+                operation = tc.get("operation") or inp
+                result = self.scratchpad(str(operation), str(tc.get("content") or ""))
+            else:
+                result = f"unknown tool: {name}"
+            call_repr = inp
+            if name == "scratchpad":
+                call_repr = f"{tc.get('operation') or inp}, {tc.get('content') or ''}".strip(", ")
+            results.append(f"{name}({call_repr}) => {result}")
+        return results
+
+    def _final_choice(
+        self,
+        observation: str,
+        choices: list[dict[str, str]],
+        tool_results: list[str] | None = None,
+    ) -> tuple[LLMResponse, dict[str, Any]]:
+        prompt = self._build_tool_prompt(observation, choices, prompt_kind="final", tool_results=tool_results)
+        parsed_response = self._parse_with_retries(prompt, observation, choices)
+        return parsed_response, {
+            "prompt_tokens": parsed_response.prompt_tokens,
+            "completion_tokens": parsed_response.completion_tokens,
+            "total_tokens": parsed_response.total_tokens,
+            "estimated_cost_usd": parsed_response.estimated_cost_usd,
+        }
+
+    def _log_step(self, observation: str, choices: list[dict[str, str]], response: LLMResponse) -> None:
+        selected = ""
+        if 1 <= response.action <= len(choices):
+            selected = choices[response.action - 1].get("text", "")
+        clipped = " ".join((observation or "").strip().split())
+        if len(clipped) > 180:
+            clipped = clipped[:180] + "..."
+        self._step_log.append(
+            {
+                "step": len(self._step_log) + 1,
+                "observation": clipped,
+                "choices": [c.get("text", "") for c in choices],
+                "selected_choice": selected,
+            }
+        )
+
+    def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int:
+        try:
+            state_signature = self._state_signature(state, choices)
+            contextual_state = self._build_contextual_state(state)
+            self._ensure_llm()
+
+            selection_prompt = self._build_tool_prompt(contextual_state, choices, prompt_kind="select")
+            selection_response = self._call_llm(selection_prompt)
+            selection_usage = self.llm.get_last_usage()
+            tool_calls = self._extract_tool_calls(selection_response)
+            parsed_response = self._parse_llm_response(selection_response, len(choices))
+            tool_results: list[str] = []
+            final_choice_used = False
+
+            total_usage = self._normalize_usage(selection_usage)
+            if tool_calls:
+                tool_results = self._execute_tool_calls(tool_calls)
+                parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=tool_results)
+                total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage))
+                final_choice_used = True
+            elif parsed_response.is_default:
+                parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=[])
+                total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage))
+                final_choice_used = True
+
+            if not final_choice_used:
+                action_before_policy = parsed_response.action
+                parsed_response.action = self._apply_safety_filter(choices, parsed_response.action)
+                if parsed_response.action != action_before_policy and not parsed_response.reasoning:
+                    parsed_response.reasoning = "policy_safety_override"
+
+            parsed_response.prompt_tokens = total_usage["prompt_tokens"]
+            parsed_response.completion_tokens = total_usage["completion_tokens"]
+            parsed_response.total_tokens = total_usage["total_tokens"]
+            parsed_response.estimated_cost_usd = total_usage["estimated_cost_usd"]
+            parsed_response.tool_calls = tool_calls or None
+            parsed_response.tool_results = tool_results or None
+
+            self.history.append(parsed_response)
+            self._last_response = parsed_response
+            self._remember_decision(state, choices, state_signature, parsed_response)
+            self._log_step(state, choices, parsed_response)
+            return parsed_response.action
+        except Exception as exc:
+            self.logger.error("Tool harness error during LLM call: %s", exc)
+            default_response = LLMResponse(
+                action=1,
+                is_default=True,
+                parse_mode="error_default",
+                reasoning=f"tool_harness_error: {exc}",
+            )
+            self.history.append(default_response)
+            self._last_response = default_response
+            return 1
+
+    def reset(self) -> None:
+        super().reset()
+        self._step_log = []
+        self._scratchpad_tool.reset()
+        self._history_tool.step_log = self._step_log
+
+
+class ToolHintedHarness(ToolCompactHarness):
+    harness_name = "tool_hinted"
+
+    def __init__(self, *args, action_template: str = "tool_augmented_hints.jinja", **kwargs):
+        super().__init__(*args, action_template=action_template, **kwargs)
diff --git a/llm_quest_benchmark/harnesses/tools.py b/llm_quest_benchmark/harnesses/tools.py
new file mode 100644
index 0000000..9978c58
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/tools.py
@@ -0,0 +1,173 @@
+"""Reusable tools for harness-based quest players."""
+
+import ast
+import re
+
+MAX_SCRATCHPAD_CHARS = 1200
+
+
+def calculator(expression: str) -> str:
+    """Evaluate a restricted arithmetic/comparison expression."""
+    expr = (expression or "").strip()
+    if not expr:
+        return "error: empty expression"
+    if len(expr) > 240:
+        return "error: expression too long"
+    if not re.fullmatch(r"[0-9a-zA-Z\s+\-*/().,<>=!%]+", expr):
+        return "error: unsupported characters"
+
+    allowed_nodes = (
+        ast.Expression,
+        ast.Constant,
+        ast.UnaryOp,
+        ast.UAdd,
+        ast.USub,
+        ast.BinOp,
+        ast.Add,
+        ast.Sub,
+        ast.Mult,
+        ast.Div,
+        ast.FloorDiv,
+        ast.Mod,
+        ast.Pow,
+        ast.Compare,
+        ast.Eq,
+        ast.NotEq,
+        ast.Lt,
+        ast.LtE,
+        ast.Gt,
+        ast.GtE,
+        ast.BoolOp,
+        ast.And,
+        ast.Or,
+    )
+    try:
+        tree = ast.parse(expr, mode="eval")
+        for node in ast.walk(tree):
+            if not isinstance(node, allowed_nodes):
+                return f"error: unsupported expression element {node.__class__.__name__}"
+            if isinstance(node, ast.Constant) and not isinstance(node.value, (int, float, bool)):
+                return "error: constants must be numeric or boolean"
+        result = _eval_calculator_node(tree.body)
+    except Exception as exc:
+        return f"error: {exc}"
+    return f"{expr} = {result}"
+
+
+def _eval_calculator_node(node: ast.AST) -> int | float | bool:
+    if isinstance(node, ast.Constant) and isinstance(node.value, (int, float, bool)):
+        return node.value
+    if isinstance(node, ast.UnaryOp):
+        value = _eval_calculator_node(node.operand)
+        if isinstance(node.op, ast.UAdd):
+            return +value
+        if isinstance(node.op, ast.USub):
+            return -value
+    if isinstance(node, ast.BinOp):
+        left = _eval_calculator_node(node.left)
+        right = _eval_calculator_node(node.right)
+        if isinstance(node.op, ast.Add):
+            return left + right
+        if isinstance(node.op, ast.Sub):
+            return left - right
+        if isinstance(node.op, ast.Mult):
+            return left * right
+        if isinstance(node.op, ast.Div):
+            return left / right
+        if isinstance(node.op, ast.FloorDiv):
+            return left // right
+        if isinstance(node.op, ast.Mod):
+            return left % right
+        if isinstance(node.op, ast.Pow):
+            if abs(right) > 8:
+                raise ValueError("exponent too large")
+            return left**right
+    if isinstance(node, ast.BoolOp):
+        values = [bool(_eval_calculator_node(value)) for value in node.values]
+        if isinstance(node.op, ast.And):
+            return all(values)
+        if isinstance(node.op, ast.Or):
+            return any(values)
+    if isinstance(node, ast.Compare):
+        left = _eval_calculator_node(node.left)
+        for op, comparator in zip(node.ops, node.comparators, strict=True):
+            right = _eval_calculator_node(comparator)
+            if isinstance(op, ast.Eq):
+                ok = left == right
+            elif isinstance(op, ast.NotEq):
+                ok = left != right
+            elif isinstance(op, ast.Lt):
+                ok = left < right
+            elif isinstance(op, ast.LtE):
+                ok = left <= right
+            elif isinstance(op, ast.Gt):
+                ok = left > right
+            elif isinstance(op, ast.GtE):
+                ok = left >= right
+            else:
+                raise ValueError("unsupported comparison")
+            if not ok:
+                return False
+            left = right
+        return True
+    raise ValueError("unsupported expression")
+
+
+class Scratchpad:
+    """Persistent free-form note blob with read and replace operations."""
+
+    def __init__(self, max_chars: int = MAX_SCRATCHPAD_CHARS):
+        self.max_chars = max_chars
+        self._content = ""
+
+    def read(self) -> str:
+        return self._content or "(empty)"
+
+    def write_replace(self, content: str = "") -> str:
+        note = " ".join((content or "").strip().split())
+        self._content = note[: self.max_chars]
+        return f"updated: {self._content or '(empty)'}"
+
+    def reset(self) -> None:
+        self._content = ""
+
+
+class QuestHistoryTool:
+    """Keyword search over a run-local quest step log."""
+
+    def __init__(self, step_log: list[dict] | None = None, history_window: int = 10):
+        self.step_log = step_log if step_log is not None else []
+        self.history_window = history_window
+
+    def search(self, query: str) -> str:
+        """Return relevant previous steps from this quest run via keyword match."""
+        if not self.step_log:
+            return "No prior quest steps recorded yet."
+
+        tokens = set(re.findall(r"[a-zA-Z\u0400-\u04ff0-9_]{3,}", (query or "").lower()))
+        scored = []
+        for entry in self.step_log:
+            haystack = " ".join(
+                [
+                    entry.get("observation", ""),
+                    " ".join(entry.get("choices", [])),
+                    entry.get("selected_choice", ""),
+                ]
+            ).lower()
+            score = sum(1 for token in tokens if token in haystack)
+            scored.append((score, entry))
+
+        scored.sort(key=lambda item: (item[0], item[1].get("step", 0)), reverse=True)
+        best = [entry for score, entry in scored if score > 0][: self.history_window]
+        if not best:
+            best = [entry for _, entry in scored[: self.history_window]]
+
+        lines = []
+        for entry in best:
+            choices = entry.get("choices", [])
+            choices_text = choices if isinstance(choices, str) else "; ".join(choices)
+            lines.append(
+                f"Step {entry.get('step', '?')}: obs={entry.get('observation', '')} | "
+                f"choices={choices_text} | picked={entry.get('selected_choice', 'n/a')}"
+            )
+        return "\n".join(lines)
diff --git a/llm_quest_benchmark/players/__init__.py b/llm_quest_benchmark/players/__init__.py
new file mode 100644
index 0000000..aa71d5b
--- /dev/null
+++ b/llm_quest_benchmark/players/__init__.py
@@ -0,0 +1,17 @@
+__all__ = ["QuestPlayer", "HumanPlayer", "RandomPlayer"]
+
+
+def __getattr__(name):
+    if name == "QuestPlayer":
+        from .base import QuestPlayer
+
+        return QuestPlayer
+    if name == "HumanPlayer":
+        from .human import HumanPlayer
+
+        return HumanPlayer
+    if name == "RandomPlayer":
+        from .random import RandomPlayer
+
+        return RandomPlayer
+    raise AttributeError(name)
diff --git a/llm_quest_benchmark/agents/base.py b/llm_quest_benchmark/players/base.py
similarity index 92%
rename from llm_quest_benchmark/agents/base.py
rename to llm_quest_benchmark/players/base.py
index eed7609..9e53750 100644
--- a/llm_quest_benchmark/agents/base.py
+++ b/llm_quest_benchmark/players/base.py
@@ -1,4 +1,4 @@
-"""Base classes for quest players (both human and LLM)"""
+"""Base class for quest players and harnesses."""
 
 from abc import ABC, abstractmethod
 from typing import Any
@@ -13,7 +13,7 @@ def __init__(self, skip_single: bool = False):
         """Initialize player with skip_single option"""
         self.skip_single = skip_single
         self._last_response: LLMResponse = None
-        self.agent_id = "base_agent"  # Default agent ID
+        self.agent_id = "base_player"
 
     def get_action(self, observation: str, choices: list) -> int:
         """Get action number from observation and choices
@@ -55,7 +55,7 @@ def _get_action_impl(self, observation: str, choices: list) -> int:
         pass
 
     def get_last_response(self) -> LLMResponse:
-        """Get the last response from the agent"""
+        """Get the last response from the player or harness."""
         return self._last_response
 
     @abstractmethod
diff --git a/llm_quest_benchmark/agents/human_player.py b/llm_quest_benchmark/players/human.py
similarity index 91%
rename from llm_quest_benchmark/agents/human_player.py
rename to llm_quest_benchmark/players/human.py
index 721c43d..b5d74f4 100644
--- a/llm_quest_benchmark/agents/human_player.py
+++ b/llm_quest_benchmark/players/human.py
@@ -3,7 +3,7 @@
 import logging
 from typing import Any
 
-from llm_quest_benchmark.agents.base import QuestPlayer
+from llm_quest_benchmark.players.base import QuestPlayer
 
 
 class HumanPlayer(QuestPlayer):
@@ -15,7 +15,7 @@ def __init__(self, skip_single: bool = False, debug: bool = False):
         self.logger = logging.getLogger(__name__)
         if debug:
             self.logger.setLevel(logging.DEBUG)
-        # Set agent_id for database records
+        # Keep the persisted identifier stable for existing result artifacts.
         self.agent_id = "human"
 
     def _get_action_impl(self, observation: str, choices: list) -> int:
diff --git a/llm_quest_benchmark/agents/random_agent.py b/llm_quest_benchmark/players/random.py
similarity index 75%
rename from llm_quest_benchmark/agents/random_agent.py
rename to llm_quest_benchmark/players/random.py
index e428353..a8fea29 100644
--- a/llm_quest_benchmark/agents/random_agent.py
+++ b/llm_quest_benchmark/players/random.py
@@ -1,17 +1,19 @@
-"""Random agent for testing quests"""
+"""Random player for testing quests"""
 
 import logging
 import random
 
-from llm_quest_benchmark.agents.base import QuestPlayer
+from llm_quest_benchmark.players.base import QuestPlayer
 
 
-class RandomAgent(QuestPlayer):
-    """Agent that randomly selects from available choices.
-    Used for testing quests and finding edge cases."""
+class RandomPlayer(QuestPlayer):
+    """Player that randomly selects from available choices.
+
+    Used for testing quests and finding edge cases.
+    """
 
     def __init__(self, seed: int = None, debug: bool = False, skip_single: bool = False):
-        """Initialize random agent.
+        """Initialize random player.
 
         Args:
             seed (int, optional): Random seed for reproducibility. Defaults to None.
@@ -24,7 +26,7 @@ def __init__(self, seed: int = None, debug: bool = False, skip_single: bool = Fa
         if debug:
             self.logger.setLevel(logging.DEBUG)
         self.rng = random.Random(seed)
-        # Set agent_id for database records
+        # Keep the persisted identifier stable for existing result artifacts.
         self.agent_id = f"random_{seed}" if seed is not None else "random"
 
     def _get_action_impl(self, observation: str, choices: list[dict[str, str]]) -> int:
@@ -43,5 +45,5 @@ def _get_action_impl(self, observation: str, choices: list[dict[str, str]]) -> i
         return self.rng.randint(1, len(choices))
 
     def reset(self) -> None:
-        """Reset agent state - nothing to reset for random agent"""
+        """Reset player state; nothing to reset for random choice."""
         pass
diff --git a/llm_quest_benchmark/prompt_templates/consequence_scan.jinja b/llm_quest_benchmark/prompt_templates/consequence_scan.jinja
deleted file mode 100644
index 55ce54b..0000000
--- a/llm_quest_benchmark/prompt_templates/consequence_scan.jinja
+++ /dev/null
@@ -1,18 +0,0 @@
-Current story state:
-{{ observation }}
-
-Available actions:
-{% for choice in choices %}
-{{ loop.index }}. {{ choice.text }}
-{% endfor %}
-
-Mission objective: complete the quest successfully.
-
-Decision method:
-1. For each action, estimate immediate consequence in 5 words max.
-2. Prefer actions that preserve progress and gather information.
-3. Avoid options that abandon, surrender, or end the mission early.
-4. If uncertain, choose the lowest-risk progress action.
-
-Return ONLY valid JSON (no markdown/code fences), exactly:
-{"analysis":"<max 25 words>","reasoning":"<max 25 words>","result":<action_number>}
diff --git a/llm_quest_benchmark/prompt_templates/consequence_scan_subgoal.jinja b/llm_quest_benchmark/prompt_templates/consequence_scan_subgoal.jinja
deleted file mode 100644
index 7fd4236..0000000
--- a/llm_quest_benchmark/prompt_templates/consequence_scan_subgoal.jinja
+++ /dev/null
@@ -1,19 +0,0 @@
-Current story state:
-{{ observation }}
-
-Available actions:
-{% for choice in choices %}
-{{ loop.index }}. {{ choice.text }}
-{% endfor %}
-
-Mission objective: complete the quest successfully.
-
-Decision method:
-1. Use any provided memo from prior turns to stay consistent.
-2. For each action, estimate immediate consequence in 5 words max.
-3. Prefer actions that preserve progress and gather information.
-4. Avoid options that abandon, surrender, or end the mission early.
-5. If uncertain, choose the lowest-risk progress action.
-
-Return ONLY valid JSON (no markdown/code fences), exactly:
-{"analysis":"<max 25 words>","reasoning":"<max 25 words>","memo":"<max 12 words next objective>","result":<action_number>}
diff --git a/llm_quest_benchmark/prompt_templates/light_hints.jinja b/llm_quest_benchmark/prompt_templates/light_hints.jinja
deleted file mode 100644
index eb3ab60..0000000
--- a/llm_quest_benchmark/prompt_templates/light_hints.jinja
+++ /dev/null
@@ -1,18 +0,0 @@
-General hints for this type of quest:
-- Read the scene literally. Win/loss constraints are usually stated directly in the text.
-- Preparation, study, negotiation, and reconnaissance are often safer than direct combat or bravado.
-- Prefer actions that gather clues or unlock safer options before committing to irreversible moves.
-- Avoid choices that abandon the mission, surrender, or waste scarce time/resources for no gain.
-- If a scene repeats, the last branch did not help - try a different action.
-- Prioritize the core objective over optional heroic detours.
-
-Current story state:
-{{ observation }}
-
-Available actions:
-{% for choice in choices %}
-{{ loop.index }}. {{ choice.text }}
-{% endfor %}
-
-Return ONLY valid JSON (no markdown/code fences), exactly:
-{"analysis":"<max 25 words>","reasoning":"<max 25 words>","result":<action_number>}
diff --git a/llm_quest_benchmark/prompt_templates/loop_aware_reasoning.jinja b/llm_quest_benchmark/prompt_templates/loop_aware_reasoning.jinja
deleted file mode 100644
index 38a9343..0000000
--- a/llm_quest_benchmark/prompt_templates/loop_aware_reasoning.jinja
+++ /dev/null
@@ -1,19 +0,0 @@
-Current story state:
-{{ observation }}
-
-Available actions:
-{% for choice in choices %}
-{{ loop.index }}. {{ choice.text }}
-{% endfor %}
-
-Mission objective: complete the quest successfully.
-
-Decision policy:
-1. Prefer actions that preserve progress and avoid premature failure.
-2. Use Status/context hints (stats, resources, relationships) to reduce obvious risks.
-3. If this scene appears repeated, avoid repeating the same action that did not progress.
-4. When uncertain, choose the safest reversible action that keeps the mission alive.
-5. Do not surrender/quit unless it is clearly required for success.
-
-Return ONLY valid JSON (no markdown/code fences), exactly:
-{"analysis":"<max 25 words>","reasoning":"<max 25 words>","result":<action_number>}
diff --git a/llm_quest_benchmark/prompt_templates/objective_guard.jinja b/llm_quest_benchmark/prompt_templates/objective_guard.jinja
deleted file mode 100644
index b80d482..0000000
--- a/llm_quest_benchmark/prompt_templates/objective_guard.jinja
+++ /dev/null
@@ -1,18 +0,0 @@
-Current story state:
-{{ observation }}
-
-Available actions:
-{% for choice in choices %}
-{{ loop.index }}. {{ choice.text }}
-{% endfor %}
-
-Choose the action that best supports mission completion.
-
-Guardrails:
-1. Keep the run alive unless ending is clearly successful.
-2. Penalize actions that look like quitting, escaping, or self-sabotage.
-3. Prefer actions that unlock clues, credentials, access, or progression gates.
-4. Resolve ambiguity by selecting the most reversible safe option.
-
-Return ONLY valid JSON (no markdown/code fences), exactly:
-{"analysis":"<max 25 words>","reasoning":"<max 25 words>","result":<action_number>}
diff --git a/llm_quest_benchmark/prompt_templates/strategic.jinja b/llm_quest_benchmark/prompt_templates/strategic.jinja
deleted file mode 100644
index 1668c41..0000000
--- a/llm_quest_benchmark/prompt_templates/strategic.jinja
+++ /dev/null
@@ -1,32 +0,0 @@
-{# Tier 3: Strategic agent prompt #}
-{# Contextual State Tracker #}
-{% if state_tracker %}
-Historical context:
-{% for entry in state_tracker %}
-- {{ entry }}
-{% endfor %}
-{% endif %}
-
-Current Situation:
-{{ observation }}
-
-Available Actions:
-{% for choice in choices %}
-{{ loop.index }}. {{ choice.text }}
-{% endfor %}
-
-Analysis Framework:
-1. Immediate Context [<50 words]
-2. Plausible Hypotheses [2-3 possibilities]
-3. Action Impact Forecast [short/long-term]
-4. Confidence Estimate [High/Medium/Low]
-
-Response format:
-```json
-{
-    "hypotheses": ["...", "..."],
-    "reasoning": "<concise reason for selected action>",
-    "choice": <number>,
-    "confidence": "<level>"
-}
-```
diff --git a/llm_quest_benchmark/prompt_templates/system_role_completion.jinja b/llm_quest_benchmark/prompt_templates/system_role_completion.jinja
deleted file mode 100644
index 918a695..0000000
--- a/llm_quest_benchmark/prompt_templates/system_role_completion.jinja
+++ /dev/null
@@ -1,11 +0,0 @@
-You are a mission-completion specialist for interactive fiction quests.
-
-Core behavior:
-1. Infer the current objective from narrative clues.
-2. Prioritize actions that maintain progress and optionality.
-3. Avoid premature terminal outcomes unless success is explicit.
-4. Prefer evidence-based choices over stylistic roleplay.
-
-When the state is ambiguous:
-- choose the safest action that still advances the mission.
-- avoid speculative high-risk branches without support in the text.
diff --git a/llm_quest_benchmark/prompt_templates/system_role_risk.jinja b/llm_quest_benchmark/prompt_templates/system_role_risk.jinja
deleted file mode 100644
index ea19c36..0000000
--- a/llm_quest_benchmark/prompt_templates/system_role_risk.jinja
+++ /dev/null
@@ -1,16 +0,0 @@
-{# Enhanced system role for interactive fiction #}
-You are an experienced interactive fiction player. Your capabilities include:
-
-1. Dynamic Goal Recognition: Infer objectives from narrative context
-2. Clue Chaining: Connect information across scenes
-3. Consequence Forecasting: Predict 2-3 steps ahead for each action
-4. Narrative Consistency: Maintain character/story logic
-
-Follow these principles:
-- Treat each choice as part of an unfolding mystery
-- Track objects/characters/relationships as state components
-- Consider both practical and thematic implications
-- Admit uncertainty when clues are ambiguous
-- Flag potential contradictions in story logic
-
-Any bad move can fail the quest, so prefer robust low-risk progress over flashy but uncertain options.
diff --git a/llm_quest_benchmark/renderers/factory.py b/llm_quest_benchmark/renderers/factory.py
index 8b18218..0a8f3e5 100644
--- a/llm_quest_benchmark/renderers/factory.py
+++ b/llm_quest_benchmark/renderers/factory.py
@@ -1,7 +1,7 @@
 """Factory for creating appropriate renderers based on agent type and mode"""
 
-from llm_quest_benchmark.agents.base import QuestPlayer
-from llm_quest_benchmark.agents.human_player import HumanPlayer
+from llm_quest_benchmark.players.base import QuestPlayer
+from llm_quest_benchmark.players.human import HumanPlayer
 from llm_quest_benchmark.renderers.base import BaseRenderer
 from llm_quest_benchmark.renderers.null import NoRenderer
 from llm_quest_benchmark.renderers.progress import ProgressRenderer
@@ -25,7 +25,7 @@ def create_renderer(
     The factory follows these rules:
     1. In debug mode, always use NoRenderer
     2. For human players, use RichRenderer
-    3. For automated agents (LLM, Random):
+    3. For automated players (LLM, Random):
        - In benchmark mode (total_quests provided), use ProgressRenderer
        - Otherwise, use NoRenderer
     """
diff --git a/llm_quest_benchmark/renderers/progress.py b/llm_quest_benchmark/renderers/progress.py
index 9d2cde9..a5097d2 100644
--- a/llm_quest_benchmark/renderers/progress.py
+++ b/llm_quest_benchmark/renderers/progress.py
@@ -45,23 +45,23 @@ def __init__(self, total_quests: int, total_runs: int):
         self.console.print("\n[bold cyan]Benchmark Progress[/]")
 
     def render_game_state(self, state: dict[str, Any]) -> None:
-        """No game state rendering needed for automated agents"""
+        """No game state rendering needed for automated players"""
         pass
 
     def render_title(self) -> None:
-        """No title rendering needed for automated agents"""
+        """No title rendering needed for automated players"""
         pass
 
     def render_quest_text(self, text: str) -> None:
-        """No quest text rendering needed for automated agents"""
+        """No quest text rendering needed for automated players"""
         pass
 
     def render_choices(self, choices: list) -> None:
-        """No choices rendering needed for automated agents"""
+        """No choices rendering needed for automated players"""
         pass
 
     def render_parameters(self, params: list) -> None:
-        """No parameters rendering needed for automated agents"""
+        """No parameters rendering needed for automated players"""
         pass
 
     def render_error(self, message: str) -> None:
diff --git a/llm_quest_benchmark/schemas/__init__.py b/llm_quest_benchmark/schemas/__init__.py
index 34fee08..cb0338f 100644
--- a/llm_quest_benchmark/schemas/__init__.py
+++ b/llm_quest_benchmark/schemas/__init__.py
@@ -1,9 +1,16 @@
 """Schema exports for LLM Quest Benchmark"""
 
-__all__ = ["QMState", "AgentState", "LLMResponse", "QMBridgeState", "BenchmarkConfig", "AgentConfig"]
+__all__ = [
+    "QMState",
+    "AgentState",
+    "LLMResponse",
+    "QMBridgeState",
+    "BenchmarkConfig",
+    "HarnessConfig",
+]
 
 # Import directly from the schema modules using relative imports
 from .bridge import QMBridgeState
-from .config import AgentConfig, BenchmarkConfig
+from .config import BenchmarkConfig, HarnessConfig
 from .response import LLMResponse
 from .state import AgentState, QMState
diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py
index 6a030b2..5cd93b2 100644
--- a/llm_quest_benchmark/schemas/config.py
+++ b/llm_quest_benchmark/schemas/config.py
@@ -8,7 +8,6 @@
 from llm_quest_benchmark.constants import (
     DEFAULT_MODEL,
     DEFAULT_TEMPERATURE,
-    DEFAULT_TEMPLATE,
     MODEL_CHOICES,
     SYSTEM_ROLE_TEMPLATE,
     normalize_template_name,
@@ -18,8 +17,8 @@
 DEFAULT_BENCHMARK_CONFIG = {
     "quests": ["quests/Boat.qm"],
     "agents": [
-        {"model": "random_choice", "skip_single": True, "temperature": 0.0, "template": "reasoning.jinja"},
-        {"model": "gpt-5-mini", "skip_single": True, "temperature": 0.4, "template": "reasoning.jinja"},
+        {"model": "random_choice", "skip_single": True, "temperature": 0.0, "harness": "random_choice"},
+        {"model": "gpt-5-mini", "skip_single": True, "temperature": 0.4, "harness": "reasoning_recent"},
     ],
     "debug": False,
     "quest_timeout": 30,
@@ -27,6 +26,18 @@
     "name": "Default Benchmark",
 }
 
+COMPACTION_HARNESSES = {
+    "memo_compact",
+    "hinted_compact",
+    "tool_compact",
+    "tool_hinted",
+    "planner",
+    "compaction_no_memo",
+    "memo_cot",
+    "memo_extended",
+    "memo_structured",
+}
+
 
 def get_default_benchmark_yaml() -> str:
     """Get the default benchmark configuration from default.yaml file"""
@@ -43,8 +54,9 @@ def get_default_benchmark_yaml() -> str:
   - quests/Boat.qm
 agents:
   - model: random_choice
+    harness: random_choice
   - model: gpt-5-mini
-    template: reasoning.jinja
+    harness: reasoning_recent
 debug: true
 # One worker per agent will be used automatically
 output_dir: results/benchmarks"""
@@ -55,25 +67,71 @@ def get_default_benchmark_yaml() -> str:
 
 
 @dataclass
-class AgentConfig:
-    """Configuration for a single agent in benchmark"""
+class HarnessConfig:
+    """Configuration for a single harness in benchmark"""
 
     model: str = DEFAULT_MODEL
     system_template: str = SYSTEM_ROLE_TEMPLATE
-    action_template: str = DEFAULT_TEMPLATE
+    harness: str = "reasoning_recent"
     temperature: float = DEFAULT_TEMPERATURE
     runs: int = 1
     skip_single: bool = False
     debug: bool = False
     benchmark_id: str | None = None
-    memory_mode: str = "default"
-    compaction_interval: int = 10
+    compaction_interval: int = 50
+
+    def __init__(
+        self,
+        model: str = DEFAULT_MODEL,
+        system_template: str = SYSTEM_ROLE_TEMPLATE,
+        harness: str = "reasoning_recent",
+        temperature: float = DEFAULT_TEMPERATURE,
+        runs: int = 1,
+        skip_single: bool = False,
+        debug: bool = False,
+        benchmark_id: str | None = None,
+        compaction_interval: int = 50,
+        **legacy_keys,
+    ):
+        if "template" in legacy_keys or "action_template" in legacy_keys:
+            raise ValueError("Use harness: key instead of template:")
+        if "memory_mode" in legacy_keys:
+            raise ValueError("Use harness: key instead of memory_mode:")
+        if legacy_keys:
+            unexpected = ", ".join(sorted(legacy_keys))
+            raise TypeError(f"Unexpected HarnessConfig key(s): {unexpected}")
+
+        self.model = model
+        self.system_template = system_template
+        self.harness = harness
+        self.temperature = temperature
+        self.runs = runs
+        self.skip_single = skip_single
+        self.debug = debug
+        self.benchmark_id = benchmark_id
+        self.compaction_interval = compaction_interval
+        self.__post_init__()
 
     def __post_init__(self):
         self.system_template = normalize_template_name(self.system_template)
-        self.action_template = normalize_template_name(self.action_template)
-        if self.model not in ("random_choice", "human"):
-            # Keep parser compatibility for legacy names while UI remains clean.
+        from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, SPECIAL_HARNESSES, is_random_choice_harness
+
+        if (
+            self.harness not in HARNESS_REGISTRY
+            and self.harness != "human"
+            and not is_random_choice_harness(self.harness)
+        ):
+            valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES]
+            raise ValueError(f"Invalid harness: {self.harness}. Supported harnesses: {valid}")
+        if self.harness == "human" and self.model != "human":
+            raise ValueError("Use model: human with harness: human")
+        if self.model == "human" and self.harness != "human":
+            raise ValueError("Use harness: human with model: human")
+        if is_random_choice_harness(self.harness) and self.model != "random_choice":
+            raise ValueError("Use model: random_choice with random_choice harnesses")
+        if is_random_choice_harness(self.model) and not is_random_choice_harness(self.harness):
+            raise ValueError("Use harness: random_choice with model: random_choice")
+        if self.model not in ("human",) and not is_random_choice_harness(self.model):
             from llm_quest_benchmark.llm.client import is_supported_model_name
 
             if not is_supported_model_name(self.model):
@@ -82,20 +140,23 @@ def __post_init__(self):
             raise ValueError(f"Temperature must be between 0.0 and 2.0, got {self.temperature}")
         if self.runs < 1:
             raise ValueError(f"runs must be >= 1, got {self.runs}")
-        if self.memory_mode not in ("default", "full_transcript", "compaction"):
-            raise ValueError(f"Invalid memory_mode: {self.memory_mode}")
-        if self.memory_mode == "compaction" and self.compaction_interval < 1:
+        if self.compaction_interval < 1:
             raise ValueError(f"compaction_interval must be >= 1, got {self.compaction_interval}")
 
     @property
-    def agent_id(self) -> str:
-        """Generate a unique agent ID based on configuration values"""
+    def harness_id(self) -> str:
+        """Generate a stable harness ID based on configuration values"""
         import hashlib
 
-        interval_tag = f"_ci{self.compaction_interval}" if self.memory_mode == "compaction" else ""
-        config_str = f"{self.model}_{self.temperature}_{self.system_template}_{self.action_template}_{self.memory_mode}{interval_tag}"
+        interval_tag = f"_ci{self.compaction_interval}" if self.harness in COMPACTION_HARNESSES else ""
+        config_str = f"{self.model}_{self.temperature}_{self.harness}_{self.system_template}{interval_tag}"
         hash_val = hashlib.md5(config_str.encode()).hexdigest()[:8]
-        return f"{self.model}_t{self.temperature}_{hash_val}"
+        return f"{self.model}_t{self.temperature}_{self.harness}_{hash_val}"
+
+    @property
+    def agent_id(self) -> str:
+        """DB-compatible alias for harness_id"""
+        return self.harness_id
 
 
 @dataclass
@@ -103,7 +164,7 @@ class BenchmarkConfig:
     """Configuration for benchmark run"""
 
     quests: list[str]  # List of quest files or directories
-    agents: list[AgentConfig]  # List of agent configurations to test
+    agents: list[HarnessConfig]  # List of harness configurations to test
     debug: bool = False
     quest_timeout: int = 60  # Timeout per quest
     benchmark_timeout: int | None = None  # Total timeout for all quests, defaults to quest_timeout * num_quests
@@ -137,10 +198,11 @@ def from_yaml(cls, yaml_path: str) -> "BenchmarkConfig":
         if "agents" in data:
             agents = []
             for agent in data["agents"]:
-                # Handle 'template' key which maps to action_template in AgentConfig
                 if "template" in agent:
-                    agent["action_template"] = agent.pop("template")
-                agents.append(AgentConfig(**agent))
+                    raise ValueError("Use harness: key instead of template:")
+                if "memory_mode" in agent:
+                    raise ValueError("Use harness: key instead of memory_mode:")
+                agents.append(HarnessConfig(**agent))
             data["agents"] = agents
 
         return cls(**data)
diff --git a/llm_quest_benchmark/tests/agents/test_mode_agents.py b/llm_quest_benchmark/tests/agents/test_mode_agents.py
deleted file mode 100644
index c650127..0000000
--- a/llm_quest_benchmark/tests/agents/test_mode_agents.py
+++ /dev/null
@@ -1,257 +0,0 @@
-"""Tests for planner and tool-augmented agent modes."""
-
-from unittest.mock import Mock
-
-from llm_quest_benchmark.agents.agent_factory import create_agent
-from llm_quest_benchmark.agents.llm_agent import LLMAgent
-from llm_quest_benchmark.agents.planner_agent import PlannerAgent
-from llm_quest_benchmark.agents.tool_agent import ToolAgent
-
-
-def test_create_agent_uses_planner_template_alias():
-    agent = create_agent(model="gpt-5-mini", action_template="planner")
-    assert isinstance(agent, PlannerAgent)
-
-
-def test_create_agent_uses_tool_template_alias():
-    agent = create_agent(model="gpt-5-mini", action_template="tool_augmented")
-    assert isinstance(agent, ToolAgent)
-
-
-def test_create_agent_propagates_memory_mode_to_planner_and_tool_agents():
-    planner = create_agent(
-        model="gpt-5-mini",
-        action_template="planner",
-        memory_mode="compaction",
-        compaction_interval=50,
-    )
-    tool = create_agent(
-        model="gpt-5-mini",
-        action_template="tool_augmented",
-        memory_mode="compaction",
-        compaction_interval=50,
-    )
-
-    assert isinstance(planner, PlannerAgent)
-    assert isinstance(tool, ToolAgent)
-    assert planner._memory_mode == "compaction"
-    assert planner._compaction_interval == 50
-    assert tool._memory_mode == "compaction"
-    assert tool._compaction_interval == 50
-
-
-def test_create_agent_uses_light_hints_template_with_standard_llm_agent():
-    agent = create_agent(model="gpt-5-mini", action_template="light_hints")
-    assert isinstance(agent, LLMAgent)
-    assert not isinstance(agent, (PlannerAgent, ToolAgent))
-
-
-def test_light_hints_template_injects_general_mechanics():
-    agent = LLMAgent(model_name="gpt-5-mini", action_template="light_hints")
-
-    prompt = agent._format_prompt("A sealed vault blocks the route.", [{"text": "Study the vault"}])
-
-    assert "General hints for this type of quest" in prompt
-    assert "Preparation, study, negotiation" in prompt
-
-
-def test_planner_agent_first_turn_generates_plan_then_acts():
-    agent = PlannerAgent(model_name="gpt-5-mini")
-    mocked_llm = Mock()
-    mocked_llm.get_completion.side_effect = [
-        "Gather clues first. Avoid direct fights. Preserve resources.",
-        '{"analysis":"plan says scout","reasoning":"safer branch","result":2}',
-    ]
-    mocked_llm.get_last_usage.side_effect = [
-        {"prompt_tokens": 30, "completion_tokens": 12, "total_tokens": 42, "estimated_cost_usd": 0.001},
-        {"prompt_tokens": 20, "completion_tokens": 8, "total_tokens": 28, "estimated_cost_usd": 0.0007},
-    ]
-    agent.llm = mocked_llm
-
-    action = agent.get_action("You enter a pirate station.", [{"text": "Scout ahead"}, {"text": "Attack now"}])
-
-    assert action == 2
-    assert agent.current_plan is not None
-    assert "Avoid direct fights" in agent.current_plan
-    assert mocked_llm.get_completion.call_count == 2
-    assert agent.get_last_response().total_tokens == 70
-
-
-def test_planner_agent_reuses_plan_when_state_is_stable():
-    agent = PlannerAgent(model_name="gpt-5-mini")
-    agent.current_plan = "Keep moving carefully and avoid a direct fight."
-    agent._observation_history = ["Quiet corridor."]
-    mocked_llm = Mock()
-    mocked_llm.get_completion.return_value = '{"analysis":"plan still fits","reasoning":"careful progress","result":1}'
-    mocked_llm.get_last_usage.return_value = {
-        "prompt_tokens": 18,
-        "completion_tokens": 7,
-        "total_tokens": 25,
-        "estimated_cost_usd": 0.0005,
-    }
-    agent.llm = mocked_llm
-
-    action = agent.get_action("Quiet corridor.", [{"text": "Open the door"}, {"text": "Run"}])
-
-    assert action == 1
-    assert mocked_llm.get_completion.call_count == 1
-
-
-def test_planner_agent_uses_contextual_memory_state():
-    agent = PlannerAgent(model_name="gpt-5-mini", memory_mode="compaction", compaction_interval=50)
-    agent._quest_briefing = "Original mission: win the election."
-    agent._transcript = [
-        {
-            "step": 1,
-            "observation": "You learned Maloqs value strength.",
-            "choice_text": "Ask about Maloqs",
-            "memo": "Maloqs value strength",
-            "action": 1,
-        }
-    ]
-    agent._steps_since_compaction = 1
-    mocked_llm = Mock()
-    mocked_llm.get_completion.side_effect = [
-        "Use the remembered cultural clue.",
-        '{"analysis":"use clue","reasoning":"fits plan","result":1}',
-    ]
-    mocked_llm.get_last_usage.return_value = {
-        "prompt_tokens": 1,
-        "completion_tokens": 1,
-        "total_tokens": 2,
-        "estimated_cost_usd": 0.0,
-    }
-    agent.llm = mocked_llm
-
-    agent.get_action("Current banquet scene.", [{"text": "Greet like a warrior"}])
-
-    first_prompt = mocked_llm.get_completion.call_args_list[0].args[0]
-    assert "Quest briefing" in first_prompt
-    assert "RECENT STEPS" in first_prompt
-    assert "Maloqs value strength" in first_prompt
-
-
-def test_tool_agent_can_use_quest_history():
-    agent = ToolAgent(model_name="gpt-5-mini")
-    agent._step_log = [
-        {
-            "step": 1,
-            "observation": "Merchant mentioned low fuel.",
-            "choices": ["Buy fuel", "Keep flying"],
-            "selected_choice": "Buy fuel",
-        }
-    ]
-    mocked_llm = Mock()
-    mocked_llm.get_completion.side_effect = [
-        '{"analysis":"need history","tool_calls":[{"tool":"quest_history","input":"fuel merchant"}],"result":null}',
-        '{"analysis":"fuel clue matters","reasoning":"play safe","result":1}',
-    ]
-    mocked_llm.get_last_usage.side_effect = [
-        {"prompt_tokens": 24, "completion_tokens": 10, "total_tokens": 34, "estimated_cost_usd": 0.0008},
-        {"prompt_tokens": 22, "completion_tokens": 9, "total_tokens": 31, "estimated_cost_usd": 0.0007},
-    ]
-    agent.llm = mocked_llm
-
-    action = agent.get_action("Your fuel gauge is blinking.", [{"text": "Refuel"}, {"text": "Attack pirates"}])
-
-    assert action == 1
-    assert mocked_llm.get_completion.call_count == 2
-    assert agent.get_last_response().total_tokens == 65
-    assert len(agent._step_log) == 2
-
-
-def test_tool_agent_calculator_supports_arithmetic_and_comparisons():
-    assert ToolAgent.calculator("55 + 12 - 5") == "55 + 12 - 5 = 62"
-    assert ToolAgent.calculator("60 >= 55 and 62 >= 80") == "60 >= 55 and 62 >= 80 = False"
-    assert ToolAgent.calculator("__import__('os')").startswith("error:")
-
-
-def test_tool_agent_scratchpad_read_write_and_reset():
-    agent = ToolAgent(model_name="gpt-5-mini")
-
-    assert agent.scratchpad("read") == "(empty)"
-    assert (
-        agent.scratchpad("write_replace", " Board: W B _ ; failed door 2 ") == "updated: Board: W B _ ; failed door 2"
-    )
-    assert agent.scratchpad("read") == "Board: W B _ ; failed door 2"
-
-    agent.reset()
-
-    assert agent.scratchpad("read") == "(empty)"
-
-
-def test_tool_agent_can_use_calculator_and_records_tool_metadata():
-    agent = ToolAgent(model_name="gpt-5-mini")
-    mocked_llm = Mock()
-    mocked_llm.get_completion.side_effect = [
-        '{"memo":"Need mix math","analysis":"calculate target","tool_calls":[{"tool":"calculator","input":"50 + 3 >= 55"}],"result":null}',
-        '{"memo":"Need more strength","analysis":"math failed","reasoning":"choose strength","result":2}',
-    ]
-    mocked_llm.get_last_usage.return_value = {
-        "prompt_tokens": 10,
-        "completion_tokens": 5,
-        "total_tokens": 15,
-        "estimated_cost_usd": 0.0,
-    }
-    agent.llm = mocked_llm
-
-    action = agent.get_action("Strength is 50. Need at least 55.", [{"text": "Add water"}, {"text": "Add repusator"}])
-
-    response = agent.get_last_response()
-    assert action == 2
-    assert response.tool_calls == [{"tool": "calculator", "input": "50 + 3 >= 55", "operation": "", "content": ""}]
-    assert response.tool_results == ["calculator(50 + 3 >= 55) => 50 + 3 >= 55 = False"]
-    assert response.memo == "Need more strength"
-
-
-def test_tool_agent_uses_contextual_memory_state():
-    agent = ToolAgent(model_name="gpt-5-mini", memory_mode="compaction", compaction_interval=50)
-    agent._quest_briefing = "Original mission: pass pilot certification."
-    agent._transcript = [
-        {
-            "step": 1,
-            "observation": "Hogger is greedy.",
-            "choice_text": "Bribe Hogger",
-            "memo": "Hogger is greedy",
-            "action": 1,
-        }
-    ]
-    agent._steps_since_compaction = 1
-    mocked_llm = Mock()
-    mocked_llm.get_completion.return_value = (
-        '{"memo":"Hogger is greedy","analysis":"no tools needed","tool_calls":[],"result":1}'
-    )
-    mocked_llm.get_last_usage.return_value = {
-        "prompt_tokens": 10,
-        "completion_tokens": 5,
-        "total_tokens": 15,
-        "estimated_cost_usd": 0.0,
-    }
-    agent.llm = mocked_llm
-
-    agent.get_action("Current exam room.", [{"text": "Offer a bribe"}])
-
-    prompt = mocked_llm.get_completion.call_args.args[0]
-    assert "Quest briefing" in prompt
-    assert "RECENT STEPS" in prompt
-    assert "Hogger is greedy" in prompt
-
-
-def test_tool_agent_can_finish_without_tools_in_one_call():
-    agent = ToolAgent(model_name="gpt-5-mini")
-    mocked_llm = Mock()
-    mocked_llm.get_completion.return_value = (
-        '{"analysis":"no tools needed","tool_calls":[],"reasoning":"direct clue","result":2}'
-    )
-    mocked_llm.get_last_usage.return_value = {
-        "prompt_tokens": 15,
-        "completion_tokens": 6,
-        "total_tokens": 21,
-        "estimated_cost_usd": 0.0004,
-    }
-    agent.llm = mocked_llm
-
-    action = agent.get_action("A guard points at the safe exit.", [{"text": "Fight"}, {"text": "Leave"}])
-
-    assert action == 2
-    assert mocked_llm.get_completion.call_count == 1
diff --git a/llm_quest_benchmark/tests/executors/cli/test_commands.py b/llm_quest_benchmark/tests/executors/cli/test_commands.py
index db0daf1..1bd972e 100644
--- a/llm_quest_benchmark/tests/executors/cli/test_commands.py
+++ b/llm_quest_benchmark/tests/executors/cli/test_commands.py
@@ -19,8 +19,11 @@ def test_version():
 
 
 def test_run_quest():
-    """Test running a quest with random agent"""
-    result = runner.invoke(app, ["run", "--quest", str(DEFAULT_QUEST), "--model", "random_choice", "--debug"])
+    """Test running a quest with random player"""
+    result = runner.invoke(
+        app,
+        ["run", "--quest", str(DEFAULT_QUEST), "--model", "random_choice", "--harness", "random_choice", "--debug"],
+    )
     assert result.exit_code in [0, 1, 2]
 
 
@@ -31,7 +34,9 @@ def test_run_quest_invalid_args():
     assert result.exit_code == 2
 
     # Test missing quest file
-    result = runner.invoke(app, ["run", "--quest", "nonexistent.qm", "--model", "random_choice"])
+    result = runner.invoke(
+        app, ["run", "--quest", "nonexistent.qm", "--model", "random_choice", "--harness", "random_choice"]
+    )
     assert result.exit_code == 2
 
 
diff --git a/llm_quest_benchmark/tests/harnesses/__init__.py b/llm_quest_benchmark/tests/harnesses/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/llm_quest_benchmark/tests/agents/test_anthropic.py b/llm_quest_benchmark/tests/harnesses/test_anthropic.py
similarity index 50%
rename from llm_quest_benchmark/tests/agents/test_anthropic.py
rename to llm_quest_benchmark/tests/harnesses/test_anthropic.py
index 5dd1f95..ba60f97 100644
--- a/llm_quest_benchmark/tests/agents/test_anthropic.py
+++ b/llm_quest_benchmark/tests/harnesses/test_anthropic.py
@@ -1,15 +1,15 @@
-"""Deterministic tests for Anthropic-backed agent behavior."""
+"""Deterministic tests for Anthropic-backed harness behavior."""
 
 from unittest.mock import Mock, patch
 
 import pytest
 
-from llm_quest_benchmark.agents.agent_factory import create_agent
+from llm_quest_benchmark.harnesses.factory import create_harness
 
 
 @patch("llm_quest_benchmark.llm.client.anthropic.Anthropic")
-def test_anthropic_agent_mocked_completion(mock_anthropic_cls):
-    """Agent should parse a mocked Anthropic completion without network calls."""
+def test_anthropic_harness_mocked_completion(mock_anthropic_cls):
+    """Harness should parse a mocked Anthropic completion without network calls."""
     mock_client = Mock()
     mock_response = Mock()
     mock_block = Mock()
@@ -18,15 +18,15 @@ def test_anthropic_agent_mocked_completion(mock_anthropic_cls):
     mock_client.messages.create.return_value = mock_response
     mock_anthropic_cls.return_value = mock_client
 
-    agent = create_agent("claude-sonnet-4-5")
-    action = agent.get_action("Test prompt", [{"text": "A"}, {"text": "B"}])
+    harness = create_harness("minimal", model="claude-sonnet-4-5")
+    action = harness.get_action("Test prompt", [{"text": "A"}, {"text": "B"}])
 
     assert action == 2
     assert mock_client.messages.create.call_count == 1
 
 
-def test_anthropic_agent_empty_choices_raises():
+def test_anthropic_harness_empty_choices_raises():
     """Base player contract should reject empty choices."""
-    agent = create_agent("claude-sonnet-4-5")
+    harness = create_harness("minimal", model="claude-sonnet-4-5")
     with pytest.raises(ValueError, match="No choices provided"):
-        agent.get_action("Test prompt", [])
+        harness.get_action("Test prompt", [])
diff --git a/llm_quest_benchmark/tests/agents/test_llm_agent.py b/llm_quest_benchmark/tests/harnesses/test_base.py
similarity index 64%
rename from llm_quest_benchmark/tests/agents/test_llm_agent.py
rename to llm_quest_benchmark/tests/harnesses/test_base.py
index 06ff32f..280fd0a 100644
--- a/llm_quest_benchmark/tests/agents/test_llm_agent.py
+++ b/llm_quest_benchmark/tests/harnesses/test_base.py
@@ -1,10 +1,11 @@
-"""Tests for LLM agent"""
+"""Tests for the base LLM harness behavior."""
 
 from unittest.mock import Mock, patch
 
 import pytest
 
-from llm_quest_benchmark.agents.llm_agent import LLMAgent, parse_llm_response
+from llm_quest_benchmark.harnesses.base import parse_llm_response
+from llm_quest_benchmark.harnesses.minimal import MinimalHarness
 from llm_quest_benchmark.schemas.response import LLMResponse
 
 
@@ -20,8 +21,8 @@ def example_choices():
 
 @pytest.mark.timeout(5)  # Quick unit test
 @patch("llm_quest_benchmark.llm.client.OpenAI")
-def test_agent_basic_flow(mock_openai, monkeypatch):
-    """Test basic agent functionality with mocked LLM"""
+def test_harness_basic_flow(mock_openai, monkeypatch):
+    """Test basic harness functionality with mocked LLM"""
     monkeypatch.setenv("OPENAI_API_KEY", "test-key")
     # Setup mock
     mock_chat = Mock()
@@ -41,14 +42,14 @@ def test_agent_basic_flow(mock_openai, monkeypatch):
     observation = "You are at a trading station."
     choices = [{"id": "1", "text": "Talk to merchant"}, {"id": "2", "text": "Leave station"}]
 
-    # Create agent and test
-    agent = LLMAgent(model_name="gpt-5-mini")
-    result = agent.get_action(observation, choices)
+    # Create harness and test
+    harness = MinimalHarness(model_name="gpt-5-mini")
+    result = harness.get_action(observation, choices)
 
     # Verify results
     assert result == 1  # Expect an integer
     assert mock_chat.completions.create.call_count == 1
-    last_response = agent.get_last_response()
+    last_response = harness.get_last_response()
     assert last_response.prompt_tokens == 9
     assert last_response.completion_tokens == 2
     assert last_response.total_tokens == 11
@@ -56,47 +57,54 @@ def test_agent_basic_flow(mock_openai, monkeypatch):
 
 def test_template_rendering():
     """Test that templates are rendered correctly"""
-    agent = LLMAgent()
+    harness = MinimalHarness()
     observation = "Test observation"
     choices = [{"text": "Option 1"}, {"text": "Option 2"}]
 
     # Test that prompt is rendered correctly
-    prompt = agent.prompt_renderer.render_action_prompt(observation, choices)
+    prompt = harness.prompt_renderer.render_action_prompt(observation, choices)
     assert "Test observation" in prompt
     assert "Option 1" in prompt
     assert "Option 2" in prompt
 
 
-def test_agent_initialization_without_api_key(monkeypatch):
-    """Agent construction should not require provider API keys before inference."""
+def test_harness_initialization_without_api_key(monkeypatch):
+    """Harness construction should not require provider API keys before inference."""
     monkeypatch.delenv("OPENAI_API_KEY", raising=False)
     monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
-    agent = LLMAgent(model_name="gpt-5-mini")
-    assert agent.llm is None
+    harness = MinimalHarness(model_name="gpt-5-mini")
+    assert harness.llm is None
 
 
 def test_gemini_prompt_uses_selected_template():
-    agent = LLMAgent(model_name="gemini-2.5-flash")
-    prompt = agent._format_prompt("state", [{"text": "A"}, {"text": "B"}])
+    harness = MinimalHarness(model_name="gemini-2.5-flash", action_template="reasoning.jinja")
+    prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}])
     assert "Return ONLY valid JSON" in prompt
     assert "A" in prompt
     assert "B" in prompt
 
 
 def test_non_gemini_prompt_uses_selected_template():
-    agent = LLMAgent(model_name="gpt-5-mini", action_template="stub.jinja")
-    prompt = agent._format_prompt("state", [{"text": "A"}, {"text": "B"}])
+    harness = MinimalHarness(model_name="gpt-5-mini", action_template="stub.jinja")
+    prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}])
     assert "IMPORTANT: Please respond with ONLY a single number" in prompt
 
 
+def test_formatted_user_prompt_does_not_duplicate_system_prompt():
+    harness = MinimalHarness(model_name="gpt-5-mini", action_template="stub.jinja")
+    prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}])
+
+    assert "experienced interactive fiction player" not in prompt
+
+
 def test_template_alias_without_suffix_is_supported():
-    agent = LLMAgent(model_name="gpt-5-mini", action_template="reasoning")
-    prompt = agent._format_prompt("state", [{"text": "A"}, {"text": "B"}])
+    harness = MinimalHarness(model_name="gpt-5-mini", action_template="reasoning")
+    prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}])
     assert '"result"' in prompt
 
 
 def test_gpt5_force_numeric_retry_path():
-    agent = LLMAgent(model_name="gpt-5-mini")
+    harness = MinimalHarness(model_name="gpt-5-mini")
     mocked_llm = Mock()
     mocked_llm.get_completion.side_effect = ["```json\n{", "```json\n{", "2"]
     mocked_llm.get_last_usage.side_effect = [
@@ -104,58 +112,57 @@ def test_gpt5_force_numeric_retry_path():
         {"prompt_tokens": 6, "completion_tokens": 1, "total_tokens": 7, "estimated_cost_usd": 0.0005},
         {"prompt_tokens": 4, "completion_tokens": 1, "total_tokens": 5, "estimated_cost_usd": 0.0003},
     ]
-    agent.llm = mocked_llm
+    harness.llm = mocked_llm
 
-    action = agent.get_action("state", [{"text": "A"}, {"text": "B"}])
+    action = harness.get_action("state", [{"text": "A"}, {"text": "B"}])
 
     assert action == 2
     assert mocked_llm.get_completion.call_count == 3
-    last = agent.get_last_response()
+    last = harness.get_last_response()
     assert last.total_tokens == 24
     assert last.estimated_cost_usd == pytest.approx(0.0018)
     assert last.parse_mode == "force_retry_number_only"
 
 
 def test_contextual_state_includes_previous_observations():
-    agent = LLMAgent(model_name="gpt-5-mini")
-    agent._remember_observation("Previous hint")
-    agent._remember_observation("Current state")
-    contextual = agent._build_contextual_state("Current state")
+    harness = MinimalHarness(model_name="gpt-5-mini")
+    harness.memory_module.update({"observation": "Previous hint"})
+    harness.memory_module.update({"observation": "Current state"})
+    contextual = harness._build_contextual_state("Current state")
     assert "Recent context from previous steps" in contextual
     assert "Previous hint" in contextual
 
 
 def test_contextual_state_includes_recent_decisions():
-    agent = LLMAgent(model_name="gpt-5-mini")
-    agent._decision_history = [
-        {"action": 2, "choice": "Inspect the terminal", "parse_mode": "json_direct"},
-        {"action": 1, "choice": "Ask for access", "parse_mode": "retry_json_repaired"},
-    ]
-    contextual = agent._build_contextual_state("Current state")
+    harness = MinimalHarness(model_name="gpt-5-mini")
+    harness.memory_module.update({"observation": "Previous state"})
+    harness.memory_module.update({"action": 2, "choice": "Inspect the terminal", "parse_mode": "json_direct"})
+    harness.memory_module.update({"action": 1, "choice": "Ask for access", "parse_mode": "retry_json_repaired"})
+    contextual = harness._build_contextual_state("Current state")
     assert "Recent selected actions" in contextual
     assert "Inspect the terminal" in contextual
     assert "parse=json_direct" in contextual
 
 
 def test_safety_filter_prefers_lower_risk_choice():
-    agent = LLMAgent(model_name="gpt-5-mini")
+    harness = MinimalHarness(model_name="gpt-5-mini")
     choices = [
         {"text": "Пойти в космопорт и улететь, чтобы завтра не позориться"},
         {"text": "Постараться пройти мимо"},
     ]
-    assert agent._apply_safety_filter(1, choices) == 2
+    assert harness._apply_safety_filter(choices, 1) == 2
 
 
 def test_get_last_response_uses_skip_single_result():
-    agent = LLMAgent(model_name="gpt-5-mini", skip_single=True)
-    agent.history.append(LLMResponse(action=2, is_default=False))
-    agent._last_response = LLMResponse(action=2, is_default=False)
+    harness = MinimalHarness(model_name="gpt-5-mini", skip_single=True)
+    harness.history.append(LLMResponse(action=2, is_default=False))
+    harness._last_response = LLMResponse(action=2, is_default=False)
 
-    action = agent.get_action("state", [{"id": "1", "text": "Only option"}])
+    action = harness.get_action("state", [{"id": "1", "text": "Only option"}])
 
     assert action == 1
-    assert agent.get_last_response().action == 1
-    assert agent.get_last_response().reasoning == "auto_single_choice"
+    assert harness.get_last_response().action == 1
+    assert harness.get_last_response().reasoning == "auto_single_choice"
 
 
 def test_parse_llm_response_number_only_tracks_parse_mode():
@@ -194,7 +201,7 @@ def test_parse_llm_response_uses_analysis_as_reasoning_when_truncated():
 
 
 def test_llm_error_default_response_keeps_reasoning_marker():
-    agent = LLMAgent(model_name="gemini-2.5-flash")
+    harness = MinimalHarness(model_name="gemini-2.5-flash")
     mocked_llm = Mock()
     mocked_llm.get_completion.side_effect = RuntimeError("provider returned empty message")
     mocked_llm.get_last_usage.return_value = {
@@ -203,20 +210,20 @@ def test_llm_error_default_response_keeps_reasoning_marker():
         "total_tokens": 0,
         "estimated_cost_usd": None,
     }
-    agent.llm = mocked_llm
+    harness.llm = mocked_llm
 
-    action = agent.get_action("state", [{"text": "A"}, {"text": "B"}])
+    action = harness.get_action("state", [{"text": "A"}, {"text": "B"}])
 
     assert action == 1
-    last = agent.get_last_response()
+    last = harness.get_last_response()
     assert last.is_default is True
     assert last.reasoning is not None
     assert "llm_call_error" in last.reasoning
 
 
 def test_retry_prompt_requests_json_payload():
-    agent = LLMAgent(model_name="gemini-2.5-flash")
-    prompt = agent._format_retry_prompt("state", [{"text": "A"}, {"text": "B"}])
+    harness = MinimalHarness(model_name="gemini-2.5-flash")
+    prompt = harness._format_retry_prompt("state", [{"text": "A"}, {"text": "B"}])
     assert "Return valid JSON only" in prompt
     assert '"analysis"' in prompt
     assert '"reasoning"' in prompt
@@ -224,7 +231,7 @@ def test_retry_prompt_requests_json_payload():
 
 
 def test_retry_preserves_reasoning_from_first_attempt():
-    agent = LLMAgent(model_name="gemini-2.5-flash")
+    harness = MinimalHarness(model_name="gemini-2.5-flash")
     mocked_llm = Mock()
     mocked_llm.get_completion.side_effect = [
         "Analysis: low oxygen\nReasoning: safer move first\n```json\n{",
@@ -244,12 +251,12 @@ def test_retry_preserves_reasoning_from_first_attempt():
             "estimated_cost_usd": 0.0002,
         },
     ]
-    agent.llm = mocked_llm
+    harness.llm = mocked_llm
 
-    action = agent.get_action("state", [{"text": "A"}, {"text": "B"}])
+    action = harness.get_action("state", [{"text": "A"}, {"text": "B"}])
 
     assert action == 2
-    last = agent.get_last_response()
+    last = harness.get_last_response()
     assert last.analysis is not None
     assert "low oxygen" in last.analysis
     assert last.reasoning is not None
diff --git a/llm_quest_benchmark/tests/harnesses/test_factory.py b/llm_quest_benchmark/tests/harnesses/test_factory.py
new file mode 100644
index 0000000..49062fe
--- /dev/null
+++ b/llm_quest_benchmark/tests/harnesses/test_factory.py
@@ -0,0 +1,203 @@
+import pytest
+
+from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness
+from llm_quest_benchmark.harnesses.memo import MemoCompactHarness
+from llm_quest_benchmark.harnesses.minimal import MinimalHarness
+from llm_quest_benchmark.players.human import HumanPlayer
+from llm_quest_benchmark.players.random import RandomPlayer
+from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig
+
+
+def test_create_minimal_harness():
+    harness = create_harness("minimal", model="gpt-5-mini")
+
+    assert isinstance(harness, MinimalHarness)
+
+
+def test_all_harness_names_instantiate():
+    for harness_name, harness_cls in HARNESS_REGISTRY.items():
+        harness = create_harness(harness_name, model="gpt-5-mini")
+
+        assert isinstance(harness, harness_cls)
+
+
+def test_create_human_harness():
+    harness = create_harness("human")
+
+    assert isinstance(harness, HumanPlayer)
+
+
+def test_create_random_choice_harness():
+    harness = create_harness("random_choice")
+
+    assert isinstance(harness, RandomPlayer)
+
+
+def test_create_seeded_random_choice_harness():
+    harness = create_harness("random_choice_123", model="random_choice")
+
+    assert isinstance(harness, RandomPlayer)
+    assert harness.agent_id == "random_123"
+
+
+def test_create_bad_harness_name_raises():
+    with pytest.raises(ValueError, match="minimal"):
+        create_harness("bad_name", model="gpt-5-mini")
+
+
+def test_create_bad_random_choice_seed_raises():
+    with pytest.raises(ValueError, match="random_choice_<seed>"):
+        create_harness("random_choice_bad")
+
+
+def test_random_choice_model_does_not_hide_bad_harness():
+    with pytest.raises(ValueError, match="bad_name"):
+        create_harness("bad_name", model="random_choice_123")
+
+
+def test_random_choice_model_requires_random_harness():
+    with pytest.raises(ValueError, match="harness='random_choice'"):
+        create_harness("minimal", model="random_choice")
+
+
+def test_seeded_random_model_is_rejected():
+    with pytest.raises(ValueError, match="Encode random seeds in harness"):
+        create_harness("random_choice", model="random_choice_123")
+
+
+def test_human_model_requires_human_harness():
+    with pytest.raises(ValueError, match="harness='human'"):
+        create_harness("minimal", model="human")
+
+
+def test_harness_config_stable_harness_id():
+    config = HarnessConfig(harness="memo_compact", model="gpt-5-mini")
+
+    assert isinstance(config.harness_id, str)
+    assert config.harness_id == HarnessConfig(harness="memo_compact", model="gpt-5-mini").harness_id
+
+
+def test_harness_config_system_template_affects_harness_id():
+    first = HarnessConfig(harness="memo_compact", model="gpt-5-mini", system_template="system_role.jinja")
+    second = HarnessConfig(harness="memo_compact", model="gpt-5-mini", system_template="custom_system_role.jinja")
+
+    assert first.harness_id != second.harness_id
+
+
+def test_non_compaction_harness_id_ignores_compaction_interval():
+    first = HarnessConfig(harness="reasoning_recent", model="gpt-5-mini", compaction_interval=10)
+    second = HarnessConfig(harness="reasoning_recent", model="gpt-5-mini", compaction_interval=99)
+
+    assert first.harness_id == second.harness_id
+
+
+def test_compaction_harness_id_includes_compaction_interval():
+    first = HarnessConfig(harness="memo_compact", model="gpt-5-mini", compaction_interval=10)
+    second = HarnessConfig(harness="memo_compact", model="gpt-5-mini", compaction_interval=99)
+
+    assert first.harness_id != second.harness_id
+
+
+def test_harness_config_allows_seeded_random_choice_harness():
+    config = HarnessConfig(harness="random_choice_123", model="random_choice")
+
+    assert config.harness == "random_choice_123"
+
+
+def test_harness_config_rejects_llm_model_with_random_harness():
+    with pytest.raises(ValueError, match="model: random_choice"):
+        HarnessConfig(harness="random_choice", model="gpt-5-mini")
+
+
+def test_harness_config_rejects_llm_model_with_human_harness():
+    with pytest.raises(ValueError, match="model: human"):
+        HarnessConfig(harness="human", model="gpt-5-mini")
+
+
+def test_harness_config_rejects_random_model_with_llm_harness():
+    with pytest.raises(ValueError, match="harness: random_choice"):
+        HarnessConfig(harness="minimal", model="random_choice")
+
+
+def test_harness_config_rejects_human_model_with_llm_harness():
+    with pytest.raises(ValueError, match="harness: human"):
+        HarnessConfig(harness="minimal", model="human")
+
+
+def test_harness_config_allows_retired_exp4_aliases():
+    for harness_name in ("compaction_no_memo", "memo_cot", "memo_extended", "memo_structured"):
+        config = HarnessConfig(harness=harness_name, model="gpt-5-mini")
+
+        assert config.harness == harness_name
+
+
+def test_harness_config_rejects_old_template_key():
+    with pytest.raises(ValueError, match="Use harness: key instead of template:"):
+        HarnessConfig(model="gpt-5-mini", template="reasoning.jinja")
+
+
+def test_harness_config_rejects_old_memory_mode_key():
+    with pytest.raises(ValueError, match="Use harness: key instead of memory_mode:"):
+        HarnessConfig(model="gpt-5-mini", harness="memo_compact", memory_mode="compaction")
+
+
+def test_benchmark_config_from_yaml_parses_harness(tmp_path):
+    quest_path = tmp_path / "quest.qm"
+    quest_path.write_text("", encoding="utf-8")
+    config_path = tmp_path / "benchmark.yaml"
+    config_path.write_text(
+        f"""
+quests:
+  - {quest_path}
+agents:
+  - model: gpt-5-mini
+    harness: memo_compact
+""",
+        encoding="utf-8",
+    )
+
+    config = BenchmarkConfig.from_yaml(str(config_path))
+
+    assert len(config.agents) == 1
+    assert isinstance(config.agents[0], HarnessConfig)
+    assert isinstance(create_harness(config.agents[0].harness, model=config.agents[0].model), MemoCompactHarness)
+    assert config.agents[0].harness == "memo_compact"
+
+
+def test_benchmark_config_from_yaml_rejects_template(tmp_path):
+    quest_path = tmp_path / "quest.qm"
+    quest_path.write_text("", encoding="utf-8")
+    config_path = tmp_path / "benchmark.yaml"
+    config_path.write_text(
+        f"""
+quests:
+  - {quest_path}
+agents:
+  - model: gpt-5-mini
+    template: reasoning.jinja
+""",
+        encoding="utf-8",
+    )
+
+    with pytest.raises(ValueError, match="Use harness: key instead of template:"):
+        BenchmarkConfig.from_yaml(str(config_path))
+
+
+def test_benchmark_config_from_yaml_rejects_memory_mode(tmp_path):
+    quest_path = tmp_path / "quest.qm"
+    quest_path.write_text("", encoding="utf-8")
+    config_path = tmp_path / "benchmark.yaml"
+    config_path.write_text(
+        f"""
+quests:
+  - {quest_path}
+agents:
+  - model: gpt-5-mini
+    harness: memo_compact
+    memory_mode: compaction
+""",
+        encoding="utf-8",
+    )
+
+    with pytest.raises(ValueError, match="Use harness: key instead of memory_mode:"):
+        BenchmarkConfig.from_yaml(str(config_path))
diff --git a/llm_quest_benchmark/tests/harnesses/test_harnesses.py b/llm_quest_benchmark/tests/harnesses/test_harnesses.py
new file mode 100644
index 0000000..efa03bb
--- /dev/null
+++ b/llm_quest_benchmark/tests/harnesses/test_harnesses.py
@@ -0,0 +1,374 @@
+"""Comprehensive tests for concrete harness behavior."""
+
+from unittest.mock import Mock
+
+from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness
+from llm_quest_benchmark.harnesses.memo import (
+    CompactionNoMemoHarness,
+    HintedCompactHarness,
+    MemoCompactHarness,
+    MemoCotHarness,
+    MemoExtendedHarness,
+    MemoStructuredHarness,
+)
+from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory
+from llm_quest_benchmark.harnesses.minimal import MinimalHarness
+from llm_quest_benchmark.harnesses.planner import PlannerHarness
+from llm_quest_benchmark.harnesses.reasoning import ReasoningFullTranscriptHarness, ReasoningRecentHarness
+from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness, ToolHintedHarness
+
+HARNESS_SPECS = {
+    "minimal": (MinimalHarness, "stub.jinja", DefaultMemory),
+    "reasoning_recent": (ReasoningRecentHarness, "reasoning.jinja", DefaultMemory),
+    "reasoning_full": (ReasoningFullTranscriptHarness, "reasoning.jinja", FullTranscriptMemory),
+    "memo_compact": (MemoCompactHarness, "stateful_compact.jinja", CompactionMemory),
+    "hinted_compact": (HintedCompactHarness, "stateful_compact_hints.jinja", CompactionMemory),
+    "tool_compact": (ToolCompactHarness, "tool_augmented.jinja", CompactionMemory),
+    "tool_hinted": (ToolHintedHarness, "tool_augmented_hints.jinja", CompactionMemory),
+    "planner": (PlannerHarness, "planner.jinja", CompactionMemory),
+    "compaction_no_memo": (CompactionNoMemoHarness, "reasoning.jinja", CompactionMemory),
+    "memo_cot": (MemoCotHarness, "memo_cot.jinja", CompactionMemory),
+    "memo_extended": (MemoExtendedHarness, "memo_extended.jinja", CompactionMemory),
+    "memo_structured": (MemoStructuredHarness, "memo_structured.jinja", CompactionMemory),
+}
+
+
+def assert_harness_configuration(harness_name: str) -> None:
+    expected_class, expected_template, expected_memory_class = HARNESS_SPECS[harness_name]
+
+    harness = create_harness(harness_name, model="gpt-5-mini")
+
+    assert isinstance(harness, expected_class)
+    assert harness.harness_name == harness_name
+    assert harness.action_template == expected_template
+    assert isinstance(harness.memory_module, expected_memory_class)
+
+
+def test_minimal_harness_configuration():
+    assert_harness_configuration("minimal")
+
+
+def test_reasoning_recent_harness_configuration():
+    assert_harness_configuration("reasoning_recent")
+
+
+def test_reasoning_full_harness_configuration():
+    assert_harness_configuration("reasoning_full")
+
+
+def test_memo_compact_harness_configuration():
+    assert_harness_configuration("memo_compact")
+
+
+def test_hinted_compact_harness_configuration():
+    assert_harness_configuration("hinted_compact")
+
+
+def test_tool_compact_harness_configuration():
+    assert_harness_configuration("tool_compact")
+
+
+def test_tool_hinted_harness_configuration():
+    assert_harness_configuration("tool_hinted")
+
+
+def test_planner_harness_configuration():
+    assert_harness_configuration("planner")
+
+
+def test_exp4_retired_harness_configuration():
+    assert_harness_configuration("compaction_no_memo")
+    assert_harness_configuration("memo_cot")
+    assert_harness_configuration("memo_extended")
+    assert_harness_configuration("memo_structured")
+
+
+def test_all_registry_harnesses_have_configuration_specs():
+    assert set(HARNESS_REGISTRY) == set(HARNESS_SPECS)
+
+
+def test_all_registry_harnesses_instantiate_with_expected_names():
+    for harness_name in HARNESS_REGISTRY:
+        harness = create_harness(harness_name, model="gpt-5-mini")
+
+        assert harness.harness_name == harness_name
+
+
+def test_memo_compact_mocked_llm_returns_action_and_reuses_memo_context():
+    harness = MemoCompactHarness(model_name="gpt-5-mini")
+    mocked_llm = Mock()
+    mocked_llm.get_completion.side_effect = [
+        '{"memo":"Merchant needs fuel payment","analysis":"pay first","reasoning":"quest clue","result":2}',
+        '{"memo":"Paid fuel merchant","analysis":"memo says paid","reasoning":"continue","result":1}',
+    ]
+    mocked_llm.get_last_usage.return_value = {
+        "prompt_tokens": 10,
+        "completion_tokens": 5,
+        "total_tokens": 15,
+        "estimated_cost_usd": 0.0,
+    }
+    harness.llm = mocked_llm
+
+    first_action = harness.get_action("A merchant offers fuel for a fee.", [{"text": "Leave"}, {"text": "Pay"}])
+    second_action = harness.get_action("The fuel gauge still blinks.", [{"text": "Check receipt"}, {"text": "Leave"}])
+
+    assert first_action == 2
+    assert second_action == 1
+    assert harness.get_last_response().memo == "Paid fuel merchant"
+    second_prompt = mocked_llm.get_completion.call_args_list[1].args[0]
+    assert "Merchant needs fuel payment" in second_prompt
+
+
+def test_compaction_memory_receives_existing_llm_client():
+    harness = MemoCompactHarness(model_name="gpt-5-mini", compaction_interval=1)
+    mocked_llm = Mock()
+    mocked_llm.get_completion.side_effect = [
+        '{"memo":"Paid fuel merchant","analysis":"pay first","reasoning":"quest clue","result":2}',
+        "Summary: paid the fuel merchant and should keep receipt.",
+    ]
+    mocked_llm.get_last_usage.return_value = {
+        "prompt_tokens": 10,
+        "completion_tokens": 5,
+        "total_tokens": 15,
+        "estimated_cost_usd": 0.0,
+    }
+    harness.llm = mocked_llm
+
+    action = harness.get_action("A merchant offers fuel for a fee.", [{"text": "Leave"}, {"text": "Pay"}])
+
+    assert action == 2
+    assert harness.memory_module.llm_client is mocked_llm
+    assert harness.memory_module._compaction_summary == "Summary: paid the fuel merchant and should keep receipt."
+    assert harness.memory_module.steps_since_compaction == 0
+
+
+def test_planner_harness_first_turn_generates_plan_then_acts():
+    harness = PlannerHarness(model_name="gpt-5-mini")
+    mocked_llm = Mock()
+    mocked_llm.get_completion.side_effect = [
+        "Gather clues first. Avoid direct fights. Preserve resources.",
+        '{"analysis":"plan says scout","reasoning":"safer branch","result":2}',
+    ]
+    mocked_llm.get_last_usage.side_effect = [
+        {"prompt_tokens": 30, "completion_tokens": 12, "total_tokens": 42, "estimated_cost_usd": 0.001},
+        {"prompt_tokens": 20, "completion_tokens": 8, "total_tokens": 28, "estimated_cost_usd": 0.0007},
+    ]
+    harness.llm = mocked_llm
+
+    action = harness.get_action("You enter a pirate station.", [{"text": "Scout ahead"}, {"text": "Attack now"}])
+
+    assert action == 2
+    assert harness.current_plan is not None
+    assert "Avoid direct fights" in harness.current_plan
+    assert mocked_llm.get_completion.call_count == 2
+    assert harness.get_last_response().total_tokens == 70
+
+
+def test_planner_harness_reuses_plan_when_state_is_stable():
+    harness = PlannerHarness(model_name="gpt-5-mini")
+    harness.current_plan = "Keep moving carefully and avoid a direct fight."
+    harness._observation_history = ["Quiet corridor."]
+    mocked_llm = Mock()
+    mocked_llm.get_completion.return_value = '{"analysis":"plan still fits","reasoning":"careful progress","result":1}'
+    mocked_llm.get_last_usage.return_value = {
+        "prompt_tokens": 18,
+        "completion_tokens": 7,
+        "total_tokens": 25,
+        "estimated_cost_usd": 0.0005,
+    }
+    harness.llm = mocked_llm
+
+    action = harness.get_action("Quiet corridor.", [{"text": "Open the door"}, {"text": "Run"}])
+
+    assert action == 1
+    assert mocked_llm.get_completion.call_count == 1
+
+
+def test_planner_harness_uses_contextual_memory_state():
+    harness = PlannerHarness(model_name="gpt-5-mini", compaction_interval=50)
+    harness.memory_module.set_quest_briefing("Original mission: win the election.")
+    harness.memory_module.transcript = [
+        {
+            "step": 1,
+            "observation": "You learned Maloqs value strength.",
+            "choice_text": "Ask about Maloqs",
+            "memo": "Maloqs value strength",
+            "action": 1,
+        }
+    ]
+    harness.memory_module.steps_since_compaction = 1
+    mocked_llm = Mock()
+    mocked_llm.get_completion.side_effect = [
+        "Use the remembered cultural clue.",
+        '{"analysis":"use clue","reasoning":"fits plan","result":1}',
+    ]
+    mocked_llm.get_last_usage.return_value = {
+        "prompt_tokens": 1,
+        "completion_tokens": 1,
+        "total_tokens": 2,
+        "estimated_cost_usd": 0.0,
+    }
+    harness.llm = mocked_llm
+
+    harness.get_action("Current banquet scene.", [{"text": "Greet like a warrior"}])
+
+    first_prompt = mocked_llm.get_completion.call_args_list[0].args[0]
+    assert "Quest briefing" in first_prompt
+    assert "RECENT STEPS" in first_prompt
+    assert "Maloqs value strength" in first_prompt
+
+
+def test_tool_compact_harness_can_use_quest_history():
+    harness = ToolCompactHarness(model_name="gpt-5-mini")
+    harness._step_log = [
+        {
+            "step": 1,
+            "observation": "Merchant mentioned low fuel.",
+            "choices": ["Buy fuel", "Keep flying"],
+            "selected_choice": "Buy fuel",
+        }
+    ]
+    harness._history_tool.step_log = harness._step_log
+    mocked_llm = Mock()
+    mocked_llm.get_completion.side_effect = [
+        '{"analysis":"need history","tool_calls":[{"tool":"quest_history","input":"fuel merchant"}],"result":null}',
+        '{"analysis":"fuel clue matters","reasoning":"play safe","result":1}',
+    ]
+    mocked_llm.get_last_usage.side_effect = [
+        {"prompt_tokens": 24, "completion_tokens": 10, "total_tokens": 34, "estimated_cost_usd": 0.0008},
+        {"prompt_tokens": 22, "completion_tokens": 9, "total_tokens": 31, "estimated_cost_usd": 0.0007},
+    ]
+    harness.llm = mocked_llm
+
+    action = harness.get_action("Your fuel gauge is blinking.", [{"text": "Refuel"}, {"text": "Attack pirates"}])
+
+    assert action == 1
+    assert mocked_llm.get_completion.call_count == 2
+    assert harness.get_last_response().total_tokens == 65
+    assert len(harness._step_log) == 2
+    assert harness.get_last_response().tool_results
+    assert "Merchant mentioned low fuel" in harness.get_last_response().tool_results[0]
+
+
+def test_tool_compact_calculator_supports_arithmetic_and_comparisons():
+    assert ToolCompactHarness.calculator("55 + 12 - 5") == "55 + 12 - 5 = 62"
+    assert ToolCompactHarness.calculator("60 >= 55 and 62 >= 80") == "60 >= 55 and 62 >= 80 = False"
+    assert ToolCompactHarness.calculator("__import__('os')").startswith("error:")
+
+
+def test_tool_compact_scratchpad_read_write_and_reset():
+    harness = ToolCompactHarness(model_name="gpt-5-mini")
+
+    assert harness.scratchpad("read") == "(empty)"
+    assert (
+        harness.scratchpad("write_replace", " Board: W B _ ; failed door 2 ") == "updated: Board: W B _ ; failed door 2"
+    )
+    assert harness.scratchpad("read") == "Board: W B _ ; failed door 2"
+
+    harness.reset()
+
+    assert harness.scratchpad("read") == "(empty)"
+
+
+def test_tool_compact_harness_can_use_calculator_and_records_tool_metadata():
+    harness = ToolCompactHarness(model_name="gpt-5-mini")
+    mocked_llm = Mock()
+    mocked_llm.get_completion.side_effect = [
+        '{"memo":"Need mix math","analysis":"calculate target","tool_calls":[{"tool":"calculator","input":"50 + 3 >= 55"}],"result":null}',
+        '{"memo":"Need more strength","analysis":"math failed","reasoning":"choose strength","result":2}',
+    ]
+    mocked_llm.get_last_usage.return_value = {
+        "prompt_tokens": 10,
+        "completion_tokens": 5,
+        "total_tokens": 15,
+        "estimated_cost_usd": 0.0,
+    }
+    harness.llm = mocked_llm
+
+    action = harness.get_action("Strength is 50. Need at least 55.", [{"text": "Add water"}, {"text": "Add repusator"}])
+
+    response = harness.get_last_response()
+    assert action == 2
+    assert response.tool_calls == [{"tool": "calculator", "input": "50 + 3 >= 55", "operation": "", "content": ""}]
+    assert response.tool_results == ["calculator(50 + 3 >= 55) => 50 + 3 >= 55 = False"]
+    assert response.memo == "Need more strength"
+
+
+def test_tool_compact_harness_can_use_scratchpad_tool_call():
+    harness = ToolCompactHarness(model_name="gpt-5-mini")
+    mocked_llm = Mock()
+    mocked_llm.get_completion.side_effect = [
+        (
+            '{"analysis":"save board","tool_calls":[{"tool":"scratchpad",'
+            '"operation":"write_replace","content":"Board: red blue blank"}],"result":null}'
+        ),
+        '{"analysis":"note saved","reasoning":"use saved board","result":1}',
+    ]
+    mocked_llm.get_last_usage.return_value = {
+        "prompt_tokens": 10,
+        "completion_tokens": 5,
+        "total_tokens": 15,
+        "estimated_cost_usd": 0.0,
+    }
+    harness.llm = mocked_llm
+
+    action = harness.get_action("A colored board blocks the hall.", [{"text": "Use red-blue order"}])
+
+    assert action == 1
+    assert harness.scratchpad("read") == "Board: red blue blank"
+    assert harness.get_last_response().tool_results == [
+        "scratchpad(write_replace, Board: red blue blank) => updated: Board: red blue blank"
+    ]
+
+
+def test_tool_compact_harness_uses_contextual_memory_state():
+    harness = ToolCompactHarness(model_name="gpt-5-mini", compaction_interval=50)
+    harness.memory_module.set_quest_briefing("Original mission: pass pilot certification.")
+    harness.memory_module.transcript = [
+        {
+            "step": 1,
+            "observation": "Hogger is greedy.",
+            "choice_text": "Bribe Hogger",
+            "memo": "Hogger is greedy",
+            "action": 1,
+        }
+    ]
+    harness.memory_module.steps_since_compaction = 1
+    mocked_llm = Mock()
+    mocked_llm.get_completion.return_value = (
+        '{"memo":"Hogger is greedy","analysis":"no tools needed","tool_calls":[],"result":1}'
+    )
+    mocked_llm.get_last_usage.return_value = {
+        "prompt_tokens": 10,
+        "completion_tokens": 5,
+        "total_tokens": 15,
+        "estimated_cost_usd": 0.0,
+    }
+    harness.llm = mocked_llm
+
+    harness.get_action("Current exam room.", [{"text": "Offer a bribe"}])
+
+    prompt = mocked_llm.get_completion.call_args.args[0]
+    assert "Quest briefing" in prompt
+    assert "RECENT STEPS" in prompt
+    assert "Hogger is greedy" in prompt
+
+
+def test_tool_compact_harness_can_finish_without_tools_in_one_call():
+    harness = ToolCompactHarness(model_name="gpt-5-mini")
+    mocked_llm = Mock()
+    mocked_llm.get_completion.return_value = (
+        '{"analysis":"no tools needed","tool_calls":[],"reasoning":"direct clue","result":2}'
+    )
+    mocked_llm.get_last_usage.return_value = {
+        "prompt_tokens": 15,
+        "completion_tokens": 6,
+        "total_tokens": 21,
+        "estimated_cost_usd": 0.0004,
+    }
+    harness.llm = mocked_llm
+
+    action = harness.get_action("A guard points at the safe exit.", [{"text": "Fight"}, {"text": "Leave"}])
+
+    assert action == 2
+    assert mocked_llm.get_completion.call_count == 1
diff --git a/llm_quest_benchmark/tests/integration/test_benchmark.py b/llm_quest_benchmark/tests/integration/test_benchmark.py
index ee0704e..1c56d35 100644
--- a/llm_quest_benchmark/tests/integration/test_benchmark.py
+++ b/llm_quest_benchmark/tests/integration/test_benchmark.py
@@ -5,11 +5,11 @@
 
 import pytest
 
-from llm_quest_benchmark.constants import DEFAULT_TEMPLATE, SYSTEM_ROLE_TEMPLATE
+from llm_quest_benchmark.constants import SYSTEM_ROLE_TEMPLATE
 from llm_quest_benchmark.environments.state import QuestOutcome
 from llm_quest_benchmark.executors import benchmark as benchmark_module
 from llm_quest_benchmark.executors.benchmark import run_benchmark
-from llm_quest_benchmark.schemas.config import AgentConfig, BenchmarkConfig
+from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig
 
 
 def _fake_task_for_parallel_test(task, result_queue):
@@ -58,10 +58,10 @@ def test_benchmark_e2e(caplog, tmp_path):
     config = BenchmarkConfig(
         quests=[str(quest_path)],
         agents=[
-            AgentConfig(
+            HarnessConfig(
                 model="random_choice",  # Use random_choice for testing
+                harness="random_choice",
                 system_template=SYSTEM_ROLE_TEMPLATE,
-                action_template=DEFAULT_TEMPLATE,
                 temperature=0.0,
                 skip_single=True,
             )
@@ -83,9 +83,9 @@ def test_benchmark_e2e(caplog, tmp_path):
         # Check first result
         result = results[0]
         assert result["quest"] == str(quest_path)
-        assert result["model"] == "random_choice"
+        assert result["model"] == "random_policy"
         assert result["temperature"] == 0.0
-        assert result["template"] == DEFAULT_TEMPLATE
+        assert result["template"] == "reasoning.jinja"
         assert result["attempt"] == 1
         assert "agent_id" in result
         assert "outcome" in result
@@ -122,9 +122,9 @@ def test_benchmark_supports_multiple_runs_per_agent(tmp_path):
     config = BenchmarkConfig(
         quests=[str(quest_path)],
         agents=[
-            AgentConfig(
+            HarnessConfig(
                 model="random_choice",
-                action_template="reasoning",
+                harness="random_choice",
                 temperature=0.0,
                 runs=2,
                 skip_single=True,
@@ -154,7 +154,7 @@ def test_benchmark_uses_max_workers(monkeypatch, tmp_path):
 
     config = BenchmarkConfig(
         quests=[str(quest_path)],
-        agents=[AgentConfig(model="random_choice", runs=4)],
+        agents=[HarnessConfig(model="random_choice", harness="random_choice", runs=4)],
         quest_timeout=5,
         max_workers=2,
         output_dir=str(tmp_path),
@@ -187,7 +187,7 @@ def test_benchmark_enforces_child_process_timeout(monkeypatch, tmp_path):
 
     config = BenchmarkConfig(
         quests=[str(quest_path)],
-        agents=[AgentConfig(model="random_choice", runs=1)],
+        agents=[HarnessConfig(model="random_choice", harness="random_choice", runs=1)],
         quest_timeout=1,
         max_workers=1,
         output_dir=str(tmp_path),
diff --git a/llm_quest_benchmark/tests/integration/test_mode_agents_e2e.py b/llm_quest_benchmark/tests/integration/test_mode_agents_e2e.py
index 5563ca2..2ceeaca 100644
--- a/llm_quest_benchmark/tests/integration/test_mode_agents_e2e.py
+++ b/llm_quest_benchmark/tests/integration/test_mode_agents_e2e.py
@@ -1,12 +1,12 @@
-"""Integration tests for planner/tool modes on real quest execution loops."""
+"""Integration tests for planner/tool harness modes on real quest execution loops."""
 
 from pathlib import Path
 
 import pytest
 
-from llm_quest_benchmark.agents.agent_factory import create_agent
 from llm_quest_benchmark.core.runner import run_quest_with_timeout
 from llm_quest_benchmark.environments.state import QuestOutcome
+from llm_quest_benchmark.harnesses.factory import create_harness
 
 QUEST_PATHS = [
     "quests/Boat.qm",
@@ -38,18 +38,18 @@ def get_last_usage(self):
 
 @pytest.mark.timeout(15)
 @pytest.mark.skipif(not Path(QUEST_PATHS[1]).exists(), reason="Quest files not downloaded")
-def test_planner_agent_runs_three_quests_across_openai_and_anthropic_models(monkeypatch):
+def test_planner_harness_runs_three_quests_across_openai_and_anthropic_models(monkeypatch):
     requested_models = []
 
     def fake_get_llm_client(model_name, **kwargs):
         requested_models.append(model_name)
         return FakeLLM("planner")
 
-    monkeypatch.setattr("llm_quest_benchmark.agents.llm_agent.get_llm_client", fake_get_llm_client)
+    monkeypatch.setattr("llm_quest_benchmark.harnesses.base.get_llm_client", fake_get_llm_client)
 
     for model_name in ["gpt-5-mini", "claude-sonnet-4-5"]:
         for quest_path in QUEST_PATHS:
-            agent = create_agent(model=model_name, action_template="planner", skip_single=True)
+            agent = create_harness("planner", model=model_name, skip_single=True)
             outcome = run_quest_with_timeout(quest_path, agent, timeout=10)
             assert outcome in {QuestOutcome.SUCCESS, QuestOutcome.FAILURE, QuestOutcome.TIMEOUT}
             assert outcome != QuestOutcome.ERROR
@@ -60,14 +60,14 @@ def fake_get_llm_client(model_name, **kwargs):
 
 @pytest.mark.timeout(15)
 @pytest.mark.skipif(not Path(QUEST_PATHS[1]).exists(), reason="Quest files not downloaded")
-def test_tool_agent_runs_three_quests(monkeypatch):
+def test_tool_harness_runs_three_quests(monkeypatch):
     monkeypatch.setattr(
-        "llm_quest_benchmark.agents.llm_agent.get_llm_client",
+        "llm_quest_benchmark.harnesses.base.get_llm_client",
         lambda model_name, **kwargs: FakeLLM("tool"),
     )
 
     for quest_path in QUEST_PATHS:
-        agent = create_agent(model="gpt-5-mini", action_template="tool_augmented", skip_single=True)
+        agent = create_harness("tool_compact", model="gpt-5-mini", skip_single=True)
         outcome = run_quest_with_timeout(quest_path, agent, timeout=10)
         assert outcome in {QuestOutcome.SUCCESS, QuestOutcome.FAILURE, QuestOutcome.TIMEOUT}
         assert outcome != QuestOutcome.ERROR
@@ -75,9 +75,9 @@ def test_tool_agent_runs_three_quests(monkeypatch):
 
 @pytest.mark.timeout(15)
 @pytest.mark.skipif(not Path(QUEST_PATHS[1]).exists(), reason="Quest files not downloaded")
-def test_reused_mode_agents_reset_between_quest_runs():
+def test_reused_mode_harnesses_reset_between_quest_runs():
     quest_path = "quests/sr_2_1_2121_eng/Borzukhan_eng.qm"
-    planner_agent = create_agent(model="gpt-5-mini", action_template="planner", skip_single=True)
+    planner_agent = create_harness("planner", model="gpt-5-mini", skip_single=True)
     planner_agent.llm = FakeLLM("planner")
 
     first_outcome = run_quest_with_timeout(quest_path, planner_agent, timeout=10)
@@ -92,7 +92,7 @@ def test_reused_mode_agents_reset_between_quest_runs():
     assert "stale plan from previous run" not in planner_agent._plan_history
     assert "stale observation" not in planner_agent._observation_history
 
-    tool_agent = create_agent(model="gpt-5-mini", action_template="tool_augmented", skip_single=True)
+    tool_agent = create_harness("tool_compact", model="gpt-5-mini", skip_single=True)
     tool_agent.llm = FakeLLM("tool")
 
     first_outcome = run_quest_with_timeout(quest_path, tool_agent, timeout=10)
diff --git a/llm_quest_benchmark/tests/integration/test_quest_e2e.py b/llm_quest_benchmark/tests/integration/test_quest_e2e.py
index 8ebfb91..3d02d1a 100644
--- a/llm_quest_benchmark/tests/integration/test_quest_e2e.py
+++ b/llm_quest_benchmark/tests/integration/test_quest_e2e.py
@@ -5,10 +5,10 @@
 
 import pytest
 
-from llm_quest_benchmark.agents.agent_factory import create_agent
-from llm_quest_benchmark.constants import DEFAULT_QUEST, DEFAULT_TEMPLATE, SYSTEM_ROLE_TEMPLATE
+from llm_quest_benchmark.constants import DEFAULT_QUEST, SYSTEM_ROLE_TEMPLATE
 from llm_quest_benchmark.core.runner import run_quest_with_timeout
 from llm_quest_benchmark.environments.state import QuestOutcome
+from llm_quest_benchmark.harnesses.factory import create_harness
 
 TIMEOUT = 20  # 20s should be enough for test quests to complete
 
@@ -19,11 +19,11 @@ def test_quest_run_with_llm(caplog):
     """Test that quest runs with LLM agent and reaches a final state"""
     caplog.set_level(logging.DEBUG)  # Show all logs in test output
 
-    # Create LLM agent
-    agent = create_agent(
+    # Create random harness
+    agent = create_harness(
+        harness="random_choice",
         model="random_choice",  # Use random for testing
         system_template=SYSTEM_ROLE_TEMPLATE,
-        action_template=DEFAULT_TEMPLATE,
         temperature=0.0,
         skip_single=False,
         debug=True,
@@ -63,13 +63,13 @@ def mock_callback(event: str, data: Any) -> None:
 
 @pytest.mark.e2e
 @pytest.mark.timeout(TIMEOUT)
-def test_random_agent_on_test_quest(caplog):
-    """Test that random agent can complete a test quest"""
+def test_random_player_on_test_quest(caplog):
+    """Test that random player can complete a test quest"""
     caplog.set_level(logging.DEBUG)  # Show all logs in test output
 
-    # Create random agent
-    agent = create_agent("random_choice", skip_single=True, debug=True)
-    assert agent is not None, "Failed to create random agent"
+    # Create random player
+    agent = create_harness("random_choice", skip_single=True, debug=True)
+    assert agent is not None, "Failed to create random player"
 
     # Mock callback for testing
     def mock_callback(event: str, data: Any) -> None:
@@ -80,7 +80,7 @@ def mock_callback(event: str, data: Any) -> None:
         elif event == "error":
             caplog.error(f"Error: {data}")
 
-    # Run quest with random agent
+    # Run quest with random player
     try:
         outcome = run_quest_with_timeout(
             quest_path=str(DEFAULT_QUEST),
diff --git a/llm_quest_benchmark/tests/agents/test_human_player.py b/llm_quest_benchmark/tests/players/test_human_player.py
similarity index 95%
rename from llm_quest_benchmark/tests/agents/test_human_player.py
rename to llm_quest_benchmark/tests/players/test_human_player.py
index 8334ebd..7108f78 100644
--- a/llm_quest_benchmark/tests/agents/test_human_player.py
+++ b/llm_quest_benchmark/tests/players/test_human_player.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from llm_quest_benchmark.agents.human_player import HumanPlayer
+from llm_quest_benchmark.players.human import HumanPlayer
 
 
 def test_human_player_initialization():
diff --git a/llm_quest_benchmark/tests/test_benchmark_with_directory.py b/llm_quest_benchmark/tests/test_benchmark_with_directory.py
index 7661f3d..87b2221 100644
--- a/llm_quest_benchmark/tests/test_benchmark_with_directory.py
+++ b/llm_quest_benchmark/tests/test_benchmark_with_directory.py
@@ -6,20 +6,20 @@
 
 import pytest
 
+from llm_quest_benchmark.executors.benchmark import _result_entry, run_benchmark
+from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig
+
 # Configure logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
 
-from llm_quest_benchmark.executors.benchmark import run_benchmark
-from llm_quest_benchmark.schemas.config import AgentConfig, BenchmarkConfig
-
 
 def create_test_config():
     """Create a test benchmark configuration with directory path"""
     return {
         "name": "Directory Benchmark Test",
         "quests": ["quests/sr_2_1_2121_eng"],
-        "agents": [{"model": "random_choice", "skip_single": True, "temperature": 0.7}],
+        "agents": [{"model": "random_choice", "harness": "random_choice", "skip_single": True, "temperature": 0.7}],
         "quest_timeout": 4,  # Keep runtime below pytest global timeout
         "max_quests": 1,
         "debug": True,
@@ -27,6 +27,26 @@ def create_test_config():
     }
 
 
+def test_result_entry_logs_random_harness_model_as_random_policy():
+    """Random harness results should not be attributed to the default LLM model."""
+    agent_config = HarnessConfig(harness="random_choice", model="random_choice")
+
+    result = _result_entry("quests/Boat.qm", agent_config, 1, "FAILURE")
+
+    assert result["model"] == "random_policy"
+    assert result["harness"] == "random_choice"
+
+
+def test_result_entry_logs_human_harness_model_as_human():
+    """Human harness results should not be attributed to the default LLM model."""
+    agent_config = HarnessConfig(harness="human", model="human")
+
+    result = _result_entry("quests/Boat.qm", agent_config, 1, "FAILURE")
+
+    assert result["model"] == "human"
+    assert result["harness"] == "human"
+
+
 @pytest.mark.skipif(not Path("quests/sr_2_1_2121_eng").exists(), reason="Quest files not downloaded")
 def test_benchmark_with_directory():
     """Test running a benchmark with a directory path"""
@@ -34,8 +54,8 @@ def test_benchmark_with_directory():
     config_dict = create_test_config()
     logger.info(f"Created test config: {json.dumps(config_dict, indent=2)}")
 
-    # Convert agent dictionaries to AgentConfig objects first
-    config_dict["agents"] = [AgentConfig(**agent_dict) for agent_dict in config_dict["agents"]]
+    # Convert agent dictionaries to HarnessConfig objects first
+    config_dict["agents"] = [HarnessConfig(**agent_dict) for agent_dict in config_dict["agents"]]
     config = BenchmarkConfig(**config_dict)
     logger.info("Config validation passed")
 
diff --git a/llm_quest_benchmark/tests/test_database.py b/llm_quest_benchmark/tests/test_database.py
index a04f53d..d00c6b1 100644
--- a/llm_quest_benchmark/tests/test_database.py
+++ b/llm_quest_benchmark/tests/test_database.py
@@ -249,8 +249,8 @@ def test_run_summary_export_tracks_repetition_rate(tmp_path, monkeypatch, quest_
     assert exported["metrics"]["bad_decision_rate"] == 0.0
 
 
-def test_random_agent_does_not_export_json(tmp_path, monkeypatch, quest_logger):
-    """Random agent runs should not create result artifacts in results/."""
+def test_random_player_does_not_export_json(tmp_path, monkeypatch, quest_logger):
+    """Random player runs should not create result artifacts in results/."""
     monkeypatch.setattr(logging_module, "RESULTS_DIR", tmp_path)
 
     quest_logger.agent = "random_choice"
diff --git a/llm_quest_benchmark/tests/test_leaderboard.py b/llm_quest_benchmark/tests/test_leaderboard.py
index aa22296..46407cf 100644
--- a/llm_quest_benchmark/tests/test_leaderboard.py
+++ b/llm_quest_benchmark/tests/test_leaderboard.py
@@ -243,6 +243,73 @@ def test_generate_leaderboard_filters_public_slice(tmp_path, monkeypatch):
     assert {row["model"] for row in leaderboard["results"]} == {"model-a", "model-b", "model-c"}
 
 
+def test_generate_leaderboard_excludes_retired_exp4_variants(tmp_path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
+
+    active_dir = Path("results/benchmarks/active")
+    active_dir.mkdir(parents=True, exist_ok=True)
+    retired_dir = Path("results/benchmarks/retired")
+    retired_dir.mkdir(parents=True, exist_ok=True)
+
+    active_row = {
+        "quest": "quests/Core.qm",
+        "model": "gpt-5-mini",
+        "template": "stateful_compact.jinja",
+        "harness": "memo_compact",
+        "agent_id": "active",
+        "attempt": 1,
+        "outcome": "SUCCESS",
+    }
+    retired_rows = [
+        {
+            "quest": "quests/Core.qm",
+            "model": "gpt-5-mini",
+            "template": "reasoning.jinja",
+            "harness": "compaction_no_memo",
+            "agent_id": "retired-no-memo",
+            "attempt": 1,
+            "outcome": "FAILURE",
+        },
+        {
+            "quest": "quests/Core.qm",
+            "model": "gpt-5-mini",
+            "template": "memo_extended.jinja",
+            "harness": "memo_extended",
+            "agent_id": "retired-extended",
+            "attempt": 1,
+            "outcome": "FAILURE",
+        },
+    ]
+
+    (active_dir / "benchmark_summary.json").write_text(
+        json.dumps({"benchmark_id": "active", "name": "active", "agents": [], "results": [active_row], "db_runs": []}),
+        encoding="utf-8",
+    )
+    (retired_dir / "benchmark_summary.json").write_text(
+        json.dumps(
+            {
+                "benchmark_id": "retired",
+                "name": "exp4_compaction_no_memo",
+                "agents": [],
+                "results": retired_rows,
+                "db_runs": [],
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    leaderboard = generate_leaderboard(
+        [str(active_dir), str(retired_dir)],
+        "site/leaderboard.json",
+        min_runs=0,
+        public_model_ids=None,
+    )
+
+    assert len(leaderboard["results"]) == 1
+    assert leaderboard["results"][0]["mode"] == "compact_memory_memo"
+    assert leaderboard["results"][0]["runs"] == 1
+
+
 def test_generate_leaderboard_matches_db_runs_by_identifiers(tmp_path, monkeypatch):
     monkeypatch.chdir(tmp_path)
 
@@ -310,3 +377,44 @@ def test_generate_leaderboard_matches_db_runs_by_identifiers(tmp_path, monkeypat
     rows = {(row["quest"], row["mode"]): row for row in leaderboard["results"]}
     assert rows[("Alpha", "compact_memory_memo")]["avg_steps"] == 10.0
     assert rows[("Beta", "full_history_reasoning")]["avg_steps"] == 20.0
+
+
+def test_generate_leaderboard_uses_result_row_memory_mode_without_db_config(tmp_path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
+    benchmark_dir = Path("results/benchmarks/bench_result_memory_mode")
+    benchmark_dir.mkdir(parents=True, exist_ok=True)
+    results = [
+        {
+            "quest": "quests/Beta.qm",
+            "model": "gpt-5-mini",
+            "template": "reasoning.jinja",
+            "memory_mode": "full_transcript",
+            "agent_id": "harness_gpt-5-mini",
+            "outcome": "SUCCESS",
+        }
+    ]
+    db_runs = [
+        {
+            "id": 20,
+            "quest_file": "quests/Beta.qm",
+            "quest_name": "Beta",
+            "agent_id": "harness_gpt-5-mini",
+            "agent_config": json.dumps({"model": "gpt-5-mini", "harness": "reasoning_full"}),
+            "outcome": "SUCCESS",
+        }
+    ]
+    (benchmark_dir / "benchmark_summary.json").write_text(
+        json.dumps(
+            {"benchmark_id": "bench_result_memory_mode", "harnesses": [], "results": results, "db_runs": db_runs}
+        ),
+        encoding="utf-8",
+    )
+
+    leaderboard = generate_leaderboard(
+        [str(benchmark_dir)],
+        "site/leaderboard.json",
+        min_runs=0,
+        public_model_ids=None,
+    )
+
+    assert leaderboard["results"][0]["mode"] == "full_history_reasoning"