diff --git a/README.md b/README.md index 3ff854c..013fb57 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ uv run llm-quest benchmark --config configs/benchmarks/memory_full_transcript.ya uv run llm-quest benchmark-report --benchmark-id --output report.md # Analyze a single run -uv run llm-quest analyze-run --run-summary results///run_/run_summary.json +uv run llm-quest analyze-run --run-summary results///run_/run_summary.json # Play as human in terminal uv run llm-quest play --quest quests/Boat.qm @@ -107,7 +107,8 @@ Provider-specific keys in `.env`: ## Project Structure -- `llm_quest_benchmark/agents/` - Agent implementations (LLM, planner, tool-augmented) +- `llm_quest_benchmark/harnesses/` - LLM harness implementations for prompt, memory, tools, and planning experiments +- `llm_quest_benchmark/players/` - Non-LLM player primitives (`human`, `random_choice`) - `llm_quest_benchmark/prompt_templates/` - Jinja2 prompt templates for the public context-scaffold taxonomy - `llm_quest_benchmark/executors/` - CLI, benchmark orchestration, TS bridge - `configs/benchmarks/` - YAML benchmark configurations diff --git a/configs/benchmarks/balanced_gpt5mini_all_modes.yaml b/configs/benchmarks/balanced_gpt5mini_all_modes.yaml index 4ab3e65..812e94a 100644 --- a/configs/benchmarks/balanced_gpt5mini_all_modes.yaml +++ b/configs/benchmarks/balanced_gpt5mini_all_modes.yaml @@ -21,49 +21,45 @@ quests: agents: # 1. Minimal prompt - model: gpt-5-mini - template: stub + harness: minimal temperature: 0.4 runs: 3 # 2. Short-context reasoning - model: gpt-5-mini - template: reasoning + harness: reasoning_recent temperature: 0.4 runs: 3 # 3. Full-history reasoning - model: gpt-5-mini - template: reasoning + harness: reasoning_full temperature: 0.4 runs: 3 - memory_mode: full_transcript # 4. Compact memory / memo - model: gpt-5-mini - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 # 5. Prompt hints - model: gpt-5-mini - template: light_hints + harness: hinted_compact temperature: 0.4 runs: 3 # 6. Tools + compact memory - model: gpt-5-mini - template: tool_augmented + harness: tool_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 # 7. Tools + hints + compact memory - model: gpt-5-mini - template: tool_augmented_hints + harness: tool_hinted temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 # 8. Planner loop - model: gpt-5-mini - template: planner + harness: planner temperature: 0.4 runs: 3 debug: false diff --git a/configs/benchmarks/exp3_no_loop_breaker.yaml b/configs/benchmarks/exp3_no_loop_breaker.yaml index 64240fe..57e7124 100644 --- a/configs/benchmarks/exp3_no_loop_breaker.yaml +++ b/configs/benchmarks/exp3_no_loop_breaker.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: reasoning + harness: reasoning_full temperature: 0.4 runs: 2 - memory_mode: full_transcript debug: false quest_timeout: 600 max_workers: 2 diff --git a/configs/benchmarks/exp3_stateful_compact.yaml b/configs/benchmarks/exp3_stateful_compact.yaml index b43fc6b..bb9973c 100644 --- a/configs/benchmarks/exp3_stateful_compact.yaml +++ b/configs/benchmarks/exp3_stateful_compact.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp4_compaction_no_memo.yaml b/configs/benchmarks/exp4_compaction_no_memo.yaml index 5ef4130..4ab63e6 100644 --- a/configs/benchmarks/exp4_compaction_no_memo.yaml +++ b/configs/benchmarks/exp4_compaction_no_memo.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: reasoning + harness: compaction_no_memo temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp4_memo_cot.yaml b/configs/benchmarks/exp4_memo_cot.yaml index fe97bca..320da54 100644 --- a/configs/benchmarks/exp4_memo_cot.yaml +++ b/configs/benchmarks/exp4_memo_cot.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: memo_cot + harness: memo_cot temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp4_memo_extended.yaml b/configs/benchmarks/exp4_memo_extended.yaml index 66d1bf4..a5d6613 100644 --- a/configs/benchmarks/exp4_memo_extended.yaml +++ b/configs/benchmarks/exp4_memo_extended.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: memo_extended + harness: memo_extended temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp4_memo_structured.yaml b/configs/benchmarks/exp4_memo_structured.yaml index 83502c7..f70ab81 100644 --- a/configs/benchmarks/exp4_memo_structured.yaml +++ b/configs/benchmarks/exp4_memo_structured.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: memo_structured + harness: memo_structured temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp5_stateful_compact_variance.yaml b/configs/benchmarks/exp5_stateful_compact_variance.yaml index 6f99f29..89cc80b 100644 --- a/configs/benchmarks/exp5_stateful_compact_variance.yaml +++ b/configs/benchmarks/exp5_stateful_compact_variance.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 5 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp6_prompt_hints.yaml b/configs/benchmarks/exp6_prompt_hints.yaml index 098b1db..4c70e61 100644 --- a/configs/benchmarks/exp6_prompt_hints.yaml +++ b/configs/benchmarks/exp6_prompt_hints.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: stateful_compact_hints + harness: hinted_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp6_tools.yaml b/configs/benchmarks/exp6_tools.yaml index 8630bb0..b254005 100644 --- a/configs/benchmarks/exp6_tools.yaml +++ b/configs/benchmarks/exp6_tools.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: tool_augmented + harness: tool_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp6_tools_hints.yaml b/configs/benchmarks/exp6_tools_hints.yaml index b7949fc..0c0c3b6 100644 --- a/configs/benchmarks/exp6_tools_hints.yaml +++ b/configs/benchmarks/exp6_tools_hints.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: tool_augmented_hints + harness: tool_hinted temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp6_unified_tools_screen.yaml b/configs/benchmarks/exp6_unified_tools_screen.yaml index 0c43290..b80f8c0 100644 --- a/configs/benchmarks/exp6_unified_tools_screen.yaml +++ b/configs/benchmarks/exp6_unified_tools_screen.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: tool_augmented + harness: tool_compact temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp7_deepseek.yaml b/configs/benchmarks/exp7_deepseek.yaml index 1b82664..6971569 100644 --- a/configs/benchmarks/exp7_deepseek.yaml +++ b/configs/benchmarks/exp7_deepseek.yaml @@ -7,10 +7,9 @@ quests: - quests/sr_2_1_2121_eng/Robots_eng.qm agents: - model: "openrouter:deepseek/deepseek-chat-v3-0324" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp7_haiku.yaml b/configs/benchmarks/exp7_haiku.yaml index 72cd6c2..8546c80 100644 --- a/configs/benchmarks/exp7_haiku.yaml +++ b/configs/benchmarks/exp7_haiku.yaml @@ -7,10 +7,9 @@ quests: - quests/sr_2_1_2121_eng/Robots_eng.qm agents: - model: "anthropic:claude-3-5-haiku-latest" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp7_llama.yaml b/configs/benchmarks/exp7_llama.yaml index 27eda5a..61e156c 100644 --- a/configs/benchmarks/exp7_llama.yaml +++ b/configs/benchmarks/exp7_llama.yaml @@ -7,10 +7,9 @@ quests: - quests/sr_2_1_2121_eng/Robots_eng.qm agents: - model: "openrouter:meta-llama/llama-4-scout" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp7_mistral.yaml b/configs/benchmarks/exp7_mistral.yaml index 76f1a40..f570882 100644 --- a/configs/benchmarks/exp7_mistral.yaml +++ b/configs/benchmarks/exp7_mistral.yaml @@ -7,10 +7,9 @@ quests: - quests/sr_2_1_2121_eng/Robots_eng.qm agents: - model: "openrouter:mistralai/mistral-small-2603" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp7_qwen.yaml b/configs/benchmarks/exp7_qwen.yaml index 572d7a6..27496cc 100644 --- a/configs/benchmarks/exp7_qwen.yaml +++ b/configs/benchmarks/exp7_qwen.yaml @@ -7,10 +7,9 @@ quests: - quests/sr_2_1_2121_eng/Robots_eng.qm agents: - model: "openrouter:qwen/qwen3-30b-a3b" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp7b_model_upgrades.yaml b/configs/benchmarks/exp7b_model_upgrades.yaml index 4c35c8b..80ab53c 100644 --- a/configs/benchmarks/exp7b_model_upgrades.yaml +++ b/configs/benchmarks/exp7b_model_upgrades.yaml @@ -20,22 +20,19 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:deepseek/deepseek-v4-flash" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 - model: "openrouter:qwen/qwen3.6-flash" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 - - model: "claude:claude-haiku-4-5-20251001" - template: stateful_compact + - model: "anthropic:claude-haiku-4-5-20251001" + harness: memo_compact temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/memory_compaction.yaml b/configs/benchmarks/memory_compaction.yaml index 1bb10a8..c403665 100644 --- a/configs/benchmarks/memory_compaction.yaml +++ b/configs/benchmarks/memory_compaction.yaml @@ -18,45 +18,39 @@ quests: agents: # Gemini 3 Flash - compaction interval 10 - model: "openrouter:google/gemini-3-flash-preview" - template: reasoning + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 10 # Gemini 3 Flash - compaction interval 20 - model: "openrouter:google/gemini-3-flash-preview" - template: reasoning + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 20 # GPT-5.4 Mini - compaction interval 10 - model: "openrouter:openai/gpt-5.4-mini" - template: reasoning + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 10 # GPT-5.4 Mini - compaction interval 20 - model: "openrouter:openai/gpt-5.4-mini" - template: reasoning + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 20 # DeepSeek V3.2 - compaction interval 10 - model: "openrouter:deepseek/deepseek-v3.2" - template: reasoning + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 10 # DeepSeek V3.2 - compaction interval 20 - model: "openrouter:deepseek/deepseek-v3.2" - template: reasoning + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 20 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/memory_full_transcript.yaml b/configs/benchmarks/memory_full_transcript.yaml index 04ad152..9fc82a4 100644 --- a/configs/benchmarks/memory_full_transcript.yaml +++ b/configs/benchmarks/memory_full_transcript.yaml @@ -18,22 +18,19 @@ quests: agents: # Gemini 3 Flash - full transcript - model: "openrouter:google/gemini-3-flash-preview" - template: reasoning + harness: reasoning_full temperature: 0.4 runs: 3 - memory_mode: full_transcript # GPT-5.4 Mini - full transcript - model: "openrouter:openai/gpt-5.4-mini" - template: reasoning + harness: reasoning_full temperature: 0.4 runs: 3 - memory_mode: full_transcript # DeepSeek V3.2 - full transcript - model: "openrouter:deepseek/deepseek-v3.2" - template: reasoning + harness: reasoning_full temperature: 0.4 runs: 3 - memory_mode: full_transcript debug: false quest_timeout: 600 max_workers: 2 diff --git a/configs/benchmarks/memory_modes_pilot.yaml b/configs/benchmarks/memory_modes_pilot.yaml index 2e4d862..db6aa23 100644 --- a/configs/benchmarks/memory_modes_pilot.yaml +++ b/configs/benchmarks/memory_modes_pilot.yaml @@ -5,31 +5,27 @@ quests: agents: # Short-context reasoning - default memory (3 obs, 5 decisions) - model: openrouter:google/gemini-3-flash-preview - template: reasoning + harness: reasoning_recent temperature: 0.4 runs: 3 - memory_mode: default # Short-context reasoning - loop-aware template - model: openrouter:google/gemini-3-flash-preview - template: loop_aware_reasoning + harness: reasoning_recent temperature: 0.4 runs: 3 - memory_mode: default # Full-history reasoning - model: openrouter:google/gemini-3-flash-preview - template: reasoning + harness: reasoning_full temperature: 0.4 runs: 3 - memory_mode: full_transcript # Compact memory / memo (compact every 10 steps) - model: openrouter:google/gemini-3-flash-preview - template: reasoning + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 10 debug: false diff --git a/configs/benchmarks/openrouter_smoke_test.yaml b/configs/benchmarks/openrouter_smoke_test.yaml index 6194df3..2fb50be 100644 --- a/configs/benchmarks/openrouter_smoke_test.yaml +++ b/configs/benchmarks/openrouter_smoke_test.yaml @@ -3,23 +3,23 @@ quests: - quests/Boat.qm agents: - model: "openrouter:anthropic/claude-sonnet-4-6" - template: stub + harness: minimal temperature: 0.4 runs: 1 - model: "openrouter:openai/gpt-5.4-mini" - template: stub + harness: minimal temperature: 0.4 runs: 1 - model: "openrouter:google/gemini-2.5-flash" - template: stub + harness: minimal temperature: 0.4 runs: 1 - model: "openrouter:deepseek/deepseek-chat" - template: stub + harness: minimal temperature: 0.4 runs: 1 - model: "openrouter:qwen/qwen3-235b-a22b" - template: stub + harness: minimal temperature: 0.4 runs: 1 debug: false diff --git a/configs/default.yaml b/configs/default.yaml index d7dbe67..3159029 100644 --- a/configs/default.yaml +++ b/configs/default.yaml @@ -5,27 +5,27 @@ quests: agents: - model: random_choice - template: reasoning.jinja + harness: random_choice temperature: 0.0 skip_single: true - model: gpt-5-mini - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true - model: claude-sonnet-4-5 - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true - model: gemini-2.5-flash - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true - model: deepseek-3.2-chat - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true diff --git a/configs/kr1.yaml b/configs/kr1.yaml index c7771e6..c31cc3b 100644 --- a/configs/kr1.yaml +++ b/configs/kr1.yaml @@ -5,22 +5,22 @@ quests: agents: - model: gpt-5-mini - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true - model: claude-sonnet-4-5 - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true - model: gemini-2.5-flash - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true - model: deepseek-3.2-chat - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true diff --git a/configs/kr1_micro.yaml b/configs/kr1_micro.yaml index c19bd1a..ac3df96 100644 --- a/configs/kr1_micro.yaml +++ b/configs/kr1_micro.yaml @@ -8,12 +8,12 @@ quests: agents: # Just 2 agents to validate the process - model: gpt-5-mini - template: reasoning.jinja + harness: reasoning_recent temperature: 0.7 skip_single: true - model: gemini-2.5-flash - template: reasoning.jinja + harness: reasoning_recent temperature: 0.6 skip_single: true diff --git a/configs/kr1_test.yaml b/configs/kr1_test.yaml index fbe843c..bb8ed98 100644 --- a/configs/kr1_test.yaml +++ b/configs/kr1_test.yaml @@ -7,12 +7,12 @@ quests: agents: # Just 2 agents to validate the process - model: gpt-5-mini - template: reasoning.jinja + harness: reasoning_recent temperature: 0.7 skip_single: true - model: gemini-2.5-flash - template: reasoning.jinja + harness: reasoning_recent temperature: 0.6 skip_single: true diff --git a/configs/kr2_en_benchmark.yaml b/configs/kr2_en_benchmark.yaml deleted file mode 100644 index 76b6c21..0000000 --- a/configs/kr2_en_benchmark.yaml +++ /dev/null @@ -1,40 +0,0 @@ -# Benchmark configuration for Kr2 English quests -# Using recommended models with optimized temperature settings - -quests: - - quests/kr2_en - -agents: - # OpenAI models - - model: gpt-4o - template: reasoning.jinja - temperature: 0.5 - skip_single: true - - - model: gpt-4o-mini - template: reasoning.jinja - temperature: 0.7 - skip_single: true - - # Anthropic models - - model: claude-3-7-sonnet-latest - template: reasoning.jinja - temperature: 0.5 - skip_single: true - - - model: claude-3-5-sonnet-latest - template: reasoning.jinja - temperature: 0.6 - skip_single: true - -# Debug mode enables more detailed logging -debug: true - -# Quest timeout in seconds -quest_timeout: 120 - -# Output directory for benchmark results -output_dir: metrics/kr2_en - -# Optional name for this benchmark run -name: kr2_en_benchmark \ No newline at end of file diff --git a/configs/kr2_en_test.yaml b/configs/kr2_en_test.yaml index 7dbe160..94cfaa3 100644 --- a/configs/kr2_en_test.yaml +++ b/configs/kr2_en_test.yaml @@ -5,7 +5,7 @@ quests: agents: - model: random_choice # Use random agent for speed and reliability temperature: 0.5 - template: reasoning.jinja + harness: random_choice quest_timeout: 10 # short timeout for testing debug: true output_dir: results/benchmarks diff --git a/configs/test/parallel_agents_test.yaml b/configs/test/parallel_agents_test.yaml index 37bca75..873d3ed 100644 --- a/configs/test/parallel_agents_test.yaml +++ b/configs/test/parallel_agents_test.yaml @@ -5,8 +5,9 @@ quests: - quests/kr_1_ru/Diamond.qm agents: - model: random_choice + harness: random_choice - model: gpt-5-mini - template: reasoning.jinja + harness: reasoning_recent debug: true # No max_workers setting - we'll use one worker per agent output_dir: results/benchmarks diff --git a/configs/test/temperature_test.yaml b/configs/test/temperature_test.yaml index 8f8e0cc..d79b705 100644 --- a/configs/test/temperature_test.yaml +++ b/configs/test/temperature_test.yaml @@ -7,32 +7,32 @@ quests: agents: - model: claude-sonnet-4-5 - template: reasoning.jinja + harness: reasoning_recent temperature: 0.3 skip_single: true - model: claude-sonnet-4-5 - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true - model: claude-sonnet-4-5 - template: reasoning.jinja + harness: reasoning_recent temperature: 0.7 skip_single: true - model: deepseek-3.2-chat - template: reasoning.jinja + harness: reasoning_recent temperature: 0.3 skip_single: true - model: deepseek-3.2-chat - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true - model: deepseek-3.2-chat - template: reasoning.jinja + harness: reasoning_recent temperature: 0.7 skip_single: true diff --git a/configs/test/test_benchmark.yaml b/configs/test/test_benchmark.yaml index 3c89dab..b3321d9 100644 --- a/configs/test/test_benchmark.yaml +++ b/configs/test/test_benchmark.yaml @@ -3,8 +3,9 @@ quests: - quests/Boat.qm agents: - model: random_choice + harness: random_choice - model: gemini-2.5-flash - template: reasoning.jinja + harness: reasoning_recent debug: true quest_timeout: 60 max_workers: 2 diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index c7f556d..2588ee2 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -1,38 +1,75 @@ # Architecture ## Overview -LLM Quest Benchmark evaluates how different agent architectures complete interactive fiction quests (Space Rangers `.qm` format). + +LLM Quest Benchmark evaluates how **agent harnesses** complete interactive +fiction quests in the Space Rangers `.qm` format. The benchmark holds the quest +environment and result logging constant while varying the harness around the +model: prompt template, memory strategy, tools, and action loop. + The runtime loop is: + 1. Parse or step quest state via the TypeScript engine bridge. -2. Build an action prompt from current state and available choices. -3. Get agent choice (human/random/LLM with varying agent modes). -4. Apply choice, log step, and detect outcome. +2. Build harness context from current state, available choices, and memory. +3. Get a choice from a human, random policy, or LLM-backed harness. +4. Apply the choice, log the step, and detect the terminal outcome. 5. Persist run metrics and run summaries. +## Harness Engineering Framing + +This project treats the **agent harness** as the primary experimental object. +An agent harness is the wrapper around a model that controls what the model +sees, what state is carried forward, what external tools are available, and how +a raw completion is converted into a quest action. In this codebase, harnesses +are not incidental plumbing: they are the independent variable. + +This follows the practical question raised by "How Much Heavy Lifting Can an +Agent Harness Do?" (arXiv:2604.07236): how much performance comes from the +surrounding scaffold rather than the base model alone? Space Rangers text +quests are useful because they are long enough to stress memory, planning, and +state tracking, but concrete enough to score with terminal success/failure +outcomes. + +Closest text-game benchmarks such as TextQuests and TALE-Suite usually vary +models under a mostly fixed evaluation scaffold. LLM Quest Benchmark can hold +the model fixed and vary the harness to ask which prompt, memory, tool, and +planning choices change behavior. + ## Main Runtime Layers ### 1. Quest Engine Layer -- `space-rangers-quest/`: - TypeScript quest parser/player submodule. -- `llm_quest_benchmark/executors/ts_bridge/consoleplayer.ts`: - Node entrypoint for parse/step execution. -- `llm_quest_benchmark/executors/ts_bridge/bridge.py`: - Python subprocess bridge with startup preflight and actionable errors. + +- `space-rangers-quest/`: TypeScript quest parser/player submodule. +- `llm_quest_benchmark/executors/ts_bridge/consoleplayer.ts`: Node entrypoint + for parse/step execution. +- `llm_quest_benchmark/executors/ts_bridge/bridge.py`: Python subprocess + bridge with startup preflight and actionable errors. ### 2. Environment Layer -- `llm_quest_benchmark/environments/qm.py`: - Wraps bridge into Python environment semantics (`reset`, `step`, terminal detection). -### 3. Agent Layer -- `llm_quest_benchmark/agents/llm_agent.py`: Base LLM agent with template-driven prompts, retry logic, loop-breaking, and safety filters. -- `llm_quest_benchmark/agents/planner_agent.py`: Planner loop with observation-diff heuristic for re-planning. -- `llm_quest_benchmark/agents/tool_agent.py`: Tool-using scaffold with quest history tool. -- `llm_quest_benchmark/agents/agent_factory.py`: Factory that maps Prompt Template choices to agent classes. -- `llm_quest_benchmark/agents/human_player.py`, `random_agent.py`: Non-LLM agents. +- `llm_quest_benchmark/environments/qm.py`: Wraps the bridge into Python + environment semantics (`reset`, `step`, terminal detection). + +### 3. Harness Layer -`LLMAgent` lazily initializes provider clients, so template rendering and agent construction do not require API keys. +- `llm_quest_benchmark/harnesses/base.py`: `BaseHarness`, the shared + LLM-backed `QuestPlayer` implementation for prompt rendering, response + parsing, retries, contextual state, and safety filtering. +- `llm_quest_benchmark/harnesses/memory.py`: `DefaultMemory`, + `FullTranscriptMemory`, and `CompactionMemory`. +- `llm_quest_benchmark/harnesses/tools.py`: Calculator, scratchpad, and quest + history helpers used by tool harnesses. +- `llm_quest_benchmark/harnesses/factory.py`: `create_harness()` and the + canonical harness registry. +- `llm_quest_benchmark/players/human.py`, + `llm_quest_benchmark/players/random.py`: Non-LLM `QuestPlayer` + implementations preserved for interactive and random baselines. + +Harness construction lazily initializes provider clients, so template rendering +and benchmark configuration parsing do not require API keys. ### 4. LLM Provider Layer + - `llm_quest_benchmark/llm/client.py`: - provider/model normalization (`provider:model` + aliases) - adapters: OpenAI, Anthropic, Google Gemini, DeepSeek @@ -40,38 +77,60 @@ The runtime loop is: - token/cost usage tracking per completion call ### 5. Execution and Analysis Layer + - `llm_quest_benchmark/core/runner.py`: Core quest run loop. -- `llm_quest_benchmark/core/analyzer.py`: Post-run analysis and benchmark summaries. +- `llm_quest_benchmark/core/analyzer.py`: Post-run analysis and benchmark + summaries. - `llm_quest_benchmark/core/benchmark_report.py`: Markdown report generator. -- `llm_quest_benchmark/core/logging.py`: Quest logger with per-run metrics (repetition_rate, bad_decision_rate). -- `llm_quest_benchmark/executors/benchmark.py`: Benchmark orchestration with parallel workers. -- `llm_quest_benchmark/executors/cli/commands.py`: CLI commands (`run`, `play`, `analyze`, `analyze-run`, `benchmark`, `benchmark-report`, `download-quests`, `cleanup`). +- `llm_quest_benchmark/core/logging.py`: Quest logger with per-run metrics + (`repetition_rate`, `bad_decision_rate`). +- `llm_quest_benchmark/executors/benchmark.py`: Benchmark orchestration with + parallel workers. +- `llm_quest_benchmark/executors/cli/commands.py`: CLI commands (`run`, `play`, + `analyze`, `analyze-run`, `benchmark`, `benchmark-report`, + `download-quests`, `cleanup`). ### 6. Prompt Templates -- `llm_quest_benchmark/prompt_templates/`: Jinja2 templates for each agent mode. + +- `llm_quest_benchmark/prompt_templates/`: Jinja2 templates referenced by + harnesses. - `stub.jinja`: Minimal prompt. - - `reasoning.jinja`, `strategic.jinja`, etc.: Short-context or full-history reasoning depending on memory mode. - - `stateful_compact.jinja`, `memo_*.jinja`: Compact memory / memo prompts. - - `light_hints.jinja`, `stateful_compact_hints.jinja`: Prompt hints. - - `planner.jinja`: Planner loop prompts. - - `tool_augmented.jinja`, `tool_augmented_hints.jinja`: Tool prompts with compact memory, optionally with hints. + - `reasoning.jinja`: Short-context or full-history reasoning depending on + harness memory. + - `stateful_compact.jinja`: Compact memory / 20-word memo prompt. + - `stateful_compact_hints.jinja`: Compact memo prompt with mechanics hints. + - `memo_cot.jinja`, `memo_extended.jinja`, `memo_structured.jinja`: + retained Exp 4 memo variants. + - `planner.jinja`: Planner loop prompt. + - `tool_augmented.jinja`, `tool_augmented_hints.jinja`: Tool prompts with + compact memory, optionally with hints. ## Persistence + - `metrics.db`: Benchmark/run metrics for CLI workflows. -- `results///run_/run_summary.json`: Step trace + per-step decisions + aggregated token/cost usage. +- `results///run_/run_summary.json`: Step trace, + per-step decisions, and aggregated token/cost usage. ## Configuration + - `.env` (copied from `.env.template`): Provider API keys. -- `configs/benchmarks/`: Benchmark YAML configs defining model x template x quest matrix. +- `configs/benchmarks/`: Benchmark YAML configs defining model × harness × + quest matrices. ## Public Taxonomy (Benchmark Dimension) -| Label | Template / memory source | Agent Class | Description | -|------|----------|-------------|-------------| -| Minimal prompt | stub | LLMAgent | Smallest action-selection prompt | -| Short-context reasoning | reasoning/strategic + default memory | LLMAgent | Local prompted analysis | -| Full-history reasoning | reasoning + full transcript memory | LLMAgent | Whole transcript retained in context | -| Compact memory / memo | reasoning/stateful/memo templates + compaction | LLMAgent | Summarized state instead of unbounded transcript | -| Prompt hints | light_hints/stateful_compact_hints | LLMAgent | Mechanics hints injected into prompt | -| Tools + compact memory | tool_augmented | ToolAgent | Quest history/scratchpad tools with compact context | -| Tools + hints + compact memory | tool_augmented_hints | ToolAgent | Tool scaffold plus prompt hints | -| Planner loop | planner | PlannerAgent | Plan-maintain-act loop | + +| Public label | Harness name | Template | Memory | Tools | Loop | +|---|---|---|---|---|---| +| Minimal prompt | `minimal` | `stub.jinja` | `DefaultMemory` | none | react | +| Short-context reasoning | `reasoning_recent` | `reasoning.jinja` | `DefaultMemory` | none | react | +| Full-history reasoning | `reasoning_full` | `reasoning.jinja` | `FullTranscriptMemory` | none | react | +| Compact memory / memo | `memo_compact` | `stateful_compact.jinja` | `CompactionMemory` | none | react | +| Prompt hints | `hinted_compact` | `stateful_compact_hints.jinja` | `CompactionMemory` | none | react | +| Tools + compact memory | `tool_compact` | `tool_augmented.jinja` | `CompactionMemory` | calculator, scratchpad, quest history | tool-select-then-act | +| Tools + hints + compact memory | `tool_hinted` | `tool_augmented_hints.jinja` | `CompactionMemory` | calculator, scratchpad, quest history | tool-select-then-act | +| Planner loop | `planner` | `planner.jinja` | `CompactionMemory` | none | plan-maintain-act | + +The harness names above are canonical snake_case identifiers used in YAML +configs, the CLI, result artifacts, and documentation. Public labels can be +friendlier, but experiment records should preserve the canonical names so runs +remain comparable. diff --git a/docs/EXPERIMENTS_LOG.md b/docs/EXPERIMENTS_LOG.md index dadef6e..a9ca972 100644 --- a/docs/EXPERIMENTS_LOG.md +++ b/docs/EXPERIMENTS_LOG.md @@ -1,5 +1,19 @@ # Experiments Log +## Harness Name Mapping + +| Experiment arm | Old label | New harness name | +|---|---|---| +| Minimal prompt arms | `stub` | `minimal` | +| Short-context reasoning arms | `reasoning` + `default` memory | `reasoning_recent` | +| Full-history reasoning arms | `reasoning` + `full_transcript` memory | `reasoning_full` | +| Stateful compact memo arms | `stateful_compact` + compaction | `memo_compact` | +| Hinted compact memo arms | `stateful_compact_hints` + compaction | `hinted_compact` | +| Tool-augmented compact arms | `tool_augmented` + compaction | `tool_compact` | +| Tool-augmented hinted arms | `tool_augmented_hints` + compaction | `tool_hinted` | +| Planner arms | `planner` | `planner` | +| Memo variation arms | `memo_extended`, `memo_structured`, `memo_cot` | retired experiment variants, not canonical harnesses | + > Historical / non-authoritative notes. This log preserves experiment history > and branch-era shorthand. For the current public taxonomy and public > comparison slice, use `site/about.html`, `site/leaderboard.json`, @@ -7,6 +21,145 @@ Record of benchmark experiments, findings, and decisions. Keeps history out of source code. +## Current Coverage Audit (2026-05-11) + +Sources reviewed for this audit: + +- `docs/EXPERIMENTS_LOG.md` +- `docs/ARCHITECTURE.md` +- `configs/benchmarks/*.yaml` +- `site/leaderboard.json` + +This audit uses the post-refactor harness taxonomy: `minimal`, +`reasoning_recent`, `reasoning_full`, `memo_compact`, `hinted_compact`, +`tool_compact`, `tool_hinted`, and `planner`. + +### Experiment Inventory + +| Experiment | Config / source | Harness mapping | Quest scope | Completed runs recorded in log | Audit disposition | +|---|---|---|---|---:|---| +| Exp 2: Memory Modes | `memory_full_transcript.yaml`, `memory_compaction.yaml` | `reasoning_full`, `memo_compact` | 14 historical quests including `Prison` | 126 | Unreliable for canonical comparison: loop-breaker bug era. | +| Exp 3 Arm 1: No Loop Breaker | `exp3_no_loop_breaker.yaml` | `reasoning_full` | 18 quests, excluding `Boat`/`Prison` | 36 | Use only rerun after timeout fix; pre-fix attempt is noisy/incomplete. | +| Exp 3 Arm 2: Stateful Compact | `exp3_stateful_compact.yaml` | `memo_compact` | 18 quests, excluding `Boat`/`Prison` | 36 | Canonical memo baseline, but only 2 runs/quest. | +| Exp 4: Compaction No Memo | `exp4_compaction_no_memo.yaml` | retired ablation, not canonical | 18 quests | 36 | Do not aggregate into `memo_compact`. | +| Exp 4: Memo Extended | `exp4_memo_extended.yaml` | retired `memo_extended` variant | 18 quests | 36 | Non-canonical variant. | +| Exp 4: Memo Structured | `exp4_memo_structured.yaml` | retired `memo_structured` variant | 18 quests | 36 | Non-canonical variant. | +| Exp 4: Memo CoT | `exp4_memo_cot.yaml` | retired `memo_cot` variant | 18 quests | 36 | Non-canonical variant. | +| Exp 5: Baseline Variance | `exp5_stateful_compact_variance.yaml` | `memo_compact` | 18 quests | 90 | Canonical memo baseline variance study. | +| Exp 6: Prompt Hints | `exp6_prompt_hints.yaml` | `hinted_compact` | 18 quests | 54 | Canonical single-model harness comparison. | +| Exp 6: Tools | `exp6_tools.yaml` | `tool_compact` | 18 quests | 54 | Canonical single-model harness comparison. | +| Exp 6: Tools + Hints | `exp6_tools_hints.yaml` | `tool_hinted` | 18 quests | 54 | Canonical single-model harness comparison. | +| Exp 7: Multi-Model Comparison | `exp7_*.yaml` | `memo_compact` | 5 winnable quests | 75 | Canonical model sweep for memo harness. | +| Exp 7b: Model Upgrades | `exp7b_model_upgrades.yaml` | `memo_compact` | 18 quests | 108 | Noisy model-upgrade sweep; high timeout rates for Qwen 3.6 and Haiku 4.5. | + +### Harness Coverage Matrix + +The table below is computed from `site/leaderboard.json` and counts recorded +leaderboard runs by harness and quest. `Boat` and `Prison` are retained because +they still appear in the published leaderboard data, but they are retired from +the canonical experiment set. + +| Harness | Badday | Banket | Boat | Codebox | Depth | Driver | Edelweiss | Election | Foncers | Leonardo | Ministry | Pizza | Prison | Robots | Ski | Total | +|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| `minimal` | 22 | 22 | 23 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 331 | +| `reasoning_recent` | 22 | 22 | 28 | 22 | 22 | 24 | 25 | 30 | 25 | 25 | 26 | 22 | 28 | 31 | 31 | 383 | +| `reasoning_full` | 17 | 17 | 9 | 17 | 17 | 15 | 17 | 17 | 17 | 17 | 16 | 17 | 6 | 14 | 14 | 227 | +| `memo_compact` | 37 | 39 | 18 | 39 | 39 | 39 | 39 | 37 | 39 | 37 | 39 | 37 | 15 | 39 | 34 | 527 | +| `hinted_compact` | 4 | 4 | 1 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 1 | 4 | 4 | 54 | +| `tool_compact` | 3 | 3 | 0 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 0 | 3 | 3 | 39 | +| `tool_hinted` | 3 | 3 | 0 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 0 | 3 | 3 | 39 | +| `planner` | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 15 | + +Leaderboard scope note: the current public JSON includes 15 quest columns and +does not include several 18-quest experiment-log quests such as `Pilot`, +`Disk`, `Player`, `Shashki`, and `Sortirovka1`. A future leaderboard refresh +should either add them or explicitly document why the public slice excludes +them. + +### Gap Analysis + +All zero-run cells in the published leaderboard matrix are retired quest cells: + +- `tool_compact` x `Boat`: 0 runs. +- `tool_compact` x `Prison`: 0 runs. +- `tool_hinted` x `Boat`: 0 runs. +- `tool_hinted` x `Prison`: 0 runs. + +Because `Boat` and `Prison` are retired, these do not require new canonical +runs. They do indicate that the public leaderboard mixes active and retired +quest scopes. + +Cells with fewer than 3 runs: + +- `hinted_compact` x `Boat`: 1 run; retired quest. +- `hinted_compact` x `Prison`: 1 run; retired quest. +- `planner`: 1 run on every published quest. + +Canonical action item: the planner harness has insufficient variance coverage. +For active quests, it needs at least two additional runs per quest to reach the +minimum 3-run threshold. + +The following harnesses have leaderboard cells where the run count may be at +least 3, but the model dimension is still only one model: `tool_compact`, +`tool_hinted`, and `planner`. Their comparison is promising, but not yet +model-robust. + +### Noise And Anomalies + +Loop-breaker bug era: + +- Exp 2 memory-mode runs are unreliable. The experiment log documents a + number-normalization bug in `_normalize_for_signature` and aggressive loop + breaker overrides that changed correct model decisions. +- Exp 3 Arm 1 has a pre-fix/incomplete attempt affected by SDK timeout issues. + Only the rerun after the timeout fix should be considered. +- Any leaderboard entry whose provenance traces to Exp 2 or the Exp 3 pre-fix + attempt should be marked non-canonical until regenerated or excluded. + +High-timeout model-upgrade runs: + +- Exp 7b `Qwen 3.6 Flash`: 17/36 timeouts (47%). +- Exp 7b `Claude Haiku 4.5`: 19/36 timeouts (53%). +- Exp 7b `DeepSeek V4 Flash`: 5/36 timeouts (14%), below the >30% threshold + but still noisy because success was 0/36. + +Retired quests: + +- `Boat`: trivial / smoke-test-like quest; removed from canonical experiment + configs. +- `Prison`: loops endlessly; removed from canonical experiment configs. + +Retired harness variants: + +- `memo_extended` +- `memo_structured` +- `memo_cot` +- `compaction_no_memo` ablation + +These variants should not be merged into canonical `memo_compact` results. + +### Budget Estimate + +Top-priority new runs to close actionable gaps while avoiding retired quests: + +| Priority | Harness | Quest(s) | New runs needed | Reason | +|---:|---|---|---:|---| +| 1 | `planner` | 13 active published quests (`Badday`, `Banket`, `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`, `Foncers`, `Leonardo`, `Ministry`, `Pizza`, `Robots`, `Ski`) | 26 | Bring 1-run planner cells up to the 3-run minimum on active leaderboard quests. | +| 2 | `planner` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest so planner effects are not single-model artifacts. | +| 3 | `tool_compact` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest; current cells are all one-model results. | +| 4 | `tool_hinted` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest; current cells are all one-model results. | +| 5 | Public leaderboard refresh | `Pilot`, `Disk`, `Player`, `Shashki`, `Sortirovka1` | Scope-dependent | These quests are present in canonical 18-quest configs/logs but absent from the current public leaderboard matrix. Backfill or explicitly exclude them. | + +Do not spend new budget on `Boat` or `Prison` unless the goal is only to +reproduce historical/public rows; both are retired from canonical analysis. + +### Leaderboard Integrity + +Recommended integrity rule: canonical leaderboard aggregates should require +non-retired quests, canonical harness names, no loop-breaker bug provenance, at +least 3 runs per harness x quest cell, and at least two models for claims about +harness effects rather than model effects. + ## Exp 2: Memory Modes (2026-04-27) **Config**: `configs/benchmarks/memory_full_transcript.yaml`, `configs/benchmarks/memory_compaction.yaml` @@ -37,7 +190,7 @@ The `_apply_loop_breaker` mechanism was overriding correct LLM decisions. Eviden ### Decision -- **Disabled loop breaker** entirely in all agent types (llm_agent, planner_agent, tool_agent) +- **Disabled loop breaker** entirely in all harness types - **Removed number normalization** from state signature computation - Kept `_state_action_counts` and `_state_signature` (used by safety filter and loop escape) - Removed `_apply_loop_breaker` method and `_loop_repetition_threshold` field as dead code diff --git a/docs/SPEC.md b/docs/SPEC.md index 99289fb..44ef498 100644 --- a/docs/SPEC.md +++ b/docs/SPEC.md @@ -7,8 +7,11 @@ For the public narrative and interpretation of results, use the project ## Purpose LLM Quest Benchmark evaluates how LLMs make sequential choices in Space -Rangers text quests. The benchmark varies the context scaffold around a model -while holding the quest environment and result logging consistent. +Rangers text quests. The benchmark varies the agent harness around a model +while holding the quest environment and result logging consistent. A harness is +the wrapper that decides what context the model sees and how its response is +converted into an action: prompt template, memory strategy, tools, and loop +shape. The core question is practical: which kinds of context help, hurt, or expose state-tracking failures during 10-50 turn interactive fiction tasks? @@ -35,29 +38,47 @@ analysis, but the public slice is the authoritative comparison surface. ## Current Taxonomy -Use these labels for current public descriptions of benchmark modes: +Use these labels for current public descriptions of benchmark harnesses: -| Label | Implementation source | Agent class | -|---|---|---| -| Minimal prompt | `stub.jinja` | `LLMAgent` | -| Short-context reasoning | `reasoning.jinja`, `strategic.jinja` with default/recent context | `LLMAgent` | -| Full-history reasoning | reasoning templates with `full_transcript` memory | `LLMAgent` | -| Compact memory / memo | `stateful_compact.jinja`, memo templates, compaction memory | `LLMAgent` | -| Prompt hints | `light_hints.jinja`, `stateful_compact_hints.jinja` | `LLMAgent` | -| Tools + compact memory | `tool_augmented.jinja` | `ToolAgent` | -| Tools + hints + compact memory | `tool_augmented_hints.jinja` | `ToolAgent` | -| Planner loop | `planner.jinja` | `PlannerAgent` | +| Label | Harness name | Template | Memory | Tools / loop | +|---|---|---|---|---| +| Minimal prompt | `minimal` | `stub.jinja` | `DefaultMemory` | no tools, react loop | +| Short-context reasoning | `reasoning_recent` | `reasoning.jinja` | `DefaultMemory` | no tools, react loop | +| Full-history reasoning | `reasoning_full` | `reasoning.jinja` | `FullTranscriptMemory` | no tools, react loop | +| Compact memory / memo | `memo_compact` | `stateful_compact.jinja` | `CompactionMemory` | no tools, react loop | +| Prompt hints | `hinted_compact` | `stateful_compact_hints.jinja` | `CompactionMemory` | no tools, react loop | +| Tools + compact memory | `tool_compact` | `tool_augmented.jinja` | `CompactionMemory` | calculator, scratchpad, quest history | +| Tools + hints + compact memory | `tool_hinted` | `tool_augmented_hints.jinja` | `CompactionMemory` | calculator, scratchpad, quest history | +| Planner loop | `planner` | `planner.jinja` | `CompactionMemory` | plan-maintain-act loop | Older internal experiment labels are historical and should not be presented as the current public taxonomy. +## Current Interpretation + +The strongest pattern so far is that bigger scaffolds are not automatically +better. A concise 20-word memo produced a useful sweet spot: it improved over +no-memo and full-transcript baselines, while longer or more structured memo +variants regressed. The likely mechanism is selective pressure: the short memo +forces the harness to preserve only state that matters for future decisions. + +Tools and hints showed a synergy effect. Prompt hints alone hurt, and tools +alone were modest, but tools plus hints improved outcomes because the hints +pointed the model toward quantities and quest mechanics while the calculator, +scratchpad, and history search gave it ways to act on those signals. + +Verbosity is a recurring failure mode. Some newer or larger models timed out +more often because they spent too much of the quest budget generating long step +responses. For sequential decision tasks, a harness that elicits concise, +actionable state updates can outperform one that invites broad reasoning. + ## Implemented Runtime - Quest execution uses the TypeScript `space-rangers-quest` submodule through the Python bridge in `llm_quest_benchmark/executors/ts_bridge/`. - Environment state is exposed through `llm_quest_benchmark/environments/qm.py`. -- Agents live under `llm_quest_benchmark/agents/` and are selected by template - aliases and agent factory wiring. +- Agent harnesses live under `llm_quest_benchmark/harnesses/` and are selected + by canonical snake_case harness names. - Provider calls are normalized in `llm_quest_benchmark/llm/client.py` with OpenAI-compatible, Anthropic, Google, and DeepSeek adapters. - Benchmark execution is CLI + YAML driven through `uv run llm-quest ...`. @@ -107,7 +128,7 @@ Provider API keys are required for real LLM runs. Tests and static validation should run without external credentials in a prepared checkout. Reproducible benchmark rows depend on recording the quest file, model/provider -ID, prompt templates, memory mode, run ID, outcome, and run summaries with -usage/metrics. Agent responses are parsed into a chosen action plus optional +ID, harness name, run ID, outcome, and run summaries with usage/metrics. +Harness responses are parsed into a chosen action plus optional analysis/reasoning so action validity, terminal outcome, steps, tokens/cost, and repetition diagnostics can be regenerated from stored artifacts. diff --git a/llm_quest_benchmark/agents/__init__.py b/llm_quest_benchmark/agents/__init__.py deleted file mode 100644 index 852fb91..0000000 --- a/llm_quest_benchmark/agents/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from .agent_factory import create_agent -from .base import QuestPlayer -from .llm_agent import LLMAgent -from .planner_agent import PlannerAgent -from .random_agent import RandomAgent -from .tool_agent import ToolAgent - -__all__ = [ - "create_agent", - "QuestPlayer", - "RandomAgent", - "LLMAgent", - "PlannerAgent", - "ToolAgent", -] diff --git a/llm_quest_benchmark/agents/agent_factory.py b/llm_quest_benchmark/agents/agent_factory.py deleted file mode 100644 index d7b889b..0000000 --- a/llm_quest_benchmark/agents/agent_factory.py +++ /dev/null @@ -1,102 +0,0 @@ -"""Factory for creating quest agents""" - -import logging - -from llm_quest_benchmark.agents.base import QuestPlayer -from llm_quest_benchmark.agents.human_player import HumanPlayer -from llm_quest_benchmark.agents.llm_agent import LLMAgent -from llm_quest_benchmark.agents.planner_agent import PlannerAgent -from llm_quest_benchmark.agents.random_agent import RandomAgent -from llm_quest_benchmark.agents.tool_agent import ToolAgent -from llm_quest_benchmark.constants import ( - DEFAULT_MODEL, - DEFAULT_TEMPERATURE, - DEFAULT_TEMPLATE, - SYSTEM_ROLE_TEMPLATE, - normalize_template_name, -) - -logger = logging.getLogger(__name__) - - -def create_agent( - model: str = DEFAULT_MODEL, - system_template: str = SYSTEM_ROLE_TEMPLATE, - action_template: str = DEFAULT_TEMPLATE, - temperature: float = DEFAULT_TEMPERATURE, - skip_single: bool = False, - debug: bool = False, - memory_mode: str = "default", - compaction_interval: int = 10, -) -> QuestPlayer: - """Create a quest agent based on model name and parameters. - - Args: - model (str): Model identifier. Can be: - - LLM model name (e.g. 'gpt-5-mini', 'claude-sonnet-4-5') - - 'random_choice' for random testing agent (can include seed e.g. 'random_choice_123') - - 'human' for interactive human player - debug (bool): Enable debug logging - system_template (str): System template for LLM agents - action_template (str): Action template for LLM agents - temperature (float): Temperature for LLM sampling - skip_single (bool): Auto-select single choices - - Returns: - QuestPlayer: Appropriate agent instance - - Raises: - ValueError: If model type is not recognized - """ - logger.debug(f"Creating agent for model: {model}") - resolved_action_template = normalize_template_name(action_template) - - # Human player - if model == "human": - return HumanPlayer(skip_single=skip_single) - - # Random choice agent - if model.startswith("random_choice"): - seed = None - if "_" in model: - try: - seed = int(model.split("_")[-1]) - except ValueError: - pass - return RandomAgent(seed=seed, debug=debug, skip_single=skip_single) - - if resolved_action_template == "planner.jinja": - return PlannerAgent( - debug=debug, - model_name=model, - system_template=system_template, - action_template=resolved_action_template, - temperature=temperature, - skip_single=skip_single, - memory_mode=memory_mode, - compaction_interval=compaction_interval, - ) - - if resolved_action_template in ("tool_augmented.jinja", "tool_augmented_hints.jinja"): - return ToolAgent( - debug=debug, - model_name=model, - system_template=system_template, - action_template=resolved_action_template, - temperature=temperature, - skip_single=skip_single, - memory_mode=memory_mode, - compaction_interval=compaction_interval, - ) - - # Default to LLM agent - return LLMAgent( - debug=debug, - model_name=model, - system_template=system_template, - action_template=resolved_action_template, - temperature=temperature, - skip_single=skip_single, - memory_mode=memory_mode, - compaction_interval=compaction_interval, - ) diff --git a/llm_quest_benchmark/agents/llm_agent.py b/llm_quest_benchmark/agents/llm_agent.py deleted file mode 100644 index 64ff0cc..0000000 --- a/llm_quest_benchmark/agents/llm_agent.py +++ /dev/null @@ -1,968 +0,0 @@ -"""LLM agent for Space Rangers quests""" - -import hashlib -import json -import logging -import re -from typing import Any - -from json_repair import repair_json - -from llm_quest_benchmark.agents.base import QuestPlayer -from llm_quest_benchmark.constants import ( - DEFAULT_MODEL, - DEFAULT_TEMPERATURE, - DEFAULT_TEMPLATE, - MODEL_CHOICES, - SYSTEM_ROLE_TEMPLATE, - normalize_template_name, -) -from llm_quest_benchmark.llm.client import ( - get_llm_client, - is_supported_model_name, - parse_model_name, -) -from llm_quest_benchmark.llm.prompt import PromptRenderer -from llm_quest_benchmark.schemas.response import LLMResponse - -RISKY_CHOICE_KEYWORDS = ( - "улететь", - "сдаться", - "отказ", - "провал", - "убежать", - "surrender", - "give up", -) - -SAFE_CHOICE_KEYWORDS = ( - "пройти мимо", - "избежать", - "подготов", - "библиотек", - "изуч", - "wait", - "avoid", - "study", -) - - -def _parse_json_response( - response: str, - debug: bool = False, - logger: logging.Logger | None = None, -) -> tuple[dict[str, Any] | None, str | None]: - """Try to parse response as JSON, with repair attempt if needed.""" - cleaned_response = (response or "").strip() - if not cleaned_response: - return None, None - - try: - # Extract JSON from response if there are backticks - if "```json" in cleaned_response: - # Find the start and end of the JSON block - start = cleaned_response.find("```json") + 7 - end = cleaned_response.find("```", start) - if end > start: - json_str = cleaned_response[start:end].strip() - if debug and logger: - logger.debug(f"Extracted JSON: {json_str}") - result = json.loads(json_str) - if debug and logger: - logger.debug(f"Parsed JSON: {result}") - return result, "json_fenced" - - # Extract a probable JSON object from free-form text. - embedded_json = re.search(r"\{[\s\S]*\}", cleaned_response) - if embedded_json: - candidate = embedded_json.group(0).strip() - if candidate and candidate != cleaned_response: - try: - result = json.loads(candidate) - if debug and logger: - logger.debug(f"Parsed embedded JSON: {result}") - return result, "json_embedded" - except json.JSONDecodeError: - pass - - # Try to parse directly - result = json.loads(cleaned_response) - if debug and logger: - logger.debug(f"Direct JSON parse successful: {result}") - return result, "json_direct" - except json.JSONDecodeError: - if debug and logger: - logger.debug("Initial JSON parse failed, attempting repair") - try: - repaired = repair_json(cleaned_response) - if debug and logger: - logger.debug(f"Repaired JSON: {repaired}") - result = json.loads(repaired) - if debug and logger: - logger.debug(f"Parse of repaired JSON successful: {result}") - return result, "json_repaired" - except Exception as e: - if debug and logger: - logger.error(f"JSON repair failed: {e}") - return None, None - - -def _validate_action_number( - action: int, num_choices: int, debug: bool = False, logger: logging.Logger | None = None -) -> bool: - """Validate that action number is within valid range""" - if 1 <= action <= num_choices: - return True - if debug and logger: - logger.error(f"Action number {action} out of range [1, {num_choices}]") - return False - - -def _extract_action_from_text(response: str, num_choices: int) -> int | None: - """Extract a candidate action from free-form text.""" - for match in re.finditer(r"\b(\d+)\b", response): - action = int(match.group(1)) - if 1 <= action <= num_choices: - return action - return None - - -def _extract_field_from_text(response: str, field: str) -> str | None: - """Best-effort extraction of analysis/reasoning from loosely formatted output.""" - if not response: - return None - - # JSON-like field forms: "analysis": "...", 'analysis': '...' - json_pattern = re.compile( - rf"""['"]{re.escape(field)}['"]\s*:\s*['"](?P.*?)['"]""", - re.IGNORECASE | re.DOTALL, - ) - match = json_pattern.search(response) - if match: - value = " ".join(match.group("value").strip().split()) - if value: - return value - - # Partial JSON field forms without a closing quote in truncated outputs. - partial_json_pattern = re.compile( - rf"""['"]{re.escape(field)}['"]\s*:\s*['"](?P[^"\n\r]+)""", - re.IGNORECASE, - ) - match = partial_json_pattern.search(response) - if match: - value = " ".join(match.group("value").strip().split()) - if value: - return value - - # Label forms: Analysis: ..., Reasoning - ... - label_pattern = re.compile( - rf"""(?im)^\s*{re.escape(field)}\s*[:\-]\s*(?P.+?)\s*$""", - ) - match = label_pattern.search(response) - if match: - value = " ".join(match.group("value").strip().split()) - if value: - return value - - return None - - -def _raw_reasoning_fallback(response: str) -> str | None: - compact = " ".join((response or "").strip().split()) - if not compact: - return None - if len(compact) > 240: - compact = compact[:237] + "..." - return f"raw_response: {compact}" - - -def _is_numeric_raw_reasoning(reasoning: str | None) -> bool: - if not reasoning: - return False - if not reasoning.startswith("raw_response:"): - return False - payload = reasoning.split(":", 1)[1].strip() - return payload.isdigit() - - -def parse_llm_response( - response: str, num_choices: int, debug: bool = False, logger: logging.Logger | None = None -) -> LLMResponse: - """Parse LLM response and return structured response object.""" - if debug and logger: - logger.debug(f"Raw LLM response: {response}") - - extracted_analysis = _extract_field_from_text(response, "analysis") - extracted_reasoning = _extract_field_from_text(response, "reasoning") - raw_reasoning = _raw_reasoning_fallback(response) - - # Try parsing as JSON first - response_json, json_parse_mode = _parse_json_response(response, debug, logger) - if response_json and isinstance(response_json, dict): - analysis = response_json.get("analysis") or extracted_analysis - reasoning = response_json.get("reasoning") or response_json.get("thinking") or extracted_reasoning - if not reasoning and analysis: - reasoning = analysis - if not analysis and not reasoning: - reasoning = raw_reasoning - - memo_raw = response_json.get("memo") - memo = str(memo_raw) if memo_raw is not None else None - - # Check for either 'action' or 'result' field - action_value = response_json.get("action") or response_json.get("result") or response_json.get("choice") - if action_value is not None: - try: - action = int(action_value) - if _validate_action_number(action, num_choices, debug, logger): - return LLMResponse( - action=action, - reasoning=reasoning, - analysis=analysis, - memo=memo, - is_default=False, - parse_mode=json_parse_mode or "json", - ) - except (ValueError, TypeError): - if debug and logger: - logger.error(f"Invalid action value in JSON: {action_value}") - - # Try parsing as plain number - try: - action = int(response.strip()) - if _validate_action_number(action, num_choices, debug, logger): - return LLMResponse( - action=action, - reasoning=extracted_reasoning or extracted_analysis or raw_reasoning, - analysis=extracted_analysis, - is_default=False, - parse_mode="number_only", - ) - except ValueError: - if debug and logger: - logger.error(f"Could not parse response as number: {response}") - - # Fallback: extract first valid integer from text. - extracted_action = _extract_action_from_text(response, num_choices) - if extracted_action is not None: - return LLMResponse( - action=extracted_action, - reasoning=extracted_reasoning or extracted_analysis or raw_reasoning, - analysis=extracted_analysis, - is_default=False, - parse_mode="number_extracted", - ) - - # Default to first choice if all parsing attempts fail - if debug and logger: - logger.error(f"Error during response parsing, defaulting to first choice. Response: {response[:100]}...") - return LLMResponse( - action=1, - reasoning=extracted_reasoning or extracted_analysis or raw_reasoning, - analysis=extracted_analysis, - is_default=True, - parse_mode="default_first", - ) - - -class LLMAgent(QuestPlayer): - """LLM-powered agent for Space Rangers quests""" - - SUPPORTED_MODELS = MODEL_CHOICES - - def __init__( - self, - model_name: str = DEFAULT_MODEL, - system_template: str = SYSTEM_ROLE_TEMPLATE, - action_template: str = DEFAULT_TEMPLATE, - temperature: float = DEFAULT_TEMPERATURE, - skip_single: bool = False, - debug: bool = False, - memory_mode: str = "default", - compaction_interval: int = 10, - ): - super().__init__(skip_single=skip_single) - self.debug = debug - self.model_name = model_name.lower() - self.system_template = normalize_template_name(system_template) - self.action_template = normalize_template_name(action_template) - self.temperature = temperature - # Set agent_id for database records - self.agent_id = f"llm_{self.model_name}" - - if not is_supported_model_name(self.model_name): - raise ValueError(f"Unsupported model: {model_name}. Supported models are: {self.SUPPORTED_MODELS}") - - self.model_spec = parse_model_name(self.model_name) - self.logger = logging.getLogger(self.__class__.__name__) - if self.debug: - self.logger.setLevel(logging.DEBUG) - self.logger.propagate = False - if not any(getattr(h, "_llm_quest_handler", False) for h in self.logger.handlers): - handler = logging.StreamHandler() - handler.setFormatter(logging.Formatter("%(name)s - %(message)s")) - handler._llm_quest_handler = True - self.logger.addHandler(handler) - - # Initialize prompt renderer - self.prompt_renderer = PromptRenderer( - None, system_template=self.system_template, action_template=self.action_template - ) - - # Delay API client creation so template-only flows and tests do not require API keys. - self.llm = None - self.history: list[LLMResponse] = [] - self._observation_history: list[str] = [] - self._decision_history: list[dict[str, Any]] = [] - self._state_action_counts: dict[str, dict[int, int]] = {} - self._context_window = 3 - self._context_chars = 220 - self._decision_window = 5 - self._max_state_signatures = 200 - self._use_safety_filter = True - self._last_response = LLMResponse(action=1, is_default=True) - - # Quest briefing: pinned first observation (mission goal) - self._quest_briefing: str | None = None - - # Memory mode: "default", "full_transcript", "compaction" - if memory_mode not in ("default", "full_transcript", "compaction"): - raise ValueError(f"Invalid memory_mode: {memory_mode}") - self._memory_mode = memory_mode - self._transcript: list[dict[str, Any]] = [] - self._compaction_interval = compaction_interval - self._compaction_summary: str | None = None - self._steps_since_compaction = 0 - self._step_count = 0 - - def _ensure_llm(self): - """Lazily create the provider client only when inference is needed.""" - if self.llm is None: - self.llm = get_llm_client( - self.model_name, - system_prompt=self.prompt_renderer.render_system_prompt(), - temperature=self.temperature, - ) - - def get_last_response(self) -> LLMResponse | None: - """Get the last LLM response from history""" - return self._last_response - - def get_action(self, observation: str, choices: list[dict[str, str]]) -> int: - """Track observation history for context, then delegate base action flow.""" - self._remember_observation(observation) - return super().get_action(observation, choices) - - def _remember_observation(self, observation: str) -> None: - clean = (observation or "").strip() - if not clean: - return - if self._quest_briefing is None: - self._quest_briefing = clean - self._observation_history.append(clean) - if len(self._observation_history) > 20: - self._observation_history = self._observation_history[-20:] - - def _build_contextual_state(self, state: str) -> str: - """Build context-augmented state based on memory mode.""" - if self._memory_mode == "full_transcript": - return self._build_full_transcript_state(state) - if self._memory_mode == "compaction": - return self._build_compaction_state(state) - return self._build_default_state(state) - - def _briefing_block(self, state: str) -> str | None: - """Return quest briefing block if available and not redundant with current state.""" - if not self._quest_briefing: - return None - if state.strip() == self._quest_briefing: - return None - briefing = self._quest_briefing - if len(briefing) > 800: - briefing = briefing[:800] + "..." - return f"Quest briefing (your mission):\n{briefing}" - - def _build_default_state(self, state: str) -> str: - """Original sliding-window context, now with pinned briefing.""" - blocks: list[str] = [] - - briefing = self._briefing_block(state) - if briefing: - blocks.append(briefing) - - if len(self._observation_history) > 1: - previous = self._observation_history[:-1][-self._context_window :] - if previous: - snippets = [] - for idx, text in enumerate(previous, start=1): - clipped = text if len(text) <= self._context_chars else text[: self._context_chars] + "..." - snippets.append(f"[Previous {idx}] {clipped}") - blocks.append("Recent context from previous steps:\n" + "\n\n".join(snippets)) - - if self._decision_history: - recent_memos = [] - for item in self._decision_history[-self._decision_window :]: - m = (item.get("memo") or "").strip() - if not m: - continue - if recent_memos and recent_memos[-1] == m: - continue - recent_memos.append(m) - if recent_memos: - lines = [f"[Memo {idx}] {m}" for idx, m in enumerate(recent_memos, start=1)] - blocks.append("State memo (recent):\n" + "\n".join(lines)) - - recent_decisions = self._decision_history[-self._decision_window :] - decision_lines = [] - for idx, item in enumerate(recent_decisions, start=1): - choice = item.get("choice", "") - parse_mode = item.get("parse_mode", "unknown") - memo_val = item.get("memo") - memo_suffix = f" | memo: {memo_val}" if memo_val else "" - decision_lines.append( - f"[Decision {idx}] action {item.get('action')}: {choice} (parse={parse_mode}){memo_suffix}" - ) - blocks.append("Recent selected actions:\n" + "\n".join(decision_lines)) - - if not blocks: - return state - - sep = "\n\n" - return f"{sep.join(blocks)}\n\nCurrent story state:\n{state}" - - def _build_full_transcript_state(self, state: str) -> str: - """Full decision transcript with pinned briefing.""" - blocks: list[str] = [] - - briefing = self._briefing_block(state) - if briefing: - blocks.append(briefing) - - if self._transcript: - lines = [] - entries = self._transcript - # Budget: keep first 3 + last N that fit under ~40 entries total - if len(entries) > 40: - entries = entries[:3] + [{"_gap": len(entries) - 40}] + entries[-(40 - 3) :] - for entry in entries: - if "_gap" in entry: - lines.append(f" ... ({entry['_gap']} steps omitted) ...") - continue - step = entry.get("step", "?") - obs = entry.get("observation", "") - if len(obs) > 400: - obs = obs[:400] + "..." - chosen = entry.get("choice_text", "") - reasoning = entry.get("reasoning", "") - line = f"Step {step}: {obs}" - if chosen: - line += f"\n You chose: {chosen}" - if reasoning: - line += f"\n Reasoning: {reasoning[:800]}" - state_notes = entry.get("memo", "") - if state_notes: - line += f"\n State: {state_notes[:350]}" - lines.append(line) - blocks.append("=== QUEST TRANSCRIPT ===\n" + "\n\n".join(lines)) - - blocks.append(f"Step {self._step_count} (CURRENT):\n{state}") - return "\n\n".join(blocks) - - def _build_compaction_state(self, state: str) -> str: - """Compacted memory summary + recent steps since last compaction.""" - blocks: list[str] = [] - - briefing = self._briefing_block(state) - if briefing: - blocks.append(briefing) - - if self._compaction_summary: - blocks.append( - f"=== QUEST MEMORY (compacted at step {self._step_count - self._steps_since_compaction}) ===\n{self._compaction_summary}" - ) - - if self._transcript: - recent = self._transcript[-self._steps_since_compaction :] if self._steps_since_compaction > 0 else [] - if recent: - lines = [] - for entry in recent: - step = entry.get("step", "?") - obs = entry.get("observation", "") - if len(obs) > 400: - obs = obs[:400] + "..." - chosen = entry.get("choice_text", "") - line = f"Step {step}: {obs}" - if chosen: - line += f"\n You chose: {chosen}" - state_notes = entry.get("memo", "") - if state_notes: - line += f"\n State: {state_notes[:350]}" - lines.append(line) - blocks.append("=== RECENT STEPS ===\n" + "\n\n".join(lines)) - - blocks.append(f"Step {self._step_count} (CURRENT):\n{state}") - return "\n\n".join(blocks) - - def _maybe_compact(self) -> None: - """Run compaction if interval reached. Called after recording a decision.""" - if self._memory_mode != "compaction": - return - if self._steps_since_compaction < self._compaction_interval: - return - - transcript_text = self._format_transcript_for_compaction() - if not transcript_text: - return - - prompt_parts = [] - prompt_parts.append("You are summarizing an agent's progress through a text quest.") - if self._quest_briefing: - prompt_parts.append(f"\nQUEST BRIEFING (the original mission):\n{self._quest_briefing}") - if self._compaction_summary: - prompt_parts.append(f"\nPREVIOUS SUMMARY:\n{self._compaction_summary}") - prompt_parts.append(f"\nTRANSCRIPT OF LAST {self._steps_since_compaction} STEPS:\n{transcript_text}") - prompt_parts.append( - "\nSummarize the agent's progress. Include:\n" - "- Current objective (what the agent should do next)\n" - "- Progress so far (what has been accomplished)\n" - "- Key facts (NPCs, items, locations, deadlines discovered)\n" - "- Failed approaches (actions/paths that didn't work)\n" - "- Map knowledge (locations visited and connections)\n\n" - "Write a concise summary in plain text, max 300 words." - ) - - compaction_prompt = "\n".join(prompt_parts) - try: - self._ensure_llm() - summary = self.llm.get_completion(compaction_prompt) - compaction_usage = self.llm.get_last_usage() or {} - if compaction_usage: - pt = int( - compaction_usage.get("prompt_tokens", 0) - if isinstance(compaction_usage, dict) - else getattr(compaction_usage, "prompt_tokens", 0) - ) - ct = int( - compaction_usage.get("completion_tokens", 0) - if isinstance(compaction_usage, dict) - else getattr(compaction_usage, "completion_tokens", 0) - ) - self._record_compaction_usage(pt, ct) - stripped = (summary or "").strip() - if not stripped: - if self.debug: - self.logger.warning("Compaction returned empty summary at step %d", self._step_count) - self._steps_since_compaction = max(0, self._compaction_interval // 2) - return - self._compaction_summary = stripped - self._transcript = [] - self._steps_since_compaction = 0 - if self.debug: - self.logger.debug( - "Compaction completed at step %d: %s", self._step_count, self._compaction_summary[:200] - ) - except Exception as e: - if self.debug: - self.logger.warning("Compaction failed at step %d: %s", self._step_count, e) - self._steps_since_compaction = max(0, self._compaction_interval // 2) - - def _record_compaction_usage(self, prompt_tokens: int, completion_tokens: int) -> None: - """Record token usage from compaction calls into agent history.""" - compaction_response = LLMResponse( - action=0, - is_default=True, - parse_mode="compaction", - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, - ) - self.history.append(compaction_response) - - def _format_transcript_for_compaction(self) -> str: - """Format recent transcript entries for the compaction prompt.""" - recent = ( - self._transcript[-self._steps_since_compaction :] - if self._steps_since_compaction > 0 - else self._transcript[-self._compaction_interval :] - ) - lines = [] - for entry in recent: - step = entry.get("step", "?") - obs = entry.get("observation", "") - if len(obs) > 400: - obs = obs[:400] + "..." - chosen = entry.get("choice_text", "") - reasoning = entry.get("reasoning", "") - state_notes = entry.get("memo", "") - line = f"Step {step}: {obs}" - if chosen: - line += f"\n Chose: {chosen}" - if state_notes: - line += f"\n State: {state_notes[:350]}" - if reasoning: - line += f"\n Reasoning: {reasoning[:800]}" - lines.append(line) - return "\n\n".join(lines) - - @staticmethod - def _normalize_for_signature(value: str, max_len: int = 320) -> str: - text = (value or "").lower() - text = re.sub(r"\s+", " ", text).strip() - if len(text) > max_len: - return text[:max_len] - return text - - def _state_signature(self, state: str, choices: list[dict[str, str]]) -> str: - normalized_state = self._normalize_for_signature(state, max_len=420) - normalized_choices = "|".join( - self._normalize_for_signature(choice.get("text", ""), max_len=110) for choice in choices - ) - raw_signature = f"{normalized_state}||{normalized_choices}" - return hashlib.sha1(raw_signature.encode("utf-8", errors="ignore")).hexdigest()[:20] - - def _remember_decision( - self, - state: str, - choices: list[dict[str, str]], - state_signature: str, - response: LLMResponse, - ) -> None: - action = int(response.action) - counts = self._state_action_counts.setdefault(state_signature, {}) - counts[action] = counts.get(action, 0) + 1 - - if len(self._state_action_counts) > self._max_state_signatures: - oldest_key = next(iter(self._state_action_counts.keys())) - if oldest_key != state_signature: - self._state_action_counts.pop(oldest_key, None) - - selected_text = "" - if 1 <= action <= len(choices): - selected_text = choices[action - 1].get("text", "") - state_snippet = state.strip() - if len(state_snippet) > self._context_chars: - state_snippet = state_snippet[: self._context_chars] + "..." - - self._decision_history.append( - { - "state": state_snippet, - "action": action, - "choice": selected_text, - "parse_mode": response.parse_mode or "unknown", - "memo": (response.memo or "").strip()[:350] or None, - } - ) - if len(self._decision_history) > 40: - self._decision_history = self._decision_history[-40:] - - # Transcript for full_transcript and compaction modes - if self._memory_mode in ("full_transcript", "compaction"): - self._step_count += 1 - self._steps_since_compaction += 1 - self._transcript.append( - { - "step": self._step_count, - "observation": state_snippet if self._memory_mode == "compaction" else state.strip()[:400], - "choice_text": selected_text, - "reasoning": (response.reasoning or "")[:800], - "memo": (response.memo or "").strip()[:350] or None, - "action": action, - } - ) - self._maybe_compact() - - def _choice_risk_score(self, choice_text: str) -> int: - text = (choice_text or "").lower() - score = 0 - for keyword in RISKY_CHOICE_KEYWORDS: - if keyword in text: - score += 2 - for keyword in SAFE_CHOICE_KEYWORDS: - if keyword in text: - score -= 1 - return score - - def _apply_safety_filter(self, action: int, choices: list[dict[str, str]]) -> int: - """Replace obviously risky actions when a clearly safer alternative exists.""" - if not self._use_safety_filter or len(choices) < 2: - return action - - current_idx = action - 1 - if current_idx < 0 or current_idx >= len(choices): - return action - - scored = [(idx + 1, self._choice_risk_score(c.get("text", ""))) for idx, c in enumerate(choices)] - scored.sort(key=lambda item: item[1]) - - best_action, best_score = scored[0] - current_score = self._choice_risk_score(choices[current_idx].get("text", "")) - - # Only override when the chosen action is materially riskier than the best option. - if current_score - best_score >= 2: - if self.debug: - self.logger.debug( - "Safety filter override: %s -> %s (risk %s -> %s)", - action, - best_action, - current_score, - best_score, - ) - return best_action - return action - - @staticmethod - def _state_fingerprint(state: str) -> str: - """Create a stable fingerprint for loop detection.""" - compact = " ".join((state or "").lower().split()) - if len(compact) > 500: - compact = compact[:500] - return compact - - def _apply_loop_escape( - self, - state_key: str, - action: int, - choices: list[dict[str, str]], - ) -> tuple[int, bool]: - """Diversify action when the same state repeats with no apparent progress.""" - if len(choices) <= 1: - return action, False - - counts = self._state_action_counts.get(state_key, {}) - total_visits = sum(counts.values()) - if total_visits < 3: - return action, False - - current_count = counts.get(action, 0) - if current_count < 2: - return action, False - all_actions = list(range(1, len(choices) + 1)) - ranked = sorted( - all_actions, - key=lambda a: ( - counts.get(a, 0), - self._choice_risk_score(choices[a - 1].get("text", "")), - ), - ) - best_action = ranked[0] - - if best_action != action and counts.get(best_action, 0) < current_count: - return best_action, True - if total_visits >= 5 and current_count >= 3 and best_action != action: - return best_action, True - return action, False - - @staticmethod - def _normalize_usage(usage: dict[str, Any] | None) -> dict[str, Any]: - usage = usage or {} - prompt_tokens = int(usage.get("prompt_tokens") or 0) - completion_tokens = int(usage.get("completion_tokens") or 0) - total_tokens = int(usage.get("total_tokens") or (prompt_tokens + completion_tokens)) - estimated_cost_usd = usage.get("estimated_cost_usd") - if estimated_cost_usd is not None: - estimated_cost_usd = float(estimated_cost_usd) - return { - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": total_tokens, - "estimated_cost_usd": estimated_cost_usd, - } - - @classmethod - def _merge_usage(cls, first: dict[str, Any] | None, second: dict[str, Any] | None) -> dict[str, Any]: - a = cls._normalize_usage(first) - b = cls._normalize_usage(second) - merged_cost = None - if a["estimated_cost_usd"] is not None or b["estimated_cost_usd"] is not None: - merged_cost = (a["estimated_cost_usd"] or 0.0) + (b["estimated_cost_usd"] or 0.0) - return { - "prompt_tokens": a["prompt_tokens"] + b["prompt_tokens"], - "completion_tokens": a["completion_tokens"] + b["completion_tokens"], - "total_tokens": a["total_tokens"] + b["total_tokens"], - "estimated_cost_usd": merged_cost, - } - - def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int: - """Implementation of action selection logic. - - Args: - state (str): Current game state text - choices (List[Dict[str, str]]): List of available choices - - Returns: - int: Selected action number (1-based) - """ - if self.debug: - self.logger.debug(f"Getting action for state with {len(choices)} choices available") - for i, choice in enumerate(choices): - self.logger.debug(f"Choice {i + 1}: {choice.get('text', 'NO TEXT')}") - try: - state_signature = self._state_signature(state, choices) - # Format prompt - prompt = self._format_prompt(self._build_contextual_state(state), choices) - if self.debug: - self.logger.debug(f"\nPrompt:\n{prompt}") - - # Get LLM response - self._ensure_llm() - llm_response = self.llm.get_completion(prompt) - llm_usage = self.llm.get_last_usage() - if self.debug: - self.logger.debug(f"LLM response: {llm_response}") - choices_debug = [] - for i, c in enumerate(choices): - choices_debug.append(f"{i + 1}: {c['text']}") - self.logger.debug(f"Available choices: {choices_debug}") - - # Parse response - first_response = parse_llm_response( - llm_response, - len(choices), - self.debug, - self.logger, - ) - parsed_response = first_response - - if parsed_response.is_default: - retry_response = self.llm.get_completion(self._format_retry_prompt(state, choices)) - retry_usage = self.llm.get_last_usage() - llm_usage = self._merge_usage(llm_usage, retry_usage) - retry_parsed = parse_llm_response(retry_response, len(choices), self.debug, self.logger) - if not retry_parsed.is_default: - retry_parsed.parse_mode = f"retry_{retry_parsed.parse_mode or 'parsed'}" - parsed_response = retry_parsed - elif self._needs_force_numeric_retry(): - # GPT-5/o models occasionally return empty visible text on long prompts. - # Use a tiny final retry that asks for number-only output. - force_retry_response = self.llm.get_completion(self._format_force_numeric_retry_prompt(choices)) - force_retry_usage = self.llm.get_last_usage() - llm_usage = self._merge_usage(llm_usage, force_retry_usage) - force_retry_parsed = parse_llm_response( - force_retry_response, - len(choices), - self.debug, - self.logger, - ) - if not force_retry_parsed.is_default: - force_retry_parsed.parse_mode = f"force_retry_{force_retry_parsed.parse_mode or 'parsed'}" - parsed_response = force_retry_parsed - - action_before_policy = parsed_response.action - if parsed_response is not first_response: - if parsed_response.analysis is None and first_response.analysis is not None: - parsed_response.analysis = first_response.analysis - if _is_numeric_raw_reasoning(parsed_response.reasoning): - if first_response.reasoning and not _is_numeric_raw_reasoning(first_response.reasoning): - parsed_response.reasoning = first_response.reasoning - else: - first_raw_reasoning = _raw_reasoning_fallback(llm_response) - if first_raw_reasoning and not _is_numeric_raw_reasoning(first_raw_reasoning): - parsed_response.reasoning = first_raw_reasoning - - parsed_response.action = self._apply_safety_filter(parsed_response.action, choices) - if parsed_response.action != action_before_policy and not parsed_response.reasoning: - parsed_response.reasoning = "policy_safety_override" - usage_payload = self._normalize_usage(llm_usage) - parsed_response.prompt_tokens = usage_payload["prompt_tokens"] - parsed_response.completion_tokens = usage_payload["completion_tokens"] - parsed_response.total_tokens = usage_payload["total_tokens"] - parsed_response.estimated_cost_usd = usage_payload["estimated_cost_usd"] - - if self.debug: - self.logger.debug(f"Parsed LLM response: {parsed_response}") - self.logger.debug(f"Final action to be returned: {parsed_response.action}") - - # Store response in history - self.history.append(parsed_response) - self._last_response = parsed_response - self._remember_decision(state, choices, state_signature, parsed_response) - - # Check that action is within valid range before returning - if parsed_response.action < 1 or parsed_response.action > len(choices): - self.logger.error(f"INVALID ACTION DETECTED: {parsed_response.action} not in range 1-{len(choices)}") - # Use default first action instead - parsed_response.action = 1 - self.logger.warning("Defaulting to action 1 instead") - - return parsed_response.action - - except Exception as e: - self.logger.error(f"Error during LLM call: {e}") - default_response = LLMResponse( - action=1, - is_default=True, - parse_mode="error_default", - reasoning=_raw_reasoning_fallback(f"llm_call_error: {e}"), - ) - self.history.append(default_response) - self._last_response = default_response - return 1 # Default to first choice on error - - def reset(self) -> None: - """Reset agent state""" - self.history = [] - self._observation_history = [] - self._decision_history = [] - self._state_action_counts = {} - self._last_response = LLMResponse(action=1, is_default=True) - self._quest_briefing = None - self._transcript = [] - self._compaction_summary = None - self._steps_since_compaction = 0 - self._step_count = 0 - - def on_game_start(self) -> None: - """Called when game starts""" - super().on_game_start() - self._observation_history = [] - self._decision_history = [] - self._state_action_counts = {} - self._last_response = LLMResponse(action=1, is_default=True) - self._quest_briefing = None - self._transcript = [] - self._compaction_summary = None - self._steps_since_compaction = 0 - self._step_count = 0 - - def on_game_end(self, final_state: dict[str, Any]) -> None: - """Log final state for analysis""" - if self.debug: - self.logger.debug(f"Game ended with state: {final_state}") - - def __str__(self) -> str: - """String representation of the agent""" - return f"LLMAgent(model={self.model_name}, system_template={self.system_template}, action_template={self.action_template}, temperature={self.temperature})" - - def _format_prompt(self, state: str, choices: list[dict[str, str]]) -> str: - """Format the prompt for the LLM""" - return self.prompt_renderer.render_action_prompt(state, choices).strip() - - def _format_retry_prompt(self, state: str, choices: list[dict[str, str]]) -> str: - """Fallback prompt that still preserves reasoning for log analysis.""" - clipped_state = (state or "").strip() - if len(clipped_state) > 500: - clipped_state = clipped_state[:500] + "..." - choices_text = "\n".join([f"{i + 1}. {(c.get('text', '') or '')[:160]}" for i, c in enumerate(choices)]) - return f"""Choose the best action. -State: {clipped_state} -Actions: -{choices_text} - -Return valid JSON only: -{{ - "analysis": "", - "reasoning": "", - "result": -}}""" - - def _format_force_numeric_retry_prompt(self, choices: list[dict[str, str]]) -> str: - """Very short retry prompt used for models that return empty visible output.""" - choices_text = "\n".join([f"{i + 1}. {(c.get('text', '') or '')[:110]}" for i, c in enumerate(choices)]) - return f"""Pick one action number. -{choices_text} -Reply with one integer only: 1 to {len(choices)}.""" - - def _needs_force_numeric_retry(self) -> bool: - return self.model_spec.provider == "openai" and ( - self.model_spec.model_id.startswith("gpt-5") or self.model_spec.model_id.startswith("o") - ) diff --git a/llm_quest_benchmark/agents/strategic_agent.py b/llm_quest_benchmark/agents/strategic_agent.py deleted file mode 100644 index 387c650..0000000 --- a/llm_quest_benchmark/agents/strategic_agent.py +++ /dev/null @@ -1,94 +0,0 @@ -"""Strategic agent decorator that adds analysis capabilities""" - -import logging -from typing import Any - -from llm_quest_benchmark.agents.base import QuestPlayer -from llm_quest_benchmark.llm.prompt import PromptRenderer - - -class StrategicAgent(QuestPlayer): - """Decorator that adds strategic thinking to any quest player""" - - def __init__(self, base_agent: QuestPlayer, debug: bool = False, template: str = "advanced.jinja"): - """Initialize strategic agent wrapper - - Args: - base_agent: Base agent to wrap (usually LLMAgent) - debug: Enable debug logging - template: Template to use for enhanced prompts - """ - super().__init__(skip_single=base_agent.skip_single) - self.agent = base_agent - self.debug = debug - self.history = [] - - # Setup logging - self.logger = logging.getLogger(self.__class__.__name__) - if self.debug: - self.logger.setLevel(logging.DEBUG) - handler = logging.StreamHandler() - handler.setFormatter(logging.Formatter("%(name)s - %(message)s")) - self.logger.addHandler(handler) - - # Initialize prompt renderer - self.prompt_renderer = PromptRenderer(None, template=template) - - def _get_action_impl(self, observation: str, choices: list) -> str: - """Implementation of action selection logic with strategic analysis""" - if hasattr(self.agent, "llm"): - # First, get situation analysis - if self.debug: - self.logger.debug(f"\nObservation:\n{observation}") - - analysis = self.agent.llm( - "Analyze this situation and explain your thinking step-by-step instead of choosing an action:\n" - + observation - ) - - if self.debug: - self.logger.debug(f"\nAnalysis:\n{analysis}") - - # Store analysis in history - self.history.append({"observation": observation, "analysis": analysis}) - - # Get enhanced context with history - enhanced_context = self.get_enhanced_context(observation, choices) - if self.debug: - self.logger.debug(f"\nEnhanced Context:\n{enhanced_context}") - - # Then make the actual choice with analysis context - return self.agent.get_action(enhanced_context, choices) - else: - # If agent doesn't have LLM capability, just pass through - return self.agent.get_action(observation, choices) - - def get_enhanced_context(self, observation: str, choices: list) -> str: - """Build context for advanced prompt with historical analysis""" - context = [ - f"Turn {len(self.history) + 1}: {entry['analysis']}" - for entry in self.history[-3:] # Last 3 analyses - ] - return self.prompt_renderer.render_action_prompt( - observation=observation, choices=choices, state_tracker=context - ) - - def reset(self) -> None: - """Reset both strategic and base agent state""" - self.history = [] - self.agent.reset() - - def on_game_start(self) -> None: - """Pass through to base agent""" - if self.debug: - self.logger.debug("Starting new game with strategic analysis") - self.agent.on_game_start() - - def on_game_end(self, final_state: dict[str, Any]) -> None: - """Pass through to base agent and log analysis history""" - self.agent.on_game_end(final_state) - if self.debug: - self.logger.debug("Final Analysis History:") - for entry in self.history: - self.logger.debug(f"\nObservation: {entry['observation']}") - self.logger.debug(f"Analysis: {entry['analysis']}") diff --git a/llm_quest_benchmark/agents/tool_agent.py b/llm_quest_benchmark/agents/tool_agent.py deleted file mode 100644 index 694d1ac..0000000 --- a/llm_quest_benchmark/agents/tool_agent.py +++ /dev/null @@ -1,384 +0,0 @@ -"""Tool-augmented agent with lightweight structured prompting.""" - -import ast -import re -from typing import Any - -from llm_quest_benchmark.agents.llm_agent import ( - LLMAgent, - LLMResponse, - _parse_json_response, - parse_llm_response, -) - - -class ToolAgent(LLMAgent): - """LLM agent with generic run-local tools for history, math, and state notes.""" - - DEFAULT_HISTORY_WINDOW = 10 - MAX_SCRATCHPAD_CHARS = 1200 - MAX_TOOL_INPUT_CHARS = 500 - - def __init__( - self, - *args, - action_template: str = "tool_augmented.jinja", - history_window: int | None = None, - **kwargs, - ): - super().__init__(*args, action_template=action_template, **kwargs) - self.agent_id = f"tool_{self.model_name}" - self._step_log: list[dict[str, Any]] = [] - self._history_window = history_window or self.DEFAULT_HISTORY_WINDOW - self._scratchpad = "" - - def _recent_steps(self) -> list[str]: - snippets = [] - for entry in self._step_log[-self._history_window :]: - snippets.append(f"Step {entry['step']}: {entry['observation']} -> {entry.get('selected_choice', 'n/a')}") - return snippets - - def _tool_descriptions(self) -> list[str]: - return [ - "quest_history(query): search earlier observations and chosen actions in this quest.", - "calculator(expression): evaluate arithmetic and simple comparisons.", - "scratchpad(operation, content): read or replace one persistent note. operation is read or write_replace.", - ] - - def quest_history(self, query: str) -> str: - """Return relevant previous steps from this quest run via keyword match.""" - if not self._step_log: - return "No prior quest steps recorded yet." - - tokens = set(re.findall(r"[a-zA-Z\u0400-\u04ff0-9_]{3,}", (query or "").lower())) - scored = [] - for entry in self._step_log: - haystack = " ".join( - [ - entry.get("observation", ""), - " ".join(entry.get("choices", [])), - entry.get("selected_choice", ""), - ] - ).lower() - score = sum(1 for token in tokens if token in haystack) - scored.append((score, entry)) - - scored.sort(key=lambda item: (item[0], item[1].get("step", 0)), reverse=True) - best = [entry for s, entry in scored if s > 0][: self._history_window] - if not best: - best = [entry for _, entry in scored[-self._history_window :]] - - lines = [] - for entry in best: - lines.append( - f"Step {entry['step']}: obs={entry['observation']} | " - f"choices={'; '.join(entry['choices'])} | picked={entry.get('selected_choice', 'n/a')}" - ) - return "\n".join(lines) - - @staticmethod - def calculator(expression: str) -> str: - """Evaluate a restricted arithmetic/comparison expression.""" - expr = (expression or "").strip() - if not expr: - return "error: empty expression" - if len(expr) > 240: - return "error: expression too long" - if not re.fullmatch(r"[0-9a-zA-Z\s+\-*/().,<>=!%]+", expr): - return "error: unsupported characters" - - allowed_nodes = ( - ast.Expression, - ast.Constant, - ast.UnaryOp, - ast.UAdd, - ast.USub, - ast.BinOp, - ast.Add, - ast.Sub, - ast.Mult, - ast.Div, - ast.FloorDiv, - ast.Mod, - ast.Pow, - ast.Compare, - ast.Eq, - ast.NotEq, - ast.Lt, - ast.LtE, - ast.Gt, - ast.GtE, - ast.BoolOp, - ast.And, - ast.Or, - ) - try: - tree = ast.parse(expr, mode="eval") - for node in ast.walk(tree): - if not isinstance(node, allowed_nodes): - return f"error: unsupported expression element {node.__class__.__name__}" - if isinstance(node, ast.Constant) and not isinstance(node.value, (int, float, bool)): - return "error: constants must be numeric or boolean" - result = ToolAgent._eval_calculator_node(tree.body) - except Exception as exc: - return f"error: {exc}" - return f"{expr} = {result}" - - @staticmethod - def _eval_calculator_node(node: ast.AST) -> int | float | bool: - if isinstance(node, ast.Constant) and isinstance(node.value, (int, float, bool)): - return node.value - if isinstance(node, ast.UnaryOp): - value = ToolAgent._eval_calculator_node(node.operand) - if isinstance(node.op, ast.UAdd): - return +value - if isinstance(node.op, ast.USub): - return -value - if isinstance(node, ast.BinOp): - left = ToolAgent._eval_calculator_node(node.left) - right = ToolAgent._eval_calculator_node(node.right) - if isinstance(node.op, ast.Add): - return left + right - if isinstance(node.op, ast.Sub): - return left - right - if isinstance(node.op, ast.Mult): - return left * right - if isinstance(node.op, ast.Div): - return left / right - if isinstance(node.op, ast.FloorDiv): - return left // right - if isinstance(node.op, ast.Mod): - return left % right - if isinstance(node.op, ast.Pow): - if abs(right) > 8: - raise ValueError("exponent too large") - return left**right - if isinstance(node, ast.BoolOp): - values = [bool(ToolAgent._eval_calculator_node(value)) for value in node.values] - if isinstance(node.op, ast.And): - return all(values) - if isinstance(node.op, ast.Or): - return any(values) - if isinstance(node, ast.Compare): - left = ToolAgent._eval_calculator_node(node.left) - for op, comparator in zip(node.ops, node.comparators, strict=True): - right = ToolAgent._eval_calculator_node(comparator) - if isinstance(op, ast.Eq): - ok = left == right - elif isinstance(op, ast.NotEq): - ok = left != right - elif isinstance(op, ast.Lt): - ok = left < right - elif isinstance(op, ast.LtE): - ok = left <= right - elif isinstance(op, ast.Gt): - ok = left > right - elif isinstance(op, ast.GtE): - ok = left >= right - else: - raise ValueError("unsupported comparison") - if not ok: - return False - left = right - return True - raise ValueError("unsupported expression") - - def scratchpad(self, operation: str, content: str = "") -> str: - """Read or replace one persistent free-form note blob.""" - op = (operation or "").strip().lower() - if op == "read": - return self._scratchpad or "(empty)" - if op == "write_replace": - note = " ".join((content or "").strip().split()) - self._scratchpad = note[: self.MAX_SCRATCHPAD_CHARS] - return f"updated: {self._scratchpad or '(empty)'}" - return "error: operation must be read or write_replace" - - def _build_tool_prompt( - self, - observation: str, - choices: list[dict[str, str]], - prompt_kind: str, - tool_results: list[str] | None = None, - ) -> str: - template = self.prompt_renderer.get_template(self.action_template) - return template.render( - prompt_kind=prompt_kind, - observation=observation, - choices=[{"text": choice.get("text", "")} for choice in choices], - tool_descriptions=self._tool_descriptions(), - tool_results=tool_results or [], - recent_steps=self._recent_steps(), - scratchpad_note=self._scratchpad, - ).strip() - - @staticmethod - def _extract_tool_calls(response: str) -> list[dict[str, Any]]: - payload, _ = _parse_json_response(response) - if not isinstance(payload, dict): - return [] - - tool_calls = payload.get("tool_calls") - if not isinstance(tool_calls, list): - return [] - - normalized = [] - for item in tool_calls[:1]: - if not isinstance(item, dict): - continue - tool_name = str(item.get("tool") or "").strip() - tool_input = item.get("input") - operation = str(item.get("operation") or "").strip() - content = str(item.get("content") or "").strip() - if isinstance(tool_input, dict): - operation = operation or str(tool_input.get("operation") or "").strip() - content = content or str(tool_input.get("content") or "").strip() - tool_input = tool_input.get("expression") or tool_input.get("query") or tool_input.get("content") or "" - tool_input = str(tool_input or "").strip() - if len(tool_input) > ToolAgent.MAX_TOOL_INPUT_CHARS: - tool_input = tool_input[: ToolAgent.MAX_TOOL_INPUT_CHARS] - if len(content) > ToolAgent.MAX_TOOL_INPUT_CHARS: - content = content[: ToolAgent.MAX_TOOL_INPUT_CHARS] - if tool_name: - normalized.append( - { - "tool": tool_name, - "input": tool_input, - "operation": operation, - "content": content, - } - ) - return normalized - - def _execute_tool_calls(self, tool_calls: list[dict[str, Any]]) -> list[str]: - results = [] - for tc in tool_calls: - name, inp = tc["tool"], tc.get("input", "") - if name == "quest_history": - result = self.quest_history(inp) - elif name == "calculator": - result = self.calculator(inp) - elif name == "scratchpad": - operation = tc.get("operation") or inp - result = self.scratchpad(str(operation), str(tc.get("content") or "")) - else: - result = f"unknown tool: {name}" - call_repr = inp - if name == "scratchpad": - call_repr = f"{tc.get('operation') or inp}, {tc.get('content') or ''}".strip(", ") - results.append(f"{name}({call_repr}) => {result}") - return results - - def _final_choice( - self, - observation: str, - choices: list[dict[str, str]], - tool_results: list[str] | None = None, - ) -> tuple[LLMResponse, dict[str, Any]]: - prompt = self._build_tool_prompt( - observation, - choices, - prompt_kind="final", - tool_results=tool_results, - ) - llm_response = self.llm.get_completion(prompt) - llm_usage = self.llm.get_last_usage() - parsed_response = parse_llm_response(llm_response, len(choices), self.debug, self.logger) - - if parsed_response.is_default: - retry_response = self.llm.get_completion(self._format_retry_prompt(observation, choices)) - retry_usage = self.llm.get_last_usage() - llm_usage = self._merge_usage(llm_usage, retry_usage) - retry_parsed = parse_llm_response(retry_response, len(choices), self.debug, self.logger) - if not retry_parsed.is_default: - retry_parsed.parse_mode = f"retry_{retry_parsed.parse_mode or 'parsed'}" - parsed_response = retry_parsed - elif self._needs_force_numeric_retry(): - force_response = self.llm.get_completion(self._format_force_numeric_retry_prompt(choices)) - force_usage = self.llm.get_last_usage() - llm_usage = self._merge_usage(llm_usage, force_usage) - force_parsed = parse_llm_response(force_response, len(choices), self.debug, self.logger) - if not force_parsed.is_default: - force_parsed.parse_mode = f"force_retry_{force_parsed.parse_mode or 'parsed'}" - parsed_response = force_parsed - - return parsed_response, llm_usage - - def _log_step(self, observation: str, choices: list[dict[str, str]], response: LLMResponse) -> None: - selected = "" - if 1 <= response.action <= len(choices): - selected = choices[response.action - 1].get("text", "") - - clipped = " ".join((observation or "").strip().split()) - if len(clipped) > 180: - clipped = clipped[:180] + "..." - - self._step_log.append( - { - "step": len(self._step_log) + 1, - "observation": clipped, - "choices": [c.get("text", "") for c in choices], - "selected_choice": selected, - } - ) - - def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int: - try: - state_signature = self._state_signature(state, choices) - contextual_state = self._build_contextual_state(state) - self._ensure_llm() - - selection_prompt = self._build_tool_prompt(contextual_state, choices, prompt_kind="select") - selection_response = self.llm.get_completion(selection_prompt) - selection_usage = self.llm.get_last_usage() - tool_calls = self._extract_tool_calls(selection_response) - parsed_response = parse_llm_response(selection_response, len(choices), self.debug, self.logger) - tool_results: list[str] = [] - - total_usage = self._normalize_usage(selection_usage) - if tool_calls: - tool_results = self._execute_tool_calls(tool_calls) - parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=tool_results) - total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage)) - elif parsed_response.is_default: - parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=[]) - total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage)) - - action_before_policy = parsed_response.action - parsed_response.action = self._apply_safety_filter(parsed_response.action, choices) - if parsed_response.action != action_before_policy and not parsed_response.reasoning: - parsed_response.reasoning = "policy_safety_override" - - parsed_response.prompt_tokens = total_usage["prompt_tokens"] - parsed_response.completion_tokens = total_usage["completion_tokens"] - parsed_response.total_tokens = total_usage["total_tokens"] - parsed_response.estimated_cost_usd = total_usage["estimated_cost_usd"] - parsed_response.tool_calls = tool_calls or None - parsed_response.tool_results = tool_results or None - - self.history.append(parsed_response) - self._last_response = parsed_response - self._remember_decision(state, choices, state_signature, parsed_response) - self._log_step(state, choices, parsed_response) - return parsed_response.action - except Exception as exc: - self.logger.error("Tool agent error during LLM call: %s", exc) - default_response = LLMResponse( - action=1, - is_default=True, - parse_mode="error_default", - reasoning=f"tool_agent_error: {exc}", - ) - self.history.append(default_response) - self._last_response = default_response - return 1 - - def reset(self) -> None: - super().reset() - self._step_log = [] - self._scratchpad = "" - - def on_game_start(self) -> None: - super().on_game_start() - self._step_log = [] - self._scratchpad = "" diff --git a/llm_quest_benchmark/core/leaderboard.py b/llm_quest_benchmark/core/leaderboard.py index 032ad48..078648e 100644 --- a/llm_quest_benchmark/core/leaderboard.py +++ b/llm_quest_benchmark/core/leaderboard.py @@ -28,9 +28,6 @@ "stub": ("minimal_prompt", TAXONOMY_MODES["minimal_prompt"]), "strategic": ("short_context_reasoning", TAXONOMY_MODES["short_context_reasoning"]), "stateful_compact": ("compact_memory_memo", TAXONOMY_MODES["compact_memory_memo"]), - "memo_cot": ("compact_memory_memo", TAXONOMY_MODES["compact_memory_memo"]), - "memo_extended": ("compact_memory_memo", TAXONOMY_MODES["compact_memory_memo"]), - "memo_structured": ("compact_memory_memo", TAXONOMY_MODES["compact_memory_memo"]), "light_hints": ("prompt_hints", TAXONOMY_MODES["prompt_hints"]), "stateful_compact_hints": ("prompt_hints", TAXONOMY_MODES["prompt_hints"]), "planner": ("planner_loop", TAXONOMY_MODES["planner_loop"]), @@ -38,6 +35,26 @@ "tool_augmented_hints": ("tools_hints_compact_memory", TAXONOMY_MODES["tools_hints_compact_memory"]), } +RETIRED_BENCHMARK_NAMES = { + "exp4_compaction_no_memo", + "exp4_memo_cot", + "exp4_memo_extended", + "exp4_memo_structured", +} + +RETIRED_HARNESSES = { + "compaction_no_memo", + "memo_cot", + "memo_extended", + "memo_structured", +} + +RETIRED_TEMPLATE_IDS = { + "memo_cot", + "memo_extended", + "memo_structured", +} + REASONING_STYLE_TEMPLATES = { "reasoning", "strategic", @@ -87,6 +104,25 @@ def _mode_from_template(template_name: str, memory_mode: str | None = None) -> t return TEMPLATE_TO_MODE.get(template_id, (template_id or "unknown", template_id or "unknown")) +def _is_retired_result( + source_name: str | None, + benchmark_id: str | None, + result_row: dict[str, Any], + agent_config: dict[str, Any], + template_name: str, +) -> bool: + source_names = {str(value) for value in (source_name, benchmark_id) if value} + if source_names & RETIRED_BENCHMARK_NAMES: + return True + + harness = str(result_row.get("harness") or agent_config.get("harness") or "") + if harness in RETIRED_HARNESSES: + return True + + template_id = _strip_template_suffix(template_name) + return template_id in RETIRED_TEMPLATE_IDS + + def _agent_config(db_run: dict[str, Any]) -> dict[str, Any]: raw_config = db_run.get("agent_config") if not isinstance(raw_config, str) or not raw_config: @@ -298,6 +334,7 @@ def generate_leaderboard( continue benchmark_id = summary.get("benchmark_id") + source_name = summary.get("name") if benchmark_id: benchmark_ids.append(str(benchmark_id)) @@ -348,7 +385,15 @@ def generate_leaderboard( template_from_config = str(config.get("action_template") or "") if template_from_config: template = template_from_config - memory_mode = config.get("memory_mode") + memory_mode = config.get("memory_mode") or result_row.get("memory_mode") + if _is_retired_result( + str(source_name) if source_name else None, + str(benchmark_id) if benchmark_id else None, + result_row, + config, + template, + ): + continue mode_id, mode_label = _mode_from_template(template, str(memory_mode) if memory_mode is not None else None) try: diff --git a/llm_quest_benchmark/core/runner.py b/llm_quest_benchmark/core/runner.py index e5f531a..d86c07b 100644 --- a/llm_quest_benchmark/core/runner.py +++ b/llm_quest_benchmark/core/runner.py @@ -10,24 +10,23 @@ from copy import deepcopy from typing import Any -from llm_quest_benchmark.agents.base import QuestPlayer from llm_quest_benchmark.constants import DEFAULT_QUEST_TIMEOUT from llm_quest_benchmark.core.logging import LogManager, QuestLogger from llm_quest_benchmark.environments.qm import QMPlayerEnv as QuestEnvironment from llm_quest_benchmark.environments.state import QuestOutcome -from llm_quest_benchmark.schemas.config import AgentConfig +from llm_quest_benchmark.players.base import QuestPlayer +from llm_quest_benchmark.schemas.config import HarnessConfig from llm_quest_benchmark.schemas.state import AgentState # Configure logging logging.getLogger("quest").setLevel(logging.WARNING) -logging.getLogger("LLMAgent").setLevel(logging.WARNING) def run_quest_with_timeout( quest_path: str, agent: QuestPlayer, timeout: int = DEFAULT_QUEST_TIMEOUT, - agent_config: AgentConfig | None = None, + agent_config: HarnessConfig | Any | None = None, debug: bool = False, callbacks: list[Callable[[str, Any], None]] = None, ) -> QuestOutcome | None: diff --git a/llm_quest_benchmark/executors/benchmark.py b/llm_quest_benchmark/executors/benchmark.py index 0b78062..14dacaf 100644 --- a/llm_quest_benchmark/executors/benchmark.py +++ b/llm_quest_benchmark/executors/benchmark.py @@ -12,10 +12,10 @@ from pathlib import Path from typing import Any -from llm_quest_benchmark.agents.agent_factory import create_agent from llm_quest_benchmark.core.logging import DEFAULT_DB_PATH from llm_quest_benchmark.core.runner import run_quest_with_timeout from llm_quest_benchmark.environments.state import QuestOutcome +from llm_quest_benchmark.harnesses.factory import create_harness from llm_quest_benchmark.llm import tracing from llm_quest_benchmark.schemas.config import BenchmarkConfig @@ -34,6 +34,68 @@ logger = logging.getLogger(__name__) +def _agent_harness(agent_config) -> str: + """Return the configured harness name.""" + return agent_config.harness + + +def _agent_model(agent_config) -> str: + """Return the result model label for the executed harness.""" + harness = _agent_harness(agent_config) + if harness == "human": + return "human" + if harness.startswith("random_choice"): + return "random_policy" + return agent_config.model + + +def _agent_id(agent_config) -> str: + """Return the stable result identifier for legacy and harness configs.""" + return getattr(agent_config, "harness_id", None) or agent_config.agent_id + + +def _agent_template(agent_config) -> str: + """Return legacy template name for result artifacts.""" + if hasattr(agent_config, "action_template"): + return agent_config.action_template + + harness_templates = { + "minimal": "stub.jinja", + "reasoning_recent": "reasoning.jinja", + "reasoning_full": "reasoning.jinja", + "memo_compact": "stateful_compact.jinja", + "hinted_compact": "stateful_compact_hints.jinja", + "tool_compact": "tool_augmented.jinja", + "tool_hinted": "tool_augmented_hints.jinja", + "planner": "planner.jinja", + "compaction_no_memo": "reasoning.jinja", + "memo_cot": "memo_cot.jinja", + "memo_extended": "memo_extended.jinja", + "memo_structured": "memo_structured.jinja", + } + return harness_templates.get(_agent_harness(agent_config), "reasoning.jinja") + + +def _agent_memory_mode(agent_config) -> str: + """Return legacy memory mode for result artifacts.""" + if hasattr(agent_config, "memory_mode"): + return agent_config.memory_mode + + harness_memory_modes = { + "reasoning_full": "full_transcript", + "memo_compact": "compaction", + "hinted_compact": "compaction", + "tool_compact": "compaction", + "tool_hinted": "compaction", + "planner": "compaction", + "compaction_no_memo": "compaction", + "memo_cot": "compaction", + "memo_extended": "compaction", + "memo_structured": "compaction", + } + return harness_memory_modes.get(_agent_harness(agent_config), "default") + + def _result_entry( quest: str, agent_config, @@ -44,10 +106,12 @@ def _result_entry( ) -> dict[str, Any]: return { "quest": quest, - "model": agent_config.model, + "model": _agent_model(agent_config), "temperature": agent_config.temperature, - "template": agent_config.action_template, - "agent_id": agent_config.agent_id, + "harness": _agent_harness(agent_config), + "template": _agent_template(agent_config), + "memory_mode": _agent_memory_mode(agent_config), + "agent_id": _agent_id(agent_config), "attempt": attempt, "outcome": outcome, "reward": reward, @@ -78,7 +142,7 @@ def _mark_run_timeout(run_id: int | None, quest: str, agent_config, benchmark_id WHERE id = ? """, ( - agent_config.agent_id, + _agent_id(agent_config), agent_config_json, benchmark_id, QuestOutcome.TIMEOUT.name, @@ -101,7 +165,7 @@ def _mark_run_timeout(run_id: int | None, quest: str, agent_config, benchmark_id Path(quest).stem, end_time, end_time, - agent_config.agent_id, + _agent_id(agent_config), agent_config_json, QuestOutcome.TIMEOUT.name, 0.0, @@ -132,15 +196,14 @@ def callback(event: str, data: Any = None) -> None: ) try: - agent = create_agent( + agent = create_harness( + harness=_agent_harness(agent_config), model=agent_config.model, temperature=agent_config.temperature, - system_template=agent_config.system_template, - action_template=agent_config.action_template, skip_single=agent_config.skip_single, debug=agent_config.debug, - memory_mode=agent_config.memory_mode, compaction_interval=agent_config.compaction_interval, + system_template=agent_config.system_template, ) outcome = run_quest_with_timeout( quest, @@ -254,7 +317,7 @@ def _write_benchmark_artifacts(config: BenchmarkConfig, results: list[dict[str, "temperature": agent.temperature, "runs": agent.runs, "system_template": agent.system_template, - "action_template": agent.action_template, + "harness": _agent_harness(agent), } for agent in config.agents ], @@ -281,7 +344,7 @@ def _write_benchmark_artifacts(config: BenchmarkConfig, results: list[dict[str, { "model": agent.model, "system_template": agent.system_template, - "action_template": agent.action_template, + "harness": _agent_harness(agent), "temperature": agent.temperature, "runs": agent.runs, "skip_single": agent.skip_single, @@ -346,7 +409,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[ logger.info( "Queued agent %s quest %s (attempt %s/%s)", - agent_config.agent_id, + _agent_id(agent_config), quest_name, attempt, agent_config.runs, @@ -378,7 +441,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[ } logger.info( "Agent %s running quest %s (attempt %s/%s)", - agent_config.agent_id, + _agent_id(agent_config), task["quest_name"], task["attempt"], agent_config.runs, @@ -391,7 +454,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[ "total_runs": total_runs, "quest": task["quest"], "quest_name": task["quest_name"], - "agent_id": agent_config.agent_id, + "agent_id": _agent_id(agent_config), "model": agent_config.model, "attempt": task["attempt"], }, @@ -426,7 +489,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[ "total_runs": total_runs, "quest": task["quest"], "quest_name": task["quest_name"], - "agent_id": agent_config.agent_id, + "agent_id": _agent_id(agent_config), "model": agent_config.model, "attempt": task["attempt"], "outcome": result["outcome"], @@ -471,7 +534,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[ "total_runs": total_runs, "quest": task["quest"], "quest_name": task["quest_name"], - "agent_id": agent_config.agent_id, + "agent_id": _agent_id(agent_config), "model": agent_config.model, "attempt": task["attempt"], "outcome": QuestOutcome.TIMEOUT.name, diff --git a/llm_quest_benchmark/executors/cli/commands.py b/llm_quest_benchmark/executors/cli/commands.py index e3cecbd..d554f70 100644 --- a/llm_quest_benchmark/executors/cli/commands.py +++ b/llm_quest_benchmark/executors/cli/commands.py @@ -18,13 +18,10 @@ import typer -from llm_quest_benchmark.agents.agent_factory import create_agent -from llm_quest_benchmark.agents.human_player import HumanPlayer from llm_quest_benchmark.constants import ( DEFAULT_MODEL, DEFAULT_QUEST, DEFAULT_TEMPERATURE, - DEFAULT_TEMPLATE, INFINITE_TIMEOUT, MODEL_CHOICES, SYSTEM_ROLE_TEMPLATE, @@ -40,9 +37,10 @@ print_summary, run_benchmark, ) +from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness from llm_quest_benchmark.llm import tracing from llm_quest_benchmark.renderers.terminal import RichRenderer -from llm_quest_benchmark.schemas.config import AgentConfig, BenchmarkConfig +from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig # Initialize logging log_manager = LogManager() @@ -53,6 +51,8 @@ rich_markup_mode="rich", ) +HARNESS_CHOICES = list(HARNESS_REGISTRY.keys()) + def version_callback(value: bool): if value: @@ -348,7 +348,12 @@ def run( model: str = typer.Option(DEFAULT_MODEL, help=f"Model for the LLM agent (choices: {', '.join(MODEL_CHOICES)})."), temperature: float = typer.Option(DEFAULT_TEMPERATURE, help="Temperature for LLM sampling"), system_template: str = typer.Option(SYSTEM_ROLE_TEMPLATE, help="Template to use for system instructions."), - action_template: str = typer.Option(DEFAULT_TEMPLATE, help="Template to use for action prompts."), + harness: str = typer.Option( + "reasoning_recent", + "--harness", + help="Harness to use for quest decisions.", + ), + compaction_interval: int = typer.Option(50, help="Advanced override for compaction interval."), timeout: int = typer.Option(60, help="Timeout in seconds for run (0 for no timeout)."), skip: bool = typer.Option(True, help="Auto-select single choices without asking agent."), debug: bool = typer.Option(False, help="Enable debug logging and output, remove terminal UI."), @@ -365,23 +370,25 @@ def run( log_manager.setup(debug) # Create agent config - agent_config = AgentConfig( + agent_config = HarnessConfig( model=model, system_template=system_template, - action_template=action_template, + harness=harness, temperature=temperature, skip_single=skip, debug=debug, + compaction_interval=compaction_interval, ) # Create agent - agent = create_agent( + agent = create_harness( + harness=harness, model=model, system_template=system_template, - action_template=action_template, temperature=temperature, skip_single=skip, debug=debug, + compaction_interval=compaction_interval, ) log.warning(f"Starting quest run with agent {str(agent)}") @@ -458,7 +465,7 @@ def play( log.debug(f"Quest file: {quest}") # Create interactive player - player = HumanPlayer(skip_single=skip, debug=debug) + player = create_harness(harness="human", skip_single=skip, debug=debug) # Run quest in interactive mode result = run_quest_with_timeout(quest_path=str(quest), agent=player, timeout=INFINITE_TIMEOUT, debug=debug) @@ -952,7 +959,7 @@ def benchmark( This command runs benchmark evaluation using a YAML configuration file that specifies: - quests: list of quest files or directories to test - - agents: list of agents with their model, template, and temperature settings + - agents: list of harnesses with their model, harness, and temperature settings - other settings: debug, timeout, workers, etc. Example: diff --git a/llm_quest_benchmark/harnesses/__init__.py b/llm_quest_benchmark/harnesses/__init__.py new file mode 100644 index 0000000..75cef22 --- /dev/null +++ b/llm_quest_benchmark/harnesses/__init__.py @@ -0,0 +1,3 @@ +from llm_quest_benchmark.harnesses.base import BaseHarness + +__all__ = ["BaseHarness"] diff --git a/llm_quest_benchmark/harnesses/base.py b/llm_quest_benchmark/harnesses/base.py new file mode 100644 index 0000000..fd8864b --- /dev/null +++ b/llm_quest_benchmark/harnesses/base.py @@ -0,0 +1,581 @@ +"""Base harness class for quest benchmark experiments.""" + +import hashlib +import json +import logging +import re +from abc import abstractmethod +from typing import Any + +from json_repair import repair_json + +from llm_quest_benchmark.constants import DEFAULT_TEMPLATE, normalize_template_name +from llm_quest_benchmark.llm.client import get_llm_client, parse_model_name +from llm_quest_benchmark.llm.prompt import PromptRenderer +from llm_quest_benchmark.players.base import QuestPlayer +from llm_quest_benchmark.schemas.response import LLMResponse + +RISKY_CHOICE_KEYWORDS = ( + "улететь", + "сдаться", + "отказ", + "провал", + "убежать", + "surrender", + "give up", +) + +SAFE_CHOICE_KEYWORDS = ( + "пройти мимо", + "избежать", + "подготов", + "библиотек", + "изуч", + "wait", + "avoid", + "study", +) + + +def _parse_json_response( + response: str, + debug: bool = False, + logger: logging.Logger | None = None, +) -> tuple[dict[str, Any] | None, str | None]: + """Try to parse response as JSON, with repair attempt if needed.""" + cleaned_response = (response or "").strip() + if not cleaned_response: + return None, None + + try: + if "```json" in cleaned_response: + start = cleaned_response.find("```json") + 7 + end = cleaned_response.find("```", start) + if end > start: + json_str = cleaned_response[start:end].strip() + if debug and logger: + logger.debug("Extracted JSON: %s", json_str) + result = json.loads(json_str) + if debug and logger: + logger.debug("Parsed JSON: %s", result) + return result, "json_fenced" + + embedded_json = re.search(r"\{[\s\S]*\}", cleaned_response) + if embedded_json: + candidate = embedded_json.group(0).strip() + if candidate and candidate != cleaned_response: + try: + result = json.loads(candidate) + if debug and logger: + logger.debug("Parsed embedded JSON: %s", result) + return result, "json_embedded" + except json.JSONDecodeError: + pass + + result = json.loads(cleaned_response) + if debug and logger: + logger.debug("Direct JSON parse successful: %s", result) + return result, "json_direct" + except json.JSONDecodeError: + if debug and logger: + logger.debug("Initial JSON parse failed, attempting repair") + try: + repaired = repair_json(cleaned_response) + if debug and logger: + logger.debug("Repaired JSON: %s", repaired) + result = json.loads(repaired) + if debug and logger: + logger.debug("Parse of repaired JSON successful: %s", result) + return result, "json_repaired" + except Exception as exc: + if debug and logger: + logger.error("JSON repair failed: %s", exc) + return None, None + + +def _validate_action_number( + action: int, + num_choices: int, + debug: bool = False, + logger: logging.Logger | None = None, +) -> bool: + """Validate that action number is within valid range.""" + if 1 <= action <= num_choices: + return True + if debug and logger: + logger.error("Action number %s out of range [1, %s]", action, num_choices) + return False + + +def _extract_action_from_text(response: str, num_choices: int) -> int | None: + """Extract a candidate action from free-form text.""" + for match in re.finditer(r"\b(\d+)\b", response): + action = int(match.group(1)) + if 1 <= action <= num_choices: + return action + return None + + +def _extract_field_from_text(response: str, field: str) -> str | None: + """Best-effort extraction of analysis/reasoning from loosely formatted output.""" + if not response: + return None + + json_pattern = re.compile( + rf"""['"]{re.escape(field)}['"]\s*:\s*['"](?P.*?)['"]""", + re.IGNORECASE | re.DOTALL, + ) + match = json_pattern.search(response) + if match: + value = " ".join(match.group("value").strip().split()) + if value: + return value + + partial_json_pattern = re.compile( + rf"""['"]{re.escape(field)}['"]\s*:\s*['"](?P[^"\n\r]+)""", + re.IGNORECASE, + ) + match = partial_json_pattern.search(response) + if match: + value = " ".join(match.group("value").strip().split()) + if value: + return value + + label_pattern = re.compile( + rf"""(?im)^\s*{re.escape(field)}\s*[:\-]\s*(?P.+?)\s*$""", + ) + match = label_pattern.search(response) + if match: + value = " ".join(match.group("value").strip().split()) + if value: + return value + + return None + + +def _raw_reasoning_fallback(response: str) -> str | None: + compact = " ".join((response or "").strip().split()) + if not compact: + return None + if len(compact) > 240: + compact = compact[:237] + "..." + return f"raw_response: {compact}" + + +def _is_numeric_raw_reasoning(reasoning: str | None) -> bool: + if not reasoning or not reasoning.startswith("raw_response:"): + return False + payload = reasoning.split(":", 1)[1].strip() + return payload.isdigit() + + +def parse_llm_response( + response: str, + num_choices: int, + debug: bool = False, + logger: logging.Logger | None = None, +) -> LLMResponse: + """Parse an LLM response and return a structured response object.""" + if debug and logger: + logger.debug("Raw LLM response: %s", response) + + extracted_analysis = _extract_field_from_text(response, "analysis") + extracted_reasoning = _extract_field_from_text(response, "reasoning") + raw_reasoning = _raw_reasoning_fallback(response) + + response_json, json_parse_mode = _parse_json_response(response, debug, logger) + if response_json and isinstance(response_json, dict): + analysis = response_json.get("analysis") or extracted_analysis + reasoning = response_json.get("reasoning") or response_json.get("thinking") or extracted_reasoning + if not reasoning and analysis: + reasoning = analysis + if not analysis and not reasoning: + reasoning = raw_reasoning + + memo_raw = response_json.get("memo") + memo = str(memo_raw) if memo_raw is not None else None + action_value = response_json.get("action") or response_json.get("result") or response_json.get("choice") + if action_value is not None: + try: + action = int(action_value) + if _validate_action_number(action, num_choices, debug, logger): + return LLMResponse( + action=action, + reasoning=reasoning, + analysis=analysis, + memo=memo, + is_default=False, + parse_mode=json_parse_mode or "json", + ) + except (ValueError, TypeError): + if debug and logger: + logger.error("Invalid action value in JSON: %s", action_value) + + try: + action = int(response.strip()) + if _validate_action_number(action, num_choices, debug, logger): + return LLMResponse( + action=action, + reasoning=extracted_reasoning or extracted_analysis or raw_reasoning, + analysis=extracted_analysis, + is_default=False, + parse_mode="number_only", + ) + except ValueError: + if debug and logger: + logger.error("Could not parse response as number: %s", response) + + extracted_action = _extract_action_from_text(response, num_choices) + if extracted_action is not None: + return LLMResponse( + action=extracted_action, + reasoning=extracted_reasoning or extracted_analysis or raw_reasoning, + analysis=extracted_analysis, + is_default=False, + parse_mode="number_extracted", + ) + + if debug and logger: + logger.error("Error during response parsing, defaulting to first choice. Response: %s...", response[:100]) + return LLMResponse( + action=1, + reasoning=extracted_reasoning or extracted_analysis or raw_reasoning, + analysis=extracted_analysis, + is_default=True, + parse_mode="default_first", + ) + + +class BaseHarness(QuestPlayer): + """Abstract LLM harness base class.""" + + def __init__( + self, + model_name, + system_template, + temperature, + skip_single, + debug, + memory_module=None, + tools=None, + action_template=DEFAULT_TEMPLATE, + ): + super().__init__(skip_single=skip_single) + self.debug = debug + self.model_name = model_name.lower() + self.system_template = normalize_template_name(system_template) + self.action_template = normalize_template_name(action_template) + self.temperature = temperature + self.harness_name = getattr(self.__class__, "harness_name", "") + self.agent_id = f"harness_{self.model_name}" + self.memory_module = memory_module + self.tools = tools or [] + self.model_spec = parse_model_name(self.model_name) + self.logger = logging.getLogger(self.__class__.__name__) + if self.debug: + self.logger.setLevel(logging.DEBUG) + self.logger.propagate = False + if not any(getattr(h, "_llm_quest_handler", False) for h in self.logger.handlers): + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter("%(name)s - %(message)s")) + handler._llm_quest_handler = True + self.logger.addHandler(handler) + + self.prompt_renderer = PromptRenderer( + None, + system_template=self.system_template, + action_template=self.action_template, + ) + self.llm = None + self.history: list[LLMResponse] = [] + self._use_safety_filter = True + self._last_response = LLMResponse(action=1, is_default=True) + self._observation_history: list[str] = [] + self._decision_history: list[dict[str, Any]] = [] + self._state_action_counts: dict[str, dict[int, int]] = {} + self._step_count = 0 + + def _ensure_llm(self) -> None: + """Lazily create the provider client only when inference is needed.""" + if self.llm is None: + self.llm = get_llm_client( + self.model_name, + system_prompt=self.prompt_renderer.render_system_prompt(), + temperature=self.temperature, + ) + if self.memory_module is not None and hasattr(self.memory_module, "llm_client"): + self.memory_module.llm_client = self.llm + + @abstractmethod + def _get_action_impl(self, observation, choices) -> int: + """Return the selected 1-based action number.""" + pass + + def reset(self) -> None: + """Reset harness state between episodes.""" + super().reset() + self.history = [] + self._last_response = LLMResponse(action=1, is_default=True) + self._observation_history = [] + self._decision_history = [] + self._state_action_counts = {} + self._step_count = 0 + if self.memory_module is not None: + self.memory_module.reset() + + def get_action(self, observation: str, choices: list[dict[str, str]]) -> int: + clean = (observation or "").strip() + if clean: + self._observation_history.append(clean) + if len(self._observation_history) > 20: + self._observation_history = self._observation_history[-20:] + return super().get_action(observation, choices) + + def on_game_start(self) -> None: + super().on_game_start() + self.reset() + + def on_game_end(self, final_state: dict[str, Any]) -> None: + if self.debug: + self.logger.debug("Game ended with state: %s", final_state) + + def get_last_response(self) -> LLMResponse | None: + return self._last_response + + def _build_contextual_state(self, state: str) -> str: + if self.memory_module is None: + return state + context = self.memory_module.get_context(self._step_count + 1) + if not context: + return state + return f"{context}\n\nCurrent story state:\n{state}" + + @staticmethod + def _normalize_for_signature(value: str, max_len: int = 320) -> str: + text = (value or "").lower() + text = re.sub(r"\s+", " ", text).strip() + return text[:max_len] if len(text) > max_len else text + + def _state_signature(self, state: str, choices: list[dict[str, str]]) -> str: + normalized_state = self._normalize_for_signature(state, max_len=420) + normalized_choices = "|".join( + self._normalize_for_signature(choice.get("text", ""), max_len=110) for choice in choices + ) + raw_signature = f"{normalized_state}||{normalized_choices}" + return hashlib.sha1(raw_signature.encode("utf-8", errors="ignore")).hexdigest()[:20] + + def _remember_decision( + self, + state: str, + choices: list[dict[str, str]], + state_signature: str, + response: LLMResponse, + ) -> None: + action = int(response.action) + counts = self._state_action_counts.setdefault(state_signature, {}) + counts[action] = counts.get(action, 0) + 1 + + selected_text = "" + if 1 <= action <= len(choices): + selected_text = choices[action - 1].get("text", "") + state_snippet = (state or "").strip() + if len(state_snippet) > 220: + state_snippet = state_snippet[:220] + "..." + + decision = { + "state": state_snippet, + "action": action, + "choice": selected_text, + "choice_text": selected_text, + "parse_mode": response.parse_mode or "unknown", + "memo": (response.memo or "").strip()[:350] or None, + "reasoning": (response.reasoning or "")[:800], + } + self._decision_history.append(decision) + if len(self._decision_history) > 40: + self._decision_history = self._decision_history[-40:] + + self._step_count += 1 + if self.memory_module is not None: + self.memory_module.update( + { + "step": self._step_count, + "observation": state, + "choices": [c.get("text", "") for c in choices], + **decision, + } + ) + + def _format_prompt(self, observation, choices, memo=None, context=None) -> str: + """Render the action Jinja template for the current decision.""" + return self.prompt_renderer.action_template.render( + observation=observation, + choices=[{"text": c.get("text", "")} for c in choices], + memo=memo, + context=context, + ).strip() + + def _parse_llm_response(self, response, num_choices) -> LLMResponse: + """Parse an LLM response into a structured response object.""" + return parse_llm_response(response, num_choices, self.debug, self.logger) + + def _call_llm(self, prompt, system_prompt=None) -> str: + """Call the LLM client with lightweight retry handling.""" + self._ensure_llm() + last_error: Exception | None = None + for attempt in range(3): + try: + if system_prompt is not None: + return self.llm.get_completion(prompt, system_prompt=system_prompt) + return self.llm.get_completion(prompt) + except TypeError: + if system_prompt is not None: + return self.llm.get_completion(prompt) + raise + except Exception as exc: + last_error = exc + if self.debug: + self.logger.warning("LLM call failed on attempt %d: %s", attempt + 1, exc) + raise last_error or RuntimeError("LLM call failed") + + def _choice_risk_score(self, choice_text: str) -> int: + text = (choice_text or "").lower() + score = 0 + for keyword in RISKY_CHOICE_KEYWORDS: + if keyword in text: + score += 2 + for keyword in SAFE_CHOICE_KEYWORDS: + if keyword in text: + score -= 1 + return score + + def _apply_safety_filter(self, choices, preferred_action) -> int: + """Replace obviously risky actions when a clearly safer alternative exists.""" + if not self._use_safety_filter or len(choices) < 2: + return preferred_action + + current_idx = preferred_action - 1 + if current_idx < 0 or current_idx >= len(choices): + return preferred_action + + scored = [(idx + 1, self._choice_risk_score(c.get("text", ""))) for idx, c in enumerate(choices)] + scored.sort(key=lambda item: item[1]) + + best_action, best_score = scored[0] + current_score = self._choice_risk_score(choices[current_idx].get("text", "")) + if current_score - best_score >= 2: + if self.debug: + self.logger.debug( + "Safety filter override: %s -> %s (risk %s -> %s)", + preferred_action, + best_action, + current_score, + best_score, + ) + return best_action + return preferred_action + + @staticmethod + def _normalize_usage(usage: dict[str, Any] | None) -> dict[str, Any]: + usage = usage or {} + prompt_tokens = int(usage.get("prompt_tokens") or 0) + completion_tokens = int(usage.get("completion_tokens") or 0) + total_tokens = int(usage.get("total_tokens") or (prompt_tokens + completion_tokens)) + estimated_cost_usd = usage.get("estimated_cost_usd") + if estimated_cost_usd is not None: + estimated_cost_usd = float(estimated_cost_usd) + return { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "estimated_cost_usd": estimated_cost_usd, + } + + @classmethod + def _merge_usage(cls, first: dict[str, Any] | None, second: dict[str, Any] | None) -> dict[str, Any]: + a = cls._normalize_usage(first) + b = cls._normalize_usage(second) + merged_cost = None + if a["estimated_cost_usd"] is not None or b["estimated_cost_usd"] is not None: + merged_cost = (a["estimated_cost_usd"] or 0.0) + (b["estimated_cost_usd"] or 0.0) + return { + "prompt_tokens": a["prompt_tokens"] + b["prompt_tokens"], + "completion_tokens": a["completion_tokens"] + b["completion_tokens"], + "total_tokens": a["total_tokens"] + b["total_tokens"], + "estimated_cost_usd": merged_cost, + } + + def _format_retry_prompt(self, state: str, choices: list[dict[str, str]]) -> str: + clipped_state = (state or "").strip() + if len(clipped_state) > 500: + clipped_state = clipped_state[:500] + "..." + choices_text = "\n".join([f"{i + 1}. {(c.get('text', '') or '')[:160]}" for i, c in enumerate(choices)]) + return f"""Choose the best action. +State: {clipped_state} +Actions: +{choices_text} + +Return valid JSON only: +{{ + "analysis": "", + "reasoning": "", + "result": +}}""" + + def _format_force_numeric_retry_prompt(self, choices: list[dict[str, str]]) -> str: + choices_text = "\n".join([f"{i + 1}. {(c.get('text', '') or '')[:110]}" for i, c in enumerate(choices)]) + return f"""Pick one action number. +{choices_text} +Reply with one integer only: 1 to {len(choices)}.""" + + def _needs_force_numeric_retry(self) -> bool: + return self.model_spec.provider == "openai" and ( + self.model_spec.model_id.startswith("gpt-5") or self.model_spec.model_id.startswith("o") + ) + + def _parse_with_retries(self, prompt: str, observation: str, choices: list[dict[str, str]]) -> LLMResponse: + """Call the model, parse, and retry once on invalid/default output.""" + llm_response = self._call_llm(prompt) + llm_usage = self.llm.get_last_usage() + first_response = self._parse_llm_response(llm_response, len(choices)) + parsed_response = first_response + + if parsed_response.is_default: + retry_response = self._call_llm(self._format_retry_prompt(observation, choices)) + retry_usage = self.llm.get_last_usage() + llm_usage = self._merge_usage(llm_usage, retry_usage) + retry_parsed = self._parse_llm_response(retry_response, len(choices)) + if not retry_parsed.is_default: + retry_parsed.parse_mode = f"retry_{retry_parsed.parse_mode or 'parsed'}" + parsed_response = retry_parsed + elif self._needs_force_numeric_retry(): + force_retry_response = self._call_llm(self._format_force_numeric_retry_prompt(choices)) + force_retry_usage = self.llm.get_last_usage() + llm_usage = self._merge_usage(llm_usage, force_retry_usage) + force_retry_parsed = self._parse_llm_response(force_retry_response, len(choices)) + if not force_retry_parsed.is_default: + force_retry_parsed.parse_mode = f"force_retry_{force_retry_parsed.parse_mode or 'parsed'}" + parsed_response = force_retry_parsed + + if parsed_response is not first_response: + if parsed_response.analysis is None and first_response.analysis is not None: + parsed_response.analysis = first_response.analysis + if _is_numeric_raw_reasoning(parsed_response.reasoning): + if first_response.reasoning and not _is_numeric_raw_reasoning(first_response.reasoning): + parsed_response.reasoning = first_response.reasoning + else: + first_raw_reasoning = _raw_reasoning_fallback(llm_response) + if first_raw_reasoning and not _is_numeric_raw_reasoning(first_raw_reasoning): + parsed_response.reasoning = first_raw_reasoning + + action_before_policy = parsed_response.action + parsed_response.action = self._apply_safety_filter(choices, parsed_response.action) + if parsed_response.action != action_before_policy and not parsed_response.reasoning: + parsed_response.reasoning = "policy_safety_override" + + usage_payload = self._normalize_usage(llm_usage) + parsed_response.prompt_tokens = usage_payload["prompt_tokens"] + parsed_response.completion_tokens = usage_payload["completion_tokens"] + parsed_response.total_tokens = usage_payload["total_tokens"] + parsed_response.estimated_cost_usd = usage_payload["estimated_cost_usd"] + return parsed_response diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py new file mode 100644 index 0000000..87e2d77 --- /dev/null +++ b/llm_quest_benchmark/harnesses/factory.py @@ -0,0 +1,92 @@ +"""Factory for creating harness-based quest players.""" + +from llm_quest_benchmark.constants import DEFAULT_MODEL +from llm_quest_benchmark.harnesses.memo import ( + CompactionNoMemoHarness, + HintedCompactHarness, + MemoCompactHarness, + MemoCotHarness, + MemoExtendedHarness, + MemoStructuredHarness, +) +from llm_quest_benchmark.harnesses.minimal import MinimalHarness +from llm_quest_benchmark.harnesses.planner import PlannerHarness +from llm_quest_benchmark.harnesses.reasoning import ReasoningFullTranscriptHarness, ReasoningRecentHarness +from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness, ToolHintedHarness +from llm_quest_benchmark.players.base import QuestPlayer +from llm_quest_benchmark.players.human import HumanPlayer +from llm_quest_benchmark.players.random import RandomPlayer + +HARNESS_REGISTRY = { + "minimal": MinimalHarness, + "reasoning_recent": ReasoningRecentHarness, + "reasoning_full": ReasoningFullTranscriptHarness, + "memo_compact": MemoCompactHarness, + "hinted_compact": HintedCompactHarness, + "tool_compact": ToolCompactHarness, + "tool_hinted": ToolHintedHarness, + "planner": PlannerHarness, + "compaction_no_memo": CompactionNoMemoHarness, + "memo_cot": MemoCotHarness, + "memo_extended": MemoExtendedHarness, + "memo_structured": MemoStructuredHarness, +} + +SPECIAL_HARNESSES = ("human", "random_choice", "random_choice_") + + +def _parse_random_choice_seed(identifier: str) -> tuple[bool, int | None]: + if identifier == "random_choice": + return True, None + prefix = "random_choice_" + if identifier.startswith(prefix) and identifier[len(prefix) :].isdigit(): + return True, int(identifier[len(prefix) :]) + return False, None + + +def is_random_choice_harness(identifier: str) -> bool: + is_random, _ = _parse_random_choice_seed(identifier) + return is_random + + +def create_harness( + harness: str, + model: str = DEFAULT_MODEL, + temperature: float = 0.4, + skip_single: bool = False, + debug: bool = False, + compaction_interval: int = 50, + system_template: str = "system_role.jinja", +) -> QuestPlayer: + valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES] + is_random_harness, seed = _parse_random_choice_seed(harness) + is_random_model, _ = _parse_random_choice_seed(model) + if is_random_harness: + if is_random_model and model != "random_choice": + raise ValueError("Encode random seeds in harness, for example harness='random_choice_123'") + if model not in (DEFAULT_MODEL, "random_choice"): + raise ValueError("Use model='random_choice' with random_choice harnesses") + return RandomPlayer(seed=seed, debug=debug, skip_single=skip_single) + if harness.startswith("random_choice"): + raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}") + if harness == "human": + return HumanPlayer(skip_single=skip_single) + if harness not in HARNESS_REGISTRY: + raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}") + if is_random_model: + raise ValueError( + "Use harness='random_choice' for random policy runs instead of pairing random_choice model with an LLM harness" + ) + if model.startswith("random_choice"): + raise ValueError(f"Unknown random_choice model '{model}'. Valid: {valid}") + if model == "human": + raise ValueError("Use harness='human' for human runs instead of pairing human model with an LLM harness") + cls = HARNESS_REGISTRY[harness] + return cls( + model_name=model, + temperature=temperature, + skip_single=skip_single, + debug=debug, + compaction_interval=compaction_interval, + system_template=system_template, + ) diff --git a/llm_quest_benchmark/harnesses/memo.py b/llm_quest_benchmark/harnesses/memo.py new file mode 100644 index 0000000..63bfb60 --- /dev/null +++ b/llm_quest_benchmark/harnesses/memo.py @@ -0,0 +1,102 @@ +"""Compacted-memory harness variants.""" + +from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE +from llm_quest_benchmark.harnesses.memory import CompactionMemory +from llm_quest_benchmark.harnesses.minimal import MinimalHarness + + +class MemoCompactHarness(MinimalHarness): + harness_name = "memo_compact" + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + system_template: str = SYSTEM_ROLE_TEMPLATE, + action_template: str = "stateful_compact.jinja", + temperature: float = DEFAULT_TEMPERATURE, + skip_single: bool = False, + debug: bool = False, + compaction_interval: int = 50, + memory_module=None, + **kwargs, + ): + super().__init__( + model_name=model_name, + system_template=system_template, + action_template=action_template, + temperature=temperature, + skip_single=skip_single, + debug=debug, + memory_module=( + memory_module + if memory_module is not None + else CompactionMemory(compaction_interval=compaction_interval) + ), + **kwargs, + ) + self._memory_mode = "compaction" + self._compaction_interval = compaction_interval + + +class HintedCompactHarness(MemoCompactHarness): + harness_name = "hinted_compact" + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + system_template: str = SYSTEM_ROLE_TEMPLATE, + action_template: str = "stateful_compact_hints.jinja", + temperature: float = DEFAULT_TEMPERATURE, + skip_single: bool = False, + debug: bool = False, + compaction_interval: int = 50, + memory_module=None, + **kwargs, + ): + super().__init__( + model_name=model_name, + system_template=system_template, + action_template=action_template, + temperature=temperature, + skip_single=skip_single, + debug=debug, + compaction_interval=compaction_interval, + memory_module=memory_module, + **kwargs, + ) + + +class CompactionNoMemoHarness(MemoCompactHarness): + """Retired Exp 4 ablation: compacted transcript without memo-oriented prompting.""" + + harness_name = "compaction_no_memo" + + def __init__(self, *args, action_template: str = "reasoning.jinja", **kwargs): + super().__init__(*args, action_template=action_template, **kwargs) + + +class MemoExtendedHarness(MemoCompactHarness): + """Retired Exp 4 variant with a larger generic memo field.""" + + harness_name = "memo_extended" + + def __init__(self, *args, action_template: str = "memo_extended.jinja", **kwargs): + super().__init__(*args, action_template=action_template, **kwargs) + + +class MemoStructuredHarness(MemoCompactHarness): + """Retired Exp 4 variant with structured memo prompting.""" + + harness_name = "memo_structured" + + def __init__(self, *args, action_template: str = "memo_structured.jinja", **kwargs): + super().__init__(*args, action_template=action_template, **kwargs) + + +class MemoCotHarness(MemoCompactHarness): + """Retired Exp 4 variant with scratchpad-style memo prompting.""" + + harness_name = "memo_cot" + + def __init__(self, *args, action_template: str = "memo_cot.jinja", **kwargs): + super().__init__(*args, action_template=action_template, **kwargs) diff --git a/llm_quest_benchmark/harnesses/memory.py b/llm_quest_benchmark/harnesses/memory.py new file mode 100644 index 0000000..45ba5e5 --- /dev/null +++ b/llm_quest_benchmark/harnesses/memory.py @@ -0,0 +1,353 @@ +"""Memory modules for harness-based quest players.""" + +import logging +from abc import ABC, abstractmethod +from typing import Any + +logger = logging.getLogger(__name__) + + +class MemoryModule(ABC): + @abstractmethod + def get_context(self, step: int) -> str: + pass + + @abstractmethod + def update(self, step_data: dict) -> None: + pass + + @abstractmethod + def reset(self) -> None: + pass + + @property + def quest_briefing(self) -> str | None: + return None + + @property + def transcript(self) -> list[dict[str, Any]]: + return [] + + @transcript.setter + def transcript(self, value: list[dict[str, Any]]) -> None: + raise TypeError(f"{self.__class__.__name__} does not support transcript assignment") + + @property + def steps_since_compaction(self) -> int: + return 0 + + @steps_since_compaction.setter + def steps_since_compaction(self, value: int) -> None: + raise TypeError(f"{self.__class__.__name__} does not support compaction counters") + + def set_quest_briefing(self, briefing: str) -> None: + clean = (briefing or "").strip() + if hasattr(self, "_quest_briefing"): + self._quest_briefing = clean or None + + def _briefing_block(self, current_state: str) -> str | None: + briefing = self.quest_briefing + if not briefing: + return None + if current_state.strip() == briefing: + return None + if len(briefing) > 800: + briefing = briefing[:800] + "..." + return f"Quest briefing (your mission):\n{briefing}" + + +class DefaultMemory(MemoryModule): + """Recent N observations window without compaction.""" + + def __init__(self, context_window: int = 3, context_chars: int = 220, decision_window: int = 5): + self.context_window = context_window + self.context_chars = context_chars + self.decision_window = decision_window + self._quest_briefing: str | None = None + self._observations: list[str] = [] + self._decisions: list[dict[str, Any]] = [] + + @property + def quest_briefing(self) -> str | None: + return self._quest_briefing + + def get_context(self, step: int) -> str: + blocks: list[str] = [] + current = self._observations[-1] if self._observations else "" + + briefing = self._briefing_block(current) + if briefing: + blocks.append(briefing) + + if len(self._observations) > 1: + previous = self._observations[:-1][-self.context_window :] + if previous: + snippets = [] + for idx, text in enumerate(previous, start=1): + clipped = text if len(text) <= self.context_chars else text[: self.context_chars] + "..." + snippets.append(f"[Previous {idx}] {clipped}") + blocks.append("Recent context from previous steps:\n" + "\n\n".join(snippets)) + + if self._decisions: + recent_memos = [] + for item in self._decisions[-self.decision_window :]: + memo = (item.get("memo") or "").strip() + if not memo: + continue + if recent_memos and recent_memos[-1] == memo: + continue + recent_memos.append(memo) + if recent_memos: + lines = [f"[Memo {idx}] {memo}" for idx, memo in enumerate(recent_memos, start=1)] + blocks.append("State memo (recent):\n" + "\n".join(lines)) + + decision_lines = [] + for idx, item in enumerate(self._decisions[-self.decision_window :], start=1): + choice = item.get("choice") or item.get("choice_text", "") + parse_mode = item.get("parse_mode", "unknown") + memo_val = item.get("memo") + memo_suffix = f" | memo: {memo_val}" if memo_val else "" + decision_lines.append( + f"[Decision {idx}] action {item.get('action')}: {choice} (parse={parse_mode}){memo_suffix}" + ) + blocks.append("Recent selected actions:\n" + "\n".join(decision_lines)) + + return "\n\n".join(blocks) + + def update(self, step_data: dict) -> None: + observation = (step_data.get("observation") or step_data.get("state") or "").strip() + if observation: + if self._quest_briefing is None: + self._quest_briefing = observation + self._observations.append(observation) + if len(self._observations) > 20: + self._observations = self._observations[-20:] + + if any(key in step_data for key in ("action", "choice", "choice_text", "memo")): + memo = (step_data.get("memo") or "").strip()[:350] or None + self._decisions.append( + { + "action": step_data.get("action"), + "choice": step_data.get("choice") or step_data.get("choice_text", ""), + "parse_mode": step_data.get("parse_mode", "unknown"), + "memo": memo, + } + ) + if len(self._decisions) > 40: + self._decisions = self._decisions[-40:] + + def reset(self) -> None: + self._quest_briefing = None + self._observations = [] + self._decisions = [] + + +class FullTranscriptMemory(MemoryModule): + """Unbounded full transcript in context.""" + + def __init__(self): + self._quest_briefing: str | None = None + self._transcript: list[dict[str, Any]] = [] + + @property + def quest_briefing(self) -> str | None: + return self._quest_briefing + + @property + def transcript(self) -> list[dict[str, Any]]: + return self._transcript + + @transcript.setter + def transcript(self, value: list[dict[str, Any]]) -> None: + self._transcript = value + + def get_context(self, step: int) -> str: + blocks: list[str] = [] + current_state = self._transcript[-1].get("observation", "") if self._transcript else "" + briefing = self._briefing_block(current_state) + if briefing: + blocks.append(briefing) + + if self._transcript: + lines = [] + for entry in self._transcript: + step_value = entry.get("step", "?") + obs = entry.get("observation", "") + if len(obs) > 400: + obs = obs[:400] + "..." + chosen = entry.get("choice_text") or entry.get("choice", "") + reasoning = entry.get("reasoning", "") + line = f"Step {step_value}: {obs}" + if chosen: + line += f"\n You chose: {chosen}" + if reasoning: + line += f"\n Reasoning: {reasoning[:800]}" + state_notes = entry.get("memo", "") + if state_notes: + line += f"\n State: {state_notes[:350]}" + lines.append(line) + blocks.append("=== QUEST TRANSCRIPT ===\n" + "\n\n".join(lines)) + + return "\n\n".join(blocks) + + def update(self, step_data: dict) -> None: + observation = (step_data.get("observation") or step_data.get("state") or "").strip() + if observation and self._quest_briefing is None: + self._quest_briefing = observation + entry = dict(step_data) + entry["observation"] = observation + entry["step"] = entry.get("step") or len(self._transcript) + 1 + self._transcript.append(entry) + + def reset(self) -> None: + self._quest_briefing = None + self._transcript = [] + + +class CompactionMemory(MemoryModule): + """Periodic LLM summarization plus 20-word memo field.""" + + def __init__(self, compaction_interval: int = 50, llm_client=None): + self.compaction_interval = compaction_interval + self.llm_client = llm_client + self._quest_briefing: str | None = None + self._transcript: list[dict[str, Any]] = [] + self._compaction_summary: str | None = None + self._steps_since_compaction = 0 + + @property + def quest_briefing(self) -> str | None: + return self._quest_briefing + + @property + def transcript(self) -> list[dict[str, Any]]: + return self._transcript + + @transcript.setter + def transcript(self, value: list[dict[str, Any]]) -> None: + self._transcript = value + + @property + def steps_since_compaction(self) -> int: + return self._steps_since_compaction + + @steps_since_compaction.setter + def steps_since_compaction(self, value: int) -> None: + self._steps_since_compaction = value + + def get_context(self, step: int) -> str: + blocks: list[str] = [] + current_state = self._transcript[-1].get("observation", "") if self._transcript else "" + briefing = self._briefing_block(current_state) + if briefing: + blocks.append(briefing) + + if self._compaction_summary: + compacted_at = max(0, step - self._steps_since_compaction) + blocks.append(f"=== QUEST MEMORY (compacted at step {compacted_at}) ===\n{self._compaction_summary}") + + recent = self._transcript[-self._steps_since_compaction :] if self._steps_since_compaction > 0 else [] + if recent: + lines = [] + for entry in recent: + step_value = entry.get("step", "?") + obs = entry.get("observation", "") + if len(obs) > 400: + obs = obs[:400] + "..." + chosen = entry.get("choice_text") or entry.get("choice", "") + line = f"Step {step_value}: {obs}" + if chosen: + line += f"\n You chose: {chosen}" + state_notes = entry.get("memo", "") + if state_notes: + line += f"\n State: {state_notes[:350]}" + lines.append(line) + blocks.append("=== RECENT STEPS ===\n" + "\n\n".join(lines)) + + return "\n\n".join(blocks) + + def update(self, step_data: dict) -> None: + observation = (step_data.get("observation") or step_data.get("state") or "").strip() + if observation and self._quest_briefing is None: + self._quest_briefing = observation + entry = dict(step_data) + entry["observation"] = observation[:400] + entry["step"] = entry.get("step") or len(self._transcript) + 1 + if entry.get("memo"): + entry["memo"] = self._twenty_word_memo(str(entry["memo"])) + self._transcript.append(entry) + self._steps_since_compaction += 1 + self._maybe_compact() + + def reset(self) -> None: + self._quest_briefing = None + self._transcript = [] + self._compaction_summary = None + self._steps_since_compaction = 0 + + def _maybe_compact(self) -> None: + if self._steps_since_compaction < self.compaction_interval: + return + if self.llm_client is None: + logger.debug("Skipping compaction because no LLM client is attached") + return + transcript_text = self._format_transcript_for_compaction() + if not transcript_text: + self._steps_since_compaction = 0 + return + + prompt_parts = ["You are summarizing a quest player's progress through a text quest."] + if self._quest_briefing: + prompt_parts.append(f"\nQUEST BRIEFING (the original mission):\n{self._quest_briefing}") + if self._compaction_summary: + prompt_parts.append(f"\nPREVIOUS SUMMARY:\n{self._compaction_summary}") + prompt_parts.append(f"\nTRANSCRIPT OF LAST {self._steps_since_compaction} STEPS:\n{transcript_text}") + prompt_parts.append( + "\nSummarize the agent's progress. Include:\n" + "- Current objective (what the player should do next)\n" + "- Progress so far (what has been accomplished)\n" + "- Key facts (NPCs, items, locations, deadlines discovered)\n" + "- Failed approaches (actions/paths that didn't work)\n" + "- Map knowledge (locations visited and connections)\n\n" + "Write a concise summary in plain text, max 300 words." + ) + + try: + summary = (self.llm_client.get_completion("\n".join(prompt_parts)) or "").strip() + except Exception as exc: + logger.debug("Skipping compaction because summarization failed: %s", exc) + self._steps_since_compaction = 0 + return + if summary: + self._compaction_summary = summary + self._transcript = [] + self._steps_since_compaction = 0 + + def _format_transcript_for_compaction(self) -> str: + recent = ( + self._transcript[-self._steps_since_compaction :] + if self._steps_since_compaction > 0 + else self._transcript[-self.compaction_interval :] + ) + lines = [] + for entry in recent: + step = entry.get("step", "?") + obs = entry.get("observation", "") + if len(obs) > 400: + obs = obs[:400] + "..." + chosen = entry.get("choice_text") or entry.get("choice", "") + reasoning = entry.get("reasoning", "") + state_notes = entry.get("memo", "") + line = f"Step {step}: {obs}" + if chosen: + line += f"\n Chose: {chosen}" + if state_notes: + line += f"\n State: {state_notes[:350]}" + if reasoning: + line += f"\n Reasoning: {reasoning[:800]}" + lines.append(line) + return "\n\n".join(lines) + + @staticmethod + def _twenty_word_memo(memo: str) -> str: + return " ".join(memo.split()[:20]) diff --git a/llm_quest_benchmark/harnesses/minimal.py b/llm_quest_benchmark/harnesses/minimal.py new file mode 100644 index 0000000..462d128 --- /dev/null +++ b/llm_quest_benchmark/harnesses/minimal.py @@ -0,0 +1,56 @@ +"""Minimal harness implementation.""" + +from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE +from llm_quest_benchmark.harnesses.base import BaseHarness +from llm_quest_benchmark.harnesses.memory import DefaultMemory +from llm_quest_benchmark.schemas.response import LLMResponse + + +class MinimalHarness(BaseHarness): + """Simple prompt-call-parse action loop with recent-memory context.""" + + harness_name = "minimal" + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + system_template: str = SYSTEM_ROLE_TEMPLATE, + action_template: str = "stub.jinja", + temperature: float = DEFAULT_TEMPERATURE, + skip_single: bool = False, + debug: bool = False, + memory_module=None, + **_, + ): + super().__init__( + model_name=model_name, + system_template=system_template, + action_template=action_template, + temperature=temperature, + skip_single=skip_single, + debug=debug, + memory_module=memory_module or DefaultMemory(), + ) + + def _get_action_impl(self, observation: str, choices: list[dict[str, str]]) -> int: + try: + state_signature = self._state_signature(observation, choices) + prompt = self._format_prompt(self._build_contextual_state(observation), choices) + parsed_response = self._parse_with_retries(prompt, observation, choices) + if parsed_response.action < 1 or parsed_response.action > len(choices): + parsed_response.action = 1 + self.history.append(parsed_response) + self._last_response = parsed_response + self._remember_decision(observation, choices, state_signature, parsed_response) + return parsed_response.action + except Exception as exc: + self.logger.error("Harness error during LLM call: %s", exc) + default_response = LLMResponse( + action=1, + is_default=True, + parse_mode="error_default", + reasoning=f"llm_call_error: {exc}", + ) + self.history.append(default_response) + self._last_response = default_response + return 1 diff --git a/llm_quest_benchmark/agents/planner_agent.py b/llm_quest_benchmark/harnesses/planner.py similarity index 62% rename from llm_quest_benchmark/agents/planner_agent.py rename to llm_quest_benchmark/harnesses/planner.py index 1999afd..810440c 100644 --- a/llm_quest_benchmark/agents/planner_agent.py +++ b/llm_quest_benchmark/harnesses/planner.py @@ -1,33 +1,53 @@ -"""Planner agent with a lightweight plan-maintain-act loop.""" +"""Planner harness implementation.""" import logging import re from typing import Any -from llm_quest_benchmark.agents.llm_agent import LLMAgent, LLMResponse, parse_llm_response +from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE +from llm_quest_benchmark.harnesses.base import BaseHarness +from llm_quest_benchmark.harnesses.memory import CompactionMemory +from llm_quest_benchmark.schemas.response import LLMResponse -class PlannerAgent(LLMAgent): - """LLM agent that maintains a short plan and re-plans on notable changes.""" +class PlannerHarness(BaseHarness): + """Compacted-memory harness with a lightweight plan-maintain-act loop.""" + + harness_name = "planner" def __init__( self, - *args, + model_name: str = DEFAULT_MODEL, + system_template: str = SYSTEM_ROLE_TEMPLATE, action_template: str = "planner.jinja", - **kwargs, + temperature: float = DEFAULT_TEMPERATURE, + skip_single: bool = False, + debug: bool = False, + compaction_interval: int = 50, + memory_module=None, + **_, ): - super().__init__(*args, action_template=action_template, **kwargs) + super().__init__( + model_name=model_name, + system_template=system_template, + action_template=action_template, + temperature=temperature, + skip_single=skip_single, + debug=debug, + memory_module=memory_module or CompactionMemory(compaction_interval=compaction_interval), + ) self.agent_id = f"planner_{self.model_name}" self.current_plan: str | None = None self._plan_history: list[str] = [] + self._memory_mode = "compaction" + self._compaction_interval = compaction_interval def _recent_actions(self) -> list[str]: entries = [] for item in self._decision_history[-3:]: choice = (item.get("choice") or "").strip() - if not choice: - continue - entries.append(f"{item.get('action')}. {choice}") + if choice: + entries.append(f"{item.get('action')}. {choice}") return entries @staticmethod @@ -35,7 +55,6 @@ def _normalize_plan(raw_plan: str) -> str: compact = " ".join((raw_plan or "").strip().split()) if not compact: return "" - sentences = re.split(r"(?<=[.!?])\s+", compact) sentences = [sentence.strip() for sentence in sentences if sentence.strip()] if len(sentences) >= 5: @@ -60,14 +79,8 @@ def _build_planner_prompt( ).strip() def _observation_changed_significantly(self, observation: str) -> bool: - """Check if the observation differs enough from the previous one to warrant re-planning. - - Uses token-level overlap ratio: if less than 50% of tokens are shared, - the scene has changed significantly. - """ if len(self._observation_history) < 2: return False - prev_tokens = set(self._observation_history[-2].lower().split()) curr_tokens = set((observation or "").lower().split()) if not prev_tokens or not curr_tokens: @@ -78,13 +91,10 @@ def _observation_changed_significantly(self, observation: str) -> bool: def _should_replan(self, observation: str, state_signature: str) -> tuple[bool, str | None]: if not self.current_plan: return True, "No plan exists yet." - if any(self._state_action_counts.get(state_signature, {}).values()): return True, "This state has repeated, so a previous action already failed to progress." - if self._observation_changed_significantly(observation): return True, "The scene changed significantly from the previous observation." - return False, None def _update_plan( @@ -94,23 +104,14 @@ def _update_plan( replan_reason: str | None, ) -> dict[str, Any]: self._ensure_llm() - prompt = self._build_planner_prompt( - observation, - choices, - prompt_kind="plan", - replan_reason=replan_reason, - ) - plan_response = self.llm.get_completion(prompt) + prompt = self._build_planner_prompt(observation, choices, prompt_kind="plan", replan_reason=replan_reason) + plan_response = self._call_llm(prompt) usage = self.llm.get_last_usage() plan = self._normalize_plan(plan_response) if not plan: - if self.current_plan: - plan = self.current_plan - else: - plan = ( - "Gather clues, protect resources, and avoid obvious traps while " - "advancing toward the main objective." - ) + plan = self.current_plan or ( + "Gather clues, protect resources, and avoid obvious traps while advancing toward the main objective." + ) self.current_plan = plan self._plan_history.append(plan) if len(self._plan_history) > 10: @@ -123,48 +124,18 @@ def _choose_action_with_plan( choices: list[dict[str, str]], replan_reason: str | None, ) -> tuple[LLMResponse, dict[str, Any]]: - prompt = self._build_planner_prompt( - observation, - choices, - prompt_kind="act", - replan_reason=replan_reason, - ) - llm_response = self.llm.get_completion(prompt) - llm_usage = self.llm.get_last_usage() - parsed_response = parse_llm_response(llm_response, len(choices), self.debug, self.logger) - - if parsed_response.is_default: - retry_response = self.llm.get_completion(self._format_retry_prompt(observation, choices)) - retry_usage = self.llm.get_last_usage() - llm_usage = self._merge_usage(llm_usage, retry_usage) - retry_parsed = parse_llm_response( - retry_response, - len(choices), - self.debug, - self.logger, - ) - if not retry_parsed.is_default: - retry_parsed.parse_mode = f"retry_{retry_parsed.parse_mode or 'parsed'}" - parsed_response = retry_parsed - elif self._needs_force_numeric_retry(): - force_retry_response = self.llm.get_completion(self._format_force_numeric_retry_prompt(choices)) - force_retry_usage = self.llm.get_last_usage() - llm_usage = self._merge_usage(llm_usage, force_retry_usage) - force_retry_parsed = parse_llm_response( - force_retry_response, - len(choices), - self.debug, - self.logger, - ) - if not force_retry_parsed.is_default: - force_retry_parsed.parse_mode = f"force_retry_{force_retry_parsed.parse_mode or 'parsed'}" - parsed_response = force_retry_parsed - - return parsed_response, llm_usage + prompt = self._build_planner_prompt(observation, choices, prompt_kind="act", replan_reason=replan_reason) + parsed_response = self._parse_with_retries(prompt, observation, choices) + return parsed_response, { + "prompt_tokens": parsed_response.prompt_tokens, + "completion_tokens": parsed_response.completion_tokens, + "total_tokens": parsed_response.total_tokens, + "estimated_cost_usd": parsed_response.estimated_cost_usd, + } def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int: if self.debug: - self.logger.debug("PlannerAgent evaluating state with %s choices", len(choices)) + self.logger.debug("PlannerHarness evaluating state with %s choices", len(choices)) try: state_signature = self._state_signature(state, choices) contextual_state = self._build_contextual_state(state) @@ -178,37 +149,29 @@ def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int: choices, replan_reason if should_replan else None, ) + action_before_policy = parsed_response.action - parsed_response.action = self._apply_safety_filter(parsed_response.action, choices) + parsed_response.action = self._apply_safety_filter(choices, parsed_response.action) if parsed_response.action != action_before_policy and not parsed_response.reasoning: parsed_response.reasoning = "policy_safety_override" total_usage = ( self._merge_usage(plan_usage, action_usage) if plan_usage else self._normalize_usage(action_usage) ) - if plan_usage: - total_usage = self._normalize_usage(total_usage) - + total_usage = self._normalize_usage(total_usage) parsed_response.prompt_tokens = total_usage["prompt_tokens"] parsed_response.completion_tokens = total_usage["completion_tokens"] parsed_response.total_tokens = total_usage["total_tokens"] parsed_response.estimated_cost_usd = total_usage["estimated_cost_usd"] + if parsed_response.action < 1 or parsed_response.action > len(choices): + parsed_response.action = 1 self.history.append(parsed_response) self._last_response = parsed_response self._remember_decision(state, choices, state_signature, parsed_response) - - if parsed_response.action < 1 or parsed_response.action > len(choices): - self.logger.error( - "INVALID ACTION DETECTED: %s not in range 1-%s", - parsed_response.action, - len(choices), - ) - parsed_response.action = 1 - return parsed_response.action except Exception as exc: - self.logger.error("Planner agent error during LLM call: %s", exc) + self.logger.error("Planner harness error during LLM call: %s", exc) default_response = LLMResponse( action=1, is_default=True, diff --git a/llm_quest_benchmark/harnesses/reasoning.py b/llm_quest_benchmark/harnesses/reasoning.py new file mode 100644 index 0000000..79564d5 --- /dev/null +++ b/llm_quest_benchmark/harnesses/reasoning.py @@ -0,0 +1,57 @@ +"""Reasoning harness variants.""" + +from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE +from llm_quest_benchmark.harnesses.memory import DefaultMemory, FullTranscriptMemory +from llm_quest_benchmark.harnesses.minimal import MinimalHarness + + +class ReasoningRecentHarness(MinimalHarness): + harness_name = "reasoning_recent" + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + system_template: str = SYSTEM_ROLE_TEMPLATE, + action_template: str = "reasoning.jinja", + temperature: float = DEFAULT_TEMPERATURE, + skip_single: bool = False, + debug: bool = False, + memory_module=None, + **kwargs, + ): + super().__init__( + model_name=model_name, + system_template=system_template, + action_template=action_template, + temperature=temperature, + skip_single=skip_single, + debug=debug, + memory_module=memory_module or DefaultMemory(), + **kwargs, + ) + + +class ReasoningFullTranscriptHarness(MinimalHarness): + harness_name = "reasoning_full" + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + system_template: str = SYSTEM_ROLE_TEMPLATE, + action_template: str = "reasoning.jinja", + temperature: float = DEFAULT_TEMPERATURE, + skip_single: bool = False, + debug: bool = False, + memory_module=None, + **kwargs, + ): + super().__init__( + model_name=model_name, + system_template=system_template, + action_template=action_template, + temperature=temperature, + skip_single=skip_single, + debug=debug, + memory_module=memory_module or FullTranscriptMemory(), + **kwargs, + ) diff --git a/llm_quest_benchmark/harnesses/tool_harness.py b/llm_quest_benchmark/harnesses/tool_harness.py new file mode 100644 index 0000000..0acc699 --- /dev/null +++ b/llm_quest_benchmark/harnesses/tool_harness.py @@ -0,0 +1,241 @@ +"""Tool-augmented harness implementations.""" + +from typing import Any + +from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE +from llm_quest_benchmark.harnesses.base import BaseHarness, _parse_json_response +from llm_quest_benchmark.harnesses.memory import CompactionMemory +from llm_quest_benchmark.harnesses.tools import QuestHistoryTool, Scratchpad, calculator +from llm_quest_benchmark.schemas.response import LLMResponse + + +class ToolCompactHarness(BaseHarness): + """Compacted-memory harness with a two-phase tool selection/action loop.""" + + harness_name = "tool_compact" + DEFAULT_HISTORY_WINDOW = 10 + MAX_TOOL_INPUT_CHARS = 500 + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + system_template: str = SYSTEM_ROLE_TEMPLATE, + action_template: str = "tool_augmented.jinja", + temperature: float = DEFAULT_TEMPERATURE, + skip_single: bool = False, + debug: bool = False, + compaction_interval: int = 50, + memory_module=None, + history_window: int | None = None, + **_, + ): + self._step_log: list[dict[str, Any]] = [] + self._history_window = history_window or self.DEFAULT_HISTORY_WINDOW + self._scratchpad_tool = Scratchpad() + self._history_tool = QuestHistoryTool(self._step_log, self._history_window) + super().__init__( + model_name=model_name, + system_template=system_template, + action_template=action_template, + temperature=temperature, + skip_single=skip_single, + debug=debug, + memory_module=memory_module or CompactionMemory(compaction_interval=compaction_interval), + tools=[calculator, self._scratchpad_tool, self._history_tool], + ) + self._memory_mode = "compaction" + self._compaction_interval = compaction_interval + + def _recent_steps(self) -> list[str]: + return [ + f"Step {entry['step']}: {entry['observation']} -> {entry.get('selected_choice', 'n/a')}" + for entry in self._step_log[-self._history_window :] + ] + + def _tool_descriptions(self) -> list[str]: + return [ + "quest_history(query): search earlier observations and chosen actions in this quest.", + "calculator(expression): evaluate arithmetic and simple comparisons.", + "scratchpad(operation, content): read or replace one persistent note. operation is read or write_replace.", + ] + + def quest_history(self, query: str) -> str: + return self._history_tool.search(query) + + @staticmethod + def calculator(expression: str) -> str: + return calculator(expression) + + def scratchpad(self, operation: str, content: str = "") -> str: + op = (operation or "").strip().lower() + if op == "read": + return self._scratchpad_tool.read() + if op == "write_replace": + return self._scratchpad_tool.write_replace(content) + return "error: operation must be read or write_replace" + + def _build_tool_prompt( + self, + observation: str, + choices: list[dict[str, str]], + prompt_kind: str, + tool_results: list[str] | None = None, + ) -> str: + template = self.prompt_renderer.get_template(self.action_template) + return template.render( + prompt_kind=prompt_kind, + observation=observation, + choices=[{"text": choice.get("text", "")} for choice in choices], + tool_descriptions=self._tool_descriptions(), + tool_results=tool_results or [], + recent_steps=self._recent_steps(), + scratchpad_note=self._scratchpad_tool.read() if self._scratchpad_tool.read() != "(empty)" else "", + ).strip() + + @staticmethod + def _extract_tool_calls(response: str) -> list[dict[str, Any]]: + payload, _ = _parse_json_response(response) + if not isinstance(payload, dict): + return [] + tool_calls = payload.get("tool_calls") + if not isinstance(tool_calls, list): + return [] + + normalized = [] + for item in tool_calls[:1]: + if not isinstance(item, dict): + continue + tool_name = str(item.get("tool") or "").strip() + tool_input = item.get("input") + operation = str(item.get("operation") or "").strip() + content = str(item.get("content") or "").strip() + if isinstance(tool_input, dict): + operation = operation or str(tool_input.get("operation") or "").strip() + content = content or str(tool_input.get("content") or "").strip() + tool_input = tool_input.get("expression") or tool_input.get("query") or tool_input.get("content") or "" + tool_input = str(tool_input or "").strip() + if len(tool_input) > ToolCompactHarness.MAX_TOOL_INPUT_CHARS: + tool_input = tool_input[: ToolCompactHarness.MAX_TOOL_INPUT_CHARS] + if len(content) > ToolCompactHarness.MAX_TOOL_INPUT_CHARS: + content = content[: ToolCompactHarness.MAX_TOOL_INPUT_CHARS] + if tool_name: + normalized.append({"tool": tool_name, "input": tool_input, "operation": operation, "content": content}) + return normalized + + def _execute_tool_calls(self, tool_calls: list[dict[str, Any]]) -> list[str]: + results = [] + for tc in tool_calls: + name, inp = tc["tool"], tc.get("input", "") + if name == "quest_history": + result = self.quest_history(inp) + elif name == "calculator": + result = self.calculator(inp) + elif name == "scratchpad": + operation = tc.get("operation") or inp + result = self.scratchpad(str(operation), str(tc.get("content") or "")) + else: + result = f"unknown tool: {name}" + call_repr = inp + if name == "scratchpad": + call_repr = f"{tc.get('operation') or inp}, {tc.get('content') or ''}".strip(", ") + results.append(f"{name}({call_repr}) => {result}") + return results + + def _final_choice( + self, + observation: str, + choices: list[dict[str, str]], + tool_results: list[str] | None = None, + ) -> tuple[LLMResponse, dict[str, Any]]: + prompt = self._build_tool_prompt(observation, choices, prompt_kind="final", tool_results=tool_results) + parsed_response = self._parse_with_retries(prompt, observation, choices) + return parsed_response, { + "prompt_tokens": parsed_response.prompt_tokens, + "completion_tokens": parsed_response.completion_tokens, + "total_tokens": parsed_response.total_tokens, + "estimated_cost_usd": parsed_response.estimated_cost_usd, + } + + def _log_step(self, observation: str, choices: list[dict[str, str]], response: LLMResponse) -> None: + selected = "" + if 1 <= response.action <= len(choices): + selected = choices[response.action - 1].get("text", "") + clipped = " ".join((observation or "").strip().split()) + if len(clipped) > 180: + clipped = clipped[:180] + "..." + self._step_log.append( + { + "step": len(self._step_log) + 1, + "observation": clipped, + "choices": [c.get("text", "") for c in choices], + "selected_choice": selected, + } + ) + + def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int: + try: + state_signature = self._state_signature(state, choices) + contextual_state = self._build_contextual_state(state) + self._ensure_llm() + + selection_prompt = self._build_tool_prompt(contextual_state, choices, prompt_kind="select") + selection_response = self._call_llm(selection_prompt) + selection_usage = self.llm.get_last_usage() + tool_calls = self._extract_tool_calls(selection_response) + parsed_response = self._parse_llm_response(selection_response, len(choices)) + tool_results: list[str] = [] + final_choice_used = False + + total_usage = self._normalize_usage(selection_usage) + if tool_calls: + tool_results = self._execute_tool_calls(tool_calls) + parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=tool_results) + total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage)) + final_choice_used = True + elif parsed_response.is_default: + parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=[]) + total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage)) + final_choice_used = True + + if not final_choice_used: + action_before_policy = parsed_response.action + parsed_response.action = self._apply_safety_filter(choices, parsed_response.action) + if parsed_response.action != action_before_policy and not parsed_response.reasoning: + parsed_response.reasoning = "policy_safety_override" + + parsed_response.prompt_tokens = total_usage["prompt_tokens"] + parsed_response.completion_tokens = total_usage["completion_tokens"] + parsed_response.total_tokens = total_usage["total_tokens"] + parsed_response.estimated_cost_usd = total_usage["estimated_cost_usd"] + parsed_response.tool_calls = tool_calls or None + parsed_response.tool_results = tool_results or None + + self.history.append(parsed_response) + self._last_response = parsed_response + self._remember_decision(state, choices, state_signature, parsed_response) + self._log_step(state, choices, parsed_response) + return parsed_response.action + except Exception as exc: + self.logger.error("Tool harness error during LLM call: %s", exc) + default_response = LLMResponse( + action=1, + is_default=True, + parse_mode="error_default", + reasoning=f"tool_harness_error: {exc}", + ) + self.history.append(default_response) + self._last_response = default_response + return 1 + + def reset(self) -> None: + super().reset() + self._step_log = [] + self._scratchpad_tool.reset() + self._history_tool.step_log = self._step_log + + +class ToolHintedHarness(ToolCompactHarness): + harness_name = "tool_hinted" + + def __init__(self, *args, action_template: str = "tool_augmented_hints.jinja", **kwargs): + super().__init__(*args, action_template=action_template, **kwargs) diff --git a/llm_quest_benchmark/harnesses/tools.py b/llm_quest_benchmark/harnesses/tools.py new file mode 100644 index 0000000..9978c58 --- /dev/null +++ b/llm_quest_benchmark/harnesses/tools.py @@ -0,0 +1,173 @@ +"""Reusable tools for harness-based quest players.""" + +import ast +import re + +MAX_SCRATCHPAD_CHARS = 1200 + + +def calculator(expression: str) -> str: + """Evaluate a restricted arithmetic/comparison expression.""" + expr = (expression or "").strip() + if not expr: + return "error: empty expression" + if len(expr) > 240: + return "error: expression too long" + if not re.fullmatch(r"[0-9a-zA-Z\s+\-*/().,<>=!%]+", expr): + return "error: unsupported characters" + + allowed_nodes = ( + ast.Expression, + ast.Constant, + ast.UnaryOp, + ast.UAdd, + ast.USub, + ast.BinOp, + ast.Add, + ast.Sub, + ast.Mult, + ast.Div, + ast.FloorDiv, + ast.Mod, + ast.Pow, + ast.Compare, + ast.Eq, + ast.NotEq, + ast.Lt, + ast.LtE, + ast.Gt, + ast.GtE, + ast.BoolOp, + ast.And, + ast.Or, + ) + try: + tree = ast.parse(expr, mode="eval") + for node in ast.walk(tree): + if not isinstance(node, allowed_nodes): + return f"error: unsupported expression element {node.__class__.__name__}" + if isinstance(node, ast.Constant) and not isinstance(node.value, (int, float, bool)): + return "error: constants must be numeric or boolean" + result = _eval_calculator_node(tree.body) + except Exception as exc: + return f"error: {exc}" + return f"{expr} = {result}" + + +def _eval_calculator_node(node: ast.AST) -> int | float | bool: + if isinstance(node, ast.Constant) and isinstance(node.value, (int, float, bool)): + return node.value + if isinstance(node, ast.UnaryOp): + value = _eval_calculator_node(node.operand) + if isinstance(node.op, ast.UAdd): + return +value + if isinstance(node.op, ast.USub): + return -value + if isinstance(node, ast.BinOp): + left = _eval_calculator_node(node.left) + right = _eval_calculator_node(node.right) + if isinstance(node.op, ast.Add): + return left + right + if isinstance(node.op, ast.Sub): + return left - right + if isinstance(node.op, ast.Mult): + return left * right + if isinstance(node.op, ast.Div): + return left / right + if isinstance(node.op, ast.FloorDiv): + return left // right + if isinstance(node.op, ast.Mod): + return left % right + if isinstance(node.op, ast.Pow): + if abs(right) > 8: + raise ValueError("exponent too large") + return left**right + if isinstance(node, ast.BoolOp): + values = [bool(_eval_calculator_node(value)) for value in node.values] + if isinstance(node.op, ast.And): + return all(values) + if isinstance(node.op, ast.Or): + return any(values) + if isinstance(node, ast.Compare): + left = _eval_calculator_node(node.left) + for op, comparator in zip(node.ops, node.comparators, strict=True): + right = _eval_calculator_node(comparator) + if isinstance(op, ast.Eq): + ok = left == right + elif isinstance(op, ast.NotEq): + ok = left != right + elif isinstance(op, ast.Lt): + ok = left < right + elif isinstance(op, ast.LtE): + ok = left <= right + elif isinstance(op, ast.Gt): + ok = left > right + elif isinstance(op, ast.GtE): + ok = left >= right + else: + raise ValueError("unsupported comparison") + if not ok: + return False + left = right + return True + raise ValueError("unsupported expression") + + +class Scratchpad: + """Persistent free-form note blob with read and replace operations.""" + + def __init__(self, max_chars: int = MAX_SCRATCHPAD_CHARS): + self.max_chars = max_chars + self._content = "" + + def read(self) -> str: + return self._content or "(empty)" + + def write_replace(self, content: str = "") -> str: + note = " ".join((content or "").strip().split()) + self._content = note[: self.max_chars] + return f"updated: {self._content or '(empty)'}" + + def reset(self) -> None: + self._content = "" + + +class QuestHistoryTool: + """Keyword search over a run-local quest step log.""" + + def __init__(self, step_log: list[dict] | None = None, history_window: int = 10): + self.step_log = step_log if step_log is not None else [] + self.history_window = history_window + + def search(self, query: str) -> str: + """Return relevant previous steps from this quest run via keyword match.""" + if not self.step_log: + return "No prior quest steps recorded yet." + + tokens = set(re.findall(r"[a-zA-Z\u0400-\u04ff0-9_]{3,}", (query or "").lower())) + scored = [] + for entry in self.step_log: + haystack = " ".join( + [ + entry.get("observation", ""), + " ".join(entry.get("choices", [])), + entry.get("selected_choice", ""), + ] + ).lower() + score = sum(1 for token in tokens if token in haystack) + scored.append((score, entry)) + + scored.sort(key=lambda item: (item[0], item[1].get("step", 0)), reverse=True) + best = [entry for score, entry in scored if score > 0][: self.history_window] + if not best: + best = [entry for _, entry in scored[: self.history_window]] + + lines = [] + for entry in best: + choices = entry.get("choices", []) + choices_text = choices if isinstance(choices, str) else "; ".join(choices) + lines.append( + f"Step {entry.get('step', '?')}: obs={entry.get('observation', '')} | " + f"choices={choices_text} | picked={entry.get('selected_choice', 'n/a')}" + ) + return "\n".join(lines) diff --git a/llm_quest_benchmark/players/__init__.py b/llm_quest_benchmark/players/__init__.py new file mode 100644 index 0000000..aa71d5b --- /dev/null +++ b/llm_quest_benchmark/players/__init__.py @@ -0,0 +1,17 @@ +__all__ = ["QuestPlayer", "HumanPlayer", "RandomPlayer"] + + +def __getattr__(name): + if name == "QuestPlayer": + from .base import QuestPlayer + + return QuestPlayer + if name == "HumanPlayer": + from .human import HumanPlayer + + return HumanPlayer + if name == "RandomPlayer": + from .random import RandomPlayer + + return RandomPlayer + raise AttributeError(name) diff --git a/llm_quest_benchmark/agents/base.py b/llm_quest_benchmark/players/base.py similarity index 92% rename from llm_quest_benchmark/agents/base.py rename to llm_quest_benchmark/players/base.py index eed7609..9e53750 100644 --- a/llm_quest_benchmark/agents/base.py +++ b/llm_quest_benchmark/players/base.py @@ -1,4 +1,4 @@ -"""Base classes for quest players (both human and LLM)""" +"""Base class for quest players and harnesses.""" from abc import ABC, abstractmethod from typing import Any @@ -13,7 +13,7 @@ def __init__(self, skip_single: bool = False): """Initialize player with skip_single option""" self.skip_single = skip_single self._last_response: LLMResponse = None - self.agent_id = "base_agent" # Default agent ID + self.agent_id = "base_player" def get_action(self, observation: str, choices: list) -> int: """Get action number from observation and choices @@ -55,7 +55,7 @@ def _get_action_impl(self, observation: str, choices: list) -> int: pass def get_last_response(self) -> LLMResponse: - """Get the last response from the agent""" + """Get the last response from the player or harness.""" return self._last_response @abstractmethod diff --git a/llm_quest_benchmark/agents/human_player.py b/llm_quest_benchmark/players/human.py similarity index 91% rename from llm_quest_benchmark/agents/human_player.py rename to llm_quest_benchmark/players/human.py index 721c43d..b5d74f4 100644 --- a/llm_quest_benchmark/agents/human_player.py +++ b/llm_quest_benchmark/players/human.py @@ -3,7 +3,7 @@ import logging from typing import Any -from llm_quest_benchmark.agents.base import QuestPlayer +from llm_quest_benchmark.players.base import QuestPlayer class HumanPlayer(QuestPlayer): @@ -15,7 +15,7 @@ def __init__(self, skip_single: bool = False, debug: bool = False): self.logger = logging.getLogger(__name__) if debug: self.logger.setLevel(logging.DEBUG) - # Set agent_id for database records + # Keep the persisted identifier stable for existing result artifacts. self.agent_id = "human" def _get_action_impl(self, observation: str, choices: list) -> int: diff --git a/llm_quest_benchmark/agents/random_agent.py b/llm_quest_benchmark/players/random.py similarity index 75% rename from llm_quest_benchmark/agents/random_agent.py rename to llm_quest_benchmark/players/random.py index e428353..a8fea29 100644 --- a/llm_quest_benchmark/agents/random_agent.py +++ b/llm_quest_benchmark/players/random.py @@ -1,17 +1,19 @@ -"""Random agent for testing quests""" +"""Random player for testing quests""" import logging import random -from llm_quest_benchmark.agents.base import QuestPlayer +from llm_quest_benchmark.players.base import QuestPlayer -class RandomAgent(QuestPlayer): - """Agent that randomly selects from available choices. - Used for testing quests and finding edge cases.""" +class RandomPlayer(QuestPlayer): + """Player that randomly selects from available choices. + + Used for testing quests and finding edge cases. + """ def __init__(self, seed: int = None, debug: bool = False, skip_single: bool = False): - """Initialize random agent. + """Initialize random player. Args: seed (int, optional): Random seed for reproducibility. Defaults to None. @@ -24,7 +26,7 @@ def __init__(self, seed: int = None, debug: bool = False, skip_single: bool = Fa if debug: self.logger.setLevel(logging.DEBUG) self.rng = random.Random(seed) - # Set agent_id for database records + # Keep the persisted identifier stable for existing result artifacts. self.agent_id = f"random_{seed}" if seed is not None else "random" def _get_action_impl(self, observation: str, choices: list[dict[str, str]]) -> int: @@ -43,5 +45,5 @@ def _get_action_impl(self, observation: str, choices: list[dict[str, str]]) -> i return self.rng.randint(1, len(choices)) def reset(self) -> None: - """Reset agent state - nothing to reset for random agent""" + """Reset player state; nothing to reset for random choice.""" pass diff --git a/llm_quest_benchmark/prompt_templates/consequence_scan.jinja b/llm_quest_benchmark/prompt_templates/consequence_scan.jinja deleted file mode 100644 index 55ce54b..0000000 --- a/llm_quest_benchmark/prompt_templates/consequence_scan.jinja +++ /dev/null @@ -1,18 +0,0 @@ -Current story state: -{{ observation }} - -Available actions: -{% for choice in choices %} -{{ loop.index }}. {{ choice.text }} -{% endfor %} - -Mission objective: complete the quest successfully. - -Decision method: -1. For each action, estimate immediate consequence in 5 words max. -2. Prefer actions that preserve progress and gather information. -3. Avoid options that abandon, surrender, or end the mission early. -4. If uncertain, choose the lowest-risk progress action. - -Return ONLY valid JSON (no markdown/code fences), exactly: -{"analysis":"","reasoning":"","result":} diff --git a/llm_quest_benchmark/prompt_templates/consequence_scan_subgoal.jinja b/llm_quest_benchmark/prompt_templates/consequence_scan_subgoal.jinja deleted file mode 100644 index 7fd4236..0000000 --- a/llm_quest_benchmark/prompt_templates/consequence_scan_subgoal.jinja +++ /dev/null @@ -1,19 +0,0 @@ -Current story state: -{{ observation }} - -Available actions: -{% for choice in choices %} -{{ loop.index }}. {{ choice.text }} -{% endfor %} - -Mission objective: complete the quest successfully. - -Decision method: -1. Use any provided memo from prior turns to stay consistent. -2. For each action, estimate immediate consequence in 5 words max. -3. Prefer actions that preserve progress and gather information. -4. Avoid options that abandon, surrender, or end the mission early. -5. If uncertain, choose the lowest-risk progress action. - -Return ONLY valid JSON (no markdown/code fences), exactly: -{"analysis":"","reasoning":"","memo":"","result":} diff --git a/llm_quest_benchmark/prompt_templates/light_hints.jinja b/llm_quest_benchmark/prompt_templates/light_hints.jinja deleted file mode 100644 index eb3ab60..0000000 --- a/llm_quest_benchmark/prompt_templates/light_hints.jinja +++ /dev/null @@ -1,18 +0,0 @@ -General hints for this type of quest: -- Read the scene literally. Win/loss constraints are usually stated directly in the text. -- Preparation, study, negotiation, and reconnaissance are often safer than direct combat or bravado. -- Prefer actions that gather clues or unlock safer options before committing to irreversible moves. -- Avoid choices that abandon the mission, surrender, or waste scarce time/resources for no gain. -- If a scene repeats, the last branch did not help - try a different action. -- Prioritize the core objective over optional heroic detours. - -Current story state: -{{ observation }} - -Available actions: -{% for choice in choices %} -{{ loop.index }}. {{ choice.text }} -{% endfor %} - -Return ONLY valid JSON (no markdown/code fences), exactly: -{"analysis":"","reasoning":"","result":} diff --git a/llm_quest_benchmark/prompt_templates/loop_aware_reasoning.jinja b/llm_quest_benchmark/prompt_templates/loop_aware_reasoning.jinja deleted file mode 100644 index 38a9343..0000000 --- a/llm_quest_benchmark/prompt_templates/loop_aware_reasoning.jinja +++ /dev/null @@ -1,19 +0,0 @@ -Current story state: -{{ observation }} - -Available actions: -{% for choice in choices %} -{{ loop.index }}. {{ choice.text }} -{% endfor %} - -Mission objective: complete the quest successfully. - -Decision policy: -1. Prefer actions that preserve progress and avoid premature failure. -2. Use Status/context hints (stats, resources, relationships) to reduce obvious risks. -3. If this scene appears repeated, avoid repeating the same action that did not progress. -4. When uncertain, choose the safest reversible action that keeps the mission alive. -5. Do not surrender/quit unless it is clearly required for success. - -Return ONLY valid JSON (no markdown/code fences), exactly: -{"analysis":"","reasoning":"","result":} diff --git a/llm_quest_benchmark/prompt_templates/objective_guard.jinja b/llm_quest_benchmark/prompt_templates/objective_guard.jinja deleted file mode 100644 index b80d482..0000000 --- a/llm_quest_benchmark/prompt_templates/objective_guard.jinja +++ /dev/null @@ -1,18 +0,0 @@ -Current story state: -{{ observation }} - -Available actions: -{% for choice in choices %} -{{ loop.index }}. {{ choice.text }} -{% endfor %} - -Choose the action that best supports mission completion. - -Guardrails: -1. Keep the run alive unless ending is clearly successful. -2. Penalize actions that look like quitting, escaping, or self-sabotage. -3. Prefer actions that unlock clues, credentials, access, or progression gates. -4. Resolve ambiguity by selecting the most reversible safe option. - -Return ONLY valid JSON (no markdown/code fences), exactly: -{"analysis":"","reasoning":"","result":} diff --git a/llm_quest_benchmark/prompt_templates/strategic.jinja b/llm_quest_benchmark/prompt_templates/strategic.jinja deleted file mode 100644 index 1668c41..0000000 --- a/llm_quest_benchmark/prompt_templates/strategic.jinja +++ /dev/null @@ -1,32 +0,0 @@ -{# Tier 3: Strategic agent prompt #} -{# Contextual State Tracker #} -{% if state_tracker %} -Historical context: -{% for entry in state_tracker %} -- {{ entry }} -{% endfor %} -{% endif %} - -Current Situation: -{{ observation }} - -Available Actions: -{% for choice in choices %} -{{ loop.index }}. {{ choice.text }} -{% endfor %} - -Analysis Framework: -1. Immediate Context [<50 words] -2. Plausible Hypotheses [2-3 possibilities] -3. Action Impact Forecast [short/long-term] -4. Confidence Estimate [High/Medium/Low] - -Response format: -```json -{ - "hypotheses": ["...", "..."], - "reasoning": "", - "choice": , - "confidence": "" -} -``` diff --git a/llm_quest_benchmark/prompt_templates/system_role_completion.jinja b/llm_quest_benchmark/prompt_templates/system_role_completion.jinja deleted file mode 100644 index 918a695..0000000 --- a/llm_quest_benchmark/prompt_templates/system_role_completion.jinja +++ /dev/null @@ -1,11 +0,0 @@ -You are a mission-completion specialist for interactive fiction quests. - -Core behavior: -1. Infer the current objective from narrative clues. -2. Prioritize actions that maintain progress and optionality. -3. Avoid premature terminal outcomes unless success is explicit. -4. Prefer evidence-based choices over stylistic roleplay. - -When the state is ambiguous: -- choose the safest action that still advances the mission. -- avoid speculative high-risk branches without support in the text. diff --git a/llm_quest_benchmark/prompt_templates/system_role_risk.jinja b/llm_quest_benchmark/prompt_templates/system_role_risk.jinja deleted file mode 100644 index ea19c36..0000000 --- a/llm_quest_benchmark/prompt_templates/system_role_risk.jinja +++ /dev/null @@ -1,16 +0,0 @@ -{# Enhanced system role for interactive fiction #} -You are an experienced interactive fiction player. Your capabilities include: - -1. Dynamic Goal Recognition: Infer objectives from narrative context -2. Clue Chaining: Connect information across scenes -3. Consequence Forecasting: Predict 2-3 steps ahead for each action -4. Narrative Consistency: Maintain character/story logic - -Follow these principles: -- Treat each choice as part of an unfolding mystery -- Track objects/characters/relationships as state components -- Consider both practical and thematic implications -- Admit uncertainty when clues are ambiguous -- Flag potential contradictions in story logic - -Any bad move can fail the quest, so prefer robust low-risk progress over flashy but uncertain options. diff --git a/llm_quest_benchmark/renderers/factory.py b/llm_quest_benchmark/renderers/factory.py index 8b18218..0a8f3e5 100644 --- a/llm_quest_benchmark/renderers/factory.py +++ b/llm_quest_benchmark/renderers/factory.py @@ -1,7 +1,7 @@ """Factory for creating appropriate renderers based on agent type and mode""" -from llm_quest_benchmark.agents.base import QuestPlayer -from llm_quest_benchmark.agents.human_player import HumanPlayer +from llm_quest_benchmark.players.base import QuestPlayer +from llm_quest_benchmark.players.human import HumanPlayer from llm_quest_benchmark.renderers.base import BaseRenderer from llm_quest_benchmark.renderers.null import NoRenderer from llm_quest_benchmark.renderers.progress import ProgressRenderer @@ -25,7 +25,7 @@ def create_renderer( The factory follows these rules: 1. In debug mode, always use NoRenderer 2. For human players, use RichRenderer - 3. For automated agents (LLM, Random): + 3. For automated players (LLM, Random): - In benchmark mode (total_quests provided), use ProgressRenderer - Otherwise, use NoRenderer """ diff --git a/llm_quest_benchmark/renderers/progress.py b/llm_quest_benchmark/renderers/progress.py index 9d2cde9..a5097d2 100644 --- a/llm_quest_benchmark/renderers/progress.py +++ b/llm_quest_benchmark/renderers/progress.py @@ -45,23 +45,23 @@ def __init__(self, total_quests: int, total_runs: int): self.console.print("\n[bold cyan]Benchmark Progress[/]") def render_game_state(self, state: dict[str, Any]) -> None: - """No game state rendering needed for automated agents""" + """No game state rendering needed for automated players""" pass def render_title(self) -> None: - """No title rendering needed for automated agents""" + """No title rendering needed for automated players""" pass def render_quest_text(self, text: str) -> None: - """No quest text rendering needed for automated agents""" + """No quest text rendering needed for automated players""" pass def render_choices(self, choices: list) -> None: - """No choices rendering needed for automated agents""" + """No choices rendering needed for automated players""" pass def render_parameters(self, params: list) -> None: - """No parameters rendering needed for automated agents""" + """No parameters rendering needed for automated players""" pass def render_error(self, message: str) -> None: diff --git a/llm_quest_benchmark/schemas/__init__.py b/llm_quest_benchmark/schemas/__init__.py index 34fee08..cb0338f 100644 --- a/llm_quest_benchmark/schemas/__init__.py +++ b/llm_quest_benchmark/schemas/__init__.py @@ -1,9 +1,16 @@ """Schema exports for LLM Quest Benchmark""" -__all__ = ["QMState", "AgentState", "LLMResponse", "QMBridgeState", "BenchmarkConfig", "AgentConfig"] +__all__ = [ + "QMState", + "AgentState", + "LLMResponse", + "QMBridgeState", + "BenchmarkConfig", + "HarnessConfig", +] # Import directly from the schema modules using relative imports from .bridge import QMBridgeState -from .config import AgentConfig, BenchmarkConfig +from .config import BenchmarkConfig, HarnessConfig from .response import LLMResponse from .state import AgentState, QMState diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py index 6a030b2..5cd93b2 100644 --- a/llm_quest_benchmark/schemas/config.py +++ b/llm_quest_benchmark/schemas/config.py @@ -8,7 +8,6 @@ from llm_quest_benchmark.constants import ( DEFAULT_MODEL, DEFAULT_TEMPERATURE, - DEFAULT_TEMPLATE, MODEL_CHOICES, SYSTEM_ROLE_TEMPLATE, normalize_template_name, @@ -18,8 +17,8 @@ DEFAULT_BENCHMARK_CONFIG = { "quests": ["quests/Boat.qm"], "agents": [ - {"model": "random_choice", "skip_single": True, "temperature": 0.0, "template": "reasoning.jinja"}, - {"model": "gpt-5-mini", "skip_single": True, "temperature": 0.4, "template": "reasoning.jinja"}, + {"model": "random_choice", "skip_single": True, "temperature": 0.0, "harness": "random_choice"}, + {"model": "gpt-5-mini", "skip_single": True, "temperature": 0.4, "harness": "reasoning_recent"}, ], "debug": False, "quest_timeout": 30, @@ -27,6 +26,18 @@ "name": "Default Benchmark", } +COMPACTION_HARNESSES = { + "memo_compact", + "hinted_compact", + "tool_compact", + "tool_hinted", + "planner", + "compaction_no_memo", + "memo_cot", + "memo_extended", + "memo_structured", +} + def get_default_benchmark_yaml() -> str: """Get the default benchmark configuration from default.yaml file""" @@ -43,8 +54,9 @@ def get_default_benchmark_yaml() -> str: - quests/Boat.qm agents: - model: random_choice + harness: random_choice - model: gpt-5-mini - template: reasoning.jinja + harness: reasoning_recent debug: true # One worker per agent will be used automatically output_dir: results/benchmarks""" @@ -55,25 +67,71 @@ def get_default_benchmark_yaml() -> str: @dataclass -class AgentConfig: - """Configuration for a single agent in benchmark""" +class HarnessConfig: + """Configuration for a single harness in benchmark""" model: str = DEFAULT_MODEL system_template: str = SYSTEM_ROLE_TEMPLATE - action_template: str = DEFAULT_TEMPLATE + harness: str = "reasoning_recent" temperature: float = DEFAULT_TEMPERATURE runs: int = 1 skip_single: bool = False debug: bool = False benchmark_id: str | None = None - memory_mode: str = "default" - compaction_interval: int = 10 + compaction_interval: int = 50 + + def __init__( + self, + model: str = DEFAULT_MODEL, + system_template: str = SYSTEM_ROLE_TEMPLATE, + harness: str = "reasoning_recent", + temperature: float = DEFAULT_TEMPERATURE, + runs: int = 1, + skip_single: bool = False, + debug: bool = False, + benchmark_id: str | None = None, + compaction_interval: int = 50, + **legacy_keys, + ): + if "template" in legacy_keys or "action_template" in legacy_keys: + raise ValueError("Use harness: key instead of template:") + if "memory_mode" in legacy_keys: + raise ValueError("Use harness: key instead of memory_mode:") + if legacy_keys: + unexpected = ", ".join(sorted(legacy_keys)) + raise TypeError(f"Unexpected HarnessConfig key(s): {unexpected}") + + self.model = model + self.system_template = system_template + self.harness = harness + self.temperature = temperature + self.runs = runs + self.skip_single = skip_single + self.debug = debug + self.benchmark_id = benchmark_id + self.compaction_interval = compaction_interval + self.__post_init__() def __post_init__(self): self.system_template = normalize_template_name(self.system_template) - self.action_template = normalize_template_name(self.action_template) - if self.model not in ("random_choice", "human"): - # Keep parser compatibility for legacy names while UI remains clean. + from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, SPECIAL_HARNESSES, is_random_choice_harness + + if ( + self.harness not in HARNESS_REGISTRY + and self.harness != "human" + and not is_random_choice_harness(self.harness) + ): + valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES] + raise ValueError(f"Invalid harness: {self.harness}. Supported harnesses: {valid}") + if self.harness == "human" and self.model != "human": + raise ValueError("Use model: human with harness: human") + if self.model == "human" and self.harness != "human": + raise ValueError("Use harness: human with model: human") + if is_random_choice_harness(self.harness) and self.model != "random_choice": + raise ValueError("Use model: random_choice with random_choice harnesses") + if is_random_choice_harness(self.model) and not is_random_choice_harness(self.harness): + raise ValueError("Use harness: random_choice with model: random_choice") + if self.model not in ("human",) and not is_random_choice_harness(self.model): from llm_quest_benchmark.llm.client import is_supported_model_name if not is_supported_model_name(self.model): @@ -82,20 +140,23 @@ def __post_init__(self): raise ValueError(f"Temperature must be between 0.0 and 2.0, got {self.temperature}") if self.runs < 1: raise ValueError(f"runs must be >= 1, got {self.runs}") - if self.memory_mode not in ("default", "full_transcript", "compaction"): - raise ValueError(f"Invalid memory_mode: {self.memory_mode}") - if self.memory_mode == "compaction" and self.compaction_interval < 1: + if self.compaction_interval < 1: raise ValueError(f"compaction_interval must be >= 1, got {self.compaction_interval}") @property - def agent_id(self) -> str: - """Generate a unique agent ID based on configuration values""" + def harness_id(self) -> str: + """Generate a stable harness ID based on configuration values""" import hashlib - interval_tag = f"_ci{self.compaction_interval}" if self.memory_mode == "compaction" else "" - config_str = f"{self.model}_{self.temperature}_{self.system_template}_{self.action_template}_{self.memory_mode}{interval_tag}" + interval_tag = f"_ci{self.compaction_interval}" if self.harness in COMPACTION_HARNESSES else "" + config_str = f"{self.model}_{self.temperature}_{self.harness}_{self.system_template}{interval_tag}" hash_val = hashlib.md5(config_str.encode()).hexdigest()[:8] - return f"{self.model}_t{self.temperature}_{hash_val}" + return f"{self.model}_t{self.temperature}_{self.harness}_{hash_val}" + + @property + def agent_id(self) -> str: + """DB-compatible alias for harness_id""" + return self.harness_id @dataclass @@ -103,7 +164,7 @@ class BenchmarkConfig: """Configuration for benchmark run""" quests: list[str] # List of quest files or directories - agents: list[AgentConfig] # List of agent configurations to test + agents: list[HarnessConfig] # List of harness configurations to test debug: bool = False quest_timeout: int = 60 # Timeout per quest benchmark_timeout: int | None = None # Total timeout for all quests, defaults to quest_timeout * num_quests @@ -137,10 +198,11 @@ def from_yaml(cls, yaml_path: str) -> "BenchmarkConfig": if "agents" in data: agents = [] for agent in data["agents"]: - # Handle 'template' key which maps to action_template in AgentConfig if "template" in agent: - agent["action_template"] = agent.pop("template") - agents.append(AgentConfig(**agent)) + raise ValueError("Use harness: key instead of template:") + if "memory_mode" in agent: + raise ValueError("Use harness: key instead of memory_mode:") + agents.append(HarnessConfig(**agent)) data["agents"] = agents return cls(**data) diff --git a/llm_quest_benchmark/tests/agents/test_mode_agents.py b/llm_quest_benchmark/tests/agents/test_mode_agents.py deleted file mode 100644 index c650127..0000000 --- a/llm_quest_benchmark/tests/agents/test_mode_agents.py +++ /dev/null @@ -1,257 +0,0 @@ -"""Tests for planner and tool-augmented agent modes.""" - -from unittest.mock import Mock - -from llm_quest_benchmark.agents.agent_factory import create_agent -from llm_quest_benchmark.agents.llm_agent import LLMAgent -from llm_quest_benchmark.agents.planner_agent import PlannerAgent -from llm_quest_benchmark.agents.tool_agent import ToolAgent - - -def test_create_agent_uses_planner_template_alias(): - agent = create_agent(model="gpt-5-mini", action_template="planner") - assert isinstance(agent, PlannerAgent) - - -def test_create_agent_uses_tool_template_alias(): - agent = create_agent(model="gpt-5-mini", action_template="tool_augmented") - assert isinstance(agent, ToolAgent) - - -def test_create_agent_propagates_memory_mode_to_planner_and_tool_agents(): - planner = create_agent( - model="gpt-5-mini", - action_template="planner", - memory_mode="compaction", - compaction_interval=50, - ) - tool = create_agent( - model="gpt-5-mini", - action_template="tool_augmented", - memory_mode="compaction", - compaction_interval=50, - ) - - assert isinstance(planner, PlannerAgent) - assert isinstance(tool, ToolAgent) - assert planner._memory_mode == "compaction" - assert planner._compaction_interval == 50 - assert tool._memory_mode == "compaction" - assert tool._compaction_interval == 50 - - -def test_create_agent_uses_light_hints_template_with_standard_llm_agent(): - agent = create_agent(model="gpt-5-mini", action_template="light_hints") - assert isinstance(agent, LLMAgent) - assert not isinstance(agent, (PlannerAgent, ToolAgent)) - - -def test_light_hints_template_injects_general_mechanics(): - agent = LLMAgent(model_name="gpt-5-mini", action_template="light_hints") - - prompt = agent._format_prompt("A sealed vault blocks the route.", [{"text": "Study the vault"}]) - - assert "General hints for this type of quest" in prompt - assert "Preparation, study, negotiation" in prompt - - -def test_planner_agent_first_turn_generates_plan_then_acts(): - agent = PlannerAgent(model_name="gpt-5-mini") - mocked_llm = Mock() - mocked_llm.get_completion.side_effect = [ - "Gather clues first. Avoid direct fights. Preserve resources.", - '{"analysis":"plan says scout","reasoning":"safer branch","result":2}', - ] - mocked_llm.get_last_usage.side_effect = [ - {"prompt_tokens": 30, "completion_tokens": 12, "total_tokens": 42, "estimated_cost_usd": 0.001}, - {"prompt_tokens": 20, "completion_tokens": 8, "total_tokens": 28, "estimated_cost_usd": 0.0007}, - ] - agent.llm = mocked_llm - - action = agent.get_action("You enter a pirate station.", [{"text": "Scout ahead"}, {"text": "Attack now"}]) - - assert action == 2 - assert agent.current_plan is not None - assert "Avoid direct fights" in agent.current_plan - assert mocked_llm.get_completion.call_count == 2 - assert agent.get_last_response().total_tokens == 70 - - -def test_planner_agent_reuses_plan_when_state_is_stable(): - agent = PlannerAgent(model_name="gpt-5-mini") - agent.current_plan = "Keep moving carefully and avoid a direct fight." - agent._observation_history = ["Quiet corridor."] - mocked_llm = Mock() - mocked_llm.get_completion.return_value = '{"analysis":"plan still fits","reasoning":"careful progress","result":1}' - mocked_llm.get_last_usage.return_value = { - "prompt_tokens": 18, - "completion_tokens": 7, - "total_tokens": 25, - "estimated_cost_usd": 0.0005, - } - agent.llm = mocked_llm - - action = agent.get_action("Quiet corridor.", [{"text": "Open the door"}, {"text": "Run"}]) - - assert action == 1 - assert mocked_llm.get_completion.call_count == 1 - - -def test_planner_agent_uses_contextual_memory_state(): - agent = PlannerAgent(model_name="gpt-5-mini", memory_mode="compaction", compaction_interval=50) - agent._quest_briefing = "Original mission: win the election." - agent._transcript = [ - { - "step": 1, - "observation": "You learned Maloqs value strength.", - "choice_text": "Ask about Maloqs", - "memo": "Maloqs value strength", - "action": 1, - } - ] - agent._steps_since_compaction = 1 - mocked_llm = Mock() - mocked_llm.get_completion.side_effect = [ - "Use the remembered cultural clue.", - '{"analysis":"use clue","reasoning":"fits plan","result":1}', - ] - mocked_llm.get_last_usage.return_value = { - "prompt_tokens": 1, - "completion_tokens": 1, - "total_tokens": 2, - "estimated_cost_usd": 0.0, - } - agent.llm = mocked_llm - - agent.get_action("Current banquet scene.", [{"text": "Greet like a warrior"}]) - - first_prompt = mocked_llm.get_completion.call_args_list[0].args[0] - assert "Quest briefing" in first_prompt - assert "RECENT STEPS" in first_prompt - assert "Maloqs value strength" in first_prompt - - -def test_tool_agent_can_use_quest_history(): - agent = ToolAgent(model_name="gpt-5-mini") - agent._step_log = [ - { - "step": 1, - "observation": "Merchant mentioned low fuel.", - "choices": ["Buy fuel", "Keep flying"], - "selected_choice": "Buy fuel", - } - ] - mocked_llm = Mock() - mocked_llm.get_completion.side_effect = [ - '{"analysis":"need history","tool_calls":[{"tool":"quest_history","input":"fuel merchant"}],"result":null}', - '{"analysis":"fuel clue matters","reasoning":"play safe","result":1}', - ] - mocked_llm.get_last_usage.side_effect = [ - {"prompt_tokens": 24, "completion_tokens": 10, "total_tokens": 34, "estimated_cost_usd": 0.0008}, - {"prompt_tokens": 22, "completion_tokens": 9, "total_tokens": 31, "estimated_cost_usd": 0.0007}, - ] - agent.llm = mocked_llm - - action = agent.get_action("Your fuel gauge is blinking.", [{"text": "Refuel"}, {"text": "Attack pirates"}]) - - assert action == 1 - assert mocked_llm.get_completion.call_count == 2 - assert agent.get_last_response().total_tokens == 65 - assert len(agent._step_log) == 2 - - -def test_tool_agent_calculator_supports_arithmetic_and_comparisons(): - assert ToolAgent.calculator("55 + 12 - 5") == "55 + 12 - 5 = 62" - assert ToolAgent.calculator("60 >= 55 and 62 >= 80") == "60 >= 55 and 62 >= 80 = False" - assert ToolAgent.calculator("__import__('os')").startswith("error:") - - -def test_tool_agent_scratchpad_read_write_and_reset(): - agent = ToolAgent(model_name="gpt-5-mini") - - assert agent.scratchpad("read") == "(empty)" - assert ( - agent.scratchpad("write_replace", " Board: W B _ ; failed door 2 ") == "updated: Board: W B _ ; failed door 2" - ) - assert agent.scratchpad("read") == "Board: W B _ ; failed door 2" - - agent.reset() - - assert agent.scratchpad("read") == "(empty)" - - -def test_tool_agent_can_use_calculator_and_records_tool_metadata(): - agent = ToolAgent(model_name="gpt-5-mini") - mocked_llm = Mock() - mocked_llm.get_completion.side_effect = [ - '{"memo":"Need mix math","analysis":"calculate target","tool_calls":[{"tool":"calculator","input":"50 + 3 >= 55"}],"result":null}', - '{"memo":"Need more strength","analysis":"math failed","reasoning":"choose strength","result":2}', - ] - mocked_llm.get_last_usage.return_value = { - "prompt_tokens": 10, - "completion_tokens": 5, - "total_tokens": 15, - "estimated_cost_usd": 0.0, - } - agent.llm = mocked_llm - - action = agent.get_action("Strength is 50. Need at least 55.", [{"text": "Add water"}, {"text": "Add repusator"}]) - - response = agent.get_last_response() - assert action == 2 - assert response.tool_calls == [{"tool": "calculator", "input": "50 + 3 >= 55", "operation": "", "content": ""}] - assert response.tool_results == ["calculator(50 + 3 >= 55) => 50 + 3 >= 55 = False"] - assert response.memo == "Need more strength" - - -def test_tool_agent_uses_contextual_memory_state(): - agent = ToolAgent(model_name="gpt-5-mini", memory_mode="compaction", compaction_interval=50) - agent._quest_briefing = "Original mission: pass pilot certification." - agent._transcript = [ - { - "step": 1, - "observation": "Hogger is greedy.", - "choice_text": "Bribe Hogger", - "memo": "Hogger is greedy", - "action": 1, - } - ] - agent._steps_since_compaction = 1 - mocked_llm = Mock() - mocked_llm.get_completion.return_value = ( - '{"memo":"Hogger is greedy","analysis":"no tools needed","tool_calls":[],"result":1}' - ) - mocked_llm.get_last_usage.return_value = { - "prompt_tokens": 10, - "completion_tokens": 5, - "total_tokens": 15, - "estimated_cost_usd": 0.0, - } - agent.llm = mocked_llm - - agent.get_action("Current exam room.", [{"text": "Offer a bribe"}]) - - prompt = mocked_llm.get_completion.call_args.args[0] - assert "Quest briefing" in prompt - assert "RECENT STEPS" in prompt - assert "Hogger is greedy" in prompt - - -def test_tool_agent_can_finish_without_tools_in_one_call(): - agent = ToolAgent(model_name="gpt-5-mini") - mocked_llm = Mock() - mocked_llm.get_completion.return_value = ( - '{"analysis":"no tools needed","tool_calls":[],"reasoning":"direct clue","result":2}' - ) - mocked_llm.get_last_usage.return_value = { - "prompt_tokens": 15, - "completion_tokens": 6, - "total_tokens": 21, - "estimated_cost_usd": 0.0004, - } - agent.llm = mocked_llm - - action = agent.get_action("A guard points at the safe exit.", [{"text": "Fight"}, {"text": "Leave"}]) - - assert action == 2 - assert mocked_llm.get_completion.call_count == 1 diff --git a/llm_quest_benchmark/tests/executors/cli/test_commands.py b/llm_quest_benchmark/tests/executors/cli/test_commands.py index db0daf1..1bd972e 100644 --- a/llm_quest_benchmark/tests/executors/cli/test_commands.py +++ b/llm_quest_benchmark/tests/executors/cli/test_commands.py @@ -19,8 +19,11 @@ def test_version(): def test_run_quest(): - """Test running a quest with random agent""" - result = runner.invoke(app, ["run", "--quest", str(DEFAULT_QUEST), "--model", "random_choice", "--debug"]) + """Test running a quest with random player""" + result = runner.invoke( + app, + ["run", "--quest", str(DEFAULT_QUEST), "--model", "random_choice", "--harness", "random_choice", "--debug"], + ) assert result.exit_code in [0, 1, 2] @@ -31,7 +34,9 @@ def test_run_quest_invalid_args(): assert result.exit_code == 2 # Test missing quest file - result = runner.invoke(app, ["run", "--quest", "nonexistent.qm", "--model", "random_choice"]) + result = runner.invoke( + app, ["run", "--quest", "nonexistent.qm", "--model", "random_choice", "--harness", "random_choice"] + ) assert result.exit_code == 2 diff --git a/llm_quest_benchmark/tests/harnesses/__init__.py b/llm_quest_benchmark/tests/harnesses/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/llm_quest_benchmark/tests/agents/test_anthropic.py b/llm_quest_benchmark/tests/harnesses/test_anthropic.py similarity index 50% rename from llm_quest_benchmark/tests/agents/test_anthropic.py rename to llm_quest_benchmark/tests/harnesses/test_anthropic.py index 5dd1f95..ba60f97 100644 --- a/llm_quest_benchmark/tests/agents/test_anthropic.py +++ b/llm_quest_benchmark/tests/harnesses/test_anthropic.py @@ -1,15 +1,15 @@ -"""Deterministic tests for Anthropic-backed agent behavior.""" +"""Deterministic tests for Anthropic-backed harness behavior.""" from unittest.mock import Mock, patch import pytest -from llm_quest_benchmark.agents.agent_factory import create_agent +from llm_quest_benchmark.harnesses.factory import create_harness @patch("llm_quest_benchmark.llm.client.anthropic.Anthropic") -def test_anthropic_agent_mocked_completion(mock_anthropic_cls): - """Agent should parse a mocked Anthropic completion without network calls.""" +def test_anthropic_harness_mocked_completion(mock_anthropic_cls): + """Harness should parse a mocked Anthropic completion without network calls.""" mock_client = Mock() mock_response = Mock() mock_block = Mock() @@ -18,15 +18,15 @@ def test_anthropic_agent_mocked_completion(mock_anthropic_cls): mock_client.messages.create.return_value = mock_response mock_anthropic_cls.return_value = mock_client - agent = create_agent("claude-sonnet-4-5") - action = agent.get_action("Test prompt", [{"text": "A"}, {"text": "B"}]) + harness = create_harness("minimal", model="claude-sonnet-4-5") + action = harness.get_action("Test prompt", [{"text": "A"}, {"text": "B"}]) assert action == 2 assert mock_client.messages.create.call_count == 1 -def test_anthropic_agent_empty_choices_raises(): +def test_anthropic_harness_empty_choices_raises(): """Base player contract should reject empty choices.""" - agent = create_agent("claude-sonnet-4-5") + harness = create_harness("minimal", model="claude-sonnet-4-5") with pytest.raises(ValueError, match="No choices provided"): - agent.get_action("Test prompt", []) + harness.get_action("Test prompt", []) diff --git a/llm_quest_benchmark/tests/agents/test_llm_agent.py b/llm_quest_benchmark/tests/harnesses/test_base.py similarity index 64% rename from llm_quest_benchmark/tests/agents/test_llm_agent.py rename to llm_quest_benchmark/tests/harnesses/test_base.py index 06ff32f..280fd0a 100644 --- a/llm_quest_benchmark/tests/agents/test_llm_agent.py +++ b/llm_quest_benchmark/tests/harnesses/test_base.py @@ -1,10 +1,11 @@ -"""Tests for LLM agent""" +"""Tests for the base LLM harness behavior.""" from unittest.mock import Mock, patch import pytest -from llm_quest_benchmark.agents.llm_agent import LLMAgent, parse_llm_response +from llm_quest_benchmark.harnesses.base import parse_llm_response +from llm_quest_benchmark.harnesses.minimal import MinimalHarness from llm_quest_benchmark.schemas.response import LLMResponse @@ -20,8 +21,8 @@ def example_choices(): @pytest.mark.timeout(5) # Quick unit test @patch("llm_quest_benchmark.llm.client.OpenAI") -def test_agent_basic_flow(mock_openai, monkeypatch): - """Test basic agent functionality with mocked LLM""" +def test_harness_basic_flow(mock_openai, monkeypatch): + """Test basic harness functionality with mocked LLM""" monkeypatch.setenv("OPENAI_API_KEY", "test-key") # Setup mock mock_chat = Mock() @@ -41,14 +42,14 @@ def test_agent_basic_flow(mock_openai, monkeypatch): observation = "You are at a trading station." choices = [{"id": "1", "text": "Talk to merchant"}, {"id": "2", "text": "Leave station"}] - # Create agent and test - agent = LLMAgent(model_name="gpt-5-mini") - result = agent.get_action(observation, choices) + # Create harness and test + harness = MinimalHarness(model_name="gpt-5-mini") + result = harness.get_action(observation, choices) # Verify results assert result == 1 # Expect an integer assert mock_chat.completions.create.call_count == 1 - last_response = agent.get_last_response() + last_response = harness.get_last_response() assert last_response.prompt_tokens == 9 assert last_response.completion_tokens == 2 assert last_response.total_tokens == 11 @@ -56,47 +57,54 @@ def test_agent_basic_flow(mock_openai, monkeypatch): def test_template_rendering(): """Test that templates are rendered correctly""" - agent = LLMAgent() + harness = MinimalHarness() observation = "Test observation" choices = [{"text": "Option 1"}, {"text": "Option 2"}] # Test that prompt is rendered correctly - prompt = agent.prompt_renderer.render_action_prompt(observation, choices) + prompt = harness.prompt_renderer.render_action_prompt(observation, choices) assert "Test observation" in prompt assert "Option 1" in prompt assert "Option 2" in prompt -def test_agent_initialization_without_api_key(monkeypatch): - """Agent construction should not require provider API keys before inference.""" +def test_harness_initialization_without_api_key(monkeypatch): + """Harness construction should not require provider API keys before inference.""" monkeypatch.delenv("OPENAI_API_KEY", raising=False) monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) - agent = LLMAgent(model_name="gpt-5-mini") - assert agent.llm is None + harness = MinimalHarness(model_name="gpt-5-mini") + assert harness.llm is None def test_gemini_prompt_uses_selected_template(): - agent = LLMAgent(model_name="gemini-2.5-flash") - prompt = agent._format_prompt("state", [{"text": "A"}, {"text": "B"}]) + harness = MinimalHarness(model_name="gemini-2.5-flash", action_template="reasoning.jinja") + prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}]) assert "Return ONLY valid JSON" in prompt assert "A" in prompt assert "B" in prompt def test_non_gemini_prompt_uses_selected_template(): - agent = LLMAgent(model_name="gpt-5-mini", action_template="stub.jinja") - prompt = agent._format_prompt("state", [{"text": "A"}, {"text": "B"}]) + harness = MinimalHarness(model_name="gpt-5-mini", action_template="stub.jinja") + prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}]) assert "IMPORTANT: Please respond with ONLY a single number" in prompt +def test_formatted_user_prompt_does_not_duplicate_system_prompt(): + harness = MinimalHarness(model_name="gpt-5-mini", action_template="stub.jinja") + prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}]) + + assert "experienced interactive fiction player" not in prompt + + def test_template_alias_without_suffix_is_supported(): - agent = LLMAgent(model_name="gpt-5-mini", action_template="reasoning") - prompt = agent._format_prompt("state", [{"text": "A"}, {"text": "B"}]) + harness = MinimalHarness(model_name="gpt-5-mini", action_template="reasoning") + prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}]) assert '"result"' in prompt def test_gpt5_force_numeric_retry_path(): - agent = LLMAgent(model_name="gpt-5-mini") + harness = MinimalHarness(model_name="gpt-5-mini") mocked_llm = Mock() mocked_llm.get_completion.side_effect = ["```json\n{", "```json\n{", "2"] mocked_llm.get_last_usage.side_effect = [ @@ -104,58 +112,57 @@ def test_gpt5_force_numeric_retry_path(): {"prompt_tokens": 6, "completion_tokens": 1, "total_tokens": 7, "estimated_cost_usd": 0.0005}, {"prompt_tokens": 4, "completion_tokens": 1, "total_tokens": 5, "estimated_cost_usd": 0.0003}, ] - agent.llm = mocked_llm + harness.llm = mocked_llm - action = agent.get_action("state", [{"text": "A"}, {"text": "B"}]) + action = harness.get_action("state", [{"text": "A"}, {"text": "B"}]) assert action == 2 assert mocked_llm.get_completion.call_count == 3 - last = agent.get_last_response() + last = harness.get_last_response() assert last.total_tokens == 24 assert last.estimated_cost_usd == pytest.approx(0.0018) assert last.parse_mode == "force_retry_number_only" def test_contextual_state_includes_previous_observations(): - agent = LLMAgent(model_name="gpt-5-mini") - agent._remember_observation("Previous hint") - agent._remember_observation("Current state") - contextual = agent._build_contextual_state("Current state") + harness = MinimalHarness(model_name="gpt-5-mini") + harness.memory_module.update({"observation": "Previous hint"}) + harness.memory_module.update({"observation": "Current state"}) + contextual = harness._build_contextual_state("Current state") assert "Recent context from previous steps" in contextual assert "Previous hint" in contextual def test_contextual_state_includes_recent_decisions(): - agent = LLMAgent(model_name="gpt-5-mini") - agent._decision_history = [ - {"action": 2, "choice": "Inspect the terminal", "parse_mode": "json_direct"}, - {"action": 1, "choice": "Ask for access", "parse_mode": "retry_json_repaired"}, - ] - contextual = agent._build_contextual_state("Current state") + harness = MinimalHarness(model_name="gpt-5-mini") + harness.memory_module.update({"observation": "Previous state"}) + harness.memory_module.update({"action": 2, "choice": "Inspect the terminal", "parse_mode": "json_direct"}) + harness.memory_module.update({"action": 1, "choice": "Ask for access", "parse_mode": "retry_json_repaired"}) + contextual = harness._build_contextual_state("Current state") assert "Recent selected actions" in contextual assert "Inspect the terminal" in contextual assert "parse=json_direct" in contextual def test_safety_filter_prefers_lower_risk_choice(): - agent = LLMAgent(model_name="gpt-5-mini") + harness = MinimalHarness(model_name="gpt-5-mini") choices = [ {"text": "Пойти в космопорт и улететь, чтобы завтра не позориться"}, {"text": "Постараться пройти мимо"}, ] - assert agent._apply_safety_filter(1, choices) == 2 + assert harness._apply_safety_filter(choices, 1) == 2 def test_get_last_response_uses_skip_single_result(): - agent = LLMAgent(model_name="gpt-5-mini", skip_single=True) - agent.history.append(LLMResponse(action=2, is_default=False)) - agent._last_response = LLMResponse(action=2, is_default=False) + harness = MinimalHarness(model_name="gpt-5-mini", skip_single=True) + harness.history.append(LLMResponse(action=2, is_default=False)) + harness._last_response = LLMResponse(action=2, is_default=False) - action = agent.get_action("state", [{"id": "1", "text": "Only option"}]) + action = harness.get_action("state", [{"id": "1", "text": "Only option"}]) assert action == 1 - assert agent.get_last_response().action == 1 - assert agent.get_last_response().reasoning == "auto_single_choice" + assert harness.get_last_response().action == 1 + assert harness.get_last_response().reasoning == "auto_single_choice" def test_parse_llm_response_number_only_tracks_parse_mode(): @@ -194,7 +201,7 @@ def test_parse_llm_response_uses_analysis_as_reasoning_when_truncated(): def test_llm_error_default_response_keeps_reasoning_marker(): - agent = LLMAgent(model_name="gemini-2.5-flash") + harness = MinimalHarness(model_name="gemini-2.5-flash") mocked_llm = Mock() mocked_llm.get_completion.side_effect = RuntimeError("provider returned empty message") mocked_llm.get_last_usage.return_value = { @@ -203,20 +210,20 @@ def test_llm_error_default_response_keeps_reasoning_marker(): "total_tokens": 0, "estimated_cost_usd": None, } - agent.llm = mocked_llm + harness.llm = mocked_llm - action = agent.get_action("state", [{"text": "A"}, {"text": "B"}]) + action = harness.get_action("state", [{"text": "A"}, {"text": "B"}]) assert action == 1 - last = agent.get_last_response() + last = harness.get_last_response() assert last.is_default is True assert last.reasoning is not None assert "llm_call_error" in last.reasoning def test_retry_prompt_requests_json_payload(): - agent = LLMAgent(model_name="gemini-2.5-flash") - prompt = agent._format_retry_prompt("state", [{"text": "A"}, {"text": "B"}]) + harness = MinimalHarness(model_name="gemini-2.5-flash") + prompt = harness._format_retry_prompt("state", [{"text": "A"}, {"text": "B"}]) assert "Return valid JSON only" in prompt assert '"analysis"' in prompt assert '"reasoning"' in prompt @@ -224,7 +231,7 @@ def test_retry_prompt_requests_json_payload(): def test_retry_preserves_reasoning_from_first_attempt(): - agent = LLMAgent(model_name="gemini-2.5-flash") + harness = MinimalHarness(model_name="gemini-2.5-flash") mocked_llm = Mock() mocked_llm.get_completion.side_effect = [ "Analysis: low oxygen\nReasoning: safer move first\n```json\n{", @@ -244,12 +251,12 @@ def test_retry_preserves_reasoning_from_first_attempt(): "estimated_cost_usd": 0.0002, }, ] - agent.llm = mocked_llm + harness.llm = mocked_llm - action = agent.get_action("state", [{"text": "A"}, {"text": "B"}]) + action = harness.get_action("state", [{"text": "A"}, {"text": "B"}]) assert action == 2 - last = agent.get_last_response() + last = harness.get_last_response() assert last.analysis is not None assert "low oxygen" in last.analysis assert last.reasoning is not None diff --git a/llm_quest_benchmark/tests/harnesses/test_factory.py b/llm_quest_benchmark/tests/harnesses/test_factory.py new file mode 100644 index 0000000..49062fe --- /dev/null +++ b/llm_quest_benchmark/tests/harnesses/test_factory.py @@ -0,0 +1,203 @@ +import pytest + +from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness +from llm_quest_benchmark.harnesses.memo import MemoCompactHarness +from llm_quest_benchmark.harnesses.minimal import MinimalHarness +from llm_quest_benchmark.players.human import HumanPlayer +from llm_quest_benchmark.players.random import RandomPlayer +from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig + + +def test_create_minimal_harness(): + harness = create_harness("minimal", model="gpt-5-mini") + + assert isinstance(harness, MinimalHarness) + + +def test_all_harness_names_instantiate(): + for harness_name, harness_cls in HARNESS_REGISTRY.items(): + harness = create_harness(harness_name, model="gpt-5-mini") + + assert isinstance(harness, harness_cls) + + +def test_create_human_harness(): + harness = create_harness("human") + + assert isinstance(harness, HumanPlayer) + + +def test_create_random_choice_harness(): + harness = create_harness("random_choice") + + assert isinstance(harness, RandomPlayer) + + +def test_create_seeded_random_choice_harness(): + harness = create_harness("random_choice_123", model="random_choice") + + assert isinstance(harness, RandomPlayer) + assert harness.agent_id == "random_123" + + +def test_create_bad_harness_name_raises(): + with pytest.raises(ValueError, match="minimal"): + create_harness("bad_name", model="gpt-5-mini") + + +def test_create_bad_random_choice_seed_raises(): + with pytest.raises(ValueError, match="random_choice_"): + create_harness("random_choice_bad") + + +def test_random_choice_model_does_not_hide_bad_harness(): + with pytest.raises(ValueError, match="bad_name"): + create_harness("bad_name", model="random_choice_123") + + +def test_random_choice_model_requires_random_harness(): + with pytest.raises(ValueError, match="harness='random_choice'"): + create_harness("minimal", model="random_choice") + + +def test_seeded_random_model_is_rejected(): + with pytest.raises(ValueError, match="Encode random seeds in harness"): + create_harness("random_choice", model="random_choice_123") + + +def test_human_model_requires_human_harness(): + with pytest.raises(ValueError, match="harness='human'"): + create_harness("minimal", model="human") + + +def test_harness_config_stable_harness_id(): + config = HarnessConfig(harness="memo_compact", model="gpt-5-mini") + + assert isinstance(config.harness_id, str) + assert config.harness_id == HarnessConfig(harness="memo_compact", model="gpt-5-mini").harness_id + + +def test_harness_config_system_template_affects_harness_id(): + first = HarnessConfig(harness="memo_compact", model="gpt-5-mini", system_template="system_role.jinja") + second = HarnessConfig(harness="memo_compact", model="gpt-5-mini", system_template="custom_system_role.jinja") + + assert first.harness_id != second.harness_id + + +def test_non_compaction_harness_id_ignores_compaction_interval(): + first = HarnessConfig(harness="reasoning_recent", model="gpt-5-mini", compaction_interval=10) + second = HarnessConfig(harness="reasoning_recent", model="gpt-5-mini", compaction_interval=99) + + assert first.harness_id == second.harness_id + + +def test_compaction_harness_id_includes_compaction_interval(): + first = HarnessConfig(harness="memo_compact", model="gpt-5-mini", compaction_interval=10) + second = HarnessConfig(harness="memo_compact", model="gpt-5-mini", compaction_interval=99) + + assert first.harness_id != second.harness_id + + +def test_harness_config_allows_seeded_random_choice_harness(): + config = HarnessConfig(harness="random_choice_123", model="random_choice") + + assert config.harness == "random_choice_123" + + +def test_harness_config_rejects_llm_model_with_random_harness(): + with pytest.raises(ValueError, match="model: random_choice"): + HarnessConfig(harness="random_choice", model="gpt-5-mini") + + +def test_harness_config_rejects_llm_model_with_human_harness(): + with pytest.raises(ValueError, match="model: human"): + HarnessConfig(harness="human", model="gpt-5-mini") + + +def test_harness_config_rejects_random_model_with_llm_harness(): + with pytest.raises(ValueError, match="harness: random_choice"): + HarnessConfig(harness="minimal", model="random_choice") + + +def test_harness_config_rejects_human_model_with_llm_harness(): + with pytest.raises(ValueError, match="harness: human"): + HarnessConfig(harness="minimal", model="human") + + +def test_harness_config_allows_retired_exp4_aliases(): + for harness_name in ("compaction_no_memo", "memo_cot", "memo_extended", "memo_structured"): + config = HarnessConfig(harness=harness_name, model="gpt-5-mini") + + assert config.harness == harness_name + + +def test_harness_config_rejects_old_template_key(): + with pytest.raises(ValueError, match="Use harness: key instead of template:"): + HarnessConfig(model="gpt-5-mini", template="reasoning.jinja") + + +def test_harness_config_rejects_old_memory_mode_key(): + with pytest.raises(ValueError, match="Use harness: key instead of memory_mode:"): + HarnessConfig(model="gpt-5-mini", harness="memo_compact", memory_mode="compaction") + + +def test_benchmark_config_from_yaml_parses_harness(tmp_path): + quest_path = tmp_path / "quest.qm" + quest_path.write_text("", encoding="utf-8") + config_path = tmp_path / "benchmark.yaml" + config_path.write_text( + f""" +quests: + - {quest_path} +agents: + - model: gpt-5-mini + harness: memo_compact +""", + encoding="utf-8", + ) + + config = BenchmarkConfig.from_yaml(str(config_path)) + + assert len(config.agents) == 1 + assert isinstance(config.agents[0], HarnessConfig) + assert isinstance(create_harness(config.agents[0].harness, model=config.agents[0].model), MemoCompactHarness) + assert config.agents[0].harness == "memo_compact" + + +def test_benchmark_config_from_yaml_rejects_template(tmp_path): + quest_path = tmp_path / "quest.qm" + quest_path.write_text("", encoding="utf-8") + config_path = tmp_path / "benchmark.yaml" + config_path.write_text( + f""" +quests: + - {quest_path} +agents: + - model: gpt-5-mini + template: reasoning.jinja +""", + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="Use harness: key instead of template:"): + BenchmarkConfig.from_yaml(str(config_path)) + + +def test_benchmark_config_from_yaml_rejects_memory_mode(tmp_path): + quest_path = tmp_path / "quest.qm" + quest_path.write_text("", encoding="utf-8") + config_path = tmp_path / "benchmark.yaml" + config_path.write_text( + f""" +quests: + - {quest_path} +agents: + - model: gpt-5-mini + harness: memo_compact + memory_mode: compaction +""", + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="Use harness: key instead of memory_mode:"): + BenchmarkConfig.from_yaml(str(config_path)) diff --git a/llm_quest_benchmark/tests/harnesses/test_harnesses.py b/llm_quest_benchmark/tests/harnesses/test_harnesses.py new file mode 100644 index 0000000..efa03bb --- /dev/null +++ b/llm_quest_benchmark/tests/harnesses/test_harnesses.py @@ -0,0 +1,374 @@ +"""Comprehensive tests for concrete harness behavior.""" + +from unittest.mock import Mock + +from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness +from llm_quest_benchmark.harnesses.memo import ( + CompactionNoMemoHarness, + HintedCompactHarness, + MemoCompactHarness, + MemoCotHarness, + MemoExtendedHarness, + MemoStructuredHarness, +) +from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory +from llm_quest_benchmark.harnesses.minimal import MinimalHarness +from llm_quest_benchmark.harnesses.planner import PlannerHarness +from llm_quest_benchmark.harnesses.reasoning import ReasoningFullTranscriptHarness, ReasoningRecentHarness +from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness, ToolHintedHarness + +HARNESS_SPECS = { + "minimal": (MinimalHarness, "stub.jinja", DefaultMemory), + "reasoning_recent": (ReasoningRecentHarness, "reasoning.jinja", DefaultMemory), + "reasoning_full": (ReasoningFullTranscriptHarness, "reasoning.jinja", FullTranscriptMemory), + "memo_compact": (MemoCompactHarness, "stateful_compact.jinja", CompactionMemory), + "hinted_compact": (HintedCompactHarness, "stateful_compact_hints.jinja", CompactionMemory), + "tool_compact": (ToolCompactHarness, "tool_augmented.jinja", CompactionMemory), + "tool_hinted": (ToolHintedHarness, "tool_augmented_hints.jinja", CompactionMemory), + "planner": (PlannerHarness, "planner.jinja", CompactionMemory), + "compaction_no_memo": (CompactionNoMemoHarness, "reasoning.jinja", CompactionMemory), + "memo_cot": (MemoCotHarness, "memo_cot.jinja", CompactionMemory), + "memo_extended": (MemoExtendedHarness, "memo_extended.jinja", CompactionMemory), + "memo_structured": (MemoStructuredHarness, "memo_structured.jinja", CompactionMemory), +} + + +def assert_harness_configuration(harness_name: str) -> None: + expected_class, expected_template, expected_memory_class = HARNESS_SPECS[harness_name] + + harness = create_harness(harness_name, model="gpt-5-mini") + + assert isinstance(harness, expected_class) + assert harness.harness_name == harness_name + assert harness.action_template == expected_template + assert isinstance(harness.memory_module, expected_memory_class) + + +def test_minimal_harness_configuration(): + assert_harness_configuration("minimal") + + +def test_reasoning_recent_harness_configuration(): + assert_harness_configuration("reasoning_recent") + + +def test_reasoning_full_harness_configuration(): + assert_harness_configuration("reasoning_full") + + +def test_memo_compact_harness_configuration(): + assert_harness_configuration("memo_compact") + + +def test_hinted_compact_harness_configuration(): + assert_harness_configuration("hinted_compact") + + +def test_tool_compact_harness_configuration(): + assert_harness_configuration("tool_compact") + + +def test_tool_hinted_harness_configuration(): + assert_harness_configuration("tool_hinted") + + +def test_planner_harness_configuration(): + assert_harness_configuration("planner") + + +def test_exp4_retired_harness_configuration(): + assert_harness_configuration("compaction_no_memo") + assert_harness_configuration("memo_cot") + assert_harness_configuration("memo_extended") + assert_harness_configuration("memo_structured") + + +def test_all_registry_harnesses_have_configuration_specs(): + assert set(HARNESS_REGISTRY) == set(HARNESS_SPECS) + + +def test_all_registry_harnesses_instantiate_with_expected_names(): + for harness_name in HARNESS_REGISTRY: + harness = create_harness(harness_name, model="gpt-5-mini") + + assert harness.harness_name == harness_name + + +def test_memo_compact_mocked_llm_returns_action_and_reuses_memo_context(): + harness = MemoCompactHarness(model_name="gpt-5-mini") + mocked_llm = Mock() + mocked_llm.get_completion.side_effect = [ + '{"memo":"Merchant needs fuel payment","analysis":"pay first","reasoning":"quest clue","result":2}', + '{"memo":"Paid fuel merchant","analysis":"memo says paid","reasoning":"continue","result":1}', + ] + mocked_llm.get_last_usage.return_value = { + "prompt_tokens": 10, + "completion_tokens": 5, + "total_tokens": 15, + "estimated_cost_usd": 0.0, + } + harness.llm = mocked_llm + + first_action = harness.get_action("A merchant offers fuel for a fee.", [{"text": "Leave"}, {"text": "Pay"}]) + second_action = harness.get_action("The fuel gauge still blinks.", [{"text": "Check receipt"}, {"text": "Leave"}]) + + assert first_action == 2 + assert second_action == 1 + assert harness.get_last_response().memo == "Paid fuel merchant" + second_prompt = mocked_llm.get_completion.call_args_list[1].args[0] + assert "Merchant needs fuel payment" in second_prompt + + +def test_compaction_memory_receives_existing_llm_client(): + harness = MemoCompactHarness(model_name="gpt-5-mini", compaction_interval=1) + mocked_llm = Mock() + mocked_llm.get_completion.side_effect = [ + '{"memo":"Paid fuel merchant","analysis":"pay first","reasoning":"quest clue","result":2}', + "Summary: paid the fuel merchant and should keep receipt.", + ] + mocked_llm.get_last_usage.return_value = { + "prompt_tokens": 10, + "completion_tokens": 5, + "total_tokens": 15, + "estimated_cost_usd": 0.0, + } + harness.llm = mocked_llm + + action = harness.get_action("A merchant offers fuel for a fee.", [{"text": "Leave"}, {"text": "Pay"}]) + + assert action == 2 + assert harness.memory_module.llm_client is mocked_llm + assert harness.memory_module._compaction_summary == "Summary: paid the fuel merchant and should keep receipt." + assert harness.memory_module.steps_since_compaction == 0 + + +def test_planner_harness_first_turn_generates_plan_then_acts(): + harness = PlannerHarness(model_name="gpt-5-mini") + mocked_llm = Mock() + mocked_llm.get_completion.side_effect = [ + "Gather clues first. Avoid direct fights. Preserve resources.", + '{"analysis":"plan says scout","reasoning":"safer branch","result":2}', + ] + mocked_llm.get_last_usage.side_effect = [ + {"prompt_tokens": 30, "completion_tokens": 12, "total_tokens": 42, "estimated_cost_usd": 0.001}, + {"prompt_tokens": 20, "completion_tokens": 8, "total_tokens": 28, "estimated_cost_usd": 0.0007}, + ] + harness.llm = mocked_llm + + action = harness.get_action("You enter a pirate station.", [{"text": "Scout ahead"}, {"text": "Attack now"}]) + + assert action == 2 + assert harness.current_plan is not None + assert "Avoid direct fights" in harness.current_plan + assert mocked_llm.get_completion.call_count == 2 + assert harness.get_last_response().total_tokens == 70 + + +def test_planner_harness_reuses_plan_when_state_is_stable(): + harness = PlannerHarness(model_name="gpt-5-mini") + harness.current_plan = "Keep moving carefully and avoid a direct fight." + harness._observation_history = ["Quiet corridor."] + mocked_llm = Mock() + mocked_llm.get_completion.return_value = '{"analysis":"plan still fits","reasoning":"careful progress","result":1}' + mocked_llm.get_last_usage.return_value = { + "prompt_tokens": 18, + "completion_tokens": 7, + "total_tokens": 25, + "estimated_cost_usd": 0.0005, + } + harness.llm = mocked_llm + + action = harness.get_action("Quiet corridor.", [{"text": "Open the door"}, {"text": "Run"}]) + + assert action == 1 + assert mocked_llm.get_completion.call_count == 1 + + +def test_planner_harness_uses_contextual_memory_state(): + harness = PlannerHarness(model_name="gpt-5-mini", compaction_interval=50) + harness.memory_module.set_quest_briefing("Original mission: win the election.") + harness.memory_module.transcript = [ + { + "step": 1, + "observation": "You learned Maloqs value strength.", + "choice_text": "Ask about Maloqs", + "memo": "Maloqs value strength", + "action": 1, + } + ] + harness.memory_module.steps_since_compaction = 1 + mocked_llm = Mock() + mocked_llm.get_completion.side_effect = [ + "Use the remembered cultural clue.", + '{"analysis":"use clue","reasoning":"fits plan","result":1}', + ] + mocked_llm.get_last_usage.return_value = { + "prompt_tokens": 1, + "completion_tokens": 1, + "total_tokens": 2, + "estimated_cost_usd": 0.0, + } + harness.llm = mocked_llm + + harness.get_action("Current banquet scene.", [{"text": "Greet like a warrior"}]) + + first_prompt = mocked_llm.get_completion.call_args_list[0].args[0] + assert "Quest briefing" in first_prompt + assert "RECENT STEPS" in first_prompt + assert "Maloqs value strength" in first_prompt + + +def test_tool_compact_harness_can_use_quest_history(): + harness = ToolCompactHarness(model_name="gpt-5-mini") + harness._step_log = [ + { + "step": 1, + "observation": "Merchant mentioned low fuel.", + "choices": ["Buy fuel", "Keep flying"], + "selected_choice": "Buy fuel", + } + ] + harness._history_tool.step_log = harness._step_log + mocked_llm = Mock() + mocked_llm.get_completion.side_effect = [ + '{"analysis":"need history","tool_calls":[{"tool":"quest_history","input":"fuel merchant"}],"result":null}', + '{"analysis":"fuel clue matters","reasoning":"play safe","result":1}', + ] + mocked_llm.get_last_usage.side_effect = [ + {"prompt_tokens": 24, "completion_tokens": 10, "total_tokens": 34, "estimated_cost_usd": 0.0008}, + {"prompt_tokens": 22, "completion_tokens": 9, "total_tokens": 31, "estimated_cost_usd": 0.0007}, + ] + harness.llm = mocked_llm + + action = harness.get_action("Your fuel gauge is blinking.", [{"text": "Refuel"}, {"text": "Attack pirates"}]) + + assert action == 1 + assert mocked_llm.get_completion.call_count == 2 + assert harness.get_last_response().total_tokens == 65 + assert len(harness._step_log) == 2 + assert harness.get_last_response().tool_results + assert "Merchant mentioned low fuel" in harness.get_last_response().tool_results[0] + + +def test_tool_compact_calculator_supports_arithmetic_and_comparisons(): + assert ToolCompactHarness.calculator("55 + 12 - 5") == "55 + 12 - 5 = 62" + assert ToolCompactHarness.calculator("60 >= 55 and 62 >= 80") == "60 >= 55 and 62 >= 80 = False" + assert ToolCompactHarness.calculator("__import__('os')").startswith("error:") + + +def test_tool_compact_scratchpad_read_write_and_reset(): + harness = ToolCompactHarness(model_name="gpt-5-mini") + + assert harness.scratchpad("read") == "(empty)" + assert ( + harness.scratchpad("write_replace", " Board: W B _ ; failed door 2 ") == "updated: Board: W B _ ; failed door 2" + ) + assert harness.scratchpad("read") == "Board: W B _ ; failed door 2" + + harness.reset() + + assert harness.scratchpad("read") == "(empty)" + + +def test_tool_compact_harness_can_use_calculator_and_records_tool_metadata(): + harness = ToolCompactHarness(model_name="gpt-5-mini") + mocked_llm = Mock() + mocked_llm.get_completion.side_effect = [ + '{"memo":"Need mix math","analysis":"calculate target","tool_calls":[{"tool":"calculator","input":"50 + 3 >= 55"}],"result":null}', + '{"memo":"Need more strength","analysis":"math failed","reasoning":"choose strength","result":2}', + ] + mocked_llm.get_last_usage.return_value = { + "prompt_tokens": 10, + "completion_tokens": 5, + "total_tokens": 15, + "estimated_cost_usd": 0.0, + } + harness.llm = mocked_llm + + action = harness.get_action("Strength is 50. Need at least 55.", [{"text": "Add water"}, {"text": "Add repusator"}]) + + response = harness.get_last_response() + assert action == 2 + assert response.tool_calls == [{"tool": "calculator", "input": "50 + 3 >= 55", "operation": "", "content": ""}] + assert response.tool_results == ["calculator(50 + 3 >= 55) => 50 + 3 >= 55 = False"] + assert response.memo == "Need more strength" + + +def test_tool_compact_harness_can_use_scratchpad_tool_call(): + harness = ToolCompactHarness(model_name="gpt-5-mini") + mocked_llm = Mock() + mocked_llm.get_completion.side_effect = [ + ( + '{"analysis":"save board","tool_calls":[{"tool":"scratchpad",' + '"operation":"write_replace","content":"Board: red blue blank"}],"result":null}' + ), + '{"analysis":"note saved","reasoning":"use saved board","result":1}', + ] + mocked_llm.get_last_usage.return_value = { + "prompt_tokens": 10, + "completion_tokens": 5, + "total_tokens": 15, + "estimated_cost_usd": 0.0, + } + harness.llm = mocked_llm + + action = harness.get_action("A colored board blocks the hall.", [{"text": "Use red-blue order"}]) + + assert action == 1 + assert harness.scratchpad("read") == "Board: red blue blank" + assert harness.get_last_response().tool_results == [ + "scratchpad(write_replace, Board: red blue blank) => updated: Board: red blue blank" + ] + + +def test_tool_compact_harness_uses_contextual_memory_state(): + harness = ToolCompactHarness(model_name="gpt-5-mini", compaction_interval=50) + harness.memory_module.set_quest_briefing("Original mission: pass pilot certification.") + harness.memory_module.transcript = [ + { + "step": 1, + "observation": "Hogger is greedy.", + "choice_text": "Bribe Hogger", + "memo": "Hogger is greedy", + "action": 1, + } + ] + harness.memory_module.steps_since_compaction = 1 + mocked_llm = Mock() + mocked_llm.get_completion.return_value = ( + '{"memo":"Hogger is greedy","analysis":"no tools needed","tool_calls":[],"result":1}' + ) + mocked_llm.get_last_usage.return_value = { + "prompt_tokens": 10, + "completion_tokens": 5, + "total_tokens": 15, + "estimated_cost_usd": 0.0, + } + harness.llm = mocked_llm + + harness.get_action("Current exam room.", [{"text": "Offer a bribe"}]) + + prompt = mocked_llm.get_completion.call_args.args[0] + assert "Quest briefing" in prompt + assert "RECENT STEPS" in prompt + assert "Hogger is greedy" in prompt + + +def test_tool_compact_harness_can_finish_without_tools_in_one_call(): + harness = ToolCompactHarness(model_name="gpt-5-mini") + mocked_llm = Mock() + mocked_llm.get_completion.return_value = ( + '{"analysis":"no tools needed","tool_calls":[],"reasoning":"direct clue","result":2}' + ) + mocked_llm.get_last_usage.return_value = { + "prompt_tokens": 15, + "completion_tokens": 6, + "total_tokens": 21, + "estimated_cost_usd": 0.0004, + } + harness.llm = mocked_llm + + action = harness.get_action("A guard points at the safe exit.", [{"text": "Fight"}, {"text": "Leave"}]) + + assert action == 2 + assert mocked_llm.get_completion.call_count == 1 diff --git a/llm_quest_benchmark/tests/integration/test_benchmark.py b/llm_quest_benchmark/tests/integration/test_benchmark.py index ee0704e..1c56d35 100644 --- a/llm_quest_benchmark/tests/integration/test_benchmark.py +++ b/llm_quest_benchmark/tests/integration/test_benchmark.py @@ -5,11 +5,11 @@ import pytest -from llm_quest_benchmark.constants import DEFAULT_TEMPLATE, SYSTEM_ROLE_TEMPLATE +from llm_quest_benchmark.constants import SYSTEM_ROLE_TEMPLATE from llm_quest_benchmark.environments.state import QuestOutcome from llm_quest_benchmark.executors import benchmark as benchmark_module from llm_quest_benchmark.executors.benchmark import run_benchmark -from llm_quest_benchmark.schemas.config import AgentConfig, BenchmarkConfig +from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig def _fake_task_for_parallel_test(task, result_queue): @@ -58,10 +58,10 @@ def test_benchmark_e2e(caplog, tmp_path): config = BenchmarkConfig( quests=[str(quest_path)], agents=[ - AgentConfig( + HarnessConfig( model="random_choice", # Use random_choice for testing + harness="random_choice", system_template=SYSTEM_ROLE_TEMPLATE, - action_template=DEFAULT_TEMPLATE, temperature=0.0, skip_single=True, ) @@ -83,9 +83,9 @@ def test_benchmark_e2e(caplog, tmp_path): # Check first result result = results[0] assert result["quest"] == str(quest_path) - assert result["model"] == "random_choice" + assert result["model"] == "random_policy" assert result["temperature"] == 0.0 - assert result["template"] == DEFAULT_TEMPLATE + assert result["template"] == "reasoning.jinja" assert result["attempt"] == 1 assert "agent_id" in result assert "outcome" in result @@ -122,9 +122,9 @@ def test_benchmark_supports_multiple_runs_per_agent(tmp_path): config = BenchmarkConfig( quests=[str(quest_path)], agents=[ - AgentConfig( + HarnessConfig( model="random_choice", - action_template="reasoning", + harness="random_choice", temperature=0.0, runs=2, skip_single=True, @@ -154,7 +154,7 @@ def test_benchmark_uses_max_workers(monkeypatch, tmp_path): config = BenchmarkConfig( quests=[str(quest_path)], - agents=[AgentConfig(model="random_choice", runs=4)], + agents=[HarnessConfig(model="random_choice", harness="random_choice", runs=4)], quest_timeout=5, max_workers=2, output_dir=str(tmp_path), @@ -187,7 +187,7 @@ def test_benchmark_enforces_child_process_timeout(monkeypatch, tmp_path): config = BenchmarkConfig( quests=[str(quest_path)], - agents=[AgentConfig(model="random_choice", runs=1)], + agents=[HarnessConfig(model="random_choice", harness="random_choice", runs=1)], quest_timeout=1, max_workers=1, output_dir=str(tmp_path), diff --git a/llm_quest_benchmark/tests/integration/test_mode_agents_e2e.py b/llm_quest_benchmark/tests/integration/test_mode_agents_e2e.py index 5563ca2..2ceeaca 100644 --- a/llm_quest_benchmark/tests/integration/test_mode_agents_e2e.py +++ b/llm_quest_benchmark/tests/integration/test_mode_agents_e2e.py @@ -1,12 +1,12 @@ -"""Integration tests for planner/tool modes on real quest execution loops.""" +"""Integration tests for planner/tool harness modes on real quest execution loops.""" from pathlib import Path import pytest -from llm_quest_benchmark.agents.agent_factory import create_agent from llm_quest_benchmark.core.runner import run_quest_with_timeout from llm_quest_benchmark.environments.state import QuestOutcome +from llm_quest_benchmark.harnesses.factory import create_harness QUEST_PATHS = [ "quests/Boat.qm", @@ -38,18 +38,18 @@ def get_last_usage(self): @pytest.mark.timeout(15) @pytest.mark.skipif(not Path(QUEST_PATHS[1]).exists(), reason="Quest files not downloaded") -def test_planner_agent_runs_three_quests_across_openai_and_anthropic_models(monkeypatch): +def test_planner_harness_runs_three_quests_across_openai_and_anthropic_models(monkeypatch): requested_models = [] def fake_get_llm_client(model_name, **kwargs): requested_models.append(model_name) return FakeLLM("planner") - monkeypatch.setattr("llm_quest_benchmark.agents.llm_agent.get_llm_client", fake_get_llm_client) + monkeypatch.setattr("llm_quest_benchmark.harnesses.base.get_llm_client", fake_get_llm_client) for model_name in ["gpt-5-mini", "claude-sonnet-4-5"]: for quest_path in QUEST_PATHS: - agent = create_agent(model=model_name, action_template="planner", skip_single=True) + agent = create_harness("planner", model=model_name, skip_single=True) outcome = run_quest_with_timeout(quest_path, agent, timeout=10) assert outcome in {QuestOutcome.SUCCESS, QuestOutcome.FAILURE, QuestOutcome.TIMEOUT} assert outcome != QuestOutcome.ERROR @@ -60,14 +60,14 @@ def fake_get_llm_client(model_name, **kwargs): @pytest.mark.timeout(15) @pytest.mark.skipif(not Path(QUEST_PATHS[1]).exists(), reason="Quest files not downloaded") -def test_tool_agent_runs_three_quests(monkeypatch): +def test_tool_harness_runs_three_quests(monkeypatch): monkeypatch.setattr( - "llm_quest_benchmark.agents.llm_agent.get_llm_client", + "llm_quest_benchmark.harnesses.base.get_llm_client", lambda model_name, **kwargs: FakeLLM("tool"), ) for quest_path in QUEST_PATHS: - agent = create_agent(model="gpt-5-mini", action_template="tool_augmented", skip_single=True) + agent = create_harness("tool_compact", model="gpt-5-mini", skip_single=True) outcome = run_quest_with_timeout(quest_path, agent, timeout=10) assert outcome in {QuestOutcome.SUCCESS, QuestOutcome.FAILURE, QuestOutcome.TIMEOUT} assert outcome != QuestOutcome.ERROR @@ -75,9 +75,9 @@ def test_tool_agent_runs_three_quests(monkeypatch): @pytest.mark.timeout(15) @pytest.mark.skipif(not Path(QUEST_PATHS[1]).exists(), reason="Quest files not downloaded") -def test_reused_mode_agents_reset_between_quest_runs(): +def test_reused_mode_harnesses_reset_between_quest_runs(): quest_path = "quests/sr_2_1_2121_eng/Borzukhan_eng.qm" - planner_agent = create_agent(model="gpt-5-mini", action_template="planner", skip_single=True) + planner_agent = create_harness("planner", model="gpt-5-mini", skip_single=True) planner_agent.llm = FakeLLM("planner") first_outcome = run_quest_with_timeout(quest_path, planner_agent, timeout=10) @@ -92,7 +92,7 @@ def test_reused_mode_agents_reset_between_quest_runs(): assert "stale plan from previous run" not in planner_agent._plan_history assert "stale observation" not in planner_agent._observation_history - tool_agent = create_agent(model="gpt-5-mini", action_template="tool_augmented", skip_single=True) + tool_agent = create_harness("tool_compact", model="gpt-5-mini", skip_single=True) tool_agent.llm = FakeLLM("tool") first_outcome = run_quest_with_timeout(quest_path, tool_agent, timeout=10) diff --git a/llm_quest_benchmark/tests/integration/test_quest_e2e.py b/llm_quest_benchmark/tests/integration/test_quest_e2e.py index 8ebfb91..3d02d1a 100644 --- a/llm_quest_benchmark/tests/integration/test_quest_e2e.py +++ b/llm_quest_benchmark/tests/integration/test_quest_e2e.py @@ -5,10 +5,10 @@ import pytest -from llm_quest_benchmark.agents.agent_factory import create_agent -from llm_quest_benchmark.constants import DEFAULT_QUEST, DEFAULT_TEMPLATE, SYSTEM_ROLE_TEMPLATE +from llm_quest_benchmark.constants import DEFAULT_QUEST, SYSTEM_ROLE_TEMPLATE from llm_quest_benchmark.core.runner import run_quest_with_timeout from llm_quest_benchmark.environments.state import QuestOutcome +from llm_quest_benchmark.harnesses.factory import create_harness TIMEOUT = 20 # 20s should be enough for test quests to complete @@ -19,11 +19,11 @@ def test_quest_run_with_llm(caplog): """Test that quest runs with LLM agent and reaches a final state""" caplog.set_level(logging.DEBUG) # Show all logs in test output - # Create LLM agent - agent = create_agent( + # Create random harness + agent = create_harness( + harness="random_choice", model="random_choice", # Use random for testing system_template=SYSTEM_ROLE_TEMPLATE, - action_template=DEFAULT_TEMPLATE, temperature=0.0, skip_single=False, debug=True, @@ -63,13 +63,13 @@ def mock_callback(event: str, data: Any) -> None: @pytest.mark.e2e @pytest.mark.timeout(TIMEOUT) -def test_random_agent_on_test_quest(caplog): - """Test that random agent can complete a test quest""" +def test_random_player_on_test_quest(caplog): + """Test that random player can complete a test quest""" caplog.set_level(logging.DEBUG) # Show all logs in test output - # Create random agent - agent = create_agent("random_choice", skip_single=True, debug=True) - assert agent is not None, "Failed to create random agent" + # Create random player + agent = create_harness("random_choice", skip_single=True, debug=True) + assert agent is not None, "Failed to create random player" # Mock callback for testing def mock_callback(event: str, data: Any) -> None: @@ -80,7 +80,7 @@ def mock_callback(event: str, data: Any) -> None: elif event == "error": caplog.error(f"Error: {data}") - # Run quest with random agent + # Run quest with random player try: outcome = run_quest_with_timeout( quest_path=str(DEFAULT_QUEST), diff --git a/llm_quest_benchmark/tests/agents/test_human_player.py b/llm_quest_benchmark/tests/players/test_human_player.py similarity index 95% rename from llm_quest_benchmark/tests/agents/test_human_player.py rename to llm_quest_benchmark/tests/players/test_human_player.py index 8334ebd..7108f78 100644 --- a/llm_quest_benchmark/tests/agents/test_human_player.py +++ b/llm_quest_benchmark/tests/players/test_human_player.py @@ -4,7 +4,7 @@ import pytest -from llm_quest_benchmark.agents.human_player import HumanPlayer +from llm_quest_benchmark.players.human import HumanPlayer def test_human_player_initialization(): diff --git a/llm_quest_benchmark/tests/test_benchmark_with_directory.py b/llm_quest_benchmark/tests/test_benchmark_with_directory.py index 7661f3d..87b2221 100644 --- a/llm_quest_benchmark/tests/test_benchmark_with_directory.py +++ b/llm_quest_benchmark/tests/test_benchmark_with_directory.py @@ -6,20 +6,20 @@ import pytest +from llm_quest_benchmark.executors.benchmark import _result_entry, run_benchmark +from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig + # Configure logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) -from llm_quest_benchmark.executors.benchmark import run_benchmark -from llm_quest_benchmark.schemas.config import AgentConfig, BenchmarkConfig - def create_test_config(): """Create a test benchmark configuration with directory path""" return { "name": "Directory Benchmark Test", "quests": ["quests/sr_2_1_2121_eng"], - "agents": [{"model": "random_choice", "skip_single": True, "temperature": 0.7}], + "agents": [{"model": "random_choice", "harness": "random_choice", "skip_single": True, "temperature": 0.7}], "quest_timeout": 4, # Keep runtime below pytest global timeout "max_quests": 1, "debug": True, @@ -27,6 +27,26 @@ def create_test_config(): } +def test_result_entry_logs_random_harness_model_as_random_policy(): + """Random harness results should not be attributed to the default LLM model.""" + agent_config = HarnessConfig(harness="random_choice", model="random_choice") + + result = _result_entry("quests/Boat.qm", agent_config, 1, "FAILURE") + + assert result["model"] == "random_policy" + assert result["harness"] == "random_choice" + + +def test_result_entry_logs_human_harness_model_as_human(): + """Human harness results should not be attributed to the default LLM model.""" + agent_config = HarnessConfig(harness="human", model="human") + + result = _result_entry("quests/Boat.qm", agent_config, 1, "FAILURE") + + assert result["model"] == "human" + assert result["harness"] == "human" + + @pytest.mark.skipif(not Path("quests/sr_2_1_2121_eng").exists(), reason="Quest files not downloaded") def test_benchmark_with_directory(): """Test running a benchmark with a directory path""" @@ -34,8 +54,8 @@ def test_benchmark_with_directory(): config_dict = create_test_config() logger.info(f"Created test config: {json.dumps(config_dict, indent=2)}") - # Convert agent dictionaries to AgentConfig objects first - config_dict["agents"] = [AgentConfig(**agent_dict) for agent_dict in config_dict["agents"]] + # Convert agent dictionaries to HarnessConfig objects first + config_dict["agents"] = [HarnessConfig(**agent_dict) for agent_dict in config_dict["agents"]] config = BenchmarkConfig(**config_dict) logger.info("Config validation passed") diff --git a/llm_quest_benchmark/tests/test_database.py b/llm_quest_benchmark/tests/test_database.py index a04f53d..d00c6b1 100644 --- a/llm_quest_benchmark/tests/test_database.py +++ b/llm_quest_benchmark/tests/test_database.py @@ -249,8 +249,8 @@ def test_run_summary_export_tracks_repetition_rate(tmp_path, monkeypatch, quest_ assert exported["metrics"]["bad_decision_rate"] == 0.0 -def test_random_agent_does_not_export_json(tmp_path, monkeypatch, quest_logger): - """Random agent runs should not create result artifacts in results/.""" +def test_random_player_does_not_export_json(tmp_path, monkeypatch, quest_logger): + """Random player runs should not create result artifacts in results/.""" monkeypatch.setattr(logging_module, "RESULTS_DIR", tmp_path) quest_logger.agent = "random_choice" diff --git a/llm_quest_benchmark/tests/test_leaderboard.py b/llm_quest_benchmark/tests/test_leaderboard.py index aa22296..46407cf 100644 --- a/llm_quest_benchmark/tests/test_leaderboard.py +++ b/llm_quest_benchmark/tests/test_leaderboard.py @@ -243,6 +243,73 @@ def test_generate_leaderboard_filters_public_slice(tmp_path, monkeypatch): assert {row["model"] for row in leaderboard["results"]} == {"model-a", "model-b", "model-c"} +def test_generate_leaderboard_excludes_retired_exp4_variants(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + + active_dir = Path("results/benchmarks/active") + active_dir.mkdir(parents=True, exist_ok=True) + retired_dir = Path("results/benchmarks/retired") + retired_dir.mkdir(parents=True, exist_ok=True) + + active_row = { + "quest": "quests/Core.qm", + "model": "gpt-5-mini", + "template": "stateful_compact.jinja", + "harness": "memo_compact", + "agent_id": "active", + "attempt": 1, + "outcome": "SUCCESS", + } + retired_rows = [ + { + "quest": "quests/Core.qm", + "model": "gpt-5-mini", + "template": "reasoning.jinja", + "harness": "compaction_no_memo", + "agent_id": "retired-no-memo", + "attempt": 1, + "outcome": "FAILURE", + }, + { + "quest": "quests/Core.qm", + "model": "gpt-5-mini", + "template": "memo_extended.jinja", + "harness": "memo_extended", + "agent_id": "retired-extended", + "attempt": 1, + "outcome": "FAILURE", + }, + ] + + (active_dir / "benchmark_summary.json").write_text( + json.dumps({"benchmark_id": "active", "name": "active", "agents": [], "results": [active_row], "db_runs": []}), + encoding="utf-8", + ) + (retired_dir / "benchmark_summary.json").write_text( + json.dumps( + { + "benchmark_id": "retired", + "name": "exp4_compaction_no_memo", + "agents": [], + "results": retired_rows, + "db_runs": [], + } + ), + encoding="utf-8", + ) + + leaderboard = generate_leaderboard( + [str(active_dir), str(retired_dir)], + "site/leaderboard.json", + min_runs=0, + public_model_ids=None, + ) + + assert len(leaderboard["results"]) == 1 + assert leaderboard["results"][0]["mode"] == "compact_memory_memo" + assert leaderboard["results"][0]["runs"] == 1 + + def test_generate_leaderboard_matches_db_runs_by_identifiers(tmp_path, monkeypatch): monkeypatch.chdir(tmp_path) @@ -310,3 +377,44 @@ def test_generate_leaderboard_matches_db_runs_by_identifiers(tmp_path, monkeypat rows = {(row["quest"], row["mode"]): row for row in leaderboard["results"]} assert rows[("Alpha", "compact_memory_memo")]["avg_steps"] == 10.0 assert rows[("Beta", "full_history_reasoning")]["avg_steps"] == 20.0 + + +def test_generate_leaderboard_uses_result_row_memory_mode_without_db_config(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + benchmark_dir = Path("results/benchmarks/bench_result_memory_mode") + benchmark_dir.mkdir(parents=True, exist_ok=True) + results = [ + { + "quest": "quests/Beta.qm", + "model": "gpt-5-mini", + "template": "reasoning.jinja", + "memory_mode": "full_transcript", + "agent_id": "harness_gpt-5-mini", + "outcome": "SUCCESS", + } + ] + db_runs = [ + { + "id": 20, + "quest_file": "quests/Beta.qm", + "quest_name": "Beta", + "agent_id": "harness_gpt-5-mini", + "agent_config": json.dumps({"model": "gpt-5-mini", "harness": "reasoning_full"}), + "outcome": "SUCCESS", + } + ] + (benchmark_dir / "benchmark_summary.json").write_text( + json.dumps( + {"benchmark_id": "bench_result_memory_mode", "harnesses": [], "results": results, "db_runs": db_runs} + ), + encoding="utf-8", + ) + + leaderboard = generate_leaderboard( + [str(benchmark_dir)], + "site/leaderboard.json", + min_runs=0, + public_model_ids=None, + ) + + assert leaderboard["results"][0]["mode"] == "full_history_reasoning"