From a95908fb4fbf7b728fb4e43eb782f37c0291403d Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Mon, 11 May 2026 13:51:09 +0400 Subject: [PATCH 01/24] harnesses: add package skeleton with base, memory, tools Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- llm_quest_benchmark/harnesses/__init__.py | 3 + llm_quest_benchmark/harnesses/base.py | 269 ++++++++++++++++++ llm_quest_benchmark/harnesses/memory.py | 317 ++++++++++++++++++++++ llm_quest_benchmark/harnesses/tools.py | 171 ++++++++++++ 4 files changed, 760 insertions(+) create mode 100644 llm_quest_benchmark/harnesses/__init__.py create mode 100644 llm_quest_benchmark/harnesses/base.py create mode 100644 llm_quest_benchmark/harnesses/memory.py create mode 100644 llm_quest_benchmark/harnesses/tools.py diff --git a/llm_quest_benchmark/harnesses/__init__.py b/llm_quest_benchmark/harnesses/__init__.py new file mode 100644 index 0000000..75cef22 --- /dev/null +++ b/llm_quest_benchmark/harnesses/__init__.py @@ -0,0 +1,3 @@ +from llm_quest_benchmark.harnesses.base import BaseHarness + +__all__ = ["BaseHarness"] diff --git a/llm_quest_benchmark/harnesses/base.py b/llm_quest_benchmark/harnesses/base.py new file mode 100644 index 0000000..2ae3e16 --- /dev/null +++ b/llm_quest_benchmark/harnesses/base.py @@ -0,0 +1,269 @@ +"""Base harness class for quest benchmark experiments.""" + +import logging +from abc import abstractmethod +from typing import Any + +from llm_quest_benchmark.agents.base import QuestPlayer +from llm_quest_benchmark.agents.llm_agent import ( + RISKY_CHOICE_KEYWORDS, + SAFE_CHOICE_KEYWORDS, + _is_numeric_raw_reasoning, + _raw_reasoning_fallback, + parse_llm_response, +) +from llm_quest_benchmark.constants import DEFAULT_TEMPLATE, normalize_template_name +from llm_quest_benchmark.llm.client import get_llm_client, parse_model_name +from llm_quest_benchmark.llm.prompt import PromptRenderer +from llm_quest_benchmark.schemas.response import LLMResponse + + +class BaseHarness(QuestPlayer): + """Abstract LLM harness base class.""" + + def __init__( + self, + model_name, + system_template, + temperature, + skip_single, + debug, + memory_module=None, + tools=None, + ): + super().__init__(skip_single=skip_single) + self.debug = debug + self.model_name = model_name.lower() + self.system_template = normalize_template_name(system_template) + self.action_template = DEFAULT_TEMPLATE + self.temperature = temperature + self.harness_name = "" + self.agent_id = f"harness_{self.model_name}" + self.memory_module = memory_module + self.tools = tools or [] + self.model_spec = parse_model_name(self.model_name) + self.logger = logging.getLogger(self.__class__.__name__) + if self.debug: + self.logger.setLevel(logging.DEBUG) + self.logger.propagate = False + if not any(getattr(h, "_llm_quest_handler", False) for h in self.logger.handlers): + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter("%(name)s - %(message)s")) + handler._llm_quest_handler = True + self.logger.addHandler(handler) + + self.prompt_renderer = PromptRenderer( + None, + system_template=self.system_template, + action_template=self.action_template, + ) + self.llm = None + self.history: list[LLMResponse] = [] + self._use_safety_filter = True + self._last_response = LLMResponse(action=1, is_default=True) + + def _ensure_llm(self) -> None: + """Lazily create the provider client only when inference is needed.""" + if self.llm is None: + self.llm = get_llm_client( + self.model_name, + system_prompt=self.prompt_renderer.render_system_prompt(), + temperature=self.temperature, + ) + + @abstractmethod + def _get_action_impl(self, observation, choices) -> int: + """Return the selected 1-based action number.""" + pass + + @abstractmethod + def reset(self) -> None: + """Reset harness state between episodes.""" + super().reset() + self.history = [] + self._last_response = LLMResponse(action=1, is_default=True) + if self.memory_module is not None: + self.memory_module.reset() + + def _format_prompt(self, observation, choices, memo=None, context=None) -> str: + """Render system and action Jinja templates for the current decision.""" + system_prompt = self.prompt_renderer.render_system_prompt( + observation=observation, + choices=choices, + memo=memo, + context=context, + ).strip() + action_prompt = self.prompt_renderer.action_template.render( + observation=observation, + choices=[{"text": c.get("text", "")} for c in choices], + memo=memo, + context=context, + ).strip() + if system_prompt: + return f"{system_prompt}\n\n{action_prompt}".strip() + return action_prompt + + def _parse_llm_response(self, response, num_choices) -> LLMResponse: + """Parse an LLM response into a structured response object.""" + return parse_llm_response(response, num_choices, self.debug, self.logger) + + def _call_llm(self, prompt, system_prompt=None) -> str: + """Call the LLM client with lightweight retry handling.""" + self._ensure_llm() + last_error: Exception | None = None + for attempt in range(3): + try: + if system_prompt is not None: + return self.llm.get_completion(prompt, system_prompt=system_prompt) + return self.llm.get_completion(prompt) + except TypeError: + if system_prompt is not None: + return self.llm.get_completion(prompt) + raise + except Exception as exc: + last_error = exc + if self.debug: + self.logger.warning("LLM call failed on attempt %d: %s", attempt + 1, exc) + raise last_error or RuntimeError("LLM call failed") + + def _choice_risk_score(self, choice_text: str) -> int: + text = (choice_text or "").lower() + score = 0 + for keyword in RISKY_CHOICE_KEYWORDS: + if keyword in text: + score += 2 + for keyword in SAFE_CHOICE_KEYWORDS: + if keyword in text: + score -= 1 + return score + + def _apply_safety_filter(self, choices, preferred_action) -> int: + """Replace obviously risky actions when a clearly safer alternative exists.""" + if not self._use_safety_filter or len(choices) < 2: + return preferred_action + + current_idx = preferred_action - 1 + if current_idx < 0 or current_idx >= len(choices): + return preferred_action + + scored = [(idx + 1, self._choice_risk_score(c.get("text", ""))) for idx, c in enumerate(choices)] + scored.sort(key=lambda item: item[1]) + + best_action, best_score = scored[0] + current_score = self._choice_risk_score(choices[current_idx].get("text", "")) + if current_score - best_score >= 2: + if self.debug: + self.logger.debug( + "Safety filter override: %s -> %s (risk %s -> %s)", + preferred_action, + best_action, + current_score, + best_score, + ) + return best_action + return preferred_action + + @staticmethod + def _normalize_usage(usage: dict[str, Any] | None) -> dict[str, Any]: + usage = usage or {} + prompt_tokens = int(usage.get("prompt_tokens") or 0) + completion_tokens = int(usage.get("completion_tokens") or 0) + total_tokens = int(usage.get("total_tokens") or (prompt_tokens + completion_tokens)) + estimated_cost_usd = usage.get("estimated_cost_usd") + if estimated_cost_usd is not None: + estimated_cost_usd = float(estimated_cost_usd) + return { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "estimated_cost_usd": estimated_cost_usd, + } + + @classmethod + def _merge_usage(cls, first: dict[str, Any] | None, second: dict[str, Any] | None) -> dict[str, Any]: + a = cls._normalize_usage(first) + b = cls._normalize_usage(second) + merged_cost = None + if a["estimated_cost_usd"] is not None or b["estimated_cost_usd"] is not None: + merged_cost = (a["estimated_cost_usd"] or 0.0) + (b["estimated_cost_usd"] or 0.0) + return { + "prompt_tokens": a["prompt_tokens"] + b["prompt_tokens"], + "completion_tokens": a["completion_tokens"] + b["completion_tokens"], + "total_tokens": a["total_tokens"] + b["total_tokens"], + "estimated_cost_usd": merged_cost, + } + + def _format_retry_prompt(self, state: str, choices: list[dict[str, str]]) -> str: + clipped_state = (state or "").strip() + if len(clipped_state) > 500: + clipped_state = clipped_state[:500] + "..." + choices_text = "\n".join([f"{i + 1}. {(c.get('text', '') or '')[:160]}" for i, c in enumerate(choices)]) + return f"""Choose the best action. +State: {clipped_state} +Actions: +{choices_text} + +Return valid JSON only: +{{ + "analysis": "", + "reasoning": "", + "result": +}}""" + + def _format_force_numeric_retry_prompt(self, choices: list[dict[str, str]]) -> str: + choices_text = "\n".join([f"{i + 1}. {(c.get('text', '') or '')[:110]}" for i, c in enumerate(choices)]) + return f"""Pick one action number. +{choices_text} +Reply with one integer only: 1 to {len(choices)}.""" + + def _needs_force_numeric_retry(self) -> bool: + return self.model_spec.provider == "openai" and ( + self.model_spec.model_id.startswith("gpt-5") or self.model_spec.model_id.startswith("o") + ) + + def _parse_with_retries(self, prompt: str, observation: str, choices: list[dict[str, str]]) -> LLMResponse: + """Call the model, parse, and retry once on invalid/default output.""" + llm_response = self._call_llm(prompt) + llm_usage = self.llm.get_last_usage() + first_response = self._parse_llm_response(llm_response, len(choices)) + parsed_response = first_response + + if parsed_response.is_default: + retry_response = self._call_llm(self._format_retry_prompt(observation, choices)) + retry_usage = self.llm.get_last_usage() + llm_usage = self._merge_usage(llm_usage, retry_usage) + retry_parsed = self._parse_llm_response(retry_response, len(choices)) + if not retry_parsed.is_default: + retry_parsed.parse_mode = f"retry_{retry_parsed.parse_mode or 'parsed'}" + parsed_response = retry_parsed + elif self._needs_force_numeric_retry(): + force_retry_response = self._call_llm(self._format_force_numeric_retry_prompt(choices)) + force_retry_usage = self.llm.get_last_usage() + llm_usage = self._merge_usage(llm_usage, force_retry_usage) + force_retry_parsed = self._parse_llm_response(force_retry_response, len(choices)) + if not force_retry_parsed.is_default: + force_retry_parsed.parse_mode = f"force_retry_{force_retry_parsed.parse_mode or 'parsed'}" + parsed_response = force_retry_parsed + + if parsed_response is not first_response: + if parsed_response.analysis is None and first_response.analysis is not None: + parsed_response.analysis = first_response.analysis + if _is_numeric_raw_reasoning(parsed_response.reasoning): + if first_response.reasoning and not _is_numeric_raw_reasoning(first_response.reasoning): + parsed_response.reasoning = first_response.reasoning + else: + first_raw_reasoning = _raw_reasoning_fallback(llm_response) + if first_raw_reasoning and not _is_numeric_raw_reasoning(first_raw_reasoning): + parsed_response.reasoning = first_raw_reasoning + + action_before_policy = parsed_response.action + parsed_response.action = self._apply_safety_filter(choices, parsed_response.action) + if parsed_response.action != action_before_policy and not parsed_response.reasoning: + parsed_response.reasoning = "policy_safety_override" + + usage_payload = self._normalize_usage(llm_usage) + parsed_response.prompt_tokens = usage_payload["prompt_tokens"] + parsed_response.completion_tokens = usage_payload["completion_tokens"] + parsed_response.total_tokens = usage_payload["total_tokens"] + parsed_response.estimated_cost_usd = usage_payload["estimated_cost_usd"] + return parsed_response diff --git a/llm_quest_benchmark/harnesses/memory.py b/llm_quest_benchmark/harnesses/memory.py new file mode 100644 index 0000000..ff54ff9 --- /dev/null +++ b/llm_quest_benchmark/harnesses/memory.py @@ -0,0 +1,317 @@ +"""Memory modules for harness-based quest agents.""" + +from abc import ABC, abstractmethod +from typing import Any + + +class MemoryModule(ABC): + @abstractmethod + def get_context(self, step: int) -> str: + pass + + @abstractmethod + def update(self, step_data: dict) -> None: + pass + + @abstractmethod + def reset(self) -> None: + pass + + def set_quest_briefing(self, briefing: str) -> None: + pass + + +class DefaultMemory(MemoryModule): + """Recent N observations window (no compaction).""" + + def __init__(self, context_window: int = 3, context_chars: int = 220, decision_window: int = 5): + self.context_window = context_window + self.context_chars = context_chars + self.decision_window = decision_window + self._quest_briefing: str | None = None + self._observations: list[str] = [] + self._decisions: list[dict[str, Any]] = [] + + def set_quest_briefing(self, briefing: str) -> None: + clean = (briefing or "").strip() + self._quest_briefing = clean or None + + def get_context(self, step: int) -> str: + blocks: list[str] = [] + current = self._observations[-1] if self._observations else "" + + briefing = self._briefing_block(current) + if briefing: + blocks.append(briefing) + + if len(self._observations) > 1: + previous = self._observations[:-1][-self.context_window :] + if previous: + snippets = [] + for idx, text in enumerate(previous, start=1): + clipped = text if len(text) <= self.context_chars else text[: self.context_chars] + "..." + snippets.append(f"[Previous {idx}] {clipped}") + blocks.append("Recent context from previous steps:\n" + "\n\n".join(snippets)) + + if self._decisions: + recent_memos = [] + for item in self._decisions[-self.decision_window :]: + memo = (item.get("memo") or "").strip() + if not memo: + continue + if recent_memos and recent_memos[-1] == memo: + continue + recent_memos.append(memo) + if recent_memos: + lines = [f"[Memo {idx}] {memo}" for idx, memo in enumerate(recent_memos, start=1)] + blocks.append("State memo (recent):\n" + "\n".join(lines)) + + decision_lines = [] + for idx, item in enumerate(self._decisions[-self.decision_window :], start=1): + choice = item.get("choice") or item.get("choice_text", "") + parse_mode = item.get("parse_mode", "unknown") + memo_val = item.get("memo") + memo_suffix = f" | memo: {memo_val}" if memo_val else "" + decision_lines.append( + f"[Decision {idx}] action {item.get('action')}: {choice} (parse={parse_mode}){memo_suffix}" + ) + blocks.append("Recent selected actions:\n" + "\n".join(decision_lines)) + + return "\n\n".join(blocks) + + def update(self, step_data: dict) -> None: + observation = (step_data.get("observation") or step_data.get("state") or "").strip() + if observation: + if self._quest_briefing is None: + self._quest_briefing = observation + self._observations.append(observation) + if len(self._observations) > 20: + self._observations = self._observations[-20:] + + if any(key in step_data for key in ("action", "choice", "choice_text", "memo")): + memo = (step_data.get("memo") or "").strip()[:350] or None + self._decisions.append( + { + "action": step_data.get("action"), + "choice": step_data.get("choice") or step_data.get("choice_text", ""), + "parse_mode": step_data.get("parse_mode", "unknown"), + "memo": memo, + } + ) + if len(self._decisions) > 40: + self._decisions = self._decisions[-40:] + + def reset(self) -> None: + self._quest_briefing = None + self._observations = [] + self._decisions = [] + + def _briefing_block(self, current_state: str) -> str | None: + if not self._quest_briefing: + return None + if current_state.strip() == self._quest_briefing: + return None + briefing = self._quest_briefing + if len(briefing) > 800: + briefing = briefing[:800] + "..." + return f"Quest briefing (your mission):\n{briefing}" + + +class FullTranscriptMemory(MemoryModule): + """Unbounded full transcript in context.""" + + def __init__(self): + self._quest_briefing: str | None = None + self._transcript: list[dict[str, Any]] = [] + + def set_quest_briefing(self, briefing: str) -> None: + clean = (briefing or "").strip() + self._quest_briefing = clean or None + + def get_context(self, step: int) -> str: + blocks: list[str] = [] + current_state = self._transcript[-1].get("observation", "") if self._transcript else "" + briefing = self._briefing_block(current_state) + if briefing: + blocks.append(briefing) + + if self._transcript: + lines = [] + for entry in self._transcript: + step_value = entry.get("step", "?") + obs = entry.get("observation", "") + if len(obs) > 400: + obs = obs[:400] + "..." + chosen = entry.get("choice_text") or entry.get("choice", "") + reasoning = entry.get("reasoning", "") + line = f"Step {step_value}: {obs}" + if chosen: + line += f"\n You chose: {chosen}" + if reasoning: + line += f"\n Reasoning: {reasoning[:800]}" + state_notes = entry.get("memo", "") + if state_notes: + line += f"\n State: {state_notes[:350]}" + lines.append(line) + blocks.append("=== QUEST TRANSCRIPT ===\n" + "\n\n".join(lines)) + + return "\n\n".join(blocks) + + def update(self, step_data: dict) -> None: + observation = (step_data.get("observation") or step_data.get("state") or "").strip() + if observation and self._quest_briefing is None: + self._quest_briefing = observation + entry = dict(step_data) + entry["observation"] = observation + entry["step"] = entry.get("step") or len(self._transcript) + 1 + self._transcript.append(entry) + + def reset(self) -> None: + self._quest_briefing = None + self._transcript = [] + + def _briefing_block(self, current_state: str) -> str | None: + if not self._quest_briefing: + return None + if current_state.strip() == self._quest_briefing: + return None + briefing = self._quest_briefing + if len(briefing) > 800: + briefing = briefing[:800] + "..." + return f"Quest briefing (your mission):\n{briefing}" + + +class CompactionMemory(MemoryModule): + """Periodic LLM summarization + 20-word memo field.""" + + def __init__(self, compaction_interval: int = 50, llm_client=None): + self.compaction_interval = compaction_interval + self.llm_client = llm_client + self._quest_briefing: str | None = None + self._transcript: list[dict[str, Any]] = [] + self._compaction_summary: str | None = None + self._steps_since_compaction = 0 + + def set_quest_briefing(self, briefing: str) -> None: + clean = (briefing or "").strip() + self._quest_briefing = clean or None + + def get_context(self, step: int) -> str: + blocks: list[str] = [] + current_state = self._transcript[-1].get("observation", "") if self._transcript else "" + briefing = self._briefing_block(current_state) + if briefing: + blocks.append(briefing) + + if self._compaction_summary: + compacted_at = max(0, step - self._steps_since_compaction) + blocks.append(f"=== QUEST MEMORY (compacted at step {compacted_at}) ===\n{self._compaction_summary}") + + recent = self._transcript[-self._steps_since_compaction :] if self._steps_since_compaction > 0 else [] + if recent: + lines = [] + for entry in recent: + step_value = entry.get("step", "?") + obs = entry.get("observation", "") + if len(obs) > 400: + obs = obs[:400] + "..." + chosen = entry.get("choice_text") or entry.get("choice", "") + line = f"Step {step_value}: {obs}" + if chosen: + line += f"\n You chose: {chosen}" + state_notes = entry.get("memo", "") + if state_notes: + line += f"\n State: {state_notes[:350]}" + lines.append(line) + blocks.append("=== RECENT STEPS ===\n" + "\n\n".join(lines)) + + return "\n\n".join(blocks) + + def update(self, step_data: dict) -> None: + observation = (step_data.get("observation") or step_data.get("state") or "").strip() + if observation and self._quest_briefing is None: + self._quest_briefing = observation + entry = dict(step_data) + entry["observation"] = observation[:400] + entry["step"] = entry.get("step") or len(self._transcript) + 1 + if entry.get("memo"): + entry["memo"] = self._twenty_word_memo(str(entry["memo"])) + self._transcript.append(entry) + self._steps_since_compaction += 1 + self._maybe_compact() + + def reset(self) -> None: + self._quest_briefing = None + self._transcript = [] + self._compaction_summary = None + self._steps_since_compaction = 0 + + def _maybe_compact(self) -> None: + if self._steps_since_compaction < self.compaction_interval: + return + transcript_text = self._format_transcript_for_compaction() + if not transcript_text: + return + + prompt_parts = ["You are summarizing an agent's progress through a text quest."] + if self._quest_briefing: + prompt_parts.append(f"\nQUEST BRIEFING (the original mission):\n{self._quest_briefing}") + if self._compaction_summary: + prompt_parts.append(f"\nPREVIOUS SUMMARY:\n{self._compaction_summary}") + prompt_parts.append(f"\nTRANSCRIPT OF LAST {self._steps_since_compaction} STEPS:\n{transcript_text}") + prompt_parts.append( + "\nSummarize the agent's progress. Include:\n" + "- Current objective (what the agent should do next)\n" + "- Progress so far (what has been accomplished)\n" + "- Key facts (NPCs, items, locations, deadlines discovered)\n" + "- Failed approaches (actions/paths that didn't work)\n" + "- Map knowledge (locations visited and connections)\n\n" + "Write a concise summary in plain text, max 300 words." + ) + + summary = "" + if self.llm_client is not None: + summary = (self.llm_client.get_completion("\n".join(prompt_parts)) or "").strip() + if summary: + self._compaction_summary = summary + self._transcript = [] + self._steps_since_compaction = 0 + + def _format_transcript_for_compaction(self) -> str: + recent = ( + self._transcript[-self._steps_since_compaction :] + if self._steps_since_compaction > 0 + else self._transcript[-self.compaction_interval :] + ) + lines = [] + for entry in recent: + step = entry.get("step", "?") + obs = entry.get("observation", "") + if len(obs) > 400: + obs = obs[:400] + "..." + chosen = entry.get("choice_text") or entry.get("choice", "") + reasoning = entry.get("reasoning", "") + state_notes = entry.get("memo", "") + line = f"Step {step}: {obs}" + if chosen: + line += f"\n Chose: {chosen}" + if state_notes: + line += f"\n State: {state_notes[:350]}" + if reasoning: + line += f"\n Reasoning: {reasoning[:800]}" + lines.append(line) + return "\n\n".join(lines) + + def _briefing_block(self, current_state: str) -> str | None: + if not self._quest_briefing: + return None + if current_state.strip() == self._quest_briefing: + return None + briefing = self._quest_briefing + if len(briefing) > 800: + briefing = briefing[:800] + "..." + return f"Quest briefing (your mission):\n{briefing}" + + @staticmethod + def _twenty_word_memo(memo: str) -> str: + return " ".join(memo.split()[:20]) diff --git a/llm_quest_benchmark/harnesses/tools.py b/llm_quest_benchmark/harnesses/tools.py new file mode 100644 index 0000000..5386d6d --- /dev/null +++ b/llm_quest_benchmark/harnesses/tools.py @@ -0,0 +1,171 @@ +"""Reusable tools for harness-based quest agents.""" + +import ast +import re + +MAX_SCRATCHPAD_CHARS = 1200 + + +def calculator(expression: str) -> str: + """Evaluate a restricted arithmetic/comparison expression.""" + expr = (expression or "").strip() + if not expr: + return "error: empty expression" + if len(expr) > 240: + return "error: expression too long" + if not re.fullmatch(r"[0-9a-zA-Z\s+\-*/().,<>=!%]+", expr): + return "error: unsupported characters" + + allowed_nodes = ( + ast.Expression, + ast.Constant, + ast.UnaryOp, + ast.UAdd, + ast.USub, + ast.BinOp, + ast.Add, + ast.Sub, + ast.Mult, + ast.Div, + ast.FloorDiv, + ast.Mod, + ast.Pow, + ast.Compare, + ast.Eq, + ast.NotEq, + ast.Lt, + ast.LtE, + ast.Gt, + ast.GtE, + ast.BoolOp, + ast.And, + ast.Or, + ) + try: + tree = ast.parse(expr, mode="eval") + for node in ast.walk(tree): + if not isinstance(node, allowed_nodes): + return f"error: unsupported expression element {node.__class__.__name__}" + if isinstance(node, ast.Constant) and not isinstance(node.value, (int, float, bool)): + return "error: constants must be numeric or boolean" + result = _eval_calculator_node(tree.body) + except Exception as exc: + return f"error: {exc}" + return f"{expr} = {result}" + + +def _eval_calculator_node(node: ast.AST) -> int | float | bool: + if isinstance(node, ast.Constant) and isinstance(node.value, (int, float, bool)): + return node.value + if isinstance(node, ast.UnaryOp): + value = _eval_calculator_node(node.operand) + if isinstance(node.op, ast.UAdd): + return +value + if isinstance(node.op, ast.USub): + return -value + if isinstance(node, ast.BinOp): + left = _eval_calculator_node(node.left) + right = _eval_calculator_node(node.right) + if isinstance(node.op, ast.Add): + return left + right + if isinstance(node.op, ast.Sub): + return left - right + if isinstance(node.op, ast.Mult): + return left * right + if isinstance(node.op, ast.Div): + return left / right + if isinstance(node.op, ast.FloorDiv): + return left // right + if isinstance(node.op, ast.Mod): + return left % right + if isinstance(node.op, ast.Pow): + if abs(right) > 8: + raise ValueError("exponent too large") + return left**right + if isinstance(node, ast.BoolOp): + values = [bool(_eval_calculator_node(value)) for value in node.values] + if isinstance(node.op, ast.And): + return all(values) + if isinstance(node.op, ast.Or): + return any(values) + if isinstance(node, ast.Compare): + left = _eval_calculator_node(node.left) + for op, comparator in zip(node.ops, node.comparators, strict=True): + right = _eval_calculator_node(comparator) + if isinstance(op, ast.Eq): + ok = left == right + elif isinstance(op, ast.NotEq): + ok = left != right + elif isinstance(op, ast.Lt): + ok = left < right + elif isinstance(op, ast.LtE): + ok = left <= right + elif isinstance(op, ast.Gt): + ok = left > right + elif isinstance(op, ast.GtE): + ok = left >= right + else: + raise ValueError("unsupported comparison") + if not ok: + return False + left = right + return True + raise ValueError("unsupported expression") + + +class Scratchpad: + """Persistent free-form note blob with read and replace operations.""" + + def __init__(self, max_chars: int = MAX_SCRATCHPAD_CHARS): + self.max_chars = max_chars + self._content = "" + + def read(self) -> str: + return self._content or "(empty)" + + def write_replace(self, content: str = "") -> str: + note = " ".join((content or "").strip().split()) + self._content = note[: self.max_chars] + return f"updated: {self._content or '(empty)'}" + + def reset(self) -> None: + self._content = "" + + +class QuestHistoryTool: + """Keyword search over a run-local quest step log.""" + + def __init__(self, step_log: list[dict] | None = None, history_window: int = 10): + self.step_log = step_log if step_log is not None else [] + self.history_window = history_window + + def search(self, query: str) -> str: + """Return relevant previous steps from this quest run via keyword match.""" + if not self.step_log: + return "No prior quest steps recorded yet." + + tokens = set(re.findall(r"[a-zA-Z\u0400-\u04ff0-9_]{3,}", (query or "").lower())) + scored = [] + for entry in self.step_log: + haystack = " ".join( + [ + entry.get("observation", ""), + " ".join(entry.get("choices", [])), + entry.get("selected_choice", ""), + ] + ).lower() + score = sum(1 for token in tokens if token in haystack) + scored.append((score, entry)) + + scored.sort(key=lambda item: (item[0], item[1].get("step", 0)), reverse=True) + best = [entry for score, entry in scored if score > 0][: self.history_window] + if not best: + best = [entry for _, entry in scored[-self.history_window :]] + + lines = [] + for entry in best: + lines.append( + f"Step {entry['step']}: obs={entry['observation']} | " + f"choices={'; '.join(entry['choices'])} | picked={entry.get('selected_choice', 'n/a')}" + ) + return "\n".join(lines) From 1c63c3c317187b40e7488252327685c464ac50b4 Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Mon, 11 May 2026 14:00:31 +0400 Subject: [PATCH 02/24] harnesses: implement 8 concrete harness classes --- llm_quest_benchmark/agents/__init__.py | 44 +- llm_quest_benchmark/agents/agent_factory.py | 28 +- llm_quest_benchmark/agents/planner_agent.py | 236 +---------- llm_quest_benchmark/agents/strategic_agent.py | 95 +---- llm_quest_benchmark/agents/tool_agent.py | 385 +----------------- llm_quest_benchmark/harnesses/base.py | 127 +++++- llm_quest_benchmark/harnesses/factory.py | 53 +++ llm_quest_benchmark/harnesses/memo.py | 62 +++ llm_quest_benchmark/harnesses/minimal.py | 61 +++ llm_quest_benchmark/harnesses/planner.py | 198 +++++++++ llm_quest_benchmark/harnesses/reasoning.py | 57 +++ llm_quest_benchmark/harnesses/tool_harness.py | 238 +++++++++++ 12 files changed, 849 insertions(+), 735 deletions(-) create mode 100644 llm_quest_benchmark/harnesses/factory.py create mode 100644 llm_quest_benchmark/harnesses/memo.py create mode 100644 llm_quest_benchmark/harnesses/minimal.py create mode 100644 llm_quest_benchmark/harnesses/planner.py create mode 100644 llm_quest_benchmark/harnesses/reasoning.py create mode 100644 llm_quest_benchmark/harnesses/tool_harness.py diff --git a/llm_quest_benchmark/agents/__init__.py b/llm_quest_benchmark/agents/__init__.py index 852fb91..d056964 100644 --- a/llm_quest_benchmark/agents/__init__.py +++ b/llm_quest_benchmark/agents/__init__.py @@ -1,15 +1,29 @@ -from .agent_factory import create_agent -from .base import QuestPlayer -from .llm_agent import LLMAgent -from .planner_agent import PlannerAgent -from .random_agent import RandomAgent -from .tool_agent import ToolAgent - -__all__ = [ - "create_agent", - "QuestPlayer", - "RandomAgent", - "LLMAgent", - "PlannerAgent", - "ToolAgent", -] +__all__ = ["create_agent", "QuestPlayer", "RandomAgent", "LLMAgent", "PlannerAgent", "ToolAgent"] + + +def __getattr__(name): + if name == "create_agent": + from .agent_factory import create_agent + + return create_agent + if name == "QuestPlayer": + from .base import QuestPlayer + + return QuestPlayer + if name == "RandomAgent": + from .random_agent import RandomAgent + + return RandomAgent + if name == "LLMAgent": + from .llm_agent import LLMAgent + + return LLMAgent + if name == "PlannerAgent": + from .planner_agent import PlannerAgent + + return PlannerAgent + if name == "ToolAgent": + from .tool_agent import ToolAgent + + return ToolAgent + raise AttributeError(name) diff --git a/llm_quest_benchmark/agents/agent_factory.py b/llm_quest_benchmark/agents/agent_factory.py index d7b889b..6d2ff42 100644 --- a/llm_quest_benchmark/agents/agent_factory.py +++ b/llm_quest_benchmark/agents/agent_factory.py @@ -5,9 +5,7 @@ from llm_quest_benchmark.agents.base import QuestPlayer from llm_quest_benchmark.agents.human_player import HumanPlayer from llm_quest_benchmark.agents.llm_agent import LLMAgent -from llm_quest_benchmark.agents.planner_agent import PlannerAgent from llm_quest_benchmark.agents.random_agent import RandomAgent -from llm_quest_benchmark.agents.tool_agent import ToolAgent from llm_quest_benchmark.constants import ( DEFAULT_MODEL, DEFAULT_TEMPERATURE, @@ -66,27 +64,29 @@ def create_agent( return RandomAgent(seed=seed, debug=debug, skip_single=skip_single) if resolved_action_template == "planner.jinja": - return PlannerAgent( - debug=debug, - model_name=model, - system_template=system_template, - action_template=resolved_action_template, + from llm_quest_benchmark.harnesses.factory import create_harness + + return create_harness( + harness="planner", + model=model, temperature=temperature, skip_single=skip_single, - memory_mode=memory_mode, + debug=debug, compaction_interval=compaction_interval, + system_template=system_template, ) if resolved_action_template in ("tool_augmented.jinja", "tool_augmented_hints.jinja"): - return ToolAgent( - debug=debug, - model_name=model, - system_template=system_template, - action_template=resolved_action_template, + from llm_quest_benchmark.harnesses.factory import create_harness + + return create_harness( + harness="tool_hinted" if resolved_action_template == "tool_augmented_hints.jinja" else "tool_compact", + model=model, temperature=temperature, skip_single=skip_single, - memory_mode=memory_mode, + debug=debug, compaction_interval=compaction_interval, + system_template=system_template, ) # Default to LLM agent diff --git a/llm_quest_benchmark/agents/planner_agent.py b/llm_quest_benchmark/agents/planner_agent.py index 1999afd..cd20e0d 100644 --- a/llm_quest_benchmark/agents/planner_agent.py +++ b/llm_quest_benchmark/agents/planner_agent.py @@ -1,235 +1,9 @@ -"""Planner agent with a lightweight plan-maintain-act loop.""" +"""Deprecated compatibility wrapper for the planner harness.""" -import logging -import re -from typing import Any +import warnings -from llm_quest_benchmark.agents.llm_agent import LLMAgent, LLMResponse, parse_llm_response +from llm_quest_benchmark.harnesses.planner import PlannerHarness as PlannerAgent +warnings.warn("planner_agent is deprecated, use harnesses.planner", DeprecationWarning, stacklevel=2) -class PlannerAgent(LLMAgent): - """LLM agent that maintains a short plan and re-plans on notable changes.""" - - def __init__( - self, - *args, - action_template: str = "planner.jinja", - **kwargs, - ): - super().__init__(*args, action_template=action_template, **kwargs) - self.agent_id = f"planner_{self.model_name}" - self.current_plan: str | None = None - self._plan_history: list[str] = [] - - def _recent_actions(self) -> list[str]: - entries = [] - for item in self._decision_history[-3:]: - choice = (item.get("choice") or "").strip() - if not choice: - continue - entries.append(f"{item.get('action')}. {choice}") - return entries - - @staticmethod - def _normalize_plan(raw_plan: str) -> str: - compact = " ".join((raw_plan or "").strip().split()) - if not compact: - return "" - - sentences = re.split(r"(?<=[.!?])\s+", compact) - sentences = [sentence.strip() for sentence in sentences if sentence.strip()] - if len(sentences) >= 5: - return " ".join(sentences[:5]) - return compact - - def _build_planner_prompt( - self, - observation: str, - choices: list[dict[str, str]], - prompt_kind: str, - replan_reason: str | None = None, - ) -> str: - template = self.prompt_renderer.get_template(self.action_template) - return template.render( - prompt_kind=prompt_kind, - observation=observation, - choices=[{"text": choice.get("text", "")} for choice in choices], - current_plan=self.current_plan, - replan_reason=replan_reason, - recent_actions=self._recent_actions(), - ).strip() - - def _observation_changed_significantly(self, observation: str) -> bool: - """Check if the observation differs enough from the previous one to warrant re-planning. - - Uses token-level overlap ratio: if less than 50% of tokens are shared, - the scene has changed significantly. - """ - if len(self._observation_history) < 2: - return False - - prev_tokens = set(self._observation_history[-2].lower().split()) - curr_tokens = set((observation or "").lower().split()) - if not prev_tokens or not curr_tokens: - return True - overlap = len(prev_tokens & curr_tokens) / max(len(prev_tokens), len(curr_tokens)) - return overlap < 0.5 - - def _should_replan(self, observation: str, state_signature: str) -> tuple[bool, str | None]: - if not self.current_plan: - return True, "No plan exists yet." - - if any(self._state_action_counts.get(state_signature, {}).values()): - return True, "This state has repeated, so a previous action already failed to progress." - - if self._observation_changed_significantly(observation): - return True, "The scene changed significantly from the previous observation." - - return False, None - - def _update_plan( - self, - observation: str, - choices: list[dict[str, str]], - replan_reason: str | None, - ) -> dict[str, Any]: - self._ensure_llm() - prompt = self._build_planner_prompt( - observation, - choices, - prompt_kind="plan", - replan_reason=replan_reason, - ) - plan_response = self.llm.get_completion(prompt) - usage = self.llm.get_last_usage() - plan = self._normalize_plan(plan_response) - if not plan: - if self.current_plan: - plan = self.current_plan - else: - plan = ( - "Gather clues, protect resources, and avoid obvious traps while " - "advancing toward the main objective." - ) - self.current_plan = plan - self._plan_history.append(plan) - if len(self._plan_history) > 10: - self._plan_history = self._plan_history[-10:] - return usage - - def _choose_action_with_plan( - self, - observation: str, - choices: list[dict[str, str]], - replan_reason: str | None, - ) -> tuple[LLMResponse, dict[str, Any]]: - prompt = self._build_planner_prompt( - observation, - choices, - prompt_kind="act", - replan_reason=replan_reason, - ) - llm_response = self.llm.get_completion(prompt) - llm_usage = self.llm.get_last_usage() - parsed_response = parse_llm_response(llm_response, len(choices), self.debug, self.logger) - - if parsed_response.is_default: - retry_response = self.llm.get_completion(self._format_retry_prompt(observation, choices)) - retry_usage = self.llm.get_last_usage() - llm_usage = self._merge_usage(llm_usage, retry_usage) - retry_parsed = parse_llm_response( - retry_response, - len(choices), - self.debug, - self.logger, - ) - if not retry_parsed.is_default: - retry_parsed.parse_mode = f"retry_{retry_parsed.parse_mode or 'parsed'}" - parsed_response = retry_parsed - elif self._needs_force_numeric_retry(): - force_retry_response = self.llm.get_completion(self._format_force_numeric_retry_prompt(choices)) - force_retry_usage = self.llm.get_last_usage() - llm_usage = self._merge_usage(llm_usage, force_retry_usage) - force_retry_parsed = parse_llm_response( - force_retry_response, - len(choices), - self.debug, - self.logger, - ) - if not force_retry_parsed.is_default: - force_retry_parsed.parse_mode = f"force_retry_{force_retry_parsed.parse_mode or 'parsed'}" - parsed_response = force_retry_parsed - - return parsed_response, llm_usage - - def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int: - if self.debug: - self.logger.debug("PlannerAgent evaluating state with %s choices", len(choices)) - try: - state_signature = self._state_signature(state, choices) - contextual_state = self._build_contextual_state(state) - should_replan, replan_reason = self._should_replan(state, state_signature) - plan_usage = None - if should_replan: - plan_usage = self._update_plan(contextual_state, choices, replan_reason) - - parsed_response, action_usage = self._choose_action_with_plan( - contextual_state, - choices, - replan_reason if should_replan else None, - ) - action_before_policy = parsed_response.action - parsed_response.action = self._apply_safety_filter(parsed_response.action, choices) - if parsed_response.action != action_before_policy and not parsed_response.reasoning: - parsed_response.reasoning = "policy_safety_override" - - total_usage = ( - self._merge_usage(plan_usage, action_usage) if plan_usage else self._normalize_usage(action_usage) - ) - if plan_usage: - total_usage = self._normalize_usage(total_usage) - - parsed_response.prompt_tokens = total_usage["prompt_tokens"] - parsed_response.completion_tokens = total_usage["completion_tokens"] - parsed_response.total_tokens = total_usage["total_tokens"] - parsed_response.estimated_cost_usd = total_usage["estimated_cost_usd"] - - self.history.append(parsed_response) - self._last_response = parsed_response - self._remember_decision(state, choices, state_signature, parsed_response) - - if parsed_response.action < 1 or parsed_response.action > len(choices): - self.logger.error( - "INVALID ACTION DETECTED: %s not in range 1-%s", - parsed_response.action, - len(choices), - ) - parsed_response.action = 1 - - return parsed_response.action - except Exception as exc: - self.logger.error("Planner agent error during LLM call: %s", exc) - default_response = LLMResponse( - action=1, - is_default=True, - parse_mode="error_default", - reasoning=f"planner_error: {exc}", - ) - self.history.append(default_response) - self._last_response = default_response - return 1 - - def reset(self) -> None: - super().reset() - self.current_plan = None - self._plan_history = [] - - def on_game_start(self) -> None: - super().on_game_start() - self.current_plan = None - self._plan_history = [] - - def on_game_end(self, final_state: dict[str, Any]) -> None: - if self.debug: - logging.getLogger(self.__class__.__name__).debug("Planner finished with plan: %s", self.current_plan) - super().on_game_end(final_state) +__all__ = ["PlannerAgent"] diff --git a/llm_quest_benchmark/agents/strategic_agent.py b/llm_quest_benchmark/agents/strategic_agent.py index 387c650..a4cc4e7 100644 --- a/llm_quest_benchmark/agents/strategic_agent.py +++ b/llm_quest_benchmark/agents/strategic_agent.py @@ -1,94 +1,3 @@ -"""Strategic agent decorator that adds analysis capabilities""" +"""Deprecated strategic agent module.""" -import logging -from typing import Any - -from llm_quest_benchmark.agents.base import QuestPlayer -from llm_quest_benchmark.llm.prompt import PromptRenderer - - -class StrategicAgent(QuestPlayer): - """Decorator that adds strategic thinking to any quest player""" - - def __init__(self, base_agent: QuestPlayer, debug: bool = False, template: str = "advanced.jinja"): - """Initialize strategic agent wrapper - - Args: - base_agent: Base agent to wrap (usually LLMAgent) - debug: Enable debug logging - template: Template to use for enhanced prompts - """ - super().__init__(skip_single=base_agent.skip_single) - self.agent = base_agent - self.debug = debug - self.history = [] - - # Setup logging - self.logger = logging.getLogger(self.__class__.__name__) - if self.debug: - self.logger.setLevel(logging.DEBUG) - handler = logging.StreamHandler() - handler.setFormatter(logging.Formatter("%(name)s - %(message)s")) - self.logger.addHandler(handler) - - # Initialize prompt renderer - self.prompt_renderer = PromptRenderer(None, template=template) - - def _get_action_impl(self, observation: str, choices: list) -> str: - """Implementation of action selection logic with strategic analysis""" - if hasattr(self.agent, "llm"): - # First, get situation analysis - if self.debug: - self.logger.debug(f"\nObservation:\n{observation}") - - analysis = self.agent.llm( - "Analyze this situation and explain your thinking step-by-step instead of choosing an action:\n" - + observation - ) - - if self.debug: - self.logger.debug(f"\nAnalysis:\n{analysis}") - - # Store analysis in history - self.history.append({"observation": observation, "analysis": analysis}) - - # Get enhanced context with history - enhanced_context = self.get_enhanced_context(observation, choices) - if self.debug: - self.logger.debug(f"\nEnhanced Context:\n{enhanced_context}") - - # Then make the actual choice with analysis context - return self.agent.get_action(enhanced_context, choices) - else: - # If agent doesn't have LLM capability, just pass through - return self.agent.get_action(observation, choices) - - def get_enhanced_context(self, observation: str, choices: list) -> str: - """Build context for advanced prompt with historical analysis""" - context = [ - f"Turn {len(self.history) + 1}: {entry['analysis']}" - for entry in self.history[-3:] # Last 3 analyses - ] - return self.prompt_renderer.render_action_prompt( - observation=observation, choices=choices, state_tracker=context - ) - - def reset(self) -> None: - """Reset both strategic and base agent state""" - self.history = [] - self.agent.reset() - - def on_game_start(self) -> None: - """Pass through to base agent""" - if self.debug: - self.logger.debug("Starting new game with strategic analysis") - self.agent.on_game_start() - - def on_game_end(self, final_state: dict[str, Any]) -> None: - """Pass through to base agent and log analysis history""" - self.agent.on_game_end(final_state) - if self.debug: - self.logger.debug("Final Analysis History:") - for entry in self.history: - self.logger.debug(f"\nObservation: {entry['observation']}") - self.logger.debug(f"Analysis: {entry['analysis']}") +raise ImportError("strategic_agent is deprecated; use llm_quest_benchmark.harnesses instead") diff --git a/llm_quest_benchmark/agents/tool_agent.py b/llm_quest_benchmark/agents/tool_agent.py index 694d1ac..659a747 100644 --- a/llm_quest_benchmark/agents/tool_agent.py +++ b/llm_quest_benchmark/agents/tool_agent.py @@ -1,384 +1,9 @@ -"""Tool-augmented agent with lightweight structured prompting.""" +"""Deprecated compatibility wrapper for the tool harness.""" -import ast -import re -from typing import Any +import warnings -from llm_quest_benchmark.agents.llm_agent import ( - LLMAgent, - LLMResponse, - _parse_json_response, - parse_llm_response, -) +from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness as ToolAgent +warnings.warn("tool_agent is deprecated, use harnesses.tool_harness", DeprecationWarning, stacklevel=2) -class ToolAgent(LLMAgent): - """LLM agent with generic run-local tools for history, math, and state notes.""" - - DEFAULT_HISTORY_WINDOW = 10 - MAX_SCRATCHPAD_CHARS = 1200 - MAX_TOOL_INPUT_CHARS = 500 - - def __init__( - self, - *args, - action_template: str = "tool_augmented.jinja", - history_window: int | None = None, - **kwargs, - ): - super().__init__(*args, action_template=action_template, **kwargs) - self.agent_id = f"tool_{self.model_name}" - self._step_log: list[dict[str, Any]] = [] - self._history_window = history_window or self.DEFAULT_HISTORY_WINDOW - self._scratchpad = "" - - def _recent_steps(self) -> list[str]: - snippets = [] - for entry in self._step_log[-self._history_window :]: - snippets.append(f"Step {entry['step']}: {entry['observation']} -> {entry.get('selected_choice', 'n/a')}") - return snippets - - def _tool_descriptions(self) -> list[str]: - return [ - "quest_history(query): search earlier observations and chosen actions in this quest.", - "calculator(expression): evaluate arithmetic and simple comparisons.", - "scratchpad(operation, content): read or replace one persistent note. operation is read or write_replace.", - ] - - def quest_history(self, query: str) -> str: - """Return relevant previous steps from this quest run via keyword match.""" - if not self._step_log: - return "No prior quest steps recorded yet." - - tokens = set(re.findall(r"[a-zA-Z\u0400-\u04ff0-9_]{3,}", (query or "").lower())) - scored = [] - for entry in self._step_log: - haystack = " ".join( - [ - entry.get("observation", ""), - " ".join(entry.get("choices", [])), - entry.get("selected_choice", ""), - ] - ).lower() - score = sum(1 for token in tokens if token in haystack) - scored.append((score, entry)) - - scored.sort(key=lambda item: (item[0], item[1].get("step", 0)), reverse=True) - best = [entry for s, entry in scored if s > 0][: self._history_window] - if not best: - best = [entry for _, entry in scored[-self._history_window :]] - - lines = [] - for entry in best: - lines.append( - f"Step {entry['step']}: obs={entry['observation']} | " - f"choices={'; '.join(entry['choices'])} | picked={entry.get('selected_choice', 'n/a')}" - ) - return "\n".join(lines) - - @staticmethod - def calculator(expression: str) -> str: - """Evaluate a restricted arithmetic/comparison expression.""" - expr = (expression or "").strip() - if not expr: - return "error: empty expression" - if len(expr) > 240: - return "error: expression too long" - if not re.fullmatch(r"[0-9a-zA-Z\s+\-*/().,<>=!%]+", expr): - return "error: unsupported characters" - - allowed_nodes = ( - ast.Expression, - ast.Constant, - ast.UnaryOp, - ast.UAdd, - ast.USub, - ast.BinOp, - ast.Add, - ast.Sub, - ast.Mult, - ast.Div, - ast.FloorDiv, - ast.Mod, - ast.Pow, - ast.Compare, - ast.Eq, - ast.NotEq, - ast.Lt, - ast.LtE, - ast.Gt, - ast.GtE, - ast.BoolOp, - ast.And, - ast.Or, - ) - try: - tree = ast.parse(expr, mode="eval") - for node in ast.walk(tree): - if not isinstance(node, allowed_nodes): - return f"error: unsupported expression element {node.__class__.__name__}" - if isinstance(node, ast.Constant) and not isinstance(node.value, (int, float, bool)): - return "error: constants must be numeric or boolean" - result = ToolAgent._eval_calculator_node(tree.body) - except Exception as exc: - return f"error: {exc}" - return f"{expr} = {result}" - - @staticmethod - def _eval_calculator_node(node: ast.AST) -> int | float | bool: - if isinstance(node, ast.Constant) and isinstance(node.value, (int, float, bool)): - return node.value - if isinstance(node, ast.UnaryOp): - value = ToolAgent._eval_calculator_node(node.operand) - if isinstance(node.op, ast.UAdd): - return +value - if isinstance(node.op, ast.USub): - return -value - if isinstance(node, ast.BinOp): - left = ToolAgent._eval_calculator_node(node.left) - right = ToolAgent._eval_calculator_node(node.right) - if isinstance(node.op, ast.Add): - return left + right - if isinstance(node.op, ast.Sub): - return left - right - if isinstance(node.op, ast.Mult): - return left * right - if isinstance(node.op, ast.Div): - return left / right - if isinstance(node.op, ast.FloorDiv): - return left // right - if isinstance(node.op, ast.Mod): - return left % right - if isinstance(node.op, ast.Pow): - if abs(right) > 8: - raise ValueError("exponent too large") - return left**right - if isinstance(node, ast.BoolOp): - values = [bool(ToolAgent._eval_calculator_node(value)) for value in node.values] - if isinstance(node.op, ast.And): - return all(values) - if isinstance(node.op, ast.Or): - return any(values) - if isinstance(node, ast.Compare): - left = ToolAgent._eval_calculator_node(node.left) - for op, comparator in zip(node.ops, node.comparators, strict=True): - right = ToolAgent._eval_calculator_node(comparator) - if isinstance(op, ast.Eq): - ok = left == right - elif isinstance(op, ast.NotEq): - ok = left != right - elif isinstance(op, ast.Lt): - ok = left < right - elif isinstance(op, ast.LtE): - ok = left <= right - elif isinstance(op, ast.Gt): - ok = left > right - elif isinstance(op, ast.GtE): - ok = left >= right - else: - raise ValueError("unsupported comparison") - if not ok: - return False - left = right - return True - raise ValueError("unsupported expression") - - def scratchpad(self, operation: str, content: str = "") -> str: - """Read or replace one persistent free-form note blob.""" - op = (operation or "").strip().lower() - if op == "read": - return self._scratchpad or "(empty)" - if op == "write_replace": - note = " ".join((content or "").strip().split()) - self._scratchpad = note[: self.MAX_SCRATCHPAD_CHARS] - return f"updated: {self._scratchpad or '(empty)'}" - return "error: operation must be read or write_replace" - - def _build_tool_prompt( - self, - observation: str, - choices: list[dict[str, str]], - prompt_kind: str, - tool_results: list[str] | None = None, - ) -> str: - template = self.prompt_renderer.get_template(self.action_template) - return template.render( - prompt_kind=prompt_kind, - observation=observation, - choices=[{"text": choice.get("text", "")} for choice in choices], - tool_descriptions=self._tool_descriptions(), - tool_results=tool_results or [], - recent_steps=self._recent_steps(), - scratchpad_note=self._scratchpad, - ).strip() - - @staticmethod - def _extract_tool_calls(response: str) -> list[dict[str, Any]]: - payload, _ = _parse_json_response(response) - if not isinstance(payload, dict): - return [] - - tool_calls = payload.get("tool_calls") - if not isinstance(tool_calls, list): - return [] - - normalized = [] - for item in tool_calls[:1]: - if not isinstance(item, dict): - continue - tool_name = str(item.get("tool") or "").strip() - tool_input = item.get("input") - operation = str(item.get("operation") or "").strip() - content = str(item.get("content") or "").strip() - if isinstance(tool_input, dict): - operation = operation or str(tool_input.get("operation") or "").strip() - content = content or str(tool_input.get("content") or "").strip() - tool_input = tool_input.get("expression") or tool_input.get("query") or tool_input.get("content") or "" - tool_input = str(tool_input or "").strip() - if len(tool_input) > ToolAgent.MAX_TOOL_INPUT_CHARS: - tool_input = tool_input[: ToolAgent.MAX_TOOL_INPUT_CHARS] - if len(content) > ToolAgent.MAX_TOOL_INPUT_CHARS: - content = content[: ToolAgent.MAX_TOOL_INPUT_CHARS] - if tool_name: - normalized.append( - { - "tool": tool_name, - "input": tool_input, - "operation": operation, - "content": content, - } - ) - return normalized - - def _execute_tool_calls(self, tool_calls: list[dict[str, Any]]) -> list[str]: - results = [] - for tc in tool_calls: - name, inp = tc["tool"], tc.get("input", "") - if name == "quest_history": - result = self.quest_history(inp) - elif name == "calculator": - result = self.calculator(inp) - elif name == "scratchpad": - operation = tc.get("operation") or inp - result = self.scratchpad(str(operation), str(tc.get("content") or "")) - else: - result = f"unknown tool: {name}" - call_repr = inp - if name == "scratchpad": - call_repr = f"{tc.get('operation') or inp}, {tc.get('content') or ''}".strip(", ") - results.append(f"{name}({call_repr}) => {result}") - return results - - def _final_choice( - self, - observation: str, - choices: list[dict[str, str]], - tool_results: list[str] | None = None, - ) -> tuple[LLMResponse, dict[str, Any]]: - prompt = self._build_tool_prompt( - observation, - choices, - prompt_kind="final", - tool_results=tool_results, - ) - llm_response = self.llm.get_completion(prompt) - llm_usage = self.llm.get_last_usage() - parsed_response = parse_llm_response(llm_response, len(choices), self.debug, self.logger) - - if parsed_response.is_default: - retry_response = self.llm.get_completion(self._format_retry_prompt(observation, choices)) - retry_usage = self.llm.get_last_usage() - llm_usage = self._merge_usage(llm_usage, retry_usage) - retry_parsed = parse_llm_response(retry_response, len(choices), self.debug, self.logger) - if not retry_parsed.is_default: - retry_parsed.parse_mode = f"retry_{retry_parsed.parse_mode or 'parsed'}" - parsed_response = retry_parsed - elif self._needs_force_numeric_retry(): - force_response = self.llm.get_completion(self._format_force_numeric_retry_prompt(choices)) - force_usage = self.llm.get_last_usage() - llm_usage = self._merge_usage(llm_usage, force_usage) - force_parsed = parse_llm_response(force_response, len(choices), self.debug, self.logger) - if not force_parsed.is_default: - force_parsed.parse_mode = f"force_retry_{force_parsed.parse_mode or 'parsed'}" - parsed_response = force_parsed - - return parsed_response, llm_usage - - def _log_step(self, observation: str, choices: list[dict[str, str]], response: LLMResponse) -> None: - selected = "" - if 1 <= response.action <= len(choices): - selected = choices[response.action - 1].get("text", "") - - clipped = " ".join((observation or "").strip().split()) - if len(clipped) > 180: - clipped = clipped[:180] + "..." - - self._step_log.append( - { - "step": len(self._step_log) + 1, - "observation": clipped, - "choices": [c.get("text", "") for c in choices], - "selected_choice": selected, - } - ) - - def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int: - try: - state_signature = self._state_signature(state, choices) - contextual_state = self._build_contextual_state(state) - self._ensure_llm() - - selection_prompt = self._build_tool_prompt(contextual_state, choices, prompt_kind="select") - selection_response = self.llm.get_completion(selection_prompt) - selection_usage = self.llm.get_last_usage() - tool_calls = self._extract_tool_calls(selection_response) - parsed_response = parse_llm_response(selection_response, len(choices), self.debug, self.logger) - tool_results: list[str] = [] - - total_usage = self._normalize_usage(selection_usage) - if tool_calls: - tool_results = self._execute_tool_calls(tool_calls) - parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=tool_results) - total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage)) - elif parsed_response.is_default: - parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=[]) - total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage)) - - action_before_policy = parsed_response.action - parsed_response.action = self._apply_safety_filter(parsed_response.action, choices) - if parsed_response.action != action_before_policy and not parsed_response.reasoning: - parsed_response.reasoning = "policy_safety_override" - - parsed_response.prompt_tokens = total_usage["prompt_tokens"] - parsed_response.completion_tokens = total_usage["completion_tokens"] - parsed_response.total_tokens = total_usage["total_tokens"] - parsed_response.estimated_cost_usd = total_usage["estimated_cost_usd"] - parsed_response.tool_calls = tool_calls or None - parsed_response.tool_results = tool_results or None - - self.history.append(parsed_response) - self._last_response = parsed_response - self._remember_decision(state, choices, state_signature, parsed_response) - self._log_step(state, choices, parsed_response) - return parsed_response.action - except Exception as exc: - self.logger.error("Tool agent error during LLM call: %s", exc) - default_response = LLMResponse( - action=1, - is_default=True, - parse_mode="error_default", - reasoning=f"tool_agent_error: {exc}", - ) - self.history.append(default_response) - self._last_response = default_response - return 1 - - def reset(self) -> None: - super().reset() - self._step_log = [] - self._scratchpad = "" - - def on_game_start(self) -> None: - super().on_game_start() - self._step_log = [] - self._scratchpad = "" +__all__ = ["ToolAgent"] diff --git a/llm_quest_benchmark/harnesses/base.py b/llm_quest_benchmark/harnesses/base.py index 2ae3e16..ec70b55 100644 --- a/llm_quest_benchmark/harnesses/base.py +++ b/llm_quest_benchmark/harnesses/base.py @@ -1,6 +1,8 @@ """Base harness class for quest benchmark experiments.""" +import hashlib import logging +import re from abc import abstractmethod from typing import Any @@ -30,14 +32,15 @@ def __init__( debug, memory_module=None, tools=None, + action_template=DEFAULT_TEMPLATE, ): super().__init__(skip_single=skip_single) self.debug = debug self.model_name = model_name.lower() self.system_template = normalize_template_name(system_template) - self.action_template = DEFAULT_TEMPLATE + self.action_template = normalize_template_name(action_template) self.temperature = temperature - self.harness_name = "" + self.harness_name = getattr(self.__class__, "harness_name", "") self.agent_id = f"harness_{self.model_name}" self.memory_module = memory_module self.tools = tools or [] @@ -61,6 +64,10 @@ def __init__( self.history: list[LLMResponse] = [] self._use_safety_filter = True self._last_response = LLMResponse(action=1, is_default=True) + self._observation_history: list[str] = [] + self._decision_history: list[dict[str, Any]] = [] + self._state_action_counts: dict[str, dict[int, int]] = {} + self._step_count = 0 def _ensure_llm(self) -> None: """Lazily create the provider client only when inference is needed.""" @@ -82,9 +89,125 @@ def reset(self) -> None: super().reset() self.history = [] self._last_response = LLMResponse(action=1, is_default=True) + self._observation_history = [] + self._decision_history = [] + self._state_action_counts = {} + self._step_count = 0 if self.memory_module is not None: self.memory_module.reset() + def get_action(self, observation: str, choices: list[dict[str, str]]) -> int: + clean = (observation or "").strip() + if clean: + self._observation_history.append(clean) + if len(self._observation_history) > 20: + self._observation_history = self._observation_history[-20:] + if self.memory_module is not None: + self.memory_module.update({"observation": clean, "step": self._step_count + 1}) + return super().get_action(observation, choices) + + def on_game_start(self) -> None: + super().on_game_start() + self.reset() + + def on_game_end(self, final_state: dict[str, Any]) -> None: + if self.debug: + self.logger.debug("Game ended with state: %s", final_state) + + def get_last_response(self) -> LLMResponse | None: + return self._last_response + + @property + def _quest_briefing(self) -> str | None: + return getattr(self.memory_module, "_quest_briefing", None) + + @_quest_briefing.setter + def _quest_briefing(self, value: str | None) -> None: + if self.memory_module is not None: + self.memory_module._quest_briefing = value + + @property + def _transcript(self) -> list[dict[str, Any]]: + return getattr(self.memory_module, "_transcript", []) + + @_transcript.setter + def _transcript(self, value: list[dict[str, Any]]) -> None: + if self.memory_module is not None: + self.memory_module._transcript = value + + @property + def _steps_since_compaction(self) -> int: + return getattr(self.memory_module, "_steps_since_compaction", 0) + + @_steps_since_compaction.setter + def _steps_since_compaction(self, value: int) -> None: + if self.memory_module is not None: + self.memory_module._steps_since_compaction = value + + def _build_contextual_state(self, state: str) -> str: + if self.memory_module is None: + return state + context = self.memory_module.get_context(self._step_count + 1) + if not context: + return state + return f"{context}\n\nCurrent story state:\n{state}" + + @staticmethod + def _normalize_for_signature(value: str, max_len: int = 320) -> str: + text = (value or "").lower() + text = re.sub(r"\s+", " ", text).strip() + return text[:max_len] if len(text) > max_len else text + + def _state_signature(self, state: str, choices: list[dict[str, str]]) -> str: + normalized_state = self._normalize_for_signature(state, max_len=420) + normalized_choices = "|".join( + self._normalize_for_signature(choice.get("text", ""), max_len=110) for choice in choices + ) + raw_signature = f"{normalized_state}||{normalized_choices}" + return hashlib.sha1(raw_signature.encode("utf-8", errors="ignore")).hexdigest()[:20] + + def _remember_decision( + self, + state: str, + choices: list[dict[str, str]], + state_signature: str, + response: LLMResponse, + ) -> None: + action = int(response.action) + counts = self._state_action_counts.setdefault(state_signature, {}) + counts[action] = counts.get(action, 0) + 1 + + selected_text = "" + if 1 <= action <= len(choices): + selected_text = choices[action - 1].get("text", "") + state_snippet = (state or "").strip() + if len(state_snippet) > 220: + state_snippet = state_snippet[:220] + "..." + + decision = { + "state": state_snippet, + "action": action, + "choice": selected_text, + "choice_text": selected_text, + "parse_mode": response.parse_mode or "unknown", + "memo": (response.memo or "").strip()[:350] or None, + "reasoning": (response.reasoning or "")[:800], + } + self._decision_history.append(decision) + if len(self._decision_history) > 40: + self._decision_history = self._decision_history[-40:] + + self._step_count += 1 + if self.memory_module is not None: + self.memory_module.update( + { + "step": self._step_count, + "observation": state, + "choices": [c.get("text", "") for c in choices], + **decision, + } + ) + def _format_prompt(self, observation, choices, memo=None, context=None) -> str: """Render system and action Jinja templates for the current decision.""" system_prompt = self.prompt_renderer.render_system_prompt( diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py new file mode 100644 index 0000000..58d2546 --- /dev/null +++ b/llm_quest_benchmark/harnesses/factory.py @@ -0,0 +1,53 @@ +"""Factory for creating harness-based quest players.""" + +from llm_quest_benchmark.agents.base import QuestPlayer +from llm_quest_benchmark.agents.human_player import HumanPlayer +from llm_quest_benchmark.agents.random_agent import RandomAgent +from llm_quest_benchmark.harnesses.memo import HintedCompactHarness, MemoCompactHarness +from llm_quest_benchmark.harnesses.minimal import MinimalHarness +from llm_quest_benchmark.harnesses.planner import PlannerHarness +from llm_quest_benchmark.harnesses.reasoning import ReasoningFullTranscriptHarness, ReasoningRecentHarness +from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness, ToolHintedHarness + +HARNESS_REGISTRY = { + "minimal": MinimalHarness, + "reasoning_recent": ReasoningRecentHarness, + "reasoning_full": ReasoningFullTranscriptHarness, + "memo_compact": MemoCompactHarness, + "hinted_compact": HintedCompactHarness, + "tool_compact": ToolCompactHarness, + "tool_hinted": ToolHintedHarness, + "planner": PlannerHarness, +} + + +def create_harness( + harness: str, + model: str, + temperature: float = 0.4, + skip_single: bool = False, + debug: bool = False, + compaction_interval: int = 50, + system_template: str = "system_role.jinja", +) -> QuestPlayer: + if harness == "human": + return HumanPlayer(skip_single=skip_single) + if harness.startswith("random_choice"): + seed = None + if "_" in harness[13:]: + try: + seed = int(harness.split("_")[-1]) + except ValueError: + pass + return RandomAgent(seed=seed, debug=debug, skip_single=skip_single) + if harness not in HARNESS_REGISTRY: + raise ValueError(f"Unknown harness '{harness}'. Valid: {sorted(HARNESS_REGISTRY)}") + cls = HARNESS_REGISTRY[harness] + return cls( + model_name=model, + temperature=temperature, + skip_single=skip_single, + debug=debug, + compaction_interval=compaction_interval, + system_template=system_template, + ) diff --git a/llm_quest_benchmark/harnesses/memo.py b/llm_quest_benchmark/harnesses/memo.py new file mode 100644 index 0000000..764f206 --- /dev/null +++ b/llm_quest_benchmark/harnesses/memo.py @@ -0,0 +1,62 @@ +"""Compacted-memory harness variants.""" + +from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE +from llm_quest_benchmark.harnesses.memory import CompactionMemory +from llm_quest_benchmark.harnesses.minimal import MinimalHarness + + +class MemoCompactHarness(MinimalHarness): + harness_name = "memo_compact" + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + system_template: str = SYSTEM_ROLE_TEMPLATE, + action_template: str = "stateful_compact.jinja", + temperature: float = DEFAULT_TEMPERATURE, + skip_single: bool = False, + debug: bool = False, + compaction_interval: int = 50, + memory_module=None, + **kwargs, + ): + super().__init__( + model_name=model_name, + system_template=system_template, + action_template=action_template, + temperature=temperature, + skip_single=skip_single, + debug=debug, + memory_module=memory_module or CompactionMemory(compaction_interval=compaction_interval), + **kwargs, + ) + self._memory_mode = "compaction" + self._compaction_interval = compaction_interval + + +class HintedCompactHarness(MemoCompactHarness): + harness_name = "hinted_compact" + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + system_template: str = SYSTEM_ROLE_TEMPLATE, + action_template: str = "stateful_compact_hints.jinja", + temperature: float = DEFAULT_TEMPERATURE, + skip_single: bool = False, + debug: bool = False, + compaction_interval: int = 50, + memory_module=None, + **kwargs, + ): + super().__init__( + model_name=model_name, + system_template=system_template, + action_template=action_template, + temperature=temperature, + skip_single=skip_single, + debug=debug, + compaction_interval=compaction_interval, + memory_module=memory_module, + **kwargs, + ) diff --git a/llm_quest_benchmark/harnesses/minimal.py b/llm_quest_benchmark/harnesses/minimal.py new file mode 100644 index 0000000..8fa8ba0 --- /dev/null +++ b/llm_quest_benchmark/harnesses/minimal.py @@ -0,0 +1,61 @@ +"""Minimal harness implementation.""" + +from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE +from llm_quest_benchmark.harnesses.base import BaseHarness +from llm_quest_benchmark.harnesses.memory import DefaultMemory +from llm_quest_benchmark.schemas.response import LLMResponse + + +class MinimalHarness(BaseHarness): + """Simple prompt-call-parse action loop with recent-memory context.""" + + harness_name = "minimal" + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + system_template: str = SYSTEM_ROLE_TEMPLATE, + action_template: str = "stub.jinja", + temperature: float = DEFAULT_TEMPERATURE, + skip_single: bool = False, + debug: bool = False, + memory_module=None, + compaction_interval: int = 50, + **_, + ): + del compaction_interval + super().__init__( + model_name=model_name, + system_template=system_template, + action_template=action_template, + temperature=temperature, + skip_single=skip_single, + debug=debug, + memory_module=memory_module or DefaultMemory(), + ) + + def _get_action_impl(self, observation: str, choices: list[dict[str, str]]) -> int: + try: + state_signature = self._state_signature(observation, choices) + prompt = self._format_prompt(self._build_contextual_state(observation), choices) + parsed_response = self._parse_with_retries(prompt, observation, choices) + self.history.append(parsed_response) + self._last_response = parsed_response + self._remember_decision(observation, choices, state_signature, parsed_response) + if parsed_response.action < 1 or parsed_response.action > len(choices): + parsed_response.action = 1 + return parsed_response.action + except Exception as exc: + self.logger.error("Harness error during LLM call: %s", exc) + default_response = LLMResponse( + action=1, + is_default=True, + parse_mode="error_default", + reasoning=f"llm_call_error: {exc}", + ) + self.history.append(default_response) + self._last_response = default_response + return 1 + + def reset(self) -> None: + super().reset() diff --git a/llm_quest_benchmark/harnesses/planner.py b/llm_quest_benchmark/harnesses/planner.py new file mode 100644 index 0000000..efb77a9 --- /dev/null +++ b/llm_quest_benchmark/harnesses/planner.py @@ -0,0 +1,198 @@ +"""Planner harness implementation.""" + +import logging +import re +from typing import Any + +from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE +from llm_quest_benchmark.harnesses.base import BaseHarness +from llm_quest_benchmark.harnesses.memory import CompactionMemory +from llm_quest_benchmark.schemas.response import LLMResponse + + +class PlannerHarness(BaseHarness): + """Compacted-memory harness with a lightweight plan-maintain-act loop.""" + + harness_name = "planner" + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + system_template: str = SYSTEM_ROLE_TEMPLATE, + action_template: str = "planner.jinja", + temperature: float = DEFAULT_TEMPERATURE, + skip_single: bool = False, + debug: bool = False, + compaction_interval: int = 50, + memory_module=None, + **_, + ): + super().__init__( + model_name=model_name, + system_template=system_template, + action_template=action_template, + temperature=temperature, + skip_single=skip_single, + debug=debug, + memory_module=memory_module or CompactionMemory(compaction_interval=compaction_interval), + ) + self.agent_id = f"planner_{self.model_name}" + self.current_plan: str | None = None + self._plan_history: list[str] = [] + self._memory_mode = "compaction" + self._compaction_interval = compaction_interval + + def _recent_actions(self) -> list[str]: + entries = [] + for item in self._decision_history[-3:]: + choice = (item.get("choice") or "").strip() + if choice: + entries.append(f"{item.get('action')}. {choice}") + return entries + + @staticmethod + def _normalize_plan(raw_plan: str) -> str: + compact = " ".join((raw_plan or "").strip().split()) + if not compact: + return "" + sentences = re.split(r"(?<=[.!?])\s+", compact) + sentences = [sentence.strip() for sentence in sentences if sentence.strip()] + if len(sentences) >= 5: + return " ".join(sentences[:5]) + return compact + + def _build_planner_prompt( + self, + observation: str, + choices: list[dict[str, str]], + prompt_kind: str, + replan_reason: str | None = None, + ) -> str: + template = self.prompt_renderer.get_template(self.action_template) + return template.render( + prompt_kind=prompt_kind, + observation=observation, + choices=[{"text": choice.get("text", "")} for choice in choices], + current_plan=self.current_plan, + replan_reason=replan_reason, + recent_actions=self._recent_actions(), + ).strip() + + def _observation_changed_significantly(self, observation: str) -> bool: + if len(self._observation_history) < 2: + return False + prev_tokens = set(self._observation_history[-2].lower().split()) + curr_tokens = set((observation or "").lower().split()) + if not prev_tokens or not curr_tokens: + return True + overlap = len(prev_tokens & curr_tokens) / max(len(prev_tokens), len(curr_tokens)) + return overlap < 0.5 + + def _should_replan(self, observation: str, state_signature: str) -> tuple[bool, str | None]: + if not self.current_plan: + return True, "No plan exists yet." + if any(self._state_action_counts.get(state_signature, {}).values()): + return True, "This state has repeated, so a previous action already failed to progress." + if self._observation_changed_significantly(observation): + return True, "The scene changed significantly from the previous observation." + return False, None + + def _update_plan( + self, + observation: str, + choices: list[dict[str, str]], + replan_reason: str | None, + ) -> dict[str, Any]: + self._ensure_llm() + prompt = self._build_planner_prompt(observation, choices, prompt_kind="plan", replan_reason=replan_reason) + plan_response = self._call_llm(prompt) + usage = self.llm.get_last_usage() + plan = self._normalize_plan(plan_response) + if not plan: + plan = self.current_plan or ( + "Gather clues, protect resources, and avoid obvious traps while advancing toward the main objective." + ) + self.current_plan = plan + self._plan_history.append(plan) + if len(self._plan_history) > 10: + self._plan_history = self._plan_history[-10:] + return usage + + def _choose_action_with_plan( + self, + observation: str, + choices: list[dict[str, str]], + replan_reason: str | None, + ) -> tuple[LLMResponse, dict[str, Any]]: + prompt = self._build_planner_prompt(observation, choices, prompt_kind="act", replan_reason=replan_reason) + parsed_response = self._parse_with_retries(prompt, observation, choices) + return parsed_response, { + "prompt_tokens": parsed_response.prompt_tokens, + "completion_tokens": parsed_response.completion_tokens, + "total_tokens": parsed_response.total_tokens, + "estimated_cost_usd": parsed_response.estimated_cost_usd, + } + + def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int: + if self.debug: + self.logger.debug("PlannerHarness evaluating state with %s choices", len(choices)) + try: + state_signature = self._state_signature(state, choices) + contextual_state = self._build_contextual_state(state) + should_replan, replan_reason = self._should_replan(state, state_signature) + plan_usage = None + if should_replan: + plan_usage = self._update_plan(contextual_state, choices, replan_reason) + + parsed_response, action_usage = self._choose_action_with_plan( + contextual_state, + choices, + replan_reason if should_replan else None, + ) + + action_before_policy = parsed_response.action + parsed_response.action = self._apply_safety_filter(choices, parsed_response.action) + if parsed_response.action != action_before_policy and not parsed_response.reasoning: + parsed_response.reasoning = "policy_safety_override" + + total_usage = ( + self._merge_usage(plan_usage, action_usage) if plan_usage else self._normalize_usage(action_usage) + ) + total_usage = self._normalize_usage(total_usage) + parsed_response.prompt_tokens = total_usage["prompt_tokens"] + parsed_response.completion_tokens = total_usage["completion_tokens"] + parsed_response.total_tokens = total_usage["total_tokens"] + parsed_response.estimated_cost_usd = total_usage["estimated_cost_usd"] + + self.history.append(parsed_response) + self._last_response = parsed_response + self._remember_decision(state, choices, state_signature, parsed_response) + if parsed_response.action < 1 or parsed_response.action > len(choices): + parsed_response.action = 1 + return parsed_response.action + except Exception as exc: + self.logger.error("Planner harness error during LLM call: %s", exc) + default_response = LLMResponse( + action=1, + is_default=True, + parse_mode="error_default", + reasoning=f"planner_error: {exc}", + ) + self.history.append(default_response) + self._last_response = default_response + return 1 + + def reset(self) -> None: + super().reset() + self.current_plan = None + self._plan_history = [] + + def on_game_start(self) -> None: + super().on_game_start() + self.current_plan = None + self._plan_history = [] + + def on_game_end(self, final_state: dict[str, Any]) -> None: + if self.debug: + logging.getLogger(self.__class__.__name__).debug("Planner finished with plan: %s", self.current_plan) + super().on_game_end(final_state) diff --git a/llm_quest_benchmark/harnesses/reasoning.py b/llm_quest_benchmark/harnesses/reasoning.py new file mode 100644 index 0000000..79564d5 --- /dev/null +++ b/llm_quest_benchmark/harnesses/reasoning.py @@ -0,0 +1,57 @@ +"""Reasoning harness variants.""" + +from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE +from llm_quest_benchmark.harnesses.memory import DefaultMemory, FullTranscriptMemory +from llm_quest_benchmark.harnesses.minimal import MinimalHarness + + +class ReasoningRecentHarness(MinimalHarness): + harness_name = "reasoning_recent" + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + system_template: str = SYSTEM_ROLE_TEMPLATE, + action_template: str = "reasoning.jinja", + temperature: float = DEFAULT_TEMPERATURE, + skip_single: bool = False, + debug: bool = False, + memory_module=None, + **kwargs, + ): + super().__init__( + model_name=model_name, + system_template=system_template, + action_template=action_template, + temperature=temperature, + skip_single=skip_single, + debug=debug, + memory_module=memory_module or DefaultMemory(), + **kwargs, + ) + + +class ReasoningFullTranscriptHarness(MinimalHarness): + harness_name = "reasoning_full" + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + system_template: str = SYSTEM_ROLE_TEMPLATE, + action_template: str = "reasoning.jinja", + temperature: float = DEFAULT_TEMPERATURE, + skip_single: bool = False, + debug: bool = False, + memory_module=None, + **kwargs, + ): + super().__init__( + model_name=model_name, + system_template=system_template, + action_template=action_template, + temperature=temperature, + skip_single=skip_single, + debug=debug, + memory_module=memory_module or FullTranscriptMemory(), + **kwargs, + ) diff --git a/llm_quest_benchmark/harnesses/tool_harness.py b/llm_quest_benchmark/harnesses/tool_harness.py new file mode 100644 index 0000000..e89e7c1 --- /dev/null +++ b/llm_quest_benchmark/harnesses/tool_harness.py @@ -0,0 +1,238 @@ +"""Tool-augmented harness implementations.""" + +from typing import Any + +from llm_quest_benchmark.agents.llm_agent import _parse_json_response +from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE +from llm_quest_benchmark.harnesses.base import BaseHarness +from llm_quest_benchmark.harnesses.memory import CompactionMemory +from llm_quest_benchmark.harnesses.tools import QuestHistoryTool, Scratchpad, calculator +from llm_quest_benchmark.schemas.response import LLMResponse + + +class ToolCompactHarness(BaseHarness): + """Compacted-memory harness with a two-phase tool selection/action loop.""" + + harness_name = "tool_compact" + DEFAULT_HISTORY_WINDOW = 10 + MAX_TOOL_INPUT_CHARS = 500 + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + system_template: str = SYSTEM_ROLE_TEMPLATE, + action_template: str = "tool_augmented.jinja", + temperature: float = DEFAULT_TEMPERATURE, + skip_single: bool = False, + debug: bool = False, + compaction_interval: int = 50, + memory_module=None, + history_window: int | None = None, + **_, + ): + self._step_log: list[dict[str, Any]] = [] + self._history_window = history_window or self.DEFAULT_HISTORY_WINDOW + self._scratchpad_tool = Scratchpad() + self._history_tool = QuestHistoryTool(self._step_log, self._history_window) + super().__init__( + model_name=model_name, + system_template=system_template, + action_template=action_template, + temperature=temperature, + skip_single=skip_single, + debug=debug, + memory_module=memory_module or CompactionMemory(compaction_interval=compaction_interval), + tools=[calculator, self._scratchpad_tool, self._history_tool], + ) + self._memory_mode = "compaction" + self._compaction_interval = compaction_interval + + def _recent_steps(self) -> list[str]: + return [ + f"Step {entry['step']}: {entry['observation']} -> {entry.get('selected_choice', 'n/a')}" + for entry in self._step_log[-self._history_window :] + ] + + def _tool_descriptions(self) -> list[str]: + return [ + "quest_history(query): search earlier observations and chosen actions in this quest.", + "calculator(expression): evaluate arithmetic and simple comparisons.", + "scratchpad(operation, content): read or replace one persistent note. operation is read or write_replace.", + ] + + def quest_history(self, query: str) -> str: + return self._history_tool.search(query) + + @staticmethod + def calculator(expression: str) -> str: + return calculator(expression) + + def scratchpad(self, operation: str, content: str = "") -> str: + op = (operation or "").strip().lower() + if op == "read": + return self._scratchpad_tool.read() + if op == "write_replace": + return self._scratchpad_tool.write_replace(content) + return "error: operation must be read or write_replace" + + def _build_tool_prompt( + self, + observation: str, + choices: list[dict[str, str]], + prompt_kind: str, + tool_results: list[str] | None = None, + ) -> str: + template = self.prompt_renderer.get_template(self.action_template) + return template.render( + prompt_kind=prompt_kind, + observation=observation, + choices=[{"text": choice.get("text", "")} for choice in choices], + tool_descriptions=self._tool_descriptions(), + tool_results=tool_results or [], + recent_steps=self._recent_steps(), + scratchpad_note=self._scratchpad_tool.read() if self._scratchpad_tool.read() != "(empty)" else "", + ).strip() + + @staticmethod + def _extract_tool_calls(response: str) -> list[dict[str, Any]]: + payload, _ = _parse_json_response(response) + if not isinstance(payload, dict): + return [] + tool_calls = payload.get("tool_calls") + if not isinstance(tool_calls, list): + return [] + + normalized = [] + for item in tool_calls[:1]: + if not isinstance(item, dict): + continue + tool_name = str(item.get("tool") or "").strip() + tool_input = item.get("input") + operation = str(item.get("operation") or "").strip() + content = str(item.get("content") or "").strip() + if isinstance(tool_input, dict): + operation = operation or str(tool_input.get("operation") or "").strip() + content = content or str(tool_input.get("content") or "").strip() + tool_input = tool_input.get("expression") or tool_input.get("query") or tool_input.get("content") or "" + tool_input = str(tool_input or "").strip() + if len(tool_input) > ToolCompactHarness.MAX_TOOL_INPUT_CHARS: + tool_input = tool_input[: ToolCompactHarness.MAX_TOOL_INPUT_CHARS] + if len(content) > ToolCompactHarness.MAX_TOOL_INPUT_CHARS: + content = content[: ToolCompactHarness.MAX_TOOL_INPUT_CHARS] + if tool_name: + normalized.append({"tool": tool_name, "input": tool_input, "operation": operation, "content": content}) + return normalized + + def _execute_tool_calls(self, tool_calls: list[dict[str, Any]]) -> list[str]: + results = [] + for tc in tool_calls: + name, inp = tc["tool"], tc.get("input", "") + if name == "quest_history": + result = self.quest_history(inp) + elif name == "calculator": + result = self.calculator(inp) + elif name == "scratchpad": + operation = tc.get("operation") or inp + result = self.scratchpad(str(operation), str(tc.get("content") or "")) + else: + result = f"unknown tool: {name}" + call_repr = inp + if name == "scratchpad": + call_repr = f"{tc.get('operation') or inp}, {tc.get('content') or ''}".strip(", ") + results.append(f"{name}({call_repr}) => {result}") + return results + + def _final_choice( + self, + observation: str, + choices: list[dict[str, str]], + tool_results: list[str] | None = None, + ) -> tuple[LLMResponse, dict[str, Any]]: + prompt = self._build_tool_prompt(observation, choices, prompt_kind="final", tool_results=tool_results) + parsed_response = self._parse_with_retries(prompt, observation, choices) + return parsed_response, { + "prompt_tokens": parsed_response.prompt_tokens, + "completion_tokens": parsed_response.completion_tokens, + "total_tokens": parsed_response.total_tokens, + "estimated_cost_usd": parsed_response.estimated_cost_usd, + } + + def _log_step(self, observation: str, choices: list[dict[str, str]], response: LLMResponse) -> None: + selected = "" + if 1 <= response.action <= len(choices): + selected = choices[response.action - 1].get("text", "") + clipped = " ".join((observation or "").strip().split()) + if len(clipped) > 180: + clipped = clipped[:180] + "..." + self._step_log.append( + { + "step": len(self._step_log) + 1, + "observation": clipped, + "choices": [c.get("text", "") for c in choices], + "selected_choice": selected, + } + ) + + def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int: + try: + state_signature = self._state_signature(state, choices) + contextual_state = self._build_contextual_state(state) + self._ensure_llm() + + selection_prompt = self._build_tool_prompt(contextual_state, choices, prompt_kind="select") + selection_response = self._call_llm(selection_prompt) + selection_usage = self.llm.get_last_usage() + tool_calls = self._extract_tool_calls(selection_response) + parsed_response = self._parse_llm_response(selection_response, len(choices)) + tool_results: list[str] = [] + + total_usage = self._normalize_usage(selection_usage) + if tool_calls: + tool_results = self._execute_tool_calls(tool_calls) + parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=tool_results) + total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage)) + elif parsed_response.is_default: + parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=[]) + total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage)) + + action_before_policy = parsed_response.action + parsed_response.action = self._apply_safety_filter(choices, parsed_response.action) + if parsed_response.action != action_before_policy and not parsed_response.reasoning: + parsed_response.reasoning = "policy_safety_override" + + parsed_response.prompt_tokens = total_usage["prompt_tokens"] + parsed_response.completion_tokens = total_usage["completion_tokens"] + parsed_response.total_tokens = total_usage["total_tokens"] + parsed_response.estimated_cost_usd = total_usage["estimated_cost_usd"] + parsed_response.tool_calls = tool_calls or None + parsed_response.tool_results = tool_results or None + + self.history.append(parsed_response) + self._last_response = parsed_response + self._remember_decision(state, choices, state_signature, parsed_response) + self._log_step(state, choices, parsed_response) + return parsed_response.action + except Exception as exc: + self.logger.error("Tool harness error during LLM call: %s", exc) + default_response = LLMResponse( + action=1, + is_default=True, + parse_mode="error_default", + reasoning=f"tool_agent_error: {exc}", + ) + self.history.append(default_response) + self._last_response = default_response + return 1 + + def reset(self) -> None: + super().reset() + self._step_log = [] + self._scratchpad_tool.reset() + self._history_tool.step_log = self._step_log + + +class ToolHintedHarness(ToolCompactHarness): + harness_name = "tool_hinted" + + def __init__(self, *args, action_template: str = "tool_augmented_hints.jinja", **kwargs): + super().__init__(*args, action_template=action_template, **kwargs) From 68cb27efa07ec8e813d7dc132041f622d08c8f6a Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Mon, 11 May 2026 14:04:22 +0400 Subject: [PATCH 03/24] harnesses: HarnessConfig schema + test_factory --- llm_quest_benchmark/harnesses/factory.py | 3 +- llm_quest_benchmark/schemas/config.py | 59 +++++++++++-- .../tests/harnesses/__init__.py | 0 .../tests/harnesses/test_factory.py | 87 +++++++++++++++++++ 4 files changed, 140 insertions(+), 9 deletions(-) create mode 100644 llm_quest_benchmark/tests/harnesses/__init__.py create mode 100644 llm_quest_benchmark/tests/harnesses/test_factory.py diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py index 58d2546..e423783 100644 --- a/llm_quest_benchmark/harnesses/factory.py +++ b/llm_quest_benchmark/harnesses/factory.py @@ -3,6 +3,7 @@ from llm_quest_benchmark.agents.base import QuestPlayer from llm_quest_benchmark.agents.human_player import HumanPlayer from llm_quest_benchmark.agents.random_agent import RandomAgent +from llm_quest_benchmark.constants import DEFAULT_MODEL from llm_quest_benchmark.harnesses.memo import HintedCompactHarness, MemoCompactHarness from llm_quest_benchmark.harnesses.minimal import MinimalHarness from llm_quest_benchmark.harnesses.planner import PlannerHarness @@ -23,7 +24,7 @@ def create_harness( harness: str, - model: str, + model: str = DEFAULT_MODEL, temperature: float = 0.4, skip_single: bool = False, debug: bool = False, diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py index 6a030b2..74799bd 100644 --- a/llm_quest_benchmark/schemas/config.py +++ b/llm_quest_benchmark/schemas/config.py @@ -18,8 +18,8 @@ DEFAULT_BENCHMARK_CONFIG = { "quests": ["quests/Boat.qm"], "agents": [ - {"model": "random_choice", "skip_single": True, "temperature": 0.0, "template": "reasoning.jinja"}, - {"model": "gpt-5-mini", "skip_single": True, "temperature": 0.4, "template": "reasoning.jinja"}, + {"model": "random_choice", "skip_single": True, "temperature": 0.0, "harness": "minimal"}, + {"model": "gpt-5-mini", "skip_single": True, "temperature": 0.4, "harness": "reasoning_recent"}, ], "debug": False, "quest_timeout": 30, @@ -44,7 +44,7 @@ def get_default_benchmark_yaml() -> str: agents: - model: random_choice - model: gpt-5-mini - template: reasoning.jinja + harness: reasoning_recent debug: true # One worker per agent will be used automatically output_dir: results/benchmarks""" @@ -54,9 +54,51 @@ def get_default_benchmark_yaml() -> str: return f.read() +@dataclass +class HarnessConfig: + """Configuration for a single harness in benchmark""" + + model: str = DEFAULT_MODEL + system_template: str = SYSTEM_ROLE_TEMPLATE + harness: str = "reasoning_recent" + temperature: float = DEFAULT_TEMPERATURE + runs: int = 1 + skip_single: bool = False + debug: bool = False + benchmark_id: str | None = None + compaction_interval: int = 50 + + def __post_init__(self): + self.system_template = normalize_template_name(self.system_template) + from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY + + if self.harness not in HARNESS_REGISTRY: + raise ValueError(f"Invalid harness: {self.harness}. Supported harnesses: {sorted(HARNESS_REGISTRY)}") + if not (0.0 <= self.temperature <= 2.0): + raise ValueError(f"Temperature must be between 0.0 and 2.0, got {self.temperature}") + if self.runs < 1: + raise ValueError(f"runs must be >= 1, got {self.runs}") + if self.compaction_interval < 1: + raise ValueError(f"compaction_interval must be >= 1, got {self.compaction_interval}") + + @property + def harness_id(self) -> str: + """Generate a stable harness ID based on configuration values""" + import hashlib + + config_str = f"{self.model}_{self.temperature}_{self.harness}_{self.compaction_interval}" + hash_val = hashlib.md5(config_str.encode()).hexdigest()[:8] + return f"{self.model}_t{self.temperature}_{self.harness}_{hash_val}" + + @property + def agent_id(self) -> str: + """DB-compatible alias for harness_id""" + return self.harness_id + + @dataclass class AgentConfig: - """Configuration for a single agent in benchmark""" + """Legacy configuration for a single agent in benchmark""" model: str = DEFAULT_MODEL system_template: str = SYSTEM_ROLE_TEMPLATE @@ -103,7 +145,7 @@ class BenchmarkConfig: """Configuration for benchmark run""" quests: list[str] # List of quest files or directories - agents: list[AgentConfig] # List of agent configurations to test + agents: list[HarnessConfig] # List of harness configurations to test debug: bool = False quest_timeout: int = 60 # Timeout per quest benchmark_timeout: int | None = None # Total timeout for all quests, defaults to quest_timeout * num_quests @@ -137,10 +179,11 @@ def from_yaml(cls, yaml_path: str) -> "BenchmarkConfig": if "agents" in data: agents = [] for agent in data["agents"]: - # Handle 'template' key which maps to action_template in AgentConfig if "template" in agent: - agent["action_template"] = agent.pop("template") - agents.append(AgentConfig(**agent)) + raise ValueError("Use 'harness:' instead of 'template:'") + if "memory_mode" in agent: + raise ValueError("Use 'harness:' instead of 'memory_mode:'") + agents.append(HarnessConfig(**agent)) data["agents"] = agents return cls(**data) diff --git a/llm_quest_benchmark/tests/harnesses/__init__.py b/llm_quest_benchmark/tests/harnesses/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/llm_quest_benchmark/tests/harnesses/test_factory.py b/llm_quest_benchmark/tests/harnesses/test_factory.py new file mode 100644 index 0000000..5b7bfa0 --- /dev/null +++ b/llm_quest_benchmark/tests/harnesses/test_factory.py @@ -0,0 +1,87 @@ +import pytest + +from llm_quest_benchmark.agents.human_player import HumanPlayer +from llm_quest_benchmark.agents.random_agent import RandomAgent +from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness +from llm_quest_benchmark.harnesses.memo import MemoCompactHarness +from llm_quest_benchmark.harnesses.minimal import MinimalHarness +from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig + + +def test_create_minimal_harness(): + harness = create_harness("minimal", model="gpt-5-mini") + + assert isinstance(harness, MinimalHarness) + + +def test_all_harness_names_instantiate(): + for harness_name, harness_cls in HARNESS_REGISTRY.items(): + harness = create_harness(harness_name, model="gpt-5-mini") + + assert isinstance(harness, harness_cls) + + +def test_create_human_harness(): + harness = create_harness("human") + + assert isinstance(harness, HumanPlayer) + + +def test_create_random_choice_harness(): + harness = create_harness("random_choice") + + assert isinstance(harness, RandomAgent) + + +def test_create_bad_harness_name_raises(): + with pytest.raises(ValueError): + create_harness("bad_name", model="gpt-5-mini") + + +def test_harness_config_stable_harness_id(): + config = HarnessConfig(harness="memo_compact", model="gpt-5-mini") + + assert isinstance(config.harness_id, str) + assert config.harness_id == HarnessConfig(harness="memo_compact", model="gpt-5-mini").harness_id + + +def test_benchmark_config_from_yaml_parses_harness(tmp_path): + quest_path = tmp_path / "quest.qm" + quest_path.write_text("", encoding="utf-8") + config_path = tmp_path / "benchmark.yaml" + config_path.write_text( + f""" +quests: + - {quest_path} +agents: + - model: gpt-5-mini + harness: memo_compact +""", + encoding="utf-8", + ) + + config = BenchmarkConfig.from_yaml(str(config_path)) + + assert len(config.agents) == 1 + assert isinstance(config.agents[0], HarnessConfig) + assert isinstance(create_harness(config.agents[0].harness, model=config.agents[0].model), MemoCompactHarness) + assert config.agents[0].harness == "memo_compact" + + +def test_benchmark_config_from_yaml_rejects_template(tmp_path): + quest_path = tmp_path / "quest.qm" + quest_path.write_text("", encoding="utf-8") + config_path = tmp_path / "benchmark.yaml" + config_path.write_text( + f""" +quests: + - {quest_path} +agents: + - model: gpt-5-mini + template: reasoning.jinja +""", + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="Use 'harness:' instead of 'template:'"): + BenchmarkConfig.from_yaml(str(config_path)) From 1bb312158af2df23acd49039203b70a597699de7 Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Mon, 11 May 2026 14:15:44 +0400 Subject: [PATCH 04/24] harnesses: implement 8 concrete harness classes, retire agents/ --- llm_quest_benchmark/agents/llm_agent.py | 981 ++---------------- llm_quest_benchmark/harnesses/base.py | 240 ++++- llm_quest_benchmark/harnesses/tool_harness.py | 3 +- 3 files changed, 292 insertions(+), 932 deletions(-) diff --git a/llm_quest_benchmark/agents/llm_agent.py b/llm_quest_benchmark/agents/llm_agent.py index 64ff0cc..7b6d352 100644 --- a/llm_quest_benchmark/agents/llm_agent.py +++ b/llm_quest_benchmark/agents/llm_agent.py @@ -1,272 +1,30 @@ -"""LLM agent for Space Rangers quests""" +"""Deprecated compatibility wrapper for harness-based LLM agents.""" -import hashlib -import json -import logging -import re -from typing import Any +import warnings -from json_repair import repair_json - -from llm_quest_benchmark.agents.base import QuestPlayer from llm_quest_benchmark.constants import ( DEFAULT_MODEL, DEFAULT_TEMPERATURE, DEFAULT_TEMPLATE, MODEL_CHOICES, SYSTEM_ROLE_TEMPLATE, - normalize_template_name, -) -from llm_quest_benchmark.llm.client import ( - get_llm_client, - is_supported_model_name, - parse_model_name, -) -from llm_quest_benchmark.llm.prompt import PromptRenderer -from llm_quest_benchmark.schemas.response import LLMResponse - -RISKY_CHOICE_KEYWORDS = ( - "улететь", - "сдаться", - "отказ", - "провал", - "убежать", - "surrender", - "give up", ) - -SAFE_CHOICE_KEYWORDS = ( - "пройти мимо", - "избежать", - "подготов", - "библиотек", - "изуч", - "wait", - "avoid", - "study", +from llm_quest_benchmark.harnesses.base import ( + RISKY_CHOICE_KEYWORDS, + SAFE_CHOICE_KEYWORDS, + _is_numeric_raw_reasoning, + _parse_json_response, + _raw_reasoning_fallback, + parse_llm_response, ) +from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory +from llm_quest_benchmark.harnesses.minimal import MinimalHarness +warnings.warn("llm_agent is deprecated, use llm_quest_benchmark.harnesses", DeprecationWarning, stacklevel=2) -def _parse_json_response( - response: str, - debug: bool = False, - logger: logging.Logger | None = None, -) -> tuple[dict[str, Any] | None, str | None]: - """Try to parse response as JSON, with repair attempt if needed.""" - cleaned_response = (response or "").strip() - if not cleaned_response: - return None, None - - try: - # Extract JSON from response if there are backticks - if "```json" in cleaned_response: - # Find the start and end of the JSON block - start = cleaned_response.find("```json") + 7 - end = cleaned_response.find("```", start) - if end > start: - json_str = cleaned_response[start:end].strip() - if debug and logger: - logger.debug(f"Extracted JSON: {json_str}") - result = json.loads(json_str) - if debug and logger: - logger.debug(f"Parsed JSON: {result}") - return result, "json_fenced" - - # Extract a probable JSON object from free-form text. - embedded_json = re.search(r"\{[\s\S]*\}", cleaned_response) - if embedded_json: - candidate = embedded_json.group(0).strip() - if candidate and candidate != cleaned_response: - try: - result = json.loads(candidate) - if debug and logger: - logger.debug(f"Parsed embedded JSON: {result}") - return result, "json_embedded" - except json.JSONDecodeError: - pass - - # Try to parse directly - result = json.loads(cleaned_response) - if debug and logger: - logger.debug(f"Direct JSON parse successful: {result}") - return result, "json_direct" - except json.JSONDecodeError: - if debug and logger: - logger.debug("Initial JSON parse failed, attempting repair") - try: - repaired = repair_json(cleaned_response) - if debug and logger: - logger.debug(f"Repaired JSON: {repaired}") - result = json.loads(repaired) - if debug and logger: - logger.debug(f"Parse of repaired JSON successful: {result}") - return result, "json_repaired" - except Exception as e: - if debug and logger: - logger.error(f"JSON repair failed: {e}") - return None, None - - -def _validate_action_number( - action: int, num_choices: int, debug: bool = False, logger: logging.Logger | None = None -) -> bool: - """Validate that action number is within valid range""" - if 1 <= action <= num_choices: - return True - if debug and logger: - logger.error(f"Action number {action} out of range [1, {num_choices}]") - return False - - -def _extract_action_from_text(response: str, num_choices: int) -> int | None: - """Extract a candidate action from free-form text.""" - for match in re.finditer(r"\b(\d+)\b", response): - action = int(match.group(1)) - if 1 <= action <= num_choices: - return action - return None - - -def _extract_field_from_text(response: str, field: str) -> str | None: - """Best-effort extraction of analysis/reasoning from loosely formatted output.""" - if not response: - return None - - # JSON-like field forms: "analysis": "...", 'analysis': '...' - json_pattern = re.compile( - rf"""['"]{re.escape(field)}['"]\s*:\s*['"](?P.*?)['"]""", - re.IGNORECASE | re.DOTALL, - ) - match = json_pattern.search(response) - if match: - value = " ".join(match.group("value").strip().split()) - if value: - return value - - # Partial JSON field forms without a closing quote in truncated outputs. - partial_json_pattern = re.compile( - rf"""['"]{re.escape(field)}['"]\s*:\s*['"](?P[^"\n\r]+)""", - re.IGNORECASE, - ) - match = partial_json_pattern.search(response) - if match: - value = " ".join(match.group("value").strip().split()) - if value: - return value - - # Label forms: Analysis: ..., Reasoning - ... - label_pattern = re.compile( - rf"""(?im)^\s*{re.escape(field)}\s*[:\-]\s*(?P.+?)\s*$""", - ) - match = label_pattern.search(response) - if match: - value = " ".join(match.group("value").strip().split()) - if value: - return value - - return None - - -def _raw_reasoning_fallback(response: str) -> str | None: - compact = " ".join((response or "").strip().split()) - if not compact: - return None - if len(compact) > 240: - compact = compact[:237] + "..." - return f"raw_response: {compact}" - - -def _is_numeric_raw_reasoning(reasoning: str | None) -> bool: - if not reasoning: - return False - if not reasoning.startswith("raw_response:"): - return False - payload = reasoning.split(":", 1)[1].strip() - return payload.isdigit() - - -def parse_llm_response( - response: str, num_choices: int, debug: bool = False, logger: logging.Logger | None = None -) -> LLMResponse: - """Parse LLM response and return structured response object.""" - if debug and logger: - logger.debug(f"Raw LLM response: {response}") - - extracted_analysis = _extract_field_from_text(response, "analysis") - extracted_reasoning = _extract_field_from_text(response, "reasoning") - raw_reasoning = _raw_reasoning_fallback(response) - - # Try parsing as JSON first - response_json, json_parse_mode = _parse_json_response(response, debug, logger) - if response_json and isinstance(response_json, dict): - analysis = response_json.get("analysis") or extracted_analysis - reasoning = response_json.get("reasoning") or response_json.get("thinking") or extracted_reasoning - if not reasoning and analysis: - reasoning = analysis - if not analysis and not reasoning: - reasoning = raw_reasoning - - memo_raw = response_json.get("memo") - memo = str(memo_raw) if memo_raw is not None else None - - # Check for either 'action' or 'result' field - action_value = response_json.get("action") or response_json.get("result") or response_json.get("choice") - if action_value is not None: - try: - action = int(action_value) - if _validate_action_number(action, num_choices, debug, logger): - return LLMResponse( - action=action, - reasoning=reasoning, - analysis=analysis, - memo=memo, - is_default=False, - parse_mode=json_parse_mode or "json", - ) - except (ValueError, TypeError): - if debug and logger: - logger.error(f"Invalid action value in JSON: {action_value}") - - # Try parsing as plain number - try: - action = int(response.strip()) - if _validate_action_number(action, num_choices, debug, logger): - return LLMResponse( - action=action, - reasoning=extracted_reasoning or extracted_analysis or raw_reasoning, - analysis=extracted_analysis, - is_default=False, - parse_mode="number_only", - ) - except ValueError: - if debug and logger: - logger.error(f"Could not parse response as number: {response}") - - # Fallback: extract first valid integer from text. - extracted_action = _extract_action_from_text(response, num_choices) - if extracted_action is not None: - return LLMResponse( - action=extracted_action, - reasoning=extracted_reasoning or extracted_analysis or raw_reasoning, - analysis=extracted_analysis, - is_default=False, - parse_mode="number_extracted", - ) - - # Default to first choice if all parsing attempts fail - if debug and logger: - logger.error(f"Error during response parsing, defaulting to first choice. Response: {response[:100]}...") - return LLMResponse( - action=1, - reasoning=extracted_reasoning or extracted_analysis or raw_reasoning, - analysis=extracted_analysis, - is_default=True, - parse_mode="default_first", - ) - -class LLMAgent(QuestPlayer): - """LLM-powered agent for Space Rangers quests""" +class LLMAgent(MinimalHarness): + """Backward-compatible LLMAgent facade backed by concrete harness classes.""" SUPPORTED_MODELS = MODEL_CHOICES @@ -281,688 +39,65 @@ def __init__( memory_mode: str = "default", compaction_interval: int = 10, ): - super().__init__(skip_single=skip_single) - self.debug = debug - self.model_name = model_name.lower() - self.system_template = normalize_template_name(system_template) - self.action_template = normalize_template_name(action_template) - self.temperature = temperature - # Set agent_id for database records - self.agent_id = f"llm_{self.model_name}" - - if not is_supported_model_name(self.model_name): - raise ValueError(f"Unsupported model: {model_name}. Supported models are: {self.SUPPORTED_MODELS}") - - self.model_spec = parse_model_name(self.model_name) - self.logger = logging.getLogger(self.__class__.__name__) - if self.debug: - self.logger.setLevel(logging.DEBUG) - self.logger.propagate = False - if not any(getattr(h, "_llm_quest_handler", False) for h in self.logger.handlers): - handler = logging.StreamHandler() - handler.setFormatter(logging.Formatter("%(name)s - %(message)s")) - handler._llm_quest_handler = True - self.logger.addHandler(handler) + if memory_mode == "default": + memory_module = DefaultMemory() + elif memory_mode == "full_transcript": + memory_module = FullTranscriptMemory() + elif memory_mode == "compaction": + memory_module = CompactionMemory(compaction_interval=compaction_interval) + else: + raise ValueError(f"Invalid memory_mode: {memory_mode}") - # Initialize prompt renderer - self.prompt_renderer = PromptRenderer( - None, system_template=self.system_template, action_template=self.action_template + super().__init__( + model_name=model_name, + system_template=system_template, + action_template=action_template, + temperature=temperature, + skip_single=skip_single, + debug=debug, + memory_module=memory_module, ) - - # Delay API client creation so template-only flows and tests do not require API keys. - self.llm = None - self.history: list[LLMResponse] = [] - self._observation_history: list[str] = [] - self._decision_history: list[dict[str, Any]] = [] - self._state_action_counts: dict[str, dict[int, int]] = {} - self._context_window = 3 - self._context_chars = 220 - self._decision_window = 5 - self._max_state_signatures = 200 - self._use_safety_filter = True - self._last_response = LLMResponse(action=1, is_default=True) - - # Quest briefing: pinned first observation (mission goal) - self._quest_briefing: str | None = None - - # Memory mode: "default", "full_transcript", "compaction" - if memory_mode not in ("default", "full_transcript", "compaction"): - raise ValueError(f"Invalid memory_mode: {memory_mode}") + self.agent_id = f"llm_{self.model_name}" self._memory_mode = memory_mode - self._transcript: list[dict[str, Any]] = [] self._compaction_interval = compaction_interval - self._compaction_summary: str | None = None - self._steps_since_compaction = 0 - self._step_count = 0 - - def _ensure_llm(self): - """Lazily create the provider client only when inference is needed.""" - if self.llm is None: - self.llm = get_llm_client( - self.model_name, - system_prompt=self.prompt_renderer.render_system_prompt(), - temperature=self.temperature, - ) - - def get_last_response(self) -> LLMResponse | None: - """Get the last LLM response from history""" - return self._last_response - - def get_action(self, observation: str, choices: list[dict[str, str]]) -> int: - """Track observation history for context, then delegate base action flow.""" - self._remember_observation(observation) - return super().get_action(observation, choices) def _remember_observation(self, observation: str) -> None: + """Compatibility hook used by legacy tests and callers.""" clean = (observation or "").strip() if not clean: return - if self._quest_briefing is None: - self._quest_briefing = clean self._observation_history.append(clean) if len(self._observation_history) > 20: self._observation_history = self._observation_history[-20:] + if self.memory_module is not None: + self.memory_module.update({"observation": clean, "step": self._step_count + 1}) def _build_contextual_state(self, state: str) -> str: - """Build context-augmented state based on memory mode.""" - if self._memory_mode == "full_transcript": - return self._build_full_transcript_state(state) - if self._memory_mode == "compaction": - return self._build_compaction_state(state) - return self._build_default_state(state) - - def _briefing_block(self, state: str) -> str | None: - """Return quest briefing block if available and not redundant with current state.""" - if not self._quest_briefing: - return None - if state.strip() == self._quest_briefing: - return None - briefing = self._quest_briefing - if len(briefing) > 800: - briefing = briefing[:800] + "..." - return f"Quest briefing (your mission):\n{briefing}" - - def _build_default_state(self, state: str) -> str: - """Original sliding-window context, now with pinned briefing.""" - blocks: list[str] = [] - - briefing = self._briefing_block(state) - if briefing: - blocks.append(briefing) - - if len(self._observation_history) > 1: - previous = self._observation_history[:-1][-self._context_window :] - if previous: - snippets = [] - for idx, text in enumerate(previous, start=1): - clipped = text if len(text) <= self._context_chars else text[: self._context_chars] + "..." - snippets.append(f"[Previous {idx}] {clipped}") - blocks.append("Recent context from previous steps:\n" + "\n\n".join(snippets)) - - if self._decision_history: - recent_memos = [] - for item in self._decision_history[-self._decision_window :]: - m = (item.get("memo") or "").strip() - if not m: - continue - if recent_memos and recent_memos[-1] == m: - continue - recent_memos.append(m) - if recent_memos: - lines = [f"[Memo {idx}] {m}" for idx, m in enumerate(recent_memos, start=1)] - blocks.append("State memo (recent):\n" + "\n".join(lines)) - - recent_decisions = self._decision_history[-self._decision_window :] - decision_lines = [] - for idx, item in enumerate(recent_decisions, start=1): - choice = item.get("choice", "") - parse_mode = item.get("parse_mode", "unknown") - memo_val = item.get("memo") - memo_suffix = f" | memo: {memo_val}" if memo_val else "" - decision_lines.append( - f"[Decision {idx}] action {item.get('action')}: {choice} (parse={parse_mode}){memo_suffix}" - ) - blocks.append("Recent selected actions:\n" + "\n".join(decision_lines)) - - if not blocks: - return state - - sep = "\n\n" - return f"{sep.join(blocks)}\n\nCurrent story state:\n{state}" - - def _build_full_transcript_state(self, state: str) -> str: - """Full decision transcript with pinned briefing.""" - blocks: list[str] = [] - - briefing = self._briefing_block(state) - if briefing: - blocks.append(briefing) - - if self._transcript: - lines = [] - entries = self._transcript - # Budget: keep first 3 + last N that fit under ~40 entries total - if len(entries) > 40: - entries = entries[:3] + [{"_gap": len(entries) - 40}] + entries[-(40 - 3) :] - for entry in entries: - if "_gap" in entry: - lines.append(f" ... ({entry['_gap']} steps omitted) ...") - continue - step = entry.get("step", "?") - obs = entry.get("observation", "") - if len(obs) > 400: - obs = obs[:400] + "..." - chosen = entry.get("choice_text", "") - reasoning = entry.get("reasoning", "") - line = f"Step {step}: {obs}" - if chosen: - line += f"\n You chose: {chosen}" - if reasoning: - line += f"\n Reasoning: {reasoning[:800]}" - state_notes = entry.get("memo", "") - if state_notes: - line += f"\n State: {state_notes[:350]}" - lines.append(line) - blocks.append("=== QUEST TRANSCRIPT ===\n" + "\n\n".join(lines)) - - blocks.append(f"Step {self._step_count} (CURRENT):\n{state}") - return "\n\n".join(blocks) - - def _build_compaction_state(self, state: str) -> str: - """Compacted memory summary + recent steps since last compaction.""" - blocks: list[str] = [] - - briefing = self._briefing_block(state) - if briefing: - blocks.append(briefing) - - if self._compaction_summary: - blocks.append( - f"=== QUEST MEMORY (compacted at step {self._step_count - self._steps_since_compaction}) ===\n{self._compaction_summary}" - ) - - if self._transcript: - recent = self._transcript[-self._steps_since_compaction :] if self._steps_since_compaction > 0 else [] - if recent: - lines = [] - for entry in recent: - step = entry.get("step", "?") - obs = entry.get("observation", "") - if len(obs) > 400: - obs = obs[:400] + "..." - chosen = entry.get("choice_text", "") - line = f"Step {step}: {obs}" - if chosen: - line += f"\n You chose: {chosen}" - state_notes = entry.get("memo", "") - if state_notes: - line += f"\n State: {state_notes[:350]}" - lines.append(line) - blocks.append("=== RECENT STEPS ===\n" + "\n\n".join(lines)) - - blocks.append(f"Step {self._step_count} (CURRENT):\n{state}") - return "\n\n".join(blocks) - - def _maybe_compact(self) -> None: - """Run compaction if interval reached. Called after recording a decision.""" - if self._memory_mode != "compaction": - return - if self._steps_since_compaction < self._compaction_interval: - return - - transcript_text = self._format_transcript_for_compaction() - if not transcript_text: - return - - prompt_parts = [] - prompt_parts.append("You are summarizing an agent's progress through a text quest.") - if self._quest_briefing: - prompt_parts.append(f"\nQUEST BRIEFING (the original mission):\n{self._quest_briefing}") - if self._compaction_summary: - prompt_parts.append(f"\nPREVIOUS SUMMARY:\n{self._compaction_summary}") - prompt_parts.append(f"\nTRANSCRIPT OF LAST {self._steps_since_compaction} STEPS:\n{transcript_text}") - prompt_parts.append( - "\nSummarize the agent's progress. Include:\n" - "- Current objective (what the agent should do next)\n" - "- Progress so far (what has been accomplished)\n" - "- Key facts (NPCs, items, locations, deadlines discovered)\n" - "- Failed approaches (actions/paths that didn't work)\n" - "- Map knowledge (locations visited and connections)\n\n" - "Write a concise summary in plain text, max 300 words." - ) - - compaction_prompt = "\n".join(prompt_parts) - try: - self._ensure_llm() - summary = self.llm.get_completion(compaction_prompt) - compaction_usage = self.llm.get_last_usage() or {} - if compaction_usage: - pt = int( - compaction_usage.get("prompt_tokens", 0) - if isinstance(compaction_usage, dict) - else getattr(compaction_usage, "prompt_tokens", 0) - ) - ct = int( - compaction_usage.get("completion_tokens", 0) - if isinstance(compaction_usage, dict) - else getattr(compaction_usage, "completion_tokens", 0) - ) - self._record_compaction_usage(pt, ct) - stripped = (summary or "").strip() - if not stripped: - if self.debug: - self.logger.warning("Compaction returned empty summary at step %d", self._step_count) - self._steps_since_compaction = max(0, self._compaction_interval // 2) - return - self._compaction_summary = stripped - self._transcript = [] - self._steps_since_compaction = 0 - if self.debug: - self.logger.debug( - "Compaction completed at step %d: %s", self._step_count, self._compaction_summary[:200] - ) - except Exception as e: - if self.debug: - self.logger.warning("Compaction failed at step %d: %s", self._step_count, e) - self._steps_since_compaction = max(0, self._compaction_interval // 2) - - def _record_compaction_usage(self, prompt_tokens: int, completion_tokens: int) -> None: - """Record token usage from compaction calls into agent history.""" - compaction_response = LLMResponse( - action=0, - is_default=True, - parse_mode="compaction", - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, - ) - self.history.append(compaction_response) - - def _format_transcript_for_compaction(self) -> str: - """Format recent transcript entries for the compaction prompt.""" - recent = ( - self._transcript[-self._steps_since_compaction :] - if self._steps_since_compaction > 0 - else self._transcript[-self._compaction_interval :] - ) - lines = [] - for entry in recent: - step = entry.get("step", "?") - obs = entry.get("observation", "") - if len(obs) > 400: - obs = obs[:400] + "..." - chosen = entry.get("choice_text", "") - reasoning = entry.get("reasoning", "") - state_notes = entry.get("memo", "") - line = f"Step {step}: {obs}" - if chosen: - line += f"\n Chose: {chosen}" - if state_notes: - line += f"\n State: {state_notes[:350]}" - if reasoning: - line += f"\n Reasoning: {reasoning[:800]}" - lines.append(line) - return "\n\n".join(lines) - - @staticmethod - def _normalize_for_signature(value: str, max_len: int = 320) -> str: - text = (value or "").lower() - text = re.sub(r"\s+", " ", text).strip() - if len(text) > max_len: - return text[:max_len] - return text - - def _state_signature(self, state: str, choices: list[dict[str, str]]) -> str: - normalized_state = self._normalize_for_signature(state, max_len=420) - normalized_choices = "|".join( - self._normalize_for_signature(choice.get("text", ""), max_len=110) for choice in choices - ) - raw_signature = f"{normalized_state}||{normalized_choices}" - return hashlib.sha1(raw_signature.encode("utf-8", errors="ignore")).hexdigest()[:20] - - def _remember_decision( - self, - state: str, - choices: list[dict[str, str]], - state_signature: str, - response: LLMResponse, - ) -> None: - action = int(response.action) - counts = self._state_action_counts.setdefault(state_signature, {}) - counts[action] = counts.get(action, 0) + 1 - - if len(self._state_action_counts) > self._max_state_signatures: - oldest_key = next(iter(self._state_action_counts.keys())) - if oldest_key != state_signature: - self._state_action_counts.pop(oldest_key, None) - - selected_text = "" - if 1 <= action <= len(choices): - selected_text = choices[action - 1].get("text", "") - state_snippet = state.strip() - if len(state_snippet) > self._context_chars: - state_snippet = state_snippet[: self._context_chars] + "..." - - self._decision_history.append( - { - "state": state_snippet, - "action": action, - "choice": selected_text, - "parse_mode": response.parse_mode or "unknown", - "memo": (response.memo or "").strip()[:350] or None, - } - ) - if len(self._decision_history) > 40: - self._decision_history = self._decision_history[-40:] - - # Transcript for full_transcript and compaction modes - if self._memory_mode in ("full_transcript", "compaction"): - self._step_count += 1 - self._steps_since_compaction += 1 - self._transcript.append( - { - "step": self._step_count, - "observation": state_snippet if self._memory_mode == "compaction" else state.strip()[:400], - "choice_text": selected_text, - "reasoning": (response.reasoning or "")[:800], - "memo": (response.memo or "").strip()[:350] or None, - "action": action, - } - ) - self._maybe_compact() - - def _choice_risk_score(self, choice_text: str) -> int: - text = (choice_text or "").lower() - score = 0 - for keyword in RISKY_CHOICE_KEYWORDS: - if keyword in text: - score += 2 - for keyword in SAFE_CHOICE_KEYWORDS: - if keyword in text: - score -= 1 - return score - - def _apply_safety_filter(self, action: int, choices: list[dict[str, str]]) -> int: - """Replace obviously risky actions when a clearly safer alternative exists.""" - if not self._use_safety_filter or len(choices) < 2: - return action - - current_idx = action - 1 - if current_idx < 0 or current_idx >= len(choices): - return action - - scored = [(idx + 1, self._choice_risk_score(c.get("text", ""))) for idx, c in enumerate(choices)] - scored.sort(key=lambda item: item[1]) - - best_action, best_score = scored[0] - current_score = self._choice_risk_score(choices[current_idx].get("text", "")) - - # Only override when the chosen action is materially riskier than the best option. - if current_score - best_score >= 2: - if self.debug: - self.logger.debug( - "Safety filter override: %s -> %s (risk %s -> %s)", - action, - best_action, - current_score, - best_score, - ) - return best_action - return action - - @staticmethod - def _state_fingerprint(state: str) -> str: - """Create a stable fingerprint for loop detection.""" - compact = " ".join((state or "").lower().split()) - if len(compact) > 500: - compact = compact[:500] - return compact - - def _apply_loop_escape( - self, - state_key: str, - action: int, - choices: list[dict[str, str]], - ) -> tuple[int, bool]: - """Diversify action when the same state repeats with no apparent progress.""" - if len(choices) <= 1: - return action, False - - counts = self._state_action_counts.get(state_key, {}) - total_visits = sum(counts.values()) - if total_visits < 3: - return action, False - - current_count = counts.get(action, 0) - if current_count < 2: - return action, False - all_actions = list(range(1, len(choices) + 1)) - ranked = sorted( - all_actions, - key=lambda a: ( - counts.get(a, 0), - self._choice_risk_score(choices[a - 1].get("text", "")), - ), - ) - best_action = ranked[0] - - if best_action != action and counts.get(best_action, 0) < current_count: - return best_action, True - if total_visits >= 5 and current_count >= 3 and best_action != action: - return best_action, True - return action, False - - @staticmethod - def _normalize_usage(usage: dict[str, Any] | None) -> dict[str, Any]: - usage = usage or {} - prompt_tokens = int(usage.get("prompt_tokens") or 0) - completion_tokens = int(usage.get("completion_tokens") or 0) - total_tokens = int(usage.get("total_tokens") or (prompt_tokens + completion_tokens)) - estimated_cost_usd = usage.get("estimated_cost_usd") - if estimated_cost_usd is not None: - estimated_cost_usd = float(estimated_cost_usd) - return { - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": total_tokens, - "estimated_cost_usd": estimated_cost_usd, - } - - @classmethod - def _merge_usage(cls, first: dict[str, Any] | None, second: dict[str, Any] | None) -> dict[str, Any]: - a = cls._normalize_usage(first) - b = cls._normalize_usage(second) - merged_cost = None - if a["estimated_cost_usd"] is not None or b["estimated_cost_usd"] is not None: - merged_cost = (a["estimated_cost_usd"] or 0.0) + (b["estimated_cost_usd"] or 0.0) - return { - "prompt_tokens": a["prompt_tokens"] + b["prompt_tokens"], - "completion_tokens": a["completion_tokens"] + b["completion_tokens"], - "total_tokens": a["total_tokens"] + b["total_tokens"], - "estimated_cost_usd": merged_cost, - } - - def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int: - """Implementation of action selection logic. - - Args: - state (str): Current game state text - choices (List[Dict[str, str]]): List of available choices - - Returns: - int: Selected action number (1-based) - """ - if self.debug: - self.logger.debug(f"Getting action for state with {len(choices)} choices available") - for i, choice in enumerate(choices): - self.logger.debug(f"Choice {i + 1}: {choice.get('text', 'NO TEXT')}") - try: - state_signature = self._state_signature(state, choices) - # Format prompt - prompt = self._format_prompt(self._build_contextual_state(state), choices) - if self.debug: - self.logger.debug(f"\nPrompt:\n{prompt}") - - # Get LLM response - self._ensure_llm() - llm_response = self.llm.get_completion(prompt) - llm_usage = self.llm.get_last_usage() - if self.debug: - self.logger.debug(f"LLM response: {llm_response}") - choices_debug = [] - for i, c in enumerate(choices): - choices_debug.append(f"{i + 1}: {c['text']}") - self.logger.debug(f"Available choices: {choices_debug}") - - # Parse response - first_response = parse_llm_response( - llm_response, - len(choices), - self.debug, - self.logger, - ) - parsed_response = first_response - - if parsed_response.is_default: - retry_response = self.llm.get_completion(self._format_retry_prompt(state, choices)) - retry_usage = self.llm.get_last_usage() - llm_usage = self._merge_usage(llm_usage, retry_usage) - retry_parsed = parse_llm_response(retry_response, len(choices), self.debug, self.logger) - if not retry_parsed.is_default: - retry_parsed.parse_mode = f"retry_{retry_parsed.parse_mode or 'parsed'}" - parsed_response = retry_parsed - elif self._needs_force_numeric_retry(): - # GPT-5/o models occasionally return empty visible text on long prompts. - # Use a tiny final retry that asks for number-only output. - force_retry_response = self.llm.get_completion(self._format_force_numeric_retry_prompt(choices)) - force_retry_usage = self.llm.get_last_usage() - llm_usage = self._merge_usage(llm_usage, force_retry_usage) - force_retry_parsed = parse_llm_response( - force_retry_response, - len(choices), - self.debug, - self.logger, - ) - if not force_retry_parsed.is_default: - force_retry_parsed.parse_mode = f"force_retry_{force_retry_parsed.parse_mode or 'parsed'}" - parsed_response = force_retry_parsed - - action_before_policy = parsed_response.action - if parsed_response is not first_response: - if parsed_response.analysis is None and first_response.analysis is not None: - parsed_response.analysis = first_response.analysis - if _is_numeric_raw_reasoning(parsed_response.reasoning): - if first_response.reasoning and not _is_numeric_raw_reasoning(first_response.reasoning): - parsed_response.reasoning = first_response.reasoning - else: - first_raw_reasoning = _raw_reasoning_fallback(llm_response) - if first_raw_reasoning and not _is_numeric_raw_reasoning(first_raw_reasoning): - parsed_response.reasoning = first_raw_reasoning - - parsed_response.action = self._apply_safety_filter(parsed_response.action, choices) - if parsed_response.action != action_before_policy and not parsed_response.reasoning: - parsed_response.reasoning = "policy_safety_override" - usage_payload = self._normalize_usage(llm_usage) - parsed_response.prompt_tokens = usage_payload["prompt_tokens"] - parsed_response.completion_tokens = usage_payload["completion_tokens"] - parsed_response.total_tokens = usage_payload["total_tokens"] - parsed_response.estimated_cost_usd = usage_payload["estimated_cost_usd"] - - if self.debug: - self.logger.debug(f"Parsed LLM response: {parsed_response}") - self.logger.debug(f"Final action to be returned: {parsed_response.action}") - - # Store response in history - self.history.append(parsed_response) - self._last_response = parsed_response - self._remember_decision(state, choices, state_signature, parsed_response) - - # Check that action is within valid range before returning - if parsed_response.action < 1 or parsed_response.action > len(choices): - self.logger.error(f"INVALID ACTION DETECTED: {parsed_response.action} not in range 1-{len(choices)}") - # Use default first action instead - parsed_response.action = 1 - self.logger.warning("Defaulting to action 1 instead") - - return parsed_response.action - - except Exception as e: - self.logger.error(f"Error during LLM call: {e}") - default_response = LLMResponse( - action=1, - is_default=True, - parse_mode="error_default", - reasoning=_raw_reasoning_fallback(f"llm_call_error: {e}"), - ) - self.history.append(default_response) - self._last_response = default_response - return 1 # Default to first choice on error - - def reset(self) -> None: - """Reset agent state""" - self.history = [] - self._observation_history = [] - self._decision_history = [] - self._state_action_counts = {} - self._last_response = LLMResponse(action=1, is_default=True) - self._quest_briefing = None - self._transcript = [] - self._compaction_summary = None - self._steps_since_compaction = 0 - self._step_count = 0 - - def on_game_start(self) -> None: - """Called when game starts""" - super().on_game_start() - self._observation_history = [] - self._decision_history = [] - self._state_action_counts = {} - self._last_response = LLMResponse(action=1, is_default=True) - self._quest_briefing = None - self._transcript = [] - self._compaction_summary = None - self._steps_since_compaction = 0 - self._step_count = 0 - - def on_game_end(self, final_state: dict[str, Any]) -> None: - """Log final state for analysis""" - if self.debug: - self.logger.debug(f"Game ended with state: {final_state}") + """Build context while honoring legacy direct history mutation.""" + if isinstance(self.memory_module, DefaultMemory): + self.memory_module._observations = list(self._observation_history) + self.memory_module._decisions = list(self._decision_history) + return super()._build_contextual_state(state) + + def _apply_safety_filter(self, action_or_choices, choices_or_action) -> int: + """Accept both legacy (action, choices) and harness (choices, action) argument order.""" + if isinstance(action_or_choices, list): + return super()._apply_safety_filter(action_or_choices, choices_or_action) + return super()._apply_safety_filter(choices_or_action, action_or_choices) def __str__(self) -> str: - """String representation of the agent""" - return f"LLMAgent(model={self.model_name}, system_template={self.system_template}, action_template={self.action_template}, temperature={self.temperature})" - - def _format_prompt(self, state: str, choices: list[dict[str, str]]) -> str: - """Format the prompt for the LLM""" - return self.prompt_renderer.render_action_prompt(state, choices).strip() - - def _format_retry_prompt(self, state: str, choices: list[dict[str, str]]) -> str: - """Fallback prompt that still preserves reasoning for log analysis.""" - clipped_state = (state or "").strip() - if len(clipped_state) > 500: - clipped_state = clipped_state[:500] + "..." - choices_text = "\n".join([f"{i + 1}. {(c.get('text', '') or '')[:160]}" for i, c in enumerate(choices)]) - return f"""Choose the best action. -State: {clipped_state} -Actions: -{choices_text} - -Return valid JSON only: -{{ - "analysis": "", - "reasoning": "", - "result": -}}""" + return ( + f"LLMAgent(model={self.model_name}, system_template={self.system_template}, " + f"action_template={self.action_template}, temperature={self.temperature})" + ) - def _format_force_numeric_retry_prompt(self, choices: list[dict[str, str]]) -> str: - """Very short retry prompt used for models that return empty visible output.""" - choices_text = "\n".join([f"{i + 1}. {(c.get('text', '') or '')[:110]}" for i, c in enumerate(choices)]) - return f"""Pick one action number. -{choices_text} -Reply with one integer only: 1 to {len(choices)}.""" - def _needs_force_numeric_retry(self) -> bool: - return self.model_spec.provider == "openai" and ( - self.model_spec.model_id.startswith("gpt-5") or self.model_spec.model_id.startswith("o") - ) +__all__ = [ + "LLMAgent", + "parse_llm_response", + "_parse_json_response", + "_raw_reasoning_fallback", + "_is_numeric_raw_reasoning", + "RISKY_CHOICE_KEYWORDS", + "SAFE_CHOICE_KEYWORDS", +] diff --git a/llm_quest_benchmark/harnesses/base.py b/llm_quest_benchmark/harnesses/base.py index ec70b55..6fa8afd 100644 --- a/llm_quest_benchmark/harnesses/base.py +++ b/llm_quest_benchmark/harnesses/base.py @@ -1,24 +1,250 @@ """Base harness class for quest benchmark experiments.""" import hashlib +import json import logging import re from abc import abstractmethod from typing import Any +from json_repair import repair_json + from llm_quest_benchmark.agents.base import QuestPlayer -from llm_quest_benchmark.agents.llm_agent import ( - RISKY_CHOICE_KEYWORDS, - SAFE_CHOICE_KEYWORDS, - _is_numeric_raw_reasoning, - _raw_reasoning_fallback, - parse_llm_response, -) from llm_quest_benchmark.constants import DEFAULT_TEMPLATE, normalize_template_name from llm_quest_benchmark.llm.client import get_llm_client, parse_model_name from llm_quest_benchmark.llm.prompt import PromptRenderer from llm_quest_benchmark.schemas.response import LLMResponse +RISKY_CHOICE_KEYWORDS = ( + "улететь", + "сдаться", + "отказ", + "провал", + "убежать", + "surrender", + "give up", +) + +SAFE_CHOICE_KEYWORDS = ( + "пройти мимо", + "избежать", + "подготов", + "библиотек", + "изуч", + "wait", + "avoid", + "study", +) + + +def _parse_json_response( + response: str, + debug: bool = False, + logger: logging.Logger | None = None, +) -> tuple[dict[str, Any] | None, str | None]: + """Try to parse response as JSON, with repair attempt if needed.""" + cleaned_response = (response or "").strip() + if not cleaned_response: + return None, None + + try: + if "```json" in cleaned_response: + start = cleaned_response.find("```json") + 7 + end = cleaned_response.find("```", start) + if end > start: + json_str = cleaned_response[start:end].strip() + if debug and logger: + logger.debug("Extracted JSON: %s", json_str) + result = json.loads(json_str) + if debug and logger: + logger.debug("Parsed JSON: %s", result) + return result, "json_fenced" + + embedded_json = re.search(r"\{[\s\S]*\}", cleaned_response) + if embedded_json: + candidate = embedded_json.group(0).strip() + if candidate and candidate != cleaned_response: + try: + result = json.loads(candidate) + if debug and logger: + logger.debug("Parsed embedded JSON: %s", result) + return result, "json_embedded" + except json.JSONDecodeError: + pass + + result = json.loads(cleaned_response) + if debug and logger: + logger.debug("Direct JSON parse successful: %s", result) + return result, "json_direct" + except json.JSONDecodeError: + if debug and logger: + logger.debug("Initial JSON parse failed, attempting repair") + try: + repaired = repair_json(cleaned_response) + if debug and logger: + logger.debug("Repaired JSON: %s", repaired) + result = json.loads(repaired) + if debug and logger: + logger.debug("Parse of repaired JSON successful: %s", result) + return result, "json_repaired" + except Exception as exc: + if debug and logger: + logger.error("JSON repair failed: %s", exc) + return None, None + + +def _validate_action_number( + action: int, + num_choices: int, + debug: bool = False, + logger: logging.Logger | None = None, +) -> bool: + """Validate that action number is within valid range.""" + if 1 <= action <= num_choices: + return True + if debug and logger: + logger.error("Action number %s out of range [1, %s]", action, num_choices) + return False + + +def _extract_action_from_text(response: str, num_choices: int) -> int | None: + """Extract a candidate action from free-form text.""" + for match in re.finditer(r"\b(\d+)\b", response): + action = int(match.group(1)) + if 1 <= action <= num_choices: + return action + return None + + +def _extract_field_from_text(response: str, field: str) -> str | None: + """Best-effort extraction of analysis/reasoning from loosely formatted output.""" + if not response: + return None + + json_pattern = re.compile( + rf"""['"]{re.escape(field)}['"]\s*:\s*['"](?P.*?)['"]""", + re.IGNORECASE | re.DOTALL, + ) + match = json_pattern.search(response) + if match: + value = " ".join(match.group("value").strip().split()) + if value: + return value + + partial_json_pattern = re.compile( + rf"""['"]{re.escape(field)}['"]\s*:\s*['"](?P[^"\n\r]+)""", + re.IGNORECASE, + ) + match = partial_json_pattern.search(response) + if match: + value = " ".join(match.group("value").strip().split()) + if value: + return value + + label_pattern = re.compile( + rf"""(?im)^\s*{re.escape(field)}\s*[:\-]\s*(?P.+?)\s*$""", + ) + match = label_pattern.search(response) + if match: + value = " ".join(match.group("value").strip().split()) + if value: + return value + + return None + + +def _raw_reasoning_fallback(response: str) -> str | None: + compact = " ".join((response or "").strip().split()) + if not compact: + return None + if len(compact) > 240: + compact = compact[:237] + "..." + return f"raw_response: {compact}" + + +def _is_numeric_raw_reasoning(reasoning: str | None) -> bool: + if not reasoning or not reasoning.startswith("raw_response:"): + return False + payload = reasoning.split(":", 1)[1].strip() + return payload.isdigit() + + +def parse_llm_response( + response: str, + num_choices: int, + debug: bool = False, + logger: logging.Logger | None = None, +) -> LLMResponse: + """Parse an LLM response and return a structured response object.""" + if debug and logger: + logger.debug("Raw LLM response: %s", response) + + extracted_analysis = _extract_field_from_text(response, "analysis") + extracted_reasoning = _extract_field_from_text(response, "reasoning") + raw_reasoning = _raw_reasoning_fallback(response) + + response_json, json_parse_mode = _parse_json_response(response, debug, logger) + if response_json and isinstance(response_json, dict): + analysis = response_json.get("analysis") or extracted_analysis + reasoning = response_json.get("reasoning") or response_json.get("thinking") or extracted_reasoning + if not reasoning and analysis: + reasoning = analysis + if not analysis and not reasoning: + reasoning = raw_reasoning + + memo_raw = response_json.get("memo") + memo = str(memo_raw) if memo_raw is not None else None + action_value = response_json.get("action") or response_json.get("result") or response_json.get("choice") + if action_value is not None: + try: + action = int(action_value) + if _validate_action_number(action, num_choices, debug, logger): + return LLMResponse( + action=action, + reasoning=reasoning, + analysis=analysis, + memo=memo, + is_default=False, + parse_mode=json_parse_mode or "json", + ) + except (ValueError, TypeError): + if debug and logger: + logger.error("Invalid action value in JSON: %s", action_value) + + try: + action = int(response.strip()) + if _validate_action_number(action, num_choices, debug, logger): + return LLMResponse( + action=action, + reasoning=extracted_reasoning or extracted_analysis or raw_reasoning, + analysis=extracted_analysis, + is_default=False, + parse_mode="number_only", + ) + except ValueError: + if debug and logger: + logger.error("Could not parse response as number: %s", response) + + extracted_action = _extract_action_from_text(response, num_choices) + if extracted_action is not None: + return LLMResponse( + action=extracted_action, + reasoning=extracted_reasoning or extracted_analysis or raw_reasoning, + analysis=extracted_analysis, + is_default=False, + parse_mode="number_extracted", + ) + + if debug and logger: + logger.error("Error during response parsing, defaulting to first choice. Response: %s...", response[:100]) + return LLMResponse( + action=1, + reasoning=extracted_reasoning or extracted_analysis or raw_reasoning, + analysis=extracted_analysis, + is_default=True, + parse_mode="default_first", + ) + class BaseHarness(QuestPlayer): """Abstract LLM harness base class.""" diff --git a/llm_quest_benchmark/harnesses/tool_harness.py b/llm_quest_benchmark/harnesses/tool_harness.py index e89e7c1..a398bfe 100644 --- a/llm_quest_benchmark/harnesses/tool_harness.py +++ b/llm_quest_benchmark/harnesses/tool_harness.py @@ -2,9 +2,8 @@ from typing import Any -from llm_quest_benchmark.agents.llm_agent import _parse_json_response from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE -from llm_quest_benchmark.harnesses.base import BaseHarness +from llm_quest_benchmark.harnesses.base import BaseHarness, _parse_json_response from llm_quest_benchmark.harnesses.memory import CompactionMemory from llm_quest_benchmark.harnesses.tools import QuestHistoryTool, Scratchpad, calculator from llm_quest_benchmark.schemas.response import LLMResponse From 78babe8558fab338463a0971aafadbadc0c4163a Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Mon, 11 May 2026 14:38:19 +0400 Subject: [PATCH 05/24] harnesses: HarnessConfig schema + factory Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- llm_quest_benchmark/harnesses/factory.py | 37 +++++++++--- llm_quest_benchmark/schemas/__init__.py | 12 +++- llm_quest_benchmark/schemas/config.py | 43 ++++++++++++-- .../tests/harnesses/test_factory.py | 57 ++++++++++++++++++- 4 files changed, 131 insertions(+), 18 deletions(-) diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py index e423783..b46f5dc 100644 --- a/llm_quest_benchmark/harnesses/factory.py +++ b/llm_quest_benchmark/harnesses/factory.py @@ -21,6 +21,22 @@ "planner": PlannerHarness, } +SPECIAL_HARNESSES = ("human", "random_choice", "random_choice_") + + +def _parse_random_choice_seed(identifier: str) -> tuple[bool, int | None]: + if identifier == "random_choice": + return True, None + prefix = "random_choice_" + if identifier.startswith(prefix) and identifier[len(prefix) :].isdigit(): + return True, int(identifier[len(prefix) :]) + return False, None + + +def is_random_choice_harness(identifier: str) -> bool: + is_random, _ = _parse_random_choice_seed(identifier) + return is_random + def create_harness( harness: str, @@ -31,18 +47,21 @@ def create_harness( compaction_interval: int = 50, system_template: str = "system_role.jinja", ) -> QuestPlayer: + valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES] + is_random_harness, seed = _parse_random_choice_seed(harness) + if is_random_harness: + return RandomAgent(seed=seed, debug=debug, skip_single=skip_single) + if harness.startswith("random_choice"): + raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}") if harness == "human": return HumanPlayer(skip_single=skip_single) - if harness.startswith("random_choice"): - seed = None - if "_" in harness[13:]: - try: - seed = int(harness.split("_")[-1]) - except ValueError: - pass - return RandomAgent(seed=seed, debug=debug, skip_single=skip_single) if harness not in HARNESS_REGISTRY: - raise ValueError(f"Unknown harness '{harness}'. Valid: {sorted(HARNESS_REGISTRY)}") + raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}") + is_random_model, seed = _parse_random_choice_seed(model) + if is_random_model: + return RandomAgent(seed=seed, debug=debug, skip_single=skip_single) + if model.startswith("random_choice"): + raise ValueError(f"Unknown random_choice model '{model}'. Valid: {valid}") cls = HARNESS_REGISTRY[harness] return cls( model_name=model, diff --git a/llm_quest_benchmark/schemas/__init__.py b/llm_quest_benchmark/schemas/__init__.py index 34fee08..0cb4242 100644 --- a/llm_quest_benchmark/schemas/__init__.py +++ b/llm_quest_benchmark/schemas/__init__.py @@ -1,9 +1,17 @@ """Schema exports for LLM Quest Benchmark""" -__all__ = ["QMState", "AgentState", "LLMResponse", "QMBridgeState", "BenchmarkConfig", "AgentConfig"] +__all__ = [ + "QMState", + "AgentState", + "LLMResponse", + "QMBridgeState", + "BenchmarkConfig", + "HarnessConfig", + "AgentConfig", +] # Import directly from the schema modules using relative imports from .bridge import QMBridgeState -from .config import AgentConfig, BenchmarkConfig +from .config import AgentConfig, BenchmarkConfig, HarnessConfig from .response import LLMResponse from .state import AgentState, QMState diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py index 74799bd..c658729 100644 --- a/llm_quest_benchmark/schemas/config.py +++ b/llm_quest_benchmark/schemas/config.py @@ -68,12 +68,45 @@ class HarnessConfig: benchmark_id: str | None = None compaction_interval: int = 50 + def __init__( + self, + model: str = DEFAULT_MODEL, + system_template: str = SYSTEM_ROLE_TEMPLATE, + harness: str = "reasoning_recent", + temperature: float = DEFAULT_TEMPERATURE, + runs: int = 1, + skip_single: bool = False, + debug: bool = False, + benchmark_id: str | None = None, + compaction_interval: int = 50, + **legacy_keys, + ): + if "template" in legacy_keys or "action_template" in legacy_keys: + raise ValueError("Use harness: key instead of template:") + if "memory_mode" in legacy_keys: + raise ValueError("Use harness: key instead of memory_mode:") + if legacy_keys: + unexpected = ", ".join(sorted(legacy_keys)) + raise TypeError(f"Unexpected HarnessConfig key(s): {unexpected}") + + self.model = model + self.system_template = system_template + self.harness = harness + self.temperature = temperature + self.runs = runs + self.skip_single = skip_single + self.debug = debug + self.benchmark_id = benchmark_id + self.compaction_interval = compaction_interval + self.__post_init__() + def __post_init__(self): self.system_template = normalize_template_name(self.system_template) - from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY + from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, SPECIAL_HARNESSES, is_random_choice_harness - if self.harness not in HARNESS_REGISTRY: - raise ValueError(f"Invalid harness: {self.harness}. Supported harnesses: {sorted(HARNESS_REGISTRY)}") + if self.harness not in HARNESS_REGISTRY and self.harness != "human" and not is_random_choice_harness(self.harness): + valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES] + raise ValueError(f"Invalid harness: {self.harness}. Supported harnesses: {valid}") if not (0.0 <= self.temperature <= 2.0): raise ValueError(f"Temperature must be between 0.0 and 2.0, got {self.temperature}") if self.runs < 1: @@ -180,9 +213,9 @@ def from_yaml(cls, yaml_path: str) -> "BenchmarkConfig": agents = [] for agent in data["agents"]: if "template" in agent: - raise ValueError("Use 'harness:' instead of 'template:'") + raise ValueError("Use harness: key instead of template:") if "memory_mode" in agent: - raise ValueError("Use 'harness:' instead of 'memory_mode:'") + raise ValueError("Use harness: key instead of memory_mode:") agents.append(HarnessConfig(**agent)) data["agents"] = agents diff --git a/llm_quest_benchmark/tests/harnesses/test_factory.py b/llm_quest_benchmark/tests/harnesses/test_factory.py index 5b7bfa0..187f4d3 100644 --- a/llm_quest_benchmark/tests/harnesses/test_factory.py +++ b/llm_quest_benchmark/tests/harnesses/test_factory.py @@ -33,11 +33,28 @@ def test_create_random_choice_harness(): assert isinstance(harness, RandomAgent) +def test_create_seeded_random_choice_harness(): + harness = create_harness("random_choice_123") + + assert isinstance(harness, RandomAgent) + assert harness.agent_id == "random_123" + + def test_create_bad_harness_name_raises(): - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="minimal"): create_harness("bad_name", model="gpt-5-mini") +def test_create_bad_random_choice_seed_raises(): + with pytest.raises(ValueError, match="random_choice_"): + create_harness("random_choice_bad") + + +def test_random_choice_model_does_not_hide_bad_harness(): + with pytest.raises(ValueError, match="bad_name"): + create_harness("bad_name", model="random_choice_123") + + def test_harness_config_stable_harness_id(): config = HarnessConfig(harness="memo_compact", model="gpt-5-mini") @@ -45,6 +62,22 @@ def test_harness_config_stable_harness_id(): assert config.harness_id == HarnessConfig(harness="memo_compact", model="gpt-5-mini").harness_id +def test_harness_config_allows_seeded_random_choice_harness(): + config = HarnessConfig(harness="random_choice_123", model="gpt-5-mini") + + assert config.harness == "random_choice_123" + + +def test_harness_config_rejects_old_template_key(): + with pytest.raises(ValueError, match="Use harness: key instead of template:"): + HarnessConfig(model="gpt-5-mini", template="reasoning.jinja") + + +def test_harness_config_rejects_old_memory_mode_key(): + with pytest.raises(ValueError, match="Use harness: key instead of memory_mode:"): + HarnessConfig(model="gpt-5-mini", harness="memo_compact", memory_mode="compaction") + + def test_benchmark_config_from_yaml_parses_harness(tmp_path): quest_path = tmp_path / "quest.qm" quest_path.write_text("", encoding="utf-8") @@ -83,5 +116,25 @@ def test_benchmark_config_from_yaml_rejects_template(tmp_path): encoding="utf-8", ) - with pytest.raises(ValueError, match="Use 'harness:' instead of 'template:'"): + with pytest.raises(ValueError, match="Use harness: key instead of template:"): + BenchmarkConfig.from_yaml(str(config_path)) + + +def test_benchmark_config_from_yaml_rejects_memory_mode(tmp_path): + quest_path = tmp_path / "quest.qm" + quest_path.write_text("", encoding="utf-8") + config_path = tmp_path / "benchmark.yaml" + config_path.write_text( + f""" +quests: + - {quest_path} +agents: + - model: gpt-5-mini + harness: memo_compact + memory_mode: compaction +""", + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="Use harness: key instead of memory_mode:"): BenchmarkConfig.from_yaml(str(config_path)) From eeac9f355398267adf727649816100f351434d37 Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Mon, 11 May 2026 14:42:43 +0400 Subject: [PATCH 06/24] harnesses: wire CLI, benchmark, YAML configs Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- configs/benchmarks/exp3_no_loop_breaker.yaml | 3 +- configs/benchmarks/exp3_stateful_compact.yaml | 3 +- .../benchmarks/exp4_compaction_no_memo.yaml | 3 +- configs/benchmarks/exp4_memo_cot.yaml | 3 +- configs/benchmarks/exp4_memo_extended.yaml | 3 +- configs/benchmarks/exp4_memo_structured.yaml | 3 +- .../exp5_stateful_compact_variance.yaml | 3 +- configs/benchmarks/exp6_prompt_hints.yaml | 3 +- configs/benchmarks/exp6_tools.yaml | 3 +- configs/benchmarks/exp6_tools_hints.yaml | 3 +- .../benchmarks/exp6_unified_tools_screen.yaml | 3 +- configs/benchmarks/exp7_deepseek.yaml | 3 +- configs/benchmarks/exp7_haiku.yaml | 3 +- configs/benchmarks/exp7_llama.yaml | 3 +- configs/benchmarks/exp7_mistral.yaml | 3 +- configs/benchmarks/exp7_qwen.yaml | 3 +- configs/benchmarks/exp7b_model_upgrades.yaml | 9 ++--- configs/benchmarks/memory_compaction.yaml | 18 +++------ .../benchmarks/memory_full_transcript.yaml | 9 ++--- configs/benchmarks/memory_modes_pilot.yaml | 12 ++---- configs/benchmarks/openrouter_smoke_test.yaml | 10 ++--- llm_quest_benchmark/executors/benchmark.py | 39 ++++++++++++++----- llm_quest_benchmark/executors/cli/commands.py | 31 +++++++++------ 23 files changed, 87 insertions(+), 89 deletions(-) diff --git a/configs/benchmarks/exp3_no_loop_breaker.yaml b/configs/benchmarks/exp3_no_loop_breaker.yaml index 64240fe..57e7124 100644 --- a/configs/benchmarks/exp3_no_loop_breaker.yaml +++ b/configs/benchmarks/exp3_no_loop_breaker.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: reasoning + harness: reasoning_full temperature: 0.4 runs: 2 - memory_mode: full_transcript debug: false quest_timeout: 600 max_workers: 2 diff --git a/configs/benchmarks/exp3_stateful_compact.yaml b/configs/benchmarks/exp3_stateful_compact.yaml index b43fc6b..bb9973c 100644 --- a/configs/benchmarks/exp3_stateful_compact.yaml +++ b/configs/benchmarks/exp3_stateful_compact.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp4_compaction_no_memo.yaml b/configs/benchmarks/exp4_compaction_no_memo.yaml index 5ef4130..896dd60 100644 --- a/configs/benchmarks/exp4_compaction_no_memo.yaml +++ b/configs/benchmarks/exp4_compaction_no_memo.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: reasoning + harness: memo_compact temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp4_memo_cot.yaml b/configs/benchmarks/exp4_memo_cot.yaml index fe97bca..9bfe382 100644 --- a/configs/benchmarks/exp4_memo_cot.yaml +++ b/configs/benchmarks/exp4_memo_cot.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: memo_cot + harness: memo_compact temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp4_memo_extended.yaml b/configs/benchmarks/exp4_memo_extended.yaml index 66d1bf4..25e5620 100644 --- a/configs/benchmarks/exp4_memo_extended.yaml +++ b/configs/benchmarks/exp4_memo_extended.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: memo_extended + harness: memo_compact temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp4_memo_structured.yaml b/configs/benchmarks/exp4_memo_structured.yaml index 83502c7..96e5daf 100644 --- a/configs/benchmarks/exp4_memo_structured.yaml +++ b/configs/benchmarks/exp4_memo_structured.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: memo_structured + harness: memo_compact temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp5_stateful_compact_variance.yaml b/configs/benchmarks/exp5_stateful_compact_variance.yaml index 6f99f29..89cc80b 100644 --- a/configs/benchmarks/exp5_stateful_compact_variance.yaml +++ b/configs/benchmarks/exp5_stateful_compact_variance.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 5 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp6_prompt_hints.yaml b/configs/benchmarks/exp6_prompt_hints.yaml index 098b1db..4c70e61 100644 --- a/configs/benchmarks/exp6_prompt_hints.yaml +++ b/configs/benchmarks/exp6_prompt_hints.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: stateful_compact_hints + harness: hinted_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp6_tools.yaml b/configs/benchmarks/exp6_tools.yaml index 8630bb0..b254005 100644 --- a/configs/benchmarks/exp6_tools.yaml +++ b/configs/benchmarks/exp6_tools.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: tool_augmented + harness: tool_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp6_tools_hints.yaml b/configs/benchmarks/exp6_tools_hints.yaml index b7949fc..0c0c3b6 100644 --- a/configs/benchmarks/exp6_tools_hints.yaml +++ b/configs/benchmarks/exp6_tools_hints.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: tool_augmented_hints + harness: tool_hinted temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp6_unified_tools_screen.yaml b/configs/benchmarks/exp6_unified_tools_screen.yaml index 0c43290..b80f8c0 100644 --- a/configs/benchmarks/exp6_unified_tools_screen.yaml +++ b/configs/benchmarks/exp6_unified_tools_screen.yaml @@ -24,10 +24,9 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - template: tool_augmented + harness: tool_compact temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp7_deepseek.yaml b/configs/benchmarks/exp7_deepseek.yaml index 1b82664..6971569 100644 --- a/configs/benchmarks/exp7_deepseek.yaml +++ b/configs/benchmarks/exp7_deepseek.yaml @@ -7,10 +7,9 @@ quests: - quests/sr_2_1_2121_eng/Robots_eng.qm agents: - model: "openrouter:deepseek/deepseek-chat-v3-0324" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp7_haiku.yaml b/configs/benchmarks/exp7_haiku.yaml index 72cd6c2..8546c80 100644 --- a/configs/benchmarks/exp7_haiku.yaml +++ b/configs/benchmarks/exp7_haiku.yaml @@ -7,10 +7,9 @@ quests: - quests/sr_2_1_2121_eng/Robots_eng.qm agents: - model: "anthropic:claude-3-5-haiku-latest" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp7_llama.yaml b/configs/benchmarks/exp7_llama.yaml index 27eda5a..61e156c 100644 --- a/configs/benchmarks/exp7_llama.yaml +++ b/configs/benchmarks/exp7_llama.yaml @@ -7,10 +7,9 @@ quests: - quests/sr_2_1_2121_eng/Robots_eng.qm agents: - model: "openrouter:meta-llama/llama-4-scout" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp7_mistral.yaml b/configs/benchmarks/exp7_mistral.yaml index 76f1a40..f570882 100644 --- a/configs/benchmarks/exp7_mistral.yaml +++ b/configs/benchmarks/exp7_mistral.yaml @@ -7,10 +7,9 @@ quests: - quests/sr_2_1_2121_eng/Robots_eng.qm agents: - model: "openrouter:mistralai/mistral-small-2603" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp7_qwen.yaml b/configs/benchmarks/exp7_qwen.yaml index 572d7a6..27496cc 100644 --- a/configs/benchmarks/exp7_qwen.yaml +++ b/configs/benchmarks/exp7_qwen.yaml @@ -7,10 +7,9 @@ quests: - quests/sr_2_1_2121_eng/Robots_eng.qm agents: - model: "openrouter:qwen/qwen3-30b-a3b" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/exp7b_model_upgrades.yaml b/configs/benchmarks/exp7b_model_upgrades.yaml index 4c35c8b..22da91b 100644 --- a/configs/benchmarks/exp7b_model_upgrades.yaml +++ b/configs/benchmarks/exp7b_model_upgrades.yaml @@ -20,22 +20,19 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:deepseek/deepseek-v4-flash" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 - model: "openrouter:qwen/qwen3.6-flash" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 - model: "claude:claude-haiku-4-5-20251001" - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 2 - memory_mode: compaction compaction_interval: 50 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/memory_compaction.yaml b/configs/benchmarks/memory_compaction.yaml index 1bb10a8..c403665 100644 --- a/configs/benchmarks/memory_compaction.yaml +++ b/configs/benchmarks/memory_compaction.yaml @@ -18,45 +18,39 @@ quests: agents: # Gemini 3 Flash - compaction interval 10 - model: "openrouter:google/gemini-3-flash-preview" - template: reasoning + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 10 # Gemini 3 Flash - compaction interval 20 - model: "openrouter:google/gemini-3-flash-preview" - template: reasoning + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 20 # GPT-5.4 Mini - compaction interval 10 - model: "openrouter:openai/gpt-5.4-mini" - template: reasoning + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 10 # GPT-5.4 Mini - compaction interval 20 - model: "openrouter:openai/gpt-5.4-mini" - template: reasoning + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 20 # DeepSeek V3.2 - compaction interval 10 - model: "openrouter:deepseek/deepseek-v3.2" - template: reasoning + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 10 # DeepSeek V3.2 - compaction interval 20 - model: "openrouter:deepseek/deepseek-v3.2" - template: reasoning + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 20 debug: false quest_timeout: 600 diff --git a/configs/benchmarks/memory_full_transcript.yaml b/configs/benchmarks/memory_full_transcript.yaml index 04ad152..9fc82a4 100644 --- a/configs/benchmarks/memory_full_transcript.yaml +++ b/configs/benchmarks/memory_full_transcript.yaml @@ -18,22 +18,19 @@ quests: agents: # Gemini 3 Flash - full transcript - model: "openrouter:google/gemini-3-flash-preview" - template: reasoning + harness: reasoning_full temperature: 0.4 runs: 3 - memory_mode: full_transcript # GPT-5.4 Mini - full transcript - model: "openrouter:openai/gpt-5.4-mini" - template: reasoning + harness: reasoning_full temperature: 0.4 runs: 3 - memory_mode: full_transcript # DeepSeek V3.2 - full transcript - model: "openrouter:deepseek/deepseek-v3.2" - template: reasoning + harness: reasoning_full temperature: 0.4 runs: 3 - memory_mode: full_transcript debug: false quest_timeout: 600 max_workers: 2 diff --git a/configs/benchmarks/memory_modes_pilot.yaml b/configs/benchmarks/memory_modes_pilot.yaml index 2e4d862..db6aa23 100644 --- a/configs/benchmarks/memory_modes_pilot.yaml +++ b/configs/benchmarks/memory_modes_pilot.yaml @@ -5,31 +5,27 @@ quests: agents: # Short-context reasoning - default memory (3 obs, 5 decisions) - model: openrouter:google/gemini-3-flash-preview - template: reasoning + harness: reasoning_recent temperature: 0.4 runs: 3 - memory_mode: default # Short-context reasoning - loop-aware template - model: openrouter:google/gemini-3-flash-preview - template: loop_aware_reasoning + harness: reasoning_recent temperature: 0.4 runs: 3 - memory_mode: default # Full-history reasoning - model: openrouter:google/gemini-3-flash-preview - template: reasoning + harness: reasoning_full temperature: 0.4 runs: 3 - memory_mode: full_transcript # Compact memory / memo (compact every 10 steps) - model: openrouter:google/gemini-3-flash-preview - template: reasoning + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 10 debug: false diff --git a/configs/benchmarks/openrouter_smoke_test.yaml b/configs/benchmarks/openrouter_smoke_test.yaml index 6194df3..2fb50be 100644 --- a/configs/benchmarks/openrouter_smoke_test.yaml +++ b/configs/benchmarks/openrouter_smoke_test.yaml @@ -3,23 +3,23 @@ quests: - quests/Boat.qm agents: - model: "openrouter:anthropic/claude-sonnet-4-6" - template: stub + harness: minimal temperature: 0.4 runs: 1 - model: "openrouter:openai/gpt-5.4-mini" - template: stub + harness: minimal temperature: 0.4 runs: 1 - model: "openrouter:google/gemini-2.5-flash" - template: stub + harness: minimal temperature: 0.4 runs: 1 - model: "openrouter:deepseek/deepseek-chat" - template: stub + harness: minimal temperature: 0.4 runs: 1 - model: "openrouter:qwen/qwen3-235b-a22b" - template: stub + harness: minimal temperature: 0.4 runs: 1 debug: false diff --git a/llm_quest_benchmark/executors/benchmark.py b/llm_quest_benchmark/executors/benchmark.py index 0b78062..69e7d22 100644 --- a/llm_quest_benchmark/executors/benchmark.py +++ b/llm_quest_benchmark/executors/benchmark.py @@ -12,10 +12,10 @@ from pathlib import Path from typing import Any -from llm_quest_benchmark.agents.agent_factory import create_agent from llm_quest_benchmark.core.logging import DEFAULT_DB_PATH from llm_quest_benchmark.core.runner import run_quest_with_timeout from llm_quest_benchmark.environments.state import QuestOutcome +from llm_quest_benchmark.harnesses.factory import create_harness from llm_quest_benchmark.llm import tracing from llm_quest_benchmark.schemas.config import BenchmarkConfig @@ -34,6 +34,28 @@ logger = logging.getLogger(__name__) +def _agent_harness(agent_config) -> str: + """Return harness name for new configs, with legacy AgentConfig fallback.""" + if hasattr(agent_config, "harness"): + return agent_config.harness + + template = getattr(agent_config, "action_template", "reasoning.jinja") + memory_mode = getattr(agent_config, "memory_mode", "default") + template = template.removesuffix(".jinja") + legacy_mapping = { + ("stub", "default"): "minimal", + ("reasoning", "default"): "reasoning_recent", + ("reasoning", "full_transcript"): "reasoning_full", + ("reasoning", "compaction"): "memo_compact", + ("stateful_compact", "compaction"): "memo_compact", + ("stateful_compact_hints", "compaction"): "hinted_compact", + ("tool_augmented", "compaction"): "tool_compact", + ("tool_augmented_hints", "compaction"): "tool_hinted", + ("planner", "compaction"): "planner", + } + return legacy_mapping.get((template, memory_mode), "reasoning_recent") + + def _result_entry( quest: str, agent_config, @@ -46,8 +68,8 @@ def _result_entry( "quest": quest, "model": agent_config.model, "temperature": agent_config.temperature, - "template": agent_config.action_template, - "agent_id": agent_config.agent_id, + "harness": _agent_harness(agent_config), + "agent_id": agent_config.harness_id if hasattr(agent_config, "harness_id") else agent_config.agent_id, "attempt": attempt, "outcome": outcome, "reward": reward, @@ -132,15 +154,14 @@ def callback(event: str, data: Any = None) -> None: ) try: - agent = create_agent( + agent = create_harness( + harness=_agent_harness(agent_config), model=agent_config.model, temperature=agent_config.temperature, - system_template=agent_config.system_template, - action_template=agent_config.action_template, skip_single=agent_config.skip_single, debug=agent_config.debug, - memory_mode=agent_config.memory_mode, compaction_interval=agent_config.compaction_interval, + system_template=agent_config.system_template, ) outcome = run_quest_with_timeout( quest, @@ -254,7 +275,7 @@ def _write_benchmark_artifacts(config: BenchmarkConfig, results: list[dict[str, "temperature": agent.temperature, "runs": agent.runs, "system_template": agent.system_template, - "action_template": agent.action_template, + "harness": _agent_harness(agent), } for agent in config.agents ], @@ -281,7 +302,7 @@ def _write_benchmark_artifacts(config: BenchmarkConfig, results: list[dict[str, { "model": agent.model, "system_template": agent.system_template, - "action_template": agent.action_template, + "harness": _agent_harness(agent), "temperature": agent.temperature, "runs": agent.runs, "skip_single": agent.skip_single, diff --git a/llm_quest_benchmark/executors/cli/commands.py b/llm_quest_benchmark/executors/cli/commands.py index e3cecbd..4b029bd 100644 --- a/llm_quest_benchmark/executors/cli/commands.py +++ b/llm_quest_benchmark/executors/cli/commands.py @@ -8,6 +8,7 @@ from pathlib import Path from typing import Any +import click from dotenv import load_dotenv # Initialize quest registry early @@ -18,13 +19,10 @@ import typer -from llm_quest_benchmark.agents.agent_factory import create_agent -from llm_quest_benchmark.agents.human_player import HumanPlayer from llm_quest_benchmark.constants import ( DEFAULT_MODEL, DEFAULT_QUEST, DEFAULT_TEMPERATURE, - DEFAULT_TEMPLATE, INFINITE_TIMEOUT, MODEL_CHOICES, SYSTEM_ROLE_TEMPLATE, @@ -40,9 +38,10 @@ print_summary, run_benchmark, ) +from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness from llm_quest_benchmark.llm import tracing from llm_quest_benchmark.renderers.terminal import RichRenderer -from llm_quest_benchmark.schemas.config import AgentConfig, BenchmarkConfig +from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig # Initialize logging log_manager = LogManager() @@ -53,6 +52,8 @@ rich_markup_mode="rich", ) +HARNESS_CHOICES = list(HARNESS_REGISTRY.keys()) + def version_callback(value: bool): if value: @@ -348,7 +349,13 @@ def run( model: str = typer.Option(DEFAULT_MODEL, help=f"Model for the LLM agent (choices: {', '.join(MODEL_CHOICES)})."), temperature: float = typer.Option(DEFAULT_TEMPERATURE, help="Temperature for LLM sampling"), system_template: str = typer.Option(SYSTEM_ROLE_TEMPLATE, help="Template to use for system instructions."), - action_template: str = typer.Option(DEFAULT_TEMPLATE, help="Template to use for action prompts."), + harness: str = typer.Option( + "reasoning_recent", + "--harness", + help="Harness to use for quest decisions.", + click_type=click.Choice(HARNESS_CHOICES), + ), + compaction_interval: int = typer.Option(50, help="Advanced override for compaction interval."), timeout: int = typer.Option(60, help="Timeout in seconds for run (0 for no timeout)."), skip: bool = typer.Option(True, help="Auto-select single choices without asking agent."), debug: bool = typer.Option(False, help="Enable debug logging and output, remove terminal UI."), @@ -365,23 +372,25 @@ def run( log_manager.setup(debug) # Create agent config - agent_config = AgentConfig( + agent_config = HarnessConfig( model=model, system_template=system_template, - action_template=action_template, + harness=harness, temperature=temperature, skip_single=skip, debug=debug, + compaction_interval=compaction_interval, ) # Create agent - agent = create_agent( + agent = create_harness( + harness=harness, model=model, system_template=system_template, - action_template=action_template, temperature=temperature, skip_single=skip, debug=debug, + compaction_interval=compaction_interval, ) log.warning(f"Starting quest run with agent {str(agent)}") @@ -458,7 +467,7 @@ def play( log.debug(f"Quest file: {quest}") # Create interactive player - player = HumanPlayer(skip_single=skip, debug=debug) + player = create_harness(harness="human", skip_single=skip, debug=debug) # Run quest in interactive mode result = run_quest_with_timeout(quest_path=str(quest), agent=player, timeout=INFINITE_TIMEOUT, debug=debug) @@ -952,7 +961,7 @@ def benchmark( This command runs benchmark evaluation using a YAML configuration file that specifies: - quests: list of quest files or directories to test - - agents: list of agents with their model, template, and temperature settings + - agents: list of harnesses with their model, harness, and temperature settings - other settings: debug, timeout, workers, etc. Example: From 664452d3ca54d82ba4a146d8ae566280c8ee70ae Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Mon, 11 May 2026 14:52:33 +0400 Subject: [PATCH 07/24] tests: migrate to harness API --- .../tests/agents/test_anthropic.py | 18 +- .../tests/agents/test_llm_agent.py | 104 +++--- .../tests/agents/test_mode_agents.py | 260 +------------- .../tests/harnesses/test_harnesses.py | 335 ++++++++++++++++++ .../tests/integration/test_mode_agents_e2e.py | 22 +- .../tests/integration/test_quest_e2e.py | 12 +- 6 files changed, 417 insertions(+), 334 deletions(-) create mode 100644 llm_quest_benchmark/tests/harnesses/test_harnesses.py diff --git a/llm_quest_benchmark/tests/agents/test_anthropic.py b/llm_quest_benchmark/tests/agents/test_anthropic.py index 5dd1f95..ba60f97 100644 --- a/llm_quest_benchmark/tests/agents/test_anthropic.py +++ b/llm_quest_benchmark/tests/agents/test_anthropic.py @@ -1,15 +1,15 @@ -"""Deterministic tests for Anthropic-backed agent behavior.""" +"""Deterministic tests for Anthropic-backed harness behavior.""" from unittest.mock import Mock, patch import pytest -from llm_quest_benchmark.agents.agent_factory import create_agent +from llm_quest_benchmark.harnesses.factory import create_harness @patch("llm_quest_benchmark.llm.client.anthropic.Anthropic") -def test_anthropic_agent_mocked_completion(mock_anthropic_cls): - """Agent should parse a mocked Anthropic completion without network calls.""" +def test_anthropic_harness_mocked_completion(mock_anthropic_cls): + """Harness should parse a mocked Anthropic completion without network calls.""" mock_client = Mock() mock_response = Mock() mock_block = Mock() @@ -18,15 +18,15 @@ def test_anthropic_agent_mocked_completion(mock_anthropic_cls): mock_client.messages.create.return_value = mock_response mock_anthropic_cls.return_value = mock_client - agent = create_agent("claude-sonnet-4-5") - action = agent.get_action("Test prompt", [{"text": "A"}, {"text": "B"}]) + harness = create_harness("minimal", model="claude-sonnet-4-5") + action = harness.get_action("Test prompt", [{"text": "A"}, {"text": "B"}]) assert action == 2 assert mock_client.messages.create.call_count == 1 -def test_anthropic_agent_empty_choices_raises(): +def test_anthropic_harness_empty_choices_raises(): """Base player contract should reject empty choices.""" - agent = create_agent("claude-sonnet-4-5") + harness = create_harness("minimal", model="claude-sonnet-4-5") with pytest.raises(ValueError, match="No choices provided"): - agent.get_action("Test prompt", []) + harness.get_action("Test prompt", []) diff --git a/llm_quest_benchmark/tests/agents/test_llm_agent.py b/llm_quest_benchmark/tests/agents/test_llm_agent.py index 06ff32f..1f3b99c 100644 --- a/llm_quest_benchmark/tests/agents/test_llm_agent.py +++ b/llm_quest_benchmark/tests/agents/test_llm_agent.py @@ -1,10 +1,11 @@ -"""Tests for LLM agent""" +"""Tests for the base LLM harness behavior.""" from unittest.mock import Mock, patch import pytest -from llm_quest_benchmark.agents.llm_agent import LLMAgent, parse_llm_response +from llm_quest_benchmark.harnesses.base import parse_llm_response +from llm_quest_benchmark.harnesses.minimal import MinimalHarness from llm_quest_benchmark.schemas.response import LLMResponse @@ -20,8 +21,8 @@ def example_choices(): @pytest.mark.timeout(5) # Quick unit test @patch("llm_quest_benchmark.llm.client.OpenAI") -def test_agent_basic_flow(mock_openai, monkeypatch): - """Test basic agent functionality with mocked LLM""" +def test_harness_basic_flow(mock_openai, monkeypatch): + """Test basic harness functionality with mocked LLM""" monkeypatch.setenv("OPENAI_API_KEY", "test-key") # Setup mock mock_chat = Mock() @@ -41,14 +42,14 @@ def test_agent_basic_flow(mock_openai, monkeypatch): observation = "You are at a trading station." choices = [{"id": "1", "text": "Talk to merchant"}, {"id": "2", "text": "Leave station"}] - # Create agent and test - agent = LLMAgent(model_name="gpt-5-mini") - result = agent.get_action(observation, choices) + # Create harness and test + harness = MinimalHarness(model_name="gpt-5-mini") + result = harness.get_action(observation, choices) # Verify results assert result == 1 # Expect an integer assert mock_chat.completions.create.call_count == 1 - last_response = agent.get_last_response() + last_response = harness.get_last_response() assert last_response.prompt_tokens == 9 assert last_response.completion_tokens == 2 assert last_response.total_tokens == 11 @@ -56,47 +57,47 @@ def test_agent_basic_flow(mock_openai, monkeypatch): def test_template_rendering(): """Test that templates are rendered correctly""" - agent = LLMAgent() + harness = MinimalHarness() observation = "Test observation" choices = [{"text": "Option 1"}, {"text": "Option 2"}] # Test that prompt is rendered correctly - prompt = agent.prompt_renderer.render_action_prompt(observation, choices) + prompt = harness.prompt_renderer.render_action_prompt(observation, choices) assert "Test observation" in prompt assert "Option 1" in prompt assert "Option 2" in prompt -def test_agent_initialization_without_api_key(monkeypatch): - """Agent construction should not require provider API keys before inference.""" +def test_harness_initialization_without_api_key(monkeypatch): + """Harness construction should not require provider API keys before inference.""" monkeypatch.delenv("OPENAI_API_KEY", raising=False) monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) - agent = LLMAgent(model_name="gpt-5-mini") - assert agent.llm is None + harness = MinimalHarness(model_name="gpt-5-mini") + assert harness.llm is None def test_gemini_prompt_uses_selected_template(): - agent = LLMAgent(model_name="gemini-2.5-flash") - prompt = agent._format_prompt("state", [{"text": "A"}, {"text": "B"}]) + harness = MinimalHarness(model_name="gemini-2.5-flash", action_template="reasoning.jinja") + prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}]) assert "Return ONLY valid JSON" in prompt assert "A" in prompt assert "B" in prompt def test_non_gemini_prompt_uses_selected_template(): - agent = LLMAgent(model_name="gpt-5-mini", action_template="stub.jinja") - prompt = agent._format_prompt("state", [{"text": "A"}, {"text": "B"}]) + harness = MinimalHarness(model_name="gpt-5-mini", action_template="stub.jinja") + prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}]) assert "IMPORTANT: Please respond with ONLY a single number" in prompt def test_template_alias_without_suffix_is_supported(): - agent = LLMAgent(model_name="gpt-5-mini", action_template="reasoning") - prompt = agent._format_prompt("state", [{"text": "A"}, {"text": "B"}]) + harness = MinimalHarness(model_name="gpt-5-mini", action_template="reasoning") + prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}]) assert '"result"' in prompt def test_gpt5_force_numeric_retry_path(): - agent = LLMAgent(model_name="gpt-5-mini") + harness = MinimalHarness(model_name="gpt-5-mini") mocked_llm = Mock() mocked_llm.get_completion.side_effect = ["```json\n{", "```json\n{", "2"] mocked_llm.get_last_usage.side_effect = [ @@ -104,58 +105,57 @@ def test_gpt5_force_numeric_retry_path(): {"prompt_tokens": 6, "completion_tokens": 1, "total_tokens": 7, "estimated_cost_usd": 0.0005}, {"prompt_tokens": 4, "completion_tokens": 1, "total_tokens": 5, "estimated_cost_usd": 0.0003}, ] - agent.llm = mocked_llm + harness.llm = mocked_llm - action = agent.get_action("state", [{"text": "A"}, {"text": "B"}]) + action = harness.get_action("state", [{"text": "A"}, {"text": "B"}]) assert action == 2 assert mocked_llm.get_completion.call_count == 3 - last = agent.get_last_response() + last = harness.get_last_response() assert last.total_tokens == 24 assert last.estimated_cost_usd == pytest.approx(0.0018) assert last.parse_mode == "force_retry_number_only" def test_contextual_state_includes_previous_observations(): - agent = LLMAgent(model_name="gpt-5-mini") - agent._remember_observation("Previous hint") - agent._remember_observation("Current state") - contextual = agent._build_contextual_state("Current state") + harness = MinimalHarness(model_name="gpt-5-mini") + harness.memory_module.update({"observation": "Previous hint"}) + harness.memory_module.update({"observation": "Current state"}) + contextual = harness._build_contextual_state("Current state") assert "Recent context from previous steps" in contextual assert "Previous hint" in contextual def test_contextual_state_includes_recent_decisions(): - agent = LLMAgent(model_name="gpt-5-mini") - agent._decision_history = [ - {"action": 2, "choice": "Inspect the terminal", "parse_mode": "json_direct"}, - {"action": 1, "choice": "Ask for access", "parse_mode": "retry_json_repaired"}, - ] - contextual = agent._build_contextual_state("Current state") + harness = MinimalHarness(model_name="gpt-5-mini") + harness.memory_module.update({"observation": "Previous state"}) + harness.memory_module.update({"action": 2, "choice": "Inspect the terminal", "parse_mode": "json_direct"}) + harness.memory_module.update({"action": 1, "choice": "Ask for access", "parse_mode": "retry_json_repaired"}) + contextual = harness._build_contextual_state("Current state") assert "Recent selected actions" in contextual assert "Inspect the terminal" in contextual assert "parse=json_direct" in contextual def test_safety_filter_prefers_lower_risk_choice(): - agent = LLMAgent(model_name="gpt-5-mini") + harness = MinimalHarness(model_name="gpt-5-mini") choices = [ {"text": "Пойти в космопорт и улететь, чтобы завтра не позориться"}, {"text": "Постараться пройти мимо"}, ] - assert agent._apply_safety_filter(1, choices) == 2 + assert harness._apply_safety_filter(choices, 1) == 2 def test_get_last_response_uses_skip_single_result(): - agent = LLMAgent(model_name="gpt-5-mini", skip_single=True) - agent.history.append(LLMResponse(action=2, is_default=False)) - agent._last_response = LLMResponse(action=2, is_default=False) + harness = MinimalHarness(model_name="gpt-5-mini", skip_single=True) + harness.history.append(LLMResponse(action=2, is_default=False)) + harness._last_response = LLMResponse(action=2, is_default=False) - action = agent.get_action("state", [{"id": "1", "text": "Only option"}]) + action = harness.get_action("state", [{"id": "1", "text": "Only option"}]) assert action == 1 - assert agent.get_last_response().action == 1 - assert agent.get_last_response().reasoning == "auto_single_choice" + assert harness.get_last_response().action == 1 + assert harness.get_last_response().reasoning == "auto_single_choice" def test_parse_llm_response_number_only_tracks_parse_mode(): @@ -194,7 +194,7 @@ def test_parse_llm_response_uses_analysis_as_reasoning_when_truncated(): def test_llm_error_default_response_keeps_reasoning_marker(): - agent = LLMAgent(model_name="gemini-2.5-flash") + harness = MinimalHarness(model_name="gemini-2.5-flash") mocked_llm = Mock() mocked_llm.get_completion.side_effect = RuntimeError("provider returned empty message") mocked_llm.get_last_usage.return_value = { @@ -203,20 +203,20 @@ def test_llm_error_default_response_keeps_reasoning_marker(): "total_tokens": 0, "estimated_cost_usd": None, } - agent.llm = mocked_llm + harness.llm = mocked_llm - action = agent.get_action("state", [{"text": "A"}, {"text": "B"}]) + action = harness.get_action("state", [{"text": "A"}, {"text": "B"}]) assert action == 1 - last = agent.get_last_response() + last = harness.get_last_response() assert last.is_default is True assert last.reasoning is not None assert "llm_call_error" in last.reasoning def test_retry_prompt_requests_json_payload(): - agent = LLMAgent(model_name="gemini-2.5-flash") - prompt = agent._format_retry_prompt("state", [{"text": "A"}, {"text": "B"}]) + harness = MinimalHarness(model_name="gemini-2.5-flash") + prompt = harness._format_retry_prompt("state", [{"text": "A"}, {"text": "B"}]) assert "Return valid JSON only" in prompt assert '"analysis"' in prompt assert '"reasoning"' in prompt @@ -224,7 +224,7 @@ def test_retry_prompt_requests_json_payload(): def test_retry_preserves_reasoning_from_first_attempt(): - agent = LLMAgent(model_name="gemini-2.5-flash") + harness = MinimalHarness(model_name="gemini-2.5-flash") mocked_llm = Mock() mocked_llm.get_completion.side_effect = [ "Analysis: low oxygen\nReasoning: safer move first\n```json\n{", @@ -244,12 +244,12 @@ def test_retry_preserves_reasoning_from_first_attempt(): "estimated_cost_usd": 0.0002, }, ] - agent.llm = mocked_llm + harness.llm = mocked_llm - action = agent.get_action("state", [{"text": "A"}, {"text": "B"}]) + action = harness.get_action("state", [{"text": "A"}, {"text": "B"}]) assert action == 2 - last = agent.get_last_response() + last = harness.get_last_response() assert last.analysis is not None assert "low oxygen" in last.analysis assert last.reasoning is not None diff --git a/llm_quest_benchmark/tests/agents/test_mode_agents.py b/llm_quest_benchmark/tests/agents/test_mode_agents.py index c650127..a41a11a 100644 --- a/llm_quest_benchmark/tests/agents/test_mode_agents.py +++ b/llm_quest_benchmark/tests/agents/test_mode_agents.py @@ -1,257 +1,5 @@ -"""Tests for planner and tool-augmented agent modes.""" +"""Legacy agent-mode tests retired. -from unittest.mock import Mock - -from llm_quest_benchmark.agents.agent_factory import create_agent -from llm_quest_benchmark.agents.llm_agent import LLMAgent -from llm_quest_benchmark.agents.planner_agent import PlannerAgent -from llm_quest_benchmark.agents.tool_agent import ToolAgent - - -def test_create_agent_uses_planner_template_alias(): - agent = create_agent(model="gpt-5-mini", action_template="planner") - assert isinstance(agent, PlannerAgent) - - -def test_create_agent_uses_tool_template_alias(): - agent = create_agent(model="gpt-5-mini", action_template="tool_augmented") - assert isinstance(agent, ToolAgent) - - -def test_create_agent_propagates_memory_mode_to_planner_and_tool_agents(): - planner = create_agent( - model="gpt-5-mini", - action_template="planner", - memory_mode="compaction", - compaction_interval=50, - ) - tool = create_agent( - model="gpt-5-mini", - action_template="tool_augmented", - memory_mode="compaction", - compaction_interval=50, - ) - - assert isinstance(planner, PlannerAgent) - assert isinstance(tool, ToolAgent) - assert planner._memory_mode == "compaction" - assert planner._compaction_interval == 50 - assert tool._memory_mode == "compaction" - assert tool._compaction_interval == 50 - - -def test_create_agent_uses_light_hints_template_with_standard_llm_agent(): - agent = create_agent(model="gpt-5-mini", action_template="light_hints") - assert isinstance(agent, LLMAgent) - assert not isinstance(agent, (PlannerAgent, ToolAgent)) - - -def test_light_hints_template_injects_general_mechanics(): - agent = LLMAgent(model_name="gpt-5-mini", action_template="light_hints") - - prompt = agent._format_prompt("A sealed vault blocks the route.", [{"text": "Study the vault"}]) - - assert "General hints for this type of quest" in prompt - assert "Preparation, study, negotiation" in prompt - - -def test_planner_agent_first_turn_generates_plan_then_acts(): - agent = PlannerAgent(model_name="gpt-5-mini") - mocked_llm = Mock() - mocked_llm.get_completion.side_effect = [ - "Gather clues first. Avoid direct fights. Preserve resources.", - '{"analysis":"plan says scout","reasoning":"safer branch","result":2}', - ] - mocked_llm.get_last_usage.side_effect = [ - {"prompt_tokens": 30, "completion_tokens": 12, "total_tokens": 42, "estimated_cost_usd": 0.001}, - {"prompt_tokens": 20, "completion_tokens": 8, "total_tokens": 28, "estimated_cost_usd": 0.0007}, - ] - agent.llm = mocked_llm - - action = agent.get_action("You enter a pirate station.", [{"text": "Scout ahead"}, {"text": "Attack now"}]) - - assert action == 2 - assert agent.current_plan is not None - assert "Avoid direct fights" in agent.current_plan - assert mocked_llm.get_completion.call_count == 2 - assert agent.get_last_response().total_tokens == 70 - - -def test_planner_agent_reuses_plan_when_state_is_stable(): - agent = PlannerAgent(model_name="gpt-5-mini") - agent.current_plan = "Keep moving carefully and avoid a direct fight." - agent._observation_history = ["Quiet corridor."] - mocked_llm = Mock() - mocked_llm.get_completion.return_value = '{"analysis":"plan still fits","reasoning":"careful progress","result":1}' - mocked_llm.get_last_usage.return_value = { - "prompt_tokens": 18, - "completion_tokens": 7, - "total_tokens": 25, - "estimated_cost_usd": 0.0005, - } - agent.llm = mocked_llm - - action = agent.get_action("Quiet corridor.", [{"text": "Open the door"}, {"text": "Run"}]) - - assert action == 1 - assert mocked_llm.get_completion.call_count == 1 - - -def test_planner_agent_uses_contextual_memory_state(): - agent = PlannerAgent(model_name="gpt-5-mini", memory_mode="compaction", compaction_interval=50) - agent._quest_briefing = "Original mission: win the election." - agent._transcript = [ - { - "step": 1, - "observation": "You learned Maloqs value strength.", - "choice_text": "Ask about Maloqs", - "memo": "Maloqs value strength", - "action": 1, - } - ] - agent._steps_since_compaction = 1 - mocked_llm = Mock() - mocked_llm.get_completion.side_effect = [ - "Use the remembered cultural clue.", - '{"analysis":"use clue","reasoning":"fits plan","result":1}', - ] - mocked_llm.get_last_usage.return_value = { - "prompt_tokens": 1, - "completion_tokens": 1, - "total_tokens": 2, - "estimated_cost_usd": 0.0, - } - agent.llm = mocked_llm - - agent.get_action("Current banquet scene.", [{"text": "Greet like a warrior"}]) - - first_prompt = mocked_llm.get_completion.call_args_list[0].args[0] - assert "Quest briefing" in first_prompt - assert "RECENT STEPS" in first_prompt - assert "Maloqs value strength" in first_prompt - - -def test_tool_agent_can_use_quest_history(): - agent = ToolAgent(model_name="gpt-5-mini") - agent._step_log = [ - { - "step": 1, - "observation": "Merchant mentioned low fuel.", - "choices": ["Buy fuel", "Keep flying"], - "selected_choice": "Buy fuel", - } - ] - mocked_llm = Mock() - mocked_llm.get_completion.side_effect = [ - '{"analysis":"need history","tool_calls":[{"tool":"quest_history","input":"fuel merchant"}],"result":null}', - '{"analysis":"fuel clue matters","reasoning":"play safe","result":1}', - ] - mocked_llm.get_last_usage.side_effect = [ - {"prompt_tokens": 24, "completion_tokens": 10, "total_tokens": 34, "estimated_cost_usd": 0.0008}, - {"prompt_tokens": 22, "completion_tokens": 9, "total_tokens": 31, "estimated_cost_usd": 0.0007}, - ] - agent.llm = mocked_llm - - action = agent.get_action("Your fuel gauge is blinking.", [{"text": "Refuel"}, {"text": "Attack pirates"}]) - - assert action == 1 - assert mocked_llm.get_completion.call_count == 2 - assert agent.get_last_response().total_tokens == 65 - assert len(agent._step_log) == 2 - - -def test_tool_agent_calculator_supports_arithmetic_and_comparisons(): - assert ToolAgent.calculator("55 + 12 - 5") == "55 + 12 - 5 = 62" - assert ToolAgent.calculator("60 >= 55 and 62 >= 80") == "60 >= 55 and 62 >= 80 = False" - assert ToolAgent.calculator("__import__('os')").startswith("error:") - - -def test_tool_agent_scratchpad_read_write_and_reset(): - agent = ToolAgent(model_name="gpt-5-mini") - - assert agent.scratchpad("read") == "(empty)" - assert ( - agent.scratchpad("write_replace", " Board: W B _ ; failed door 2 ") == "updated: Board: W B _ ; failed door 2" - ) - assert agent.scratchpad("read") == "Board: W B _ ; failed door 2" - - agent.reset() - - assert agent.scratchpad("read") == "(empty)" - - -def test_tool_agent_can_use_calculator_and_records_tool_metadata(): - agent = ToolAgent(model_name="gpt-5-mini") - mocked_llm = Mock() - mocked_llm.get_completion.side_effect = [ - '{"memo":"Need mix math","analysis":"calculate target","tool_calls":[{"tool":"calculator","input":"50 + 3 >= 55"}],"result":null}', - '{"memo":"Need more strength","analysis":"math failed","reasoning":"choose strength","result":2}', - ] - mocked_llm.get_last_usage.return_value = { - "prompt_tokens": 10, - "completion_tokens": 5, - "total_tokens": 15, - "estimated_cost_usd": 0.0, - } - agent.llm = mocked_llm - - action = agent.get_action("Strength is 50. Need at least 55.", [{"text": "Add water"}, {"text": "Add repusator"}]) - - response = agent.get_last_response() - assert action == 2 - assert response.tool_calls == [{"tool": "calculator", "input": "50 + 3 >= 55", "operation": "", "content": ""}] - assert response.tool_results == ["calculator(50 + 3 >= 55) => 50 + 3 >= 55 = False"] - assert response.memo == "Need more strength" - - -def test_tool_agent_uses_contextual_memory_state(): - agent = ToolAgent(model_name="gpt-5-mini", memory_mode="compaction", compaction_interval=50) - agent._quest_briefing = "Original mission: pass pilot certification." - agent._transcript = [ - { - "step": 1, - "observation": "Hogger is greedy.", - "choice_text": "Bribe Hogger", - "memo": "Hogger is greedy", - "action": 1, - } - ] - agent._steps_since_compaction = 1 - mocked_llm = Mock() - mocked_llm.get_completion.return_value = ( - '{"memo":"Hogger is greedy","analysis":"no tools needed","tool_calls":[],"result":1}' - ) - mocked_llm.get_last_usage.return_value = { - "prompt_tokens": 10, - "completion_tokens": 5, - "total_tokens": 15, - "estimated_cost_usd": 0.0, - } - agent.llm = mocked_llm - - agent.get_action("Current exam room.", [{"text": "Offer a bribe"}]) - - prompt = mocked_llm.get_completion.call_args.args[0] - assert "Quest briefing" in prompt - assert "RECENT STEPS" in prompt - assert "Hogger is greedy" in prompt - - -def test_tool_agent_can_finish_without_tools_in_one_call(): - agent = ToolAgent(model_name="gpt-5-mini") - mocked_llm = Mock() - mocked_llm.get_completion.return_value = ( - '{"analysis":"no tools needed","tool_calls":[],"reasoning":"direct clue","result":2}' - ) - mocked_llm.get_last_usage.return_value = { - "prompt_tokens": 15, - "completion_tokens": 6, - "total_tokens": 21, - "estimated_cost_usd": 0.0004, - } - agent.llm = mocked_llm - - action = agent.get_action("A guard points at the safe exit.", [{"text": "Fight"}, {"text": "Leave"}]) - - assert action == 2 - assert mocked_llm.get_completion.call_count == 1 +Planner/tool/memo behavior now lives in +``llm_quest_benchmark.tests.harnesses.test_harnesses``. +""" diff --git a/llm_quest_benchmark/tests/harnesses/test_harnesses.py b/llm_quest_benchmark/tests/harnesses/test_harnesses.py new file mode 100644 index 0000000..030648b --- /dev/null +++ b/llm_quest_benchmark/tests/harnesses/test_harnesses.py @@ -0,0 +1,335 @@ +"""Comprehensive tests for concrete harness behavior.""" + +from unittest.mock import Mock + +from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness +from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory +from llm_quest_benchmark.harnesses.memo import HintedCompactHarness, MemoCompactHarness +from llm_quest_benchmark.harnesses.minimal import MinimalHarness +from llm_quest_benchmark.harnesses.planner import PlannerHarness +from llm_quest_benchmark.harnesses.reasoning import ReasoningFullTranscriptHarness, ReasoningRecentHarness +from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness, ToolHintedHarness + + +HARNESS_SPECS = { + "minimal": (MinimalHarness, "stub.jinja", DefaultMemory), + "reasoning_recent": (ReasoningRecentHarness, "reasoning.jinja", DefaultMemory), + "reasoning_full": (ReasoningFullTranscriptHarness, "reasoning.jinja", FullTranscriptMemory), + "memo_compact": (MemoCompactHarness, "stateful_compact.jinja", CompactionMemory), + "hinted_compact": (HintedCompactHarness, "stateful_compact_hints.jinja", CompactionMemory), + "tool_compact": (ToolCompactHarness, "tool_augmented.jinja", CompactionMemory), + "tool_hinted": (ToolHintedHarness, "tool_augmented_hints.jinja", CompactionMemory), + "planner": (PlannerHarness, "planner.jinja", CompactionMemory), +} + + +def assert_harness_configuration(harness_name: str) -> None: + expected_class, expected_template, expected_memory_class = HARNESS_SPECS[harness_name] + + harness = create_harness(harness_name, model="gpt-5-mini") + + assert isinstance(harness, expected_class) + assert harness.harness_name == harness_name + assert harness.action_template == expected_template + assert isinstance(harness.memory_module, expected_memory_class) + + +def test_minimal_harness_configuration(): + assert_harness_configuration("minimal") + + +def test_reasoning_recent_harness_configuration(): + assert_harness_configuration("reasoning_recent") + + +def test_reasoning_full_harness_configuration(): + assert_harness_configuration("reasoning_full") + + +def test_memo_compact_harness_configuration(): + assert_harness_configuration("memo_compact") + + +def test_hinted_compact_harness_configuration(): + assert_harness_configuration("hinted_compact") + + +def test_tool_compact_harness_configuration(): + assert_harness_configuration("tool_compact") + + +def test_tool_hinted_harness_configuration(): + assert_harness_configuration("tool_hinted") + + +def test_planner_harness_configuration(): + assert_harness_configuration("planner") + + +def test_all_registry_harnesses_have_configuration_specs(): + assert set(HARNESS_REGISTRY) == set(HARNESS_SPECS) + + +def test_all_registry_harnesses_instantiate_with_expected_names(): + for harness_name in HARNESS_REGISTRY: + harness = create_harness(harness_name, model="gpt-5-mini") + + assert harness.harness_name == harness_name + + +def test_memo_compact_mocked_llm_returns_action_and_reuses_memo_context(): + harness = MemoCompactHarness(model_name="gpt-5-mini") + mocked_llm = Mock() + mocked_llm.get_completion.side_effect = [ + '{"memo":"Merchant needs fuel payment","analysis":"pay first","reasoning":"quest clue","result":2}', + '{"memo":"Paid fuel merchant","analysis":"memo says paid","reasoning":"continue","result":1}', + ] + mocked_llm.get_last_usage.return_value = { + "prompt_tokens": 10, + "completion_tokens": 5, + "total_tokens": 15, + "estimated_cost_usd": 0.0, + } + harness.llm = mocked_llm + + first_action = harness.get_action("A merchant offers fuel for a fee.", [{"text": "Leave"}, {"text": "Pay"}]) + second_action = harness.get_action("The fuel gauge still blinks.", [{"text": "Check receipt"}, {"text": "Leave"}]) + + assert first_action == 2 + assert second_action == 1 + assert harness.get_last_response().memo == "Paid fuel merchant" + second_prompt = mocked_llm.get_completion.call_args_list[1].args[0] + assert "Merchant needs fuel payment" in second_prompt + + +def test_planner_harness_first_turn_generates_plan_then_acts(): + harness = PlannerHarness(model_name="gpt-5-mini") + mocked_llm = Mock() + mocked_llm.get_completion.side_effect = [ + "Gather clues first. Avoid direct fights. Preserve resources.", + '{"analysis":"plan says scout","reasoning":"safer branch","result":2}', + ] + mocked_llm.get_last_usage.side_effect = [ + {"prompt_tokens": 30, "completion_tokens": 12, "total_tokens": 42, "estimated_cost_usd": 0.001}, + {"prompt_tokens": 20, "completion_tokens": 8, "total_tokens": 28, "estimated_cost_usd": 0.0007}, + ] + harness.llm = mocked_llm + + action = harness.get_action("You enter a pirate station.", [{"text": "Scout ahead"}, {"text": "Attack now"}]) + + assert action == 2 + assert harness.current_plan is not None + assert "Avoid direct fights" in harness.current_plan + assert mocked_llm.get_completion.call_count == 2 + assert harness.get_last_response().total_tokens == 70 + + +def test_planner_harness_reuses_plan_when_state_is_stable(): + harness = PlannerHarness(model_name="gpt-5-mini") + harness.current_plan = "Keep moving carefully and avoid a direct fight." + harness._observation_history = ["Quiet corridor."] + mocked_llm = Mock() + mocked_llm.get_completion.return_value = '{"analysis":"plan still fits","reasoning":"careful progress","result":1}' + mocked_llm.get_last_usage.return_value = { + "prompt_tokens": 18, + "completion_tokens": 7, + "total_tokens": 25, + "estimated_cost_usd": 0.0005, + } + harness.llm = mocked_llm + + action = harness.get_action("Quiet corridor.", [{"text": "Open the door"}, {"text": "Run"}]) + + assert action == 1 + assert mocked_llm.get_completion.call_count == 1 + + +def test_planner_harness_uses_contextual_memory_state(): + harness = PlannerHarness(model_name="gpt-5-mini", compaction_interval=50) + harness._quest_briefing = "Original mission: win the election." + harness._transcript = [ + { + "step": 1, + "observation": "You learned Maloqs value strength.", + "choice_text": "Ask about Maloqs", + "memo": "Maloqs value strength", + "action": 1, + } + ] + harness._steps_since_compaction = 1 + mocked_llm = Mock() + mocked_llm.get_completion.side_effect = [ + "Use the remembered cultural clue.", + '{"analysis":"use clue","reasoning":"fits plan","result":1}', + ] + mocked_llm.get_last_usage.return_value = { + "prompt_tokens": 1, + "completion_tokens": 1, + "total_tokens": 2, + "estimated_cost_usd": 0.0, + } + harness.llm = mocked_llm + + harness.get_action("Current banquet scene.", [{"text": "Greet like a warrior"}]) + + first_prompt = mocked_llm.get_completion.call_args_list[0].args[0] + assert "Quest briefing" in first_prompt + assert "RECENT STEPS" in first_prompt + assert "Maloqs value strength" in first_prompt + + +def test_tool_compact_harness_can_use_quest_history(): + harness = ToolCompactHarness(model_name="gpt-5-mini") + harness._step_log = [ + { + "step": 1, + "observation": "Merchant mentioned low fuel.", + "choices": ["Buy fuel", "Keep flying"], + "selected_choice": "Buy fuel", + } + ] + harness._history_tool.step_log = harness._step_log + mocked_llm = Mock() + mocked_llm.get_completion.side_effect = [ + '{"analysis":"need history","tool_calls":[{"tool":"quest_history","input":"fuel merchant"}],"result":null}', + '{"analysis":"fuel clue matters","reasoning":"play safe","result":1}', + ] + mocked_llm.get_last_usage.side_effect = [ + {"prompt_tokens": 24, "completion_tokens": 10, "total_tokens": 34, "estimated_cost_usd": 0.0008}, + {"prompt_tokens": 22, "completion_tokens": 9, "total_tokens": 31, "estimated_cost_usd": 0.0007}, + ] + harness.llm = mocked_llm + + action = harness.get_action("Your fuel gauge is blinking.", [{"text": "Refuel"}, {"text": "Attack pirates"}]) + + assert action == 1 + assert mocked_llm.get_completion.call_count == 2 + assert harness.get_last_response().total_tokens == 65 + assert len(harness._step_log) == 2 + assert harness.get_last_response().tool_results + assert "Merchant mentioned low fuel" in harness.get_last_response().tool_results[0] + + +def test_tool_compact_calculator_supports_arithmetic_and_comparisons(): + assert ToolCompactHarness.calculator("55 + 12 - 5") == "55 + 12 - 5 = 62" + assert ToolCompactHarness.calculator("60 >= 55 and 62 >= 80") == "60 >= 55 and 62 >= 80 = False" + assert ToolCompactHarness.calculator("__import__('os')").startswith("error:") + + +def test_tool_compact_scratchpad_read_write_and_reset(): + harness = ToolCompactHarness(model_name="gpt-5-mini") + + assert harness.scratchpad("read") == "(empty)" + assert ( + harness.scratchpad("write_replace", " Board: W B _ ; failed door 2 ") + == "updated: Board: W B _ ; failed door 2" + ) + assert harness.scratchpad("read") == "Board: W B _ ; failed door 2" + + harness.reset() + + assert harness.scratchpad("read") == "(empty)" + + +def test_tool_compact_harness_can_use_calculator_and_records_tool_metadata(): + harness = ToolCompactHarness(model_name="gpt-5-mini") + mocked_llm = Mock() + mocked_llm.get_completion.side_effect = [ + '{"memo":"Need mix math","analysis":"calculate target","tool_calls":[{"tool":"calculator","input":"50 + 3 >= 55"}],"result":null}', + '{"memo":"Need more strength","analysis":"math failed","reasoning":"choose strength","result":2}', + ] + mocked_llm.get_last_usage.return_value = { + "prompt_tokens": 10, + "completion_tokens": 5, + "total_tokens": 15, + "estimated_cost_usd": 0.0, + } + harness.llm = mocked_llm + + action = harness.get_action("Strength is 50. Need at least 55.", [{"text": "Add water"}, {"text": "Add repusator"}]) + + response = harness.get_last_response() + assert action == 2 + assert response.tool_calls == [{"tool": "calculator", "input": "50 + 3 >= 55", "operation": "", "content": ""}] + assert response.tool_results == ["calculator(50 + 3 >= 55) => 50 + 3 >= 55 = False"] + assert response.memo == "Need more strength" + + +def test_tool_compact_harness_can_use_scratchpad_tool_call(): + harness = ToolCompactHarness(model_name="gpt-5-mini") + mocked_llm = Mock() + mocked_llm.get_completion.side_effect = [ + ( + '{"analysis":"save board","tool_calls":[{"tool":"scratchpad",' + '"operation":"write_replace","content":"Board: red blue blank"}],"result":null}' + ), + '{"analysis":"note saved","reasoning":"use saved board","result":1}', + ] + mocked_llm.get_last_usage.return_value = { + "prompt_tokens": 10, + "completion_tokens": 5, + "total_tokens": 15, + "estimated_cost_usd": 0.0, + } + harness.llm = mocked_llm + + action = harness.get_action("A colored board blocks the hall.", [{"text": "Use red-blue order"}]) + + assert action == 1 + assert harness.scratchpad("read") == "Board: red blue blank" + assert harness.get_last_response().tool_results == [ + "scratchpad(write_replace, Board: red blue blank) => updated: Board: red blue blank" + ] + + +def test_tool_compact_harness_uses_contextual_memory_state(): + harness = ToolCompactHarness(model_name="gpt-5-mini", compaction_interval=50) + harness._quest_briefing = "Original mission: pass pilot certification." + harness._transcript = [ + { + "step": 1, + "observation": "Hogger is greedy.", + "choice_text": "Bribe Hogger", + "memo": "Hogger is greedy", + "action": 1, + } + ] + harness._steps_since_compaction = 1 + mocked_llm = Mock() + mocked_llm.get_completion.return_value = ( + '{"memo":"Hogger is greedy","analysis":"no tools needed","tool_calls":[],"result":1}' + ) + mocked_llm.get_last_usage.return_value = { + "prompt_tokens": 10, + "completion_tokens": 5, + "total_tokens": 15, + "estimated_cost_usd": 0.0, + } + harness.llm = mocked_llm + + harness.get_action("Current exam room.", [{"text": "Offer a bribe"}]) + + prompt = mocked_llm.get_completion.call_args.args[0] + assert "Quest briefing" in prompt + assert "RECENT STEPS" in prompt + assert "Hogger is greedy" in prompt + + +def test_tool_compact_harness_can_finish_without_tools_in_one_call(): + harness = ToolCompactHarness(model_name="gpt-5-mini") + mocked_llm = Mock() + mocked_llm.get_completion.return_value = ( + '{"analysis":"no tools needed","tool_calls":[],"reasoning":"direct clue","result":2}' + ) + mocked_llm.get_last_usage.return_value = { + "prompt_tokens": 15, + "completion_tokens": 6, + "total_tokens": 21, + "estimated_cost_usd": 0.0004, + } + harness.llm = mocked_llm + + action = harness.get_action("A guard points at the safe exit.", [{"text": "Fight"}, {"text": "Leave"}]) + + assert action == 2 + assert mocked_llm.get_completion.call_count == 1 diff --git a/llm_quest_benchmark/tests/integration/test_mode_agents_e2e.py b/llm_quest_benchmark/tests/integration/test_mode_agents_e2e.py index 5563ca2..2ceeaca 100644 --- a/llm_quest_benchmark/tests/integration/test_mode_agents_e2e.py +++ b/llm_quest_benchmark/tests/integration/test_mode_agents_e2e.py @@ -1,12 +1,12 @@ -"""Integration tests for planner/tool modes on real quest execution loops.""" +"""Integration tests for planner/tool harness modes on real quest execution loops.""" from pathlib import Path import pytest -from llm_quest_benchmark.agents.agent_factory import create_agent from llm_quest_benchmark.core.runner import run_quest_with_timeout from llm_quest_benchmark.environments.state import QuestOutcome +from llm_quest_benchmark.harnesses.factory import create_harness QUEST_PATHS = [ "quests/Boat.qm", @@ -38,18 +38,18 @@ def get_last_usage(self): @pytest.mark.timeout(15) @pytest.mark.skipif(not Path(QUEST_PATHS[1]).exists(), reason="Quest files not downloaded") -def test_planner_agent_runs_three_quests_across_openai_and_anthropic_models(monkeypatch): +def test_planner_harness_runs_three_quests_across_openai_and_anthropic_models(monkeypatch): requested_models = [] def fake_get_llm_client(model_name, **kwargs): requested_models.append(model_name) return FakeLLM("planner") - monkeypatch.setattr("llm_quest_benchmark.agents.llm_agent.get_llm_client", fake_get_llm_client) + monkeypatch.setattr("llm_quest_benchmark.harnesses.base.get_llm_client", fake_get_llm_client) for model_name in ["gpt-5-mini", "claude-sonnet-4-5"]: for quest_path in QUEST_PATHS: - agent = create_agent(model=model_name, action_template="planner", skip_single=True) + agent = create_harness("planner", model=model_name, skip_single=True) outcome = run_quest_with_timeout(quest_path, agent, timeout=10) assert outcome in {QuestOutcome.SUCCESS, QuestOutcome.FAILURE, QuestOutcome.TIMEOUT} assert outcome != QuestOutcome.ERROR @@ -60,14 +60,14 @@ def fake_get_llm_client(model_name, **kwargs): @pytest.mark.timeout(15) @pytest.mark.skipif(not Path(QUEST_PATHS[1]).exists(), reason="Quest files not downloaded") -def test_tool_agent_runs_three_quests(monkeypatch): +def test_tool_harness_runs_three_quests(monkeypatch): monkeypatch.setattr( - "llm_quest_benchmark.agents.llm_agent.get_llm_client", + "llm_quest_benchmark.harnesses.base.get_llm_client", lambda model_name, **kwargs: FakeLLM("tool"), ) for quest_path in QUEST_PATHS: - agent = create_agent(model="gpt-5-mini", action_template="tool_augmented", skip_single=True) + agent = create_harness("tool_compact", model="gpt-5-mini", skip_single=True) outcome = run_quest_with_timeout(quest_path, agent, timeout=10) assert outcome in {QuestOutcome.SUCCESS, QuestOutcome.FAILURE, QuestOutcome.TIMEOUT} assert outcome != QuestOutcome.ERROR @@ -75,9 +75,9 @@ def test_tool_agent_runs_three_quests(monkeypatch): @pytest.mark.timeout(15) @pytest.mark.skipif(not Path(QUEST_PATHS[1]).exists(), reason="Quest files not downloaded") -def test_reused_mode_agents_reset_between_quest_runs(): +def test_reused_mode_harnesses_reset_between_quest_runs(): quest_path = "quests/sr_2_1_2121_eng/Borzukhan_eng.qm" - planner_agent = create_agent(model="gpt-5-mini", action_template="planner", skip_single=True) + planner_agent = create_harness("planner", model="gpt-5-mini", skip_single=True) planner_agent.llm = FakeLLM("planner") first_outcome = run_quest_with_timeout(quest_path, planner_agent, timeout=10) @@ -92,7 +92,7 @@ def test_reused_mode_agents_reset_between_quest_runs(): assert "stale plan from previous run" not in planner_agent._plan_history assert "stale observation" not in planner_agent._observation_history - tool_agent = create_agent(model="gpt-5-mini", action_template="tool_augmented", skip_single=True) + tool_agent = create_harness("tool_compact", model="gpt-5-mini", skip_single=True) tool_agent.llm = FakeLLM("tool") first_outcome = run_quest_with_timeout(quest_path, tool_agent, timeout=10) diff --git a/llm_quest_benchmark/tests/integration/test_quest_e2e.py b/llm_quest_benchmark/tests/integration/test_quest_e2e.py index 8ebfb91..a0e376d 100644 --- a/llm_quest_benchmark/tests/integration/test_quest_e2e.py +++ b/llm_quest_benchmark/tests/integration/test_quest_e2e.py @@ -5,10 +5,10 @@ import pytest -from llm_quest_benchmark.agents.agent_factory import create_agent -from llm_quest_benchmark.constants import DEFAULT_QUEST, DEFAULT_TEMPLATE, SYSTEM_ROLE_TEMPLATE +from llm_quest_benchmark.constants import DEFAULT_QUEST, SYSTEM_ROLE_TEMPLATE from llm_quest_benchmark.core.runner import run_quest_with_timeout from llm_quest_benchmark.environments.state import QuestOutcome +from llm_quest_benchmark.harnesses.factory import create_harness TIMEOUT = 20 # 20s should be enough for test quests to complete @@ -19,11 +19,11 @@ def test_quest_run_with_llm(caplog): """Test that quest runs with LLM agent and reaches a final state""" caplog.set_level(logging.DEBUG) # Show all logs in test output - # Create LLM agent - agent = create_agent( + # Create LLM harness + agent = create_harness( + harness="minimal", model="random_choice", # Use random for testing system_template=SYSTEM_ROLE_TEMPLATE, - action_template=DEFAULT_TEMPLATE, temperature=0.0, skip_single=False, debug=True, @@ -68,7 +68,7 @@ def test_random_agent_on_test_quest(caplog): caplog.set_level(logging.DEBUG) # Show all logs in test output # Create random agent - agent = create_agent("random_choice", skip_single=True, debug=True) + agent = create_harness("random_choice", skip_single=True, debug=True) assert agent is not None, "Failed to create random agent" # Mock callback for testing From 1ac851ebb3c9ba5bcf87d46af1c78374233c7ee5 Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Mon, 11 May 2026 14:57:20 +0400 Subject: [PATCH 08/24] docs: reframe as agent harness benchmark --- docs/ARCHITECTURE.md | 114 +++++++++++++++++++++++------------- docs/EXPERIMENTS_LOG.md | 14 +++++ docs/HARNESS_ENGINEERING.md | 64 ++++++++++++++++++++ docs/SPEC.md | 37 ++++++------ 4 files changed, 171 insertions(+), 58 deletions(-) create mode 100644 docs/HARNESS_ENGINEERING.md diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index c7f556d..998241a 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -1,38 +1,55 @@ # Architecture ## Overview -LLM Quest Benchmark evaluates how different agent architectures complete interactive fiction quests (Space Rangers `.qm` format). + +LLM Quest Benchmark evaluates how **agent harnesses** complete interactive +fiction quests in the Space Rangers `.qm` format. The benchmark holds the quest +environment and result logging constant while varying the harness around the +model: prompt template, memory strategy, tools, and action loop. + The runtime loop is: + 1. Parse or step quest state via the TypeScript engine bridge. -2. Build an action prompt from current state and available choices. -3. Get agent choice (human/random/LLM with varying agent modes). -4. Apply choice, log step, and detect outcome. +2. Build harness context from current state, available choices, and memory. +3. Get a choice from a human, random policy, or LLM-backed harness. +4. Apply the choice, log the step, and detect the terminal outcome. 5. Persist run metrics and run summaries. ## Main Runtime Layers ### 1. Quest Engine Layer -- `space-rangers-quest/`: - TypeScript quest parser/player submodule. -- `llm_quest_benchmark/executors/ts_bridge/consoleplayer.ts`: - Node entrypoint for parse/step execution. -- `llm_quest_benchmark/executors/ts_bridge/bridge.py`: - Python subprocess bridge with startup preflight and actionable errors. + +- `space-rangers-quest/`: TypeScript quest parser/player submodule. +- `llm_quest_benchmark/executors/ts_bridge/consoleplayer.ts`: Node entrypoint + for parse/step execution. +- `llm_quest_benchmark/executors/ts_bridge/bridge.py`: Python subprocess + bridge with startup preflight and actionable errors. ### 2. Environment Layer -- `llm_quest_benchmark/environments/qm.py`: - Wraps bridge into Python environment semantics (`reset`, `step`, terminal detection). -### 3. Agent Layer -- `llm_quest_benchmark/agents/llm_agent.py`: Base LLM agent with template-driven prompts, retry logic, loop-breaking, and safety filters. -- `llm_quest_benchmark/agents/planner_agent.py`: Planner loop with observation-diff heuristic for re-planning. -- `llm_quest_benchmark/agents/tool_agent.py`: Tool-using scaffold with quest history tool. -- `llm_quest_benchmark/agents/agent_factory.py`: Factory that maps Prompt Template choices to agent classes. -- `llm_quest_benchmark/agents/human_player.py`, `random_agent.py`: Non-LLM agents. +- `llm_quest_benchmark/environments/qm.py`: Wraps the bridge into Python + environment semantics (`reset`, `step`, terminal detection). + +### 3. Harness Layer + +- `llm_quest_benchmark/harnesses/base.py`: `BaseHarness`, the shared + LLM-backed `QuestPlayer` implementation for prompt rendering, response + parsing, retries, contextual state, and safety filtering. +- `llm_quest_benchmark/harnesses/memory.py`: `DefaultMemory`, + `FullTranscriptMemory`, and `CompactionMemory`. +- `llm_quest_benchmark/harnesses/tools.py`: Calculator, scratchpad, and quest + history helpers used by tool harnesses. +- `llm_quest_benchmark/harnesses/factory.py`: `create_harness()` and the + canonical harness registry. +- `llm_quest_benchmark/agents/human_player.py`, + `llm_quest_benchmark/agents/random_agent.py`: Non-LLM `QuestPlayer` + implementations preserved for interactive and random baselines. -`LLMAgent` lazily initializes provider clients, so template rendering and agent construction do not require API keys. +Harness construction lazily initializes provider clients, so template rendering +and benchmark configuration parsing do not require API keys. ### 4. LLM Provider Layer + - `llm_quest_benchmark/llm/client.py`: - provider/model normalization (`provider:model` + aliases) - adapters: OpenAI, Anthropic, Google Gemini, DeepSeek @@ -40,38 +57,53 @@ The runtime loop is: - token/cost usage tracking per completion call ### 5. Execution and Analysis Layer + - `llm_quest_benchmark/core/runner.py`: Core quest run loop. -- `llm_quest_benchmark/core/analyzer.py`: Post-run analysis and benchmark summaries. +- `llm_quest_benchmark/core/analyzer.py`: Post-run analysis and benchmark + summaries. - `llm_quest_benchmark/core/benchmark_report.py`: Markdown report generator. -- `llm_quest_benchmark/core/logging.py`: Quest logger with per-run metrics (repetition_rate, bad_decision_rate). -- `llm_quest_benchmark/executors/benchmark.py`: Benchmark orchestration with parallel workers. -- `llm_quest_benchmark/executors/cli/commands.py`: CLI commands (`run`, `play`, `analyze`, `analyze-run`, `benchmark`, `benchmark-report`, `download-quests`, `cleanup`). +- `llm_quest_benchmark/core/logging.py`: Quest logger with per-run metrics + (`repetition_rate`, `bad_decision_rate`). +- `llm_quest_benchmark/executors/benchmark.py`: Benchmark orchestration with + parallel workers. +- `llm_quest_benchmark/executors/cli/commands.py`: CLI commands (`run`, `play`, + `analyze`, `analyze-run`, `benchmark`, `benchmark-report`, + `download-quests`, `cleanup`). ### 6. Prompt Templates -- `llm_quest_benchmark/prompt_templates/`: Jinja2 templates for each agent mode. + +- `llm_quest_benchmark/prompt_templates/`: Jinja2 templates referenced by + harnesses. - `stub.jinja`: Minimal prompt. - - `reasoning.jinja`, `strategic.jinja`, etc.: Short-context or full-history reasoning depending on memory mode. - - `stateful_compact.jinja`, `memo_*.jinja`: Compact memory / memo prompts. - - `light_hints.jinja`, `stateful_compact_hints.jinja`: Prompt hints. - - `planner.jinja`: Planner loop prompts. - - `tool_augmented.jinja`, `tool_augmented_hints.jinja`: Tool prompts with compact memory, optionally with hints. + - `reasoning.jinja`: Short-context or full-history reasoning depending on + harness memory. + - `stateful_compact.jinja`: Compact memory / 20-word memo prompt. + - `stateful_compact_hints.jinja`: Compact memo prompt with mechanics hints. + - `planner.jinja`: Planner loop prompt. + - `tool_augmented.jinja`, `tool_augmented_hints.jinja`: Tool prompts with + compact memory, optionally with hints. ## Persistence + - `metrics.db`: Benchmark/run metrics for CLI workflows. -- `results///run_/run_summary.json`: Step trace + per-step decisions + aggregated token/cost usage. +- `results///run_/run_summary.json`: Step trace, + per-step decisions, and aggregated token/cost usage. ## Configuration + - `.env` (copied from `.env.template`): Provider API keys. -- `configs/benchmarks/`: Benchmark YAML configs defining model x template x quest matrix. +- `configs/benchmarks/`: Benchmark YAML configs defining model × harness × + quest matrices. ## Public Taxonomy (Benchmark Dimension) -| Label | Template / memory source | Agent Class | Description | -|------|----------|-------------|-------------| -| Minimal prompt | stub | LLMAgent | Smallest action-selection prompt | -| Short-context reasoning | reasoning/strategic + default memory | LLMAgent | Local prompted analysis | -| Full-history reasoning | reasoning + full transcript memory | LLMAgent | Whole transcript retained in context | -| Compact memory / memo | reasoning/stateful/memo templates + compaction | LLMAgent | Summarized state instead of unbounded transcript | -| Prompt hints | light_hints/stateful_compact_hints | LLMAgent | Mechanics hints injected into prompt | -| Tools + compact memory | tool_augmented | ToolAgent | Quest history/scratchpad tools with compact context | -| Tools + hints + compact memory | tool_augmented_hints | ToolAgent | Tool scaffold plus prompt hints | -| Planner loop | planner | PlannerAgent | Plan-maintain-act loop | + +| Public label | Harness name | Template | Memory | Tools | Loop | +|---|---|---|---|---|---| +| Minimal prompt | `minimal` | `stub.jinja` | `DefaultMemory` | none | react | +| Short-context reasoning | `reasoning_recent` | `reasoning.jinja` | `DefaultMemory` | none | react | +| Full-history reasoning | `reasoning_full` | `reasoning.jinja` | `FullTranscriptMemory` | none | react | +| Compact memory / memo | `memo_compact` | `stateful_compact.jinja` | `CompactionMemory` | none | react | +| Prompt hints | `hinted_compact` | `stateful_compact_hints.jinja` | `CompactionMemory` | none | react | +| Tools + compact memory | `tool_compact` | `tool_augmented.jinja` | `CompactionMemory` | calculator, scratchpad, quest history | tool-select-then-act | +| Tools + hints + compact memory | `tool_hinted` | `tool_augmented_hints.jinja` | `CompactionMemory` | calculator, scratchpad, quest history | tool-select-then-act | +| Planner loop | `planner` | `planner.jinja` | `CompactionMemory` | none | plan-maintain-act | diff --git a/docs/EXPERIMENTS_LOG.md b/docs/EXPERIMENTS_LOG.md index dadef6e..0d9ce49 100644 --- a/docs/EXPERIMENTS_LOG.md +++ b/docs/EXPERIMENTS_LOG.md @@ -1,5 +1,19 @@ # Experiments Log +## Harness Name Mapping + +| Experiment arm | Old label | New harness name | +|---|---|---| +| Minimal prompt arms | `stub` | `minimal` | +| Short-context reasoning arms | `reasoning` + `default` memory | `reasoning_recent` | +| Full-history reasoning arms | `reasoning` + `full_transcript` memory | `reasoning_full` | +| Stateful compact memo arms | `stateful_compact` + compaction | `memo_compact` | +| Hinted compact memo arms | `stateful_compact_hints` + compaction | `hinted_compact` | +| Tool-augmented compact arms | `tool_augmented` + compaction | `tool_compact` | +| Tool-augmented hinted arms | `tool_augmented_hints` + compaction | `tool_hinted` | +| Planner arms | `planner` | `planner` | +| Memo variation arms | `memo_extended`, `memo_structured`, `memo_cot` | retired experiment variants, not canonical harnesses | + > Historical / non-authoritative notes. This log preserves experiment history > and branch-era shorthand. For the current public taxonomy and public > comparison slice, use `site/about.html`, `site/leaderboard.json`, diff --git a/docs/HARNESS_ENGINEERING.md b/docs/HARNESS_ENGINEERING.md new file mode 100644 index 0000000..5666ebc --- /dev/null +++ b/docs/HARNESS_ENGINEERING.md @@ -0,0 +1,64 @@ +# Harness Engineering + +LLM Quest Benchmark treats the **agent harness** as the primary experimental +object. An agent harness is the wrapper around a model that controls what the +model sees, what state is carried forward, what external tools are available, +and how a raw model completion is converted into a quest action. In this +project, harnesses are not incidental plumbing: they are the independent +variable. + +This framing follows the harness engineering question raised by "How Much Heavy +Lifting Can an Agent Harness Do?" (arXiv:2604.07236): how much performance comes +from the surrounding scaffold rather than the base model alone? Space Rangers +text quests are a useful testbed because they are long enough to stress memory, +planning, and state tracking, but concrete enough to score with terminal +success/failure outcomes. + +## The Eight Canonical Harnesses + +| Harness name | What varies | +|---|---| +| `minimal` | Uses the smallest action-selection prompt with recent context only. This is the low-scaffold baseline. | +| `reasoning_recent` | Adds an explicit reasoning prompt while keeping recent-window memory. | +| `reasoning_full` | Keeps the reasoning prompt but exposes the full transcript instead of a short recent window. | +| `memo_compact` | Uses compacted memory plus a constrained 20-word memo to preserve salient state. | +| `hinted_compact` | Adds mechanics hints to the compact memo harness, without tools. | +| `tool_compact` | Adds calculator, scratchpad, and quest-history tools to compact memory. | +| `tool_hinted` | Combines compact memory, tools, and mechanics hints. | +| `planner` | Uses a plan-maintain-act loop with compact memory instead of a pure react loop. | + +The harness names are canonical snake_case identifiers used in YAML configs, +the CLI, and documentation. Public labels can be friendlier, but experimental +records should preserve these names so runs remain comparable. + +## Difference From TextQuests and TALE-Suite + +TextQuests (arXiv:2507.23701) and TALE-Suite are closest in spirit because they +also evaluate language models on interactive text-game tasks. Their main +comparison axis is model capability under a mostly fixed evaluation scaffold: +the harness is treated as test infrastructure, and the model is varied. + +LLM Quest Benchmark flips that emphasis. We can hold a model fixed and vary the +harness to ask which context, memory, tool, and planning choices change +behavior. That makes the benchmark useful for harness engineering: it can +separate "the model cannot do the task" from "this wrapper failed to show the +model the right state, preserve the right facts, or expose the right operation." + +## Findings So Far + +The strongest pattern so far is that bigger scaffolds are not automatically +better. A concise 20-word memo produced a sweet spot: it improved over no memo +and full transcript baselines, while longer or more structured memo variants +regressed. The likely mechanism is selective pressure: the short memo forces +the harness to preserve only state that matters for future decisions. + +Tools and hints show a synergy effect. Prompt hints alone hurt, and tools alone +were modest, but tools plus hints improved outcomes because the hints pointed +the model toward quantities and morally grey quest mechanics while the +calculator, scratchpad, and history search gave it ways to act on those +signals. + +Verbosity hurts in this environment. Some newer or larger models timed out more +often because they spent too much of the quest budget generating long step +responses. For sequential decision tasks, a harness that elicits concise, +actionable state updates can outperform one that invites broad reasoning. diff --git a/docs/SPEC.md b/docs/SPEC.md index 99289fb..cadbb99 100644 --- a/docs/SPEC.md +++ b/docs/SPEC.md @@ -7,8 +7,11 @@ For the public narrative and interpretation of results, use the project ## Purpose LLM Quest Benchmark evaluates how LLMs make sequential choices in Space -Rangers text quests. The benchmark varies the context scaffold around a model -while holding the quest environment and result logging consistent. +Rangers text quests. The benchmark varies the agent harness around a model +while holding the quest environment and result logging consistent. A harness is +the wrapper that decides what context the model sees and how its response is +converted into an action: prompt template, memory strategy, tools, and loop +shape. The core question is practical: which kinds of context help, hurt, or expose state-tracking failures during 10-50 turn interactive fiction tasks? @@ -35,18 +38,18 @@ analysis, but the public slice is the authoritative comparison surface. ## Current Taxonomy -Use these labels for current public descriptions of benchmark modes: +Use these labels for current public descriptions of benchmark harnesses: -| Label | Implementation source | Agent class | -|---|---|---| -| Minimal prompt | `stub.jinja` | `LLMAgent` | -| Short-context reasoning | `reasoning.jinja`, `strategic.jinja` with default/recent context | `LLMAgent` | -| Full-history reasoning | reasoning templates with `full_transcript` memory | `LLMAgent` | -| Compact memory / memo | `stateful_compact.jinja`, memo templates, compaction memory | `LLMAgent` | -| Prompt hints | `light_hints.jinja`, `stateful_compact_hints.jinja` | `LLMAgent` | -| Tools + compact memory | `tool_augmented.jinja` | `ToolAgent` | -| Tools + hints + compact memory | `tool_augmented_hints.jinja` | `ToolAgent` | -| Planner loop | `planner.jinja` | `PlannerAgent` | +| Label | Harness name | Template | Memory | Tools / loop | +|---|---|---|---|---| +| Minimal prompt | `minimal` | `stub.jinja` | `DefaultMemory` | no tools, react loop | +| Short-context reasoning | `reasoning_recent` | `reasoning.jinja` | `DefaultMemory` | no tools, react loop | +| Full-history reasoning | `reasoning_full` | `reasoning.jinja` | `FullTranscriptMemory` | no tools, react loop | +| Compact memory / memo | `memo_compact` | `stateful_compact.jinja` | `CompactionMemory` | no tools, react loop | +| Prompt hints | `hinted_compact` | `stateful_compact_hints.jinja` | `CompactionMemory` | no tools, react loop | +| Tools + compact memory | `tool_compact` | `tool_augmented.jinja` | `CompactionMemory` | calculator, scratchpad, quest history | +| Tools + hints + compact memory | `tool_hinted` | `tool_augmented_hints.jinja` | `CompactionMemory` | calculator, scratchpad, quest history | +| Planner loop | `planner` | `planner.jinja` | `CompactionMemory` | plan-maintain-act loop | Older internal experiment labels are historical and should not be presented as the current public taxonomy. @@ -56,8 +59,8 @@ the current public taxonomy. - Quest execution uses the TypeScript `space-rangers-quest` submodule through the Python bridge in `llm_quest_benchmark/executors/ts_bridge/`. - Environment state is exposed through `llm_quest_benchmark/environments/qm.py`. -- Agents live under `llm_quest_benchmark/agents/` and are selected by template - aliases and agent factory wiring. +- Agent harnesses live under `llm_quest_benchmark/harnesses/` and are selected + by canonical snake_case harness names. - Provider calls are normalized in `llm_quest_benchmark/llm/client.py` with OpenAI-compatible, Anthropic, Google, and DeepSeek adapters. - Benchmark execution is CLI + YAML driven through `uv run llm-quest ...`. @@ -107,7 +110,7 @@ Provider API keys are required for real LLM runs. Tests and static validation should run without external credentials in a prepared checkout. Reproducible benchmark rows depend on recording the quest file, model/provider -ID, prompt templates, memory mode, run ID, outcome, and run summaries with -usage/metrics. Agent responses are parsed into a chosen action plus optional +ID, harness name, run ID, outcome, and run summaries with usage/metrics. +Harness responses are parsed into a chosen action plus optional analysis/reasoning so action validity, terminal outcome, steps, tokens/cost, and repetition diagnostics can be regenerated from stored artifacts. From 7cc2a21bc963204b03f7a383313a111d336f3502 Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Mon, 11 May 2026 15:02:07 +0400 Subject: [PATCH 09/24] docs: add experiment audit Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- docs/EXPERIMENT_AUDIT.md | 193 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 docs/EXPERIMENT_AUDIT.md diff --git a/docs/EXPERIMENT_AUDIT.md b/docs/EXPERIMENT_AUDIT.md new file mode 100644 index 0000000..3298d0e --- /dev/null +++ b/docs/EXPERIMENT_AUDIT.md @@ -0,0 +1,193 @@ +# Experiment Audit + +Generated: 2026-05-11 + +Sources reviewed: + +- `docs/EXPERIMENTS_LOG.md` +- `docs/ARCHITECTURE.md` +- `configs/benchmarks/*.yaml` +- `site/leaderboard.json` + +This audit uses the post-refactor harness taxonomy: `minimal`, +`reasoning_recent`, `reasoning_full`, `memo_compact`, `hinted_compact`, +`tool_compact`, `tool_hinted`, and `planner`. + +## Experiment Inventory + +| Experiment | Config / source | Harness mapping | Quest scope | Completed runs recorded in log | Audit disposition | +|---|---|---|---|---:|---| +| Exp 2: Memory Modes | `memory_full_transcript.yaml`, `memory_compaction.yaml` | `reasoning_full`, `memo_compact` | 14 historical quests including `Prison` | 126 | Unreliable for canonical comparison: loop-breaker bug era. | +| Exp 3 Arm 1: No Loop Breaker | `exp3_no_loop_breaker.yaml` | `reasoning_full` | 18 quests, excluding `Boat`/`Prison` | 36 | Use only rerun after timeout fix; pre-fix attempt is noisy/incomplete. | +| Exp 3 Arm 2: Stateful Compact | `exp3_stateful_compact.yaml` | `memo_compact` | 18 quests, excluding `Boat`/`Prison` | 36 | Canonical memo baseline, but only 2 runs/quest. | +| Exp 4: Compaction No Memo | `exp4_compaction_no_memo.yaml` | retired ablation, not canonical | 18 quests | 36 | Do not aggregate into `memo_compact`. | +| Exp 4: Memo Extended | `exp4_memo_extended.yaml` | retired `memo_extended` variant | 18 quests | 36 | Non-canonical variant. | +| Exp 4: Memo Structured | `exp4_memo_structured.yaml` | retired `memo_structured` variant | 18 quests | 36 | Non-canonical variant. | +| Exp 4: Memo CoT | `exp4_memo_cot.yaml` | retired `memo_cot` variant | 18 quests | 36 | Non-canonical variant. | +| Exp 5: Baseline Variance | `exp5_stateful_compact_variance.yaml` | `memo_compact` | 18 quests | 90 | Canonical memo baseline variance study. | +| Exp 6: Prompt Hints | `exp6_prompt_hints.yaml` | `hinted_compact` | 18 quests | 54 | Canonical single-model harness comparison. | +| Exp 6: Tools | `exp6_tools.yaml` | `tool_compact` | 18 quests | 54 | Canonical single-model harness comparison. | +| Exp 6: Tools + Hints | `exp6_tools_hints.yaml` | `tool_hinted` | 18 quests | 54 | Canonical single-model harness comparison. | +| Exp 7: Multi-Model Comparison | `exp7_*.yaml` | `memo_compact` | 5 winnable quests | 75 | Canonical model sweep for memo harness. | +| Exp 7b: Model Upgrades | `exp7b_model_upgrades.yaml` | `memo_compact` | 18 quests | 108 | Noisy model-upgrade sweep; high timeout rates for Qwen 3.6 and Haiku 4.5. | + +## 1. Harness Coverage Matrix + +The table below is computed from `site/leaderboard.json` and counts recorded +leaderboard runs by harness and quest. `Boat` and `Prison` are retained in this +matrix because they still appear in the published leaderboard data, but they +are retired from the canonical experiment set. + +| Harness | Badday | Banket | Boat | Codebox | Depth | Driver | Edelweiss | Election | Foncers | Leonardo | Ministry | Pizza | Prison | Robots | Ski | Total | +|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| `minimal` | 22 | 22 | 23 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 331 | +| `reasoning_recent` | 22 | 22 | 28 | 22 | 22 | 24 | 25 | 30 | 25 | 25 | 26 | 22 | 28 | 31 | 31 | 383 | +| `reasoning_full` | 17 | 17 | 9 | 17 | 17 | 15 | 17 | 17 | 17 | 17 | 16 | 17 | 6 | 14 | 14 | 227 | +| `memo_compact` | 37 | 39 | 18 | 39 | 39 | 39 | 39 | 37 | 39 | 37 | 39 | 37 | 15 | 39 | 34 | 527 | +| `hinted_compact` | 4 | 4 | 1 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 1 | 4 | 4 | 54 | +| `tool_compact` | 3 | 3 | **0** | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | **0** | 3 | 3 | 39 | +| `tool_hinted` | 3 | 3 | **0** | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | **0** | 3 | 3 | 39 | +| `planner` | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 15 | + +Leaderboard scope note: the current public JSON includes 15 quest columns and +does not include several 18-quest experiment-log quests such as `Pilot`, +`Disk`, `Player`, `Shashki`, and `Sortirovka1`. Those quests appear in the +benchmark configs and experiment log, so a future leaderboard refresh should +either add them or explicitly document why the public slice excludes them. + +## 2. Gap Analysis + +### Zero-run harness × quest cells + +All zero-run cells in the published leaderboard matrix are retired quest cells: + +- `tool_compact` × `Boat`: 0 runs. +- `tool_compact` × `Prison`: 0 runs. +- `tool_hinted` × `Boat`: 0 runs. +- `tool_hinted` × `Prison`: 0 runs. + +Because `Boat` and `Prison` are retired, these do not require new canonical +runs. They do indicate that the public leaderboard mixes active and retired +quest scopes. + +### Fewer than 3 runs + +- `hinted_compact` × `Boat`: 1 run; retired quest. +- `hinted_compact` × `Prison`: 1 run; retired quest. +- `planner`: 1 run on every published quest (`Badday`, `Banket`, `Boat`, + `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`, `Foncers`, + `Leonardo`, `Ministry`, `Pizza`, `Prison`, `Robots`, `Ski`). + +Canonical action item: the planner harness has insufficient variance coverage. +For active quests, it needs at least two additional runs per quest to reach the +minimum 3-run threshold. + +### Only 1 model tested + +The following harnesses have leaderboard cells where the run count may be at +least 3, but the model dimension is still only one model. These cells cannot +separate harness effects from model-specific behavior: + +- `tool_compact`: one model on all non-retired published quests + (`Badday`, `Banket`, `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`, + `Foncers`, `Leonardo`, `Ministry`, `Pizza`, `Robots`, `Ski`). +- `tool_hinted`: one model on all non-retired published quests + (`Badday`, `Banket`, `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`, + `Foncers`, `Leonardo`, `Ministry`, `Pizza`, `Robots`, `Ski`). +- `planner`: one model on every published quest and only one run per quest. +- `hinted_compact` on `Boat` and `Prison`: one model, but both quests are + retired. + +The stronger public comparison cells are `minimal`, `reasoning_recent`, +`reasoning_full`, and `memo_compact`, which have multi-model coverage in the +leaderboard data. However, `reasoning_full` and `memo_compact` still require +provenance filtering because early memory-mode runs overlap with the loop- +breaker bug era. + +## 3. Noise / Anomaly List + +### Loop-breaker bug era + +- Exp 2 memory-mode runs are unreliable. The experiment log documents a + number-normalization bug in `_normalize_for_signature` and aggressive loop + breaker overrides that changed correct model decisions. +- Exp 3 Arm 1 has a pre-fix/incomplete attempt affected by SDK timeout issues. + Only the rerun after the timeout fix should be considered. +- Any leaderboard entry whose provenance traces to Exp 2 or the Exp 3 pre-fix + attempt should be marked non-canonical until regenerated or excluded. + +### High timeout runs + +- Exp 7b `Qwen 3.6 Flash`: 17/36 timeouts (47%). +- Exp 7b `Claude Haiku 4.5`: 19/36 timeouts (53%). +- Exp 7b `DeepSeek V4 Flash`: 5/36 timeouts (14%), below the >30% threshold + but still noisy because success was 0/36. + +The Qwen 3.6 and Haiku 4.5 rows should be interpreted primarily as timeout / +verbosity failures, not clean harness-quality signals. + +### Retired quests + +- `Boat`: trivial / smoke-test-like quest; removed from canonical experiment + configs. +- `Prison`: loops endlessly; removed from canonical experiment configs. + +Both still appear in `site/leaderboard.json`, so public summaries should label +them as retired or remove them from canonical aggregates. + +### Retired harness variants + +The following Exp 4 arms are not part of the final taxonomy and should not be +merged into canonical `memo_compact` results: + +- `memo_extended` +- `memo_structured` +- `memo_cot` +- `compaction_no_memo` ablation + +Current YAML files have been migrated to the `harness:` key, so historical +variant identity must be preserved from `docs/EXPERIMENTS_LOG.md` and config +file names rather than inferred only from the post-refactor `harness` field. + +## 4. Budget Estimate + +Top-priority new runs to close actionable gaps while avoiding retired quests: + +| Priority | Harness | Quest(s) | New runs needed | Reason | +|---:|---|---|---:|---| +| 1 | `planner` | 13 active published quests (`Badday`, `Banket`, `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`, `Foncers`, `Leonardo`, `Ministry`, `Pizza`, `Robots`, `Ski`) | 26 | Bring 1-run planner cells up to the 3-run minimum on active leaderboard quests. | +| 2 | `planner` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest so planner effects are not single-model artifacts. | +| 3 | `tool_compact` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest; current cells are all one-model results. | +| 4 | `tool_hinted` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest; current cells are all one-model results. | +| 5 | Public leaderboard refresh | `Pilot`, `Disk`, `Player`, `Shashki`, `Sortirovka1` | Scope-dependent | These quests are present in canonical 18-quest configs/logs but absent from the current public leaderboard matrix. Backfill or explicitly exclude them. | + +Do not spend new budget on `Boat` or `Prison` unless the goal is only to +reproduce historical/public rows; both are retired from canonical analysis. + +## 5. Leaderboard Integrity + +Findings from `site/leaderboard.json`: + +1. The leaderboard uses the eight canonical public modes and does not expose + retired harness variants as separate modes. This is good, but it creates a + provenance risk if Exp 4 retired variants were ever aggregated under + `memo_compact`. +2. `Boat` and `Prison` remain in the published quest list despite being retired + from canonical experiment configs. They should be excluded from aggregate + claims or clearly labeled as retired. +3. `planner` has only one run per quest and one model. It should not be used for + reliability claims yet. +4. `tool_compact` and `tool_hinted` have three runs per active published quest, + but only one model. Their harness comparison is promising but not yet + model-robust. +5. Published `reasoning_full` / `memo_compact` rows need run-level provenance + checks before canonical use because early memory-mode experiments overlap + with the Exp 2 loop-breaker bug era. +6. Exp 7b model-upgrade entries for `Qwen 3.6 Flash` and `Claude Haiku 4.5` + should be annotated as high-timeout data if included in any leaderboard or + narrative comparison. + +Recommended integrity rule: canonical leaderboard aggregates should require +non-retired quests, canonical harness names, no loop-breaker bug provenance, at +least 3 runs per harness × quest cell, and at least two models for claims about +harness effects rather than model effects. From 27328d143b9208247558949850787adac73c737e Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Mon, 11 May 2026 15:21:03 +0400 Subject: [PATCH 10/24] fix: double memory update, compaction guard, test config migration Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- configs/default.yaml | 10 +++++----- configs/kr1.yaml | 8 ++++---- configs/kr1_micro.yaml | 4 ++-- configs/kr1_test.yaml | 4 ++-- configs/kr2_en_benchmark.yaml | 8 ++++---- configs/kr2_en_test.yaml | 2 +- configs/test/parallel_agents_test.yaml | 2 +- configs/test/temperature_test.yaml | 12 ++++++------ configs/test/test_benchmark.yaml | 2 +- llm_quest_benchmark/harnesses/base.py | 2 -- llm_quest_benchmark/harnesses/memory.py | 7 ++++--- 11 files changed, 30 insertions(+), 31 deletions(-) diff --git a/configs/default.yaml b/configs/default.yaml index d7dbe67..ff185a5 100644 --- a/configs/default.yaml +++ b/configs/default.yaml @@ -5,27 +5,27 @@ quests: agents: - model: random_choice - template: reasoning.jinja + harness: reasoning_recent temperature: 0.0 skip_single: true - model: gpt-5-mini - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true - model: claude-sonnet-4-5 - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true - model: gemini-2.5-flash - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true - model: deepseek-3.2-chat - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true diff --git a/configs/kr1.yaml b/configs/kr1.yaml index c7771e6..c31cc3b 100644 --- a/configs/kr1.yaml +++ b/configs/kr1.yaml @@ -5,22 +5,22 @@ quests: agents: - model: gpt-5-mini - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true - model: claude-sonnet-4-5 - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true - model: gemini-2.5-flash - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true - model: deepseek-3.2-chat - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true diff --git a/configs/kr1_micro.yaml b/configs/kr1_micro.yaml index c19bd1a..ac3df96 100644 --- a/configs/kr1_micro.yaml +++ b/configs/kr1_micro.yaml @@ -8,12 +8,12 @@ quests: agents: # Just 2 agents to validate the process - model: gpt-5-mini - template: reasoning.jinja + harness: reasoning_recent temperature: 0.7 skip_single: true - model: gemini-2.5-flash - template: reasoning.jinja + harness: reasoning_recent temperature: 0.6 skip_single: true diff --git a/configs/kr1_test.yaml b/configs/kr1_test.yaml index fbe843c..bb8ed98 100644 --- a/configs/kr1_test.yaml +++ b/configs/kr1_test.yaml @@ -7,12 +7,12 @@ quests: agents: # Just 2 agents to validate the process - model: gpt-5-mini - template: reasoning.jinja + harness: reasoning_recent temperature: 0.7 skip_single: true - model: gemini-2.5-flash - template: reasoning.jinja + harness: reasoning_recent temperature: 0.6 skip_single: true diff --git a/configs/kr2_en_benchmark.yaml b/configs/kr2_en_benchmark.yaml index 76b6c21..88a0fe5 100644 --- a/configs/kr2_en_benchmark.yaml +++ b/configs/kr2_en_benchmark.yaml @@ -7,23 +7,23 @@ quests: agents: # OpenAI models - model: gpt-4o - template: reasoning.jinja + harness: reasoning_recent temperature: 0.5 skip_single: true - model: gpt-4o-mini - template: reasoning.jinja + harness: reasoning_recent temperature: 0.7 skip_single: true # Anthropic models - model: claude-3-7-sonnet-latest - template: reasoning.jinja + harness: reasoning_recent temperature: 0.5 skip_single: true - model: claude-3-5-sonnet-latest - template: reasoning.jinja + harness: reasoning_recent temperature: 0.6 skip_single: true diff --git a/configs/kr2_en_test.yaml b/configs/kr2_en_test.yaml index 7dbe160..0addb04 100644 --- a/configs/kr2_en_test.yaml +++ b/configs/kr2_en_test.yaml @@ -5,7 +5,7 @@ quests: agents: - model: random_choice # Use random agent for speed and reliability temperature: 0.5 - template: reasoning.jinja + harness: reasoning_recent quest_timeout: 10 # short timeout for testing debug: true output_dir: results/benchmarks diff --git a/configs/test/parallel_agents_test.yaml b/configs/test/parallel_agents_test.yaml index 37bca75..0aec1be 100644 --- a/configs/test/parallel_agents_test.yaml +++ b/configs/test/parallel_agents_test.yaml @@ -6,7 +6,7 @@ quests: agents: - model: random_choice - model: gpt-5-mini - template: reasoning.jinja + harness: reasoning_recent debug: true # No max_workers setting - we'll use one worker per agent output_dir: results/benchmarks diff --git a/configs/test/temperature_test.yaml b/configs/test/temperature_test.yaml index 8f8e0cc..d79b705 100644 --- a/configs/test/temperature_test.yaml +++ b/configs/test/temperature_test.yaml @@ -7,32 +7,32 @@ quests: agents: - model: claude-sonnet-4-5 - template: reasoning.jinja + harness: reasoning_recent temperature: 0.3 skip_single: true - model: claude-sonnet-4-5 - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true - model: claude-sonnet-4-5 - template: reasoning.jinja + harness: reasoning_recent temperature: 0.7 skip_single: true - model: deepseek-3.2-chat - template: reasoning.jinja + harness: reasoning_recent temperature: 0.3 skip_single: true - model: deepseek-3.2-chat - template: reasoning.jinja + harness: reasoning_recent temperature: 0.4 skip_single: true - model: deepseek-3.2-chat - template: reasoning.jinja + harness: reasoning_recent temperature: 0.7 skip_single: true diff --git a/configs/test/test_benchmark.yaml b/configs/test/test_benchmark.yaml index 3c89dab..c20c648 100644 --- a/configs/test/test_benchmark.yaml +++ b/configs/test/test_benchmark.yaml @@ -4,7 +4,7 @@ quests: agents: - model: random_choice - model: gemini-2.5-flash - template: reasoning.jinja + harness: reasoning_recent debug: true quest_timeout: 60 max_workers: 2 diff --git a/llm_quest_benchmark/harnesses/base.py b/llm_quest_benchmark/harnesses/base.py index 6fa8afd..0501287 100644 --- a/llm_quest_benchmark/harnesses/base.py +++ b/llm_quest_benchmark/harnesses/base.py @@ -328,8 +328,6 @@ def get_action(self, observation: str, choices: list[dict[str, str]]) -> int: self._observation_history.append(clean) if len(self._observation_history) > 20: self._observation_history = self._observation_history[-20:] - if self.memory_module is not None: - self.memory_module.update({"observation": clean, "step": self._step_count + 1}) return super().get_action(observation, choices) def on_game_start(self) -> None: diff --git a/llm_quest_benchmark/harnesses/memory.py b/llm_quest_benchmark/harnesses/memory.py index ff54ff9..ab4f72c 100644 --- a/llm_quest_benchmark/harnesses/memory.py +++ b/llm_quest_benchmark/harnesses/memory.py @@ -249,6 +249,9 @@ def reset(self) -> None: def _maybe_compact(self) -> None: if self._steps_since_compaction < self.compaction_interval: return + if self.llm_client is None: + # No LLM client available for compaction; skip silently + return transcript_text = self._format_transcript_for_compaction() if not transcript_text: return @@ -269,9 +272,7 @@ def _maybe_compact(self) -> None: "Write a concise summary in plain text, max 300 words." ) - summary = "" - if self.llm_client is not None: - summary = (self.llm_client.get_completion("\n".join(prompt_parts)) or "").strip() + summary = (self.llm_client.get_completion("\n".join(prompt_parts)) or "").strip() if summary: self._compaction_summary = summary self._transcript = [] From 5c2aa5cdb1f7c20e257ba0ca4cab6443e46546e5 Mon Sep 17 00:00:00 2001 From: Kirill Korikov Date: Mon, 11 May 2026 17:36:59 +0400 Subject: [PATCH 11/24] fix: preserve benchmark result compatibility --- llm_quest_benchmark/executors/benchmark.py | 36 +++++++++++++++++++ llm_quest_benchmark/schemas/config.py | 6 +++- .../tests/harnesses/test_harnesses.py | 6 ++-- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/llm_quest_benchmark/executors/benchmark.py b/llm_quest_benchmark/executors/benchmark.py index 69e7d22..62f5c92 100644 --- a/llm_quest_benchmark/executors/benchmark.py +++ b/llm_quest_benchmark/executors/benchmark.py @@ -56,6 +56,40 @@ def _agent_harness(agent_config) -> str: return legacy_mapping.get((template, memory_mode), "reasoning_recent") +def _agent_template(agent_config) -> str: + """Return legacy template name for result artifacts.""" + if hasattr(agent_config, "action_template"): + return agent_config.action_template + + harness_templates = { + "minimal": "stub.jinja", + "reasoning_recent": "reasoning.jinja", + "reasoning_full": "reasoning.jinja", + "memo_compact": "stateful_compact.jinja", + "hinted_compact": "stateful_compact_hints.jinja", + "tool_compact": "tool_augmented.jinja", + "tool_hinted": "tool_augmented_hints.jinja", + "planner": "planner.jinja", + } + return harness_templates.get(_agent_harness(agent_config), "reasoning.jinja") + + +def _agent_memory_mode(agent_config) -> str: + """Return legacy memory mode for result artifacts.""" + if hasattr(agent_config, "memory_mode"): + return agent_config.memory_mode + + harness_memory_modes = { + "reasoning_full": "full_transcript", + "memo_compact": "compaction", + "hinted_compact": "compaction", + "tool_compact": "compaction", + "tool_hinted": "compaction", + "planner": "compaction", + } + return harness_memory_modes.get(_agent_harness(agent_config), "default") + + def _result_entry( quest: str, agent_config, @@ -69,6 +103,8 @@ def _result_entry( "model": agent_config.model, "temperature": agent_config.temperature, "harness": _agent_harness(agent_config), + "template": _agent_template(agent_config), + "memory_mode": _agent_memory_mode(agent_config), "agent_id": agent_config.harness_id if hasattr(agent_config, "harness_id") else agent_config.agent_id, "attempt": attempt, "outcome": outcome, diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py index c658729..7d5c74c 100644 --- a/llm_quest_benchmark/schemas/config.py +++ b/llm_quest_benchmark/schemas/config.py @@ -104,7 +104,11 @@ def __post_init__(self): self.system_template = normalize_template_name(self.system_template) from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, SPECIAL_HARNESSES, is_random_choice_harness - if self.harness not in HARNESS_REGISTRY and self.harness != "human" and not is_random_choice_harness(self.harness): + if ( + self.harness not in HARNESS_REGISTRY + and self.harness != "human" + and not is_random_choice_harness(self.harness) + ): valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES] raise ValueError(f"Invalid harness: {self.harness}. Supported harnesses: {valid}") if not (0.0 <= self.temperature <= 2.0): diff --git a/llm_quest_benchmark/tests/harnesses/test_harnesses.py b/llm_quest_benchmark/tests/harnesses/test_harnesses.py index 030648b..9095a46 100644 --- a/llm_quest_benchmark/tests/harnesses/test_harnesses.py +++ b/llm_quest_benchmark/tests/harnesses/test_harnesses.py @@ -3,14 +3,13 @@ from unittest.mock import Mock from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness -from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory from llm_quest_benchmark.harnesses.memo import HintedCompactHarness, MemoCompactHarness +from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory from llm_quest_benchmark.harnesses.minimal import MinimalHarness from llm_quest_benchmark.harnesses.planner import PlannerHarness from llm_quest_benchmark.harnesses.reasoning import ReasoningFullTranscriptHarness, ReasoningRecentHarness from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness, ToolHintedHarness - HARNESS_SPECS = { "minimal": (MinimalHarness, "stub.jinja", DefaultMemory), "reasoning_recent": (ReasoningRecentHarness, "reasoning.jinja", DefaultMemory), @@ -221,8 +220,7 @@ def test_tool_compact_scratchpad_read_write_and_reset(): assert harness.scratchpad("read") == "(empty)" assert ( - harness.scratchpad("write_replace", " Board: W B _ ; failed door 2 ") - == "updated: Board: W B _ ; failed door 2" + harness.scratchpad("write_replace", " Board: W B _ ; failed door 2 ") == "updated: Board: W B _ ; failed door 2" ) assert harness.scratchpad("read") == "Board: W B _ ; failed door 2" From 930edda4ebe1cb796ae55eb2398667d7aea3d063 Mon Sep 17 00:00:00 2001 From: Kirill Korikov Date: Mon, 11 May 2026 17:57:13 +0400 Subject: [PATCH 12/24] fix: address PR review feedback --- .../benchmarks/exp4_compaction_no_memo.yaml | 2 +- configs/benchmarks/exp4_memo_cot.yaml | 2 +- configs/benchmarks/exp4_memo_extended.yaml | 2 +- configs/benchmarks/exp4_memo_structured.yaml | 2 +- llm_quest_benchmark/core/leaderboard.py | 51 +++++++++++++- llm_quest_benchmark/executors/benchmark.py | 29 +++----- llm_quest_benchmark/harnesses/base.py | 15 ++--- llm_quest_benchmark/harnesses/factory.py | 13 +++- llm_quest_benchmark/harnesses/memo.py | 36 ++++++++++ .../tests/agents/test_llm_agent.py | 7 ++ .../tests/harnesses/test_factory.py | 7 ++ .../tests/harnesses/test_harnesses.py | 43 +++++++++++- .../tests/integration/test_benchmark.py | 18 ++--- .../tests/test_benchmark_with_directory.py | 8 +-- llm_quest_benchmark/tests/test_leaderboard.py | 67 +++++++++++++++++++ 15 files changed, 250 insertions(+), 52 deletions(-) diff --git a/configs/benchmarks/exp4_compaction_no_memo.yaml b/configs/benchmarks/exp4_compaction_no_memo.yaml index 896dd60..4ab63e6 100644 --- a/configs/benchmarks/exp4_compaction_no_memo.yaml +++ b/configs/benchmarks/exp4_compaction_no_memo.yaml @@ -24,7 +24,7 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - harness: memo_compact + harness: compaction_no_memo temperature: 0.4 runs: 2 compaction_interval: 50 diff --git a/configs/benchmarks/exp4_memo_cot.yaml b/configs/benchmarks/exp4_memo_cot.yaml index 9bfe382..320da54 100644 --- a/configs/benchmarks/exp4_memo_cot.yaml +++ b/configs/benchmarks/exp4_memo_cot.yaml @@ -24,7 +24,7 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - harness: memo_compact + harness: memo_cot temperature: 0.4 runs: 2 compaction_interval: 50 diff --git a/configs/benchmarks/exp4_memo_extended.yaml b/configs/benchmarks/exp4_memo_extended.yaml index 25e5620..a5d6613 100644 --- a/configs/benchmarks/exp4_memo_extended.yaml +++ b/configs/benchmarks/exp4_memo_extended.yaml @@ -24,7 +24,7 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - harness: memo_compact + harness: memo_extended temperature: 0.4 runs: 2 compaction_interval: 50 diff --git a/configs/benchmarks/exp4_memo_structured.yaml b/configs/benchmarks/exp4_memo_structured.yaml index 96e5daf..f70ab81 100644 --- a/configs/benchmarks/exp4_memo_structured.yaml +++ b/configs/benchmarks/exp4_memo_structured.yaml @@ -24,7 +24,7 @@ quests: - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm agents: - model: "openrouter:google/gemini-3-flash-preview" - harness: memo_compact + harness: memo_structured temperature: 0.4 runs: 2 compaction_interval: 50 diff --git a/llm_quest_benchmark/core/leaderboard.py b/llm_quest_benchmark/core/leaderboard.py index 032ad48..dc0a67b 100644 --- a/llm_quest_benchmark/core/leaderboard.py +++ b/llm_quest_benchmark/core/leaderboard.py @@ -28,9 +28,6 @@ "stub": ("minimal_prompt", TAXONOMY_MODES["minimal_prompt"]), "strategic": ("short_context_reasoning", TAXONOMY_MODES["short_context_reasoning"]), "stateful_compact": ("compact_memory_memo", TAXONOMY_MODES["compact_memory_memo"]), - "memo_cot": ("compact_memory_memo", TAXONOMY_MODES["compact_memory_memo"]), - "memo_extended": ("compact_memory_memo", TAXONOMY_MODES["compact_memory_memo"]), - "memo_structured": ("compact_memory_memo", TAXONOMY_MODES["compact_memory_memo"]), "light_hints": ("prompt_hints", TAXONOMY_MODES["prompt_hints"]), "stateful_compact_hints": ("prompt_hints", TAXONOMY_MODES["prompt_hints"]), "planner": ("planner_loop", TAXONOMY_MODES["planner_loop"]), @@ -38,6 +35,26 @@ "tool_augmented_hints": ("tools_hints_compact_memory", TAXONOMY_MODES["tools_hints_compact_memory"]), } +RETIRED_BENCHMARK_NAMES = { + "exp4_compaction_no_memo", + "exp4_memo_cot", + "exp4_memo_extended", + "exp4_memo_structured", +} + +RETIRED_HARNESSES = { + "compaction_no_memo", + "memo_cot", + "memo_extended", + "memo_structured", +} + +RETIRED_TEMPLATE_IDS = { + "memo_cot", + "memo_extended", + "memo_structured", +} + REASONING_STYLE_TEMPLATES = { "reasoning", "strategic", @@ -87,6 +104,25 @@ def _mode_from_template(template_name: str, memory_mode: str | None = None) -> t return TEMPLATE_TO_MODE.get(template_id, (template_id or "unknown", template_id or "unknown")) +def _is_retired_result( + source_name: str | None, + benchmark_id: str | None, + result_row: dict[str, Any], + agent_config: dict[str, Any], + template_name: str, +) -> bool: + source_names = {str(value) for value in (source_name, benchmark_id) if value} + if source_names & RETIRED_BENCHMARK_NAMES: + return True + + harness = str(result_row.get("harness") or agent_config.get("harness") or "") + if harness in RETIRED_HARNESSES: + return True + + template_id = _strip_template_suffix(template_name) + return template_id in RETIRED_TEMPLATE_IDS + + def _agent_config(db_run: dict[str, Any]) -> dict[str, Any]: raw_config = db_run.get("agent_config") if not isinstance(raw_config, str) or not raw_config: @@ -298,6 +334,7 @@ def generate_leaderboard( continue benchmark_id = summary.get("benchmark_id") + source_name = summary.get("name") if benchmark_id: benchmark_ids.append(str(benchmark_id)) @@ -349,6 +386,14 @@ def generate_leaderboard( if template_from_config: template = template_from_config memory_mode = config.get("memory_mode") + if _is_retired_result( + str(source_name) if source_name else None, + str(benchmark_id) if benchmark_id else None, + result_row, + config, + template, + ): + continue mode_id, mode_label = _mode_from_template(template, str(memory_mode) if memory_mode is not None else None) try: diff --git a/llm_quest_benchmark/executors/benchmark.py b/llm_quest_benchmark/executors/benchmark.py index 62f5c92..53c430d 100644 --- a/llm_quest_benchmark/executors/benchmark.py +++ b/llm_quest_benchmark/executors/benchmark.py @@ -35,25 +35,8 @@ def _agent_harness(agent_config) -> str: - """Return harness name for new configs, with legacy AgentConfig fallback.""" - if hasattr(agent_config, "harness"): - return agent_config.harness - - template = getattr(agent_config, "action_template", "reasoning.jinja") - memory_mode = getattr(agent_config, "memory_mode", "default") - template = template.removesuffix(".jinja") - legacy_mapping = { - ("stub", "default"): "minimal", - ("reasoning", "default"): "reasoning_recent", - ("reasoning", "full_transcript"): "reasoning_full", - ("reasoning", "compaction"): "memo_compact", - ("stateful_compact", "compaction"): "memo_compact", - ("stateful_compact_hints", "compaction"): "hinted_compact", - ("tool_augmented", "compaction"): "tool_compact", - ("tool_augmented_hints", "compaction"): "tool_hinted", - ("planner", "compaction"): "planner", - } - return legacy_mapping.get((template, memory_mode), "reasoning_recent") + """Return the configured harness name.""" + return agent_config.harness def _agent_template(agent_config) -> str: @@ -70,6 +53,10 @@ def _agent_template(agent_config) -> str: "tool_compact": "tool_augmented.jinja", "tool_hinted": "tool_augmented_hints.jinja", "planner": "planner.jinja", + "compaction_no_memo": "reasoning.jinja", + "memo_cot": "memo_cot.jinja", + "memo_extended": "memo_extended.jinja", + "memo_structured": "memo_structured.jinja", } return harness_templates.get(_agent_harness(agent_config), "reasoning.jinja") @@ -86,6 +73,10 @@ def _agent_memory_mode(agent_config) -> str: "tool_compact": "compaction", "tool_hinted": "compaction", "planner": "compaction", + "compaction_no_memo": "compaction", + "memo_cot": "compaction", + "memo_extended": "compaction", + "memo_structured": "compaction", } return harness_memory_modes.get(_agent_harness(agent_config), "default") diff --git a/llm_quest_benchmark/harnesses/base.py b/llm_quest_benchmark/harnesses/base.py index 0501287..440675b 100644 --- a/llm_quest_benchmark/harnesses/base.py +++ b/llm_quest_benchmark/harnesses/base.py @@ -303,6 +303,8 @@ def _ensure_llm(self) -> None: system_prompt=self.prompt_renderer.render_system_prompt(), temperature=self.temperature, ) + if self.memory_module is not None and hasattr(self.memory_module, "llm_client"): + self.memory_module.llm_client = self.llm @abstractmethod def _get_action_impl(self, observation, choices) -> int: @@ -433,22 +435,13 @@ def _remember_decision( ) def _format_prompt(self, observation, choices, memo=None, context=None) -> str: - """Render system and action Jinja templates for the current decision.""" - system_prompt = self.prompt_renderer.render_system_prompt( - observation=observation, - choices=choices, - memo=memo, - context=context, - ).strip() - action_prompt = self.prompt_renderer.action_template.render( + """Render the action Jinja template for the current decision.""" + return self.prompt_renderer.action_template.render( observation=observation, choices=[{"text": c.get("text", "")} for c in choices], memo=memo, context=context, ).strip() - if system_prompt: - return f"{system_prompt}\n\n{action_prompt}".strip() - return action_prompt def _parse_llm_response(self, response, num_choices) -> LLMResponse: """Parse an LLM response into a structured response object.""" diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py index b46f5dc..8ea4b84 100644 --- a/llm_quest_benchmark/harnesses/factory.py +++ b/llm_quest_benchmark/harnesses/factory.py @@ -4,7 +4,14 @@ from llm_quest_benchmark.agents.human_player import HumanPlayer from llm_quest_benchmark.agents.random_agent import RandomAgent from llm_quest_benchmark.constants import DEFAULT_MODEL -from llm_quest_benchmark.harnesses.memo import HintedCompactHarness, MemoCompactHarness +from llm_quest_benchmark.harnesses.memo import ( + CompactionNoMemoHarness, + HintedCompactHarness, + MemoCompactHarness, + MemoCotHarness, + MemoExtendedHarness, + MemoStructuredHarness, +) from llm_quest_benchmark.harnesses.minimal import MinimalHarness from llm_quest_benchmark.harnesses.planner import PlannerHarness from llm_quest_benchmark.harnesses.reasoning import ReasoningFullTranscriptHarness, ReasoningRecentHarness @@ -19,6 +26,10 @@ "tool_compact": ToolCompactHarness, "tool_hinted": ToolHintedHarness, "planner": PlannerHarness, + "compaction_no_memo": CompactionNoMemoHarness, + "memo_cot": MemoCotHarness, + "memo_extended": MemoExtendedHarness, + "memo_structured": MemoStructuredHarness, } SPECIAL_HARNESSES = ("human", "random_choice", "random_choice_") diff --git a/llm_quest_benchmark/harnesses/memo.py b/llm_quest_benchmark/harnesses/memo.py index 764f206..eaab06b 100644 --- a/llm_quest_benchmark/harnesses/memo.py +++ b/llm_quest_benchmark/harnesses/memo.py @@ -60,3 +60,39 @@ def __init__( memory_module=memory_module, **kwargs, ) + + +class CompactionNoMemoHarness(MemoCompactHarness): + """Retired Exp 4 ablation: compacted transcript without memo-oriented prompting.""" + + harness_name = "compaction_no_memo" + + def __init__(self, *args, action_template: str = "reasoning.jinja", **kwargs): + super().__init__(*args, action_template=action_template, **kwargs) + + +class MemoExtendedHarness(MemoCompactHarness): + """Retired Exp 4 variant with a larger generic memo field.""" + + harness_name = "memo_extended" + + def __init__(self, *args, action_template: str = "memo_extended.jinja", **kwargs): + super().__init__(*args, action_template=action_template, **kwargs) + + +class MemoStructuredHarness(MemoCompactHarness): + """Retired Exp 4 variant with structured memo prompting.""" + + harness_name = "memo_structured" + + def __init__(self, *args, action_template: str = "memo_structured.jinja", **kwargs): + super().__init__(*args, action_template=action_template, **kwargs) + + +class MemoCotHarness(MemoCompactHarness): + """Retired Exp 4 variant with scratchpad-style memo prompting.""" + + harness_name = "memo_cot" + + def __init__(self, *args, action_template: str = "memo_cot.jinja", **kwargs): + super().__init__(*args, action_template=action_template, **kwargs) diff --git a/llm_quest_benchmark/tests/agents/test_llm_agent.py b/llm_quest_benchmark/tests/agents/test_llm_agent.py index 1f3b99c..280fd0a 100644 --- a/llm_quest_benchmark/tests/agents/test_llm_agent.py +++ b/llm_quest_benchmark/tests/agents/test_llm_agent.py @@ -90,6 +90,13 @@ def test_non_gemini_prompt_uses_selected_template(): assert "IMPORTANT: Please respond with ONLY a single number" in prompt +def test_formatted_user_prompt_does_not_duplicate_system_prompt(): + harness = MinimalHarness(model_name="gpt-5-mini", action_template="stub.jinja") + prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}]) + + assert "experienced interactive fiction player" not in prompt + + def test_template_alias_without_suffix_is_supported(): harness = MinimalHarness(model_name="gpt-5-mini", action_template="reasoning") prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}]) diff --git a/llm_quest_benchmark/tests/harnesses/test_factory.py b/llm_quest_benchmark/tests/harnesses/test_factory.py index 187f4d3..4d72b61 100644 --- a/llm_quest_benchmark/tests/harnesses/test_factory.py +++ b/llm_quest_benchmark/tests/harnesses/test_factory.py @@ -68,6 +68,13 @@ def test_harness_config_allows_seeded_random_choice_harness(): assert config.harness == "random_choice_123" +def test_harness_config_allows_retired_exp4_aliases(): + for harness_name in ("compaction_no_memo", "memo_cot", "memo_extended", "memo_structured"): + config = HarnessConfig(harness=harness_name, model="gpt-5-mini") + + assert config.harness == harness_name + + def test_harness_config_rejects_old_template_key(): with pytest.raises(ValueError, match="Use harness: key instead of template:"): HarnessConfig(model="gpt-5-mini", template="reasoning.jinja") diff --git a/llm_quest_benchmark/tests/harnesses/test_harnesses.py b/llm_quest_benchmark/tests/harnesses/test_harnesses.py index 9095a46..3cba73e 100644 --- a/llm_quest_benchmark/tests/harnesses/test_harnesses.py +++ b/llm_quest_benchmark/tests/harnesses/test_harnesses.py @@ -3,7 +3,14 @@ from unittest.mock import Mock from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness -from llm_quest_benchmark.harnesses.memo import HintedCompactHarness, MemoCompactHarness +from llm_quest_benchmark.harnesses.memo import ( + CompactionNoMemoHarness, + HintedCompactHarness, + MemoCompactHarness, + MemoCotHarness, + MemoExtendedHarness, + MemoStructuredHarness, +) from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory from llm_quest_benchmark.harnesses.minimal import MinimalHarness from llm_quest_benchmark.harnesses.planner import PlannerHarness @@ -19,6 +26,10 @@ "tool_compact": (ToolCompactHarness, "tool_augmented.jinja", CompactionMemory), "tool_hinted": (ToolHintedHarness, "tool_augmented_hints.jinja", CompactionMemory), "planner": (PlannerHarness, "planner.jinja", CompactionMemory), + "compaction_no_memo": (CompactionNoMemoHarness, "reasoning.jinja", CompactionMemory), + "memo_cot": (MemoCotHarness, "memo_cot.jinja", CompactionMemory), + "memo_extended": (MemoExtendedHarness, "memo_extended.jinja", CompactionMemory), + "memo_structured": (MemoStructuredHarness, "memo_structured.jinja", CompactionMemory), } @@ -65,6 +76,13 @@ def test_planner_harness_configuration(): assert_harness_configuration("planner") +def test_exp4_retired_harness_configuration(): + assert_harness_configuration("compaction_no_memo") + assert_harness_configuration("memo_cot") + assert_harness_configuration("memo_extended") + assert_harness_configuration("memo_structured") + + def test_all_registry_harnesses_have_configuration_specs(): assert set(HARNESS_REGISTRY) == set(HARNESS_SPECS) @@ -101,6 +119,29 @@ def test_memo_compact_mocked_llm_returns_action_and_reuses_memo_context(): assert "Merchant needs fuel payment" in second_prompt +def test_compaction_memory_receives_existing_llm_client(): + harness = MemoCompactHarness(model_name="gpt-5-mini", compaction_interval=1) + mocked_llm = Mock() + mocked_llm.get_completion.side_effect = [ + '{"memo":"Paid fuel merchant","analysis":"pay first","reasoning":"quest clue","result":2}', + "Summary: paid the fuel merchant and should keep receipt.", + ] + mocked_llm.get_last_usage.return_value = { + "prompt_tokens": 10, + "completion_tokens": 5, + "total_tokens": 15, + "estimated_cost_usd": 0.0, + } + harness.llm = mocked_llm + + action = harness.get_action("A merchant offers fuel for a fee.", [{"text": "Leave"}, {"text": "Pay"}]) + + assert action == 2 + assert harness.memory_module.llm_client is mocked_llm + assert harness.memory_module._compaction_summary == "Summary: paid the fuel merchant and should keep receipt." + assert harness._steps_since_compaction == 0 + + def test_planner_harness_first_turn_generates_plan_then_acts(): harness = PlannerHarness(model_name="gpt-5-mini") mocked_llm = Mock() diff --git a/llm_quest_benchmark/tests/integration/test_benchmark.py b/llm_quest_benchmark/tests/integration/test_benchmark.py index ee0704e..c024da4 100644 --- a/llm_quest_benchmark/tests/integration/test_benchmark.py +++ b/llm_quest_benchmark/tests/integration/test_benchmark.py @@ -5,11 +5,11 @@ import pytest -from llm_quest_benchmark.constants import DEFAULT_TEMPLATE, SYSTEM_ROLE_TEMPLATE +from llm_quest_benchmark.constants import SYSTEM_ROLE_TEMPLATE from llm_quest_benchmark.environments.state import QuestOutcome from llm_quest_benchmark.executors import benchmark as benchmark_module from llm_quest_benchmark.executors.benchmark import run_benchmark -from llm_quest_benchmark.schemas.config import AgentConfig, BenchmarkConfig +from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig def _fake_task_for_parallel_test(task, result_queue): @@ -58,10 +58,10 @@ def test_benchmark_e2e(caplog, tmp_path): config = BenchmarkConfig( quests=[str(quest_path)], agents=[ - AgentConfig( + HarnessConfig( model="random_choice", # Use random_choice for testing + harness="random_choice", system_template=SYSTEM_ROLE_TEMPLATE, - action_template=DEFAULT_TEMPLATE, temperature=0.0, skip_single=True, ) @@ -85,7 +85,7 @@ def test_benchmark_e2e(caplog, tmp_path): assert result["quest"] == str(quest_path) assert result["model"] == "random_choice" assert result["temperature"] == 0.0 - assert result["template"] == DEFAULT_TEMPLATE + assert result["template"] == "reasoning.jinja" assert result["attempt"] == 1 assert "agent_id" in result assert "outcome" in result @@ -122,9 +122,9 @@ def test_benchmark_supports_multiple_runs_per_agent(tmp_path): config = BenchmarkConfig( quests=[str(quest_path)], agents=[ - AgentConfig( + HarnessConfig( model="random_choice", - action_template="reasoning", + harness="random_choice", temperature=0.0, runs=2, skip_single=True, @@ -154,7 +154,7 @@ def test_benchmark_uses_max_workers(monkeypatch, tmp_path): config = BenchmarkConfig( quests=[str(quest_path)], - agents=[AgentConfig(model="random_choice", runs=4)], + agents=[HarnessConfig(model="random_choice", harness="random_choice", runs=4)], quest_timeout=5, max_workers=2, output_dir=str(tmp_path), @@ -187,7 +187,7 @@ def test_benchmark_enforces_child_process_timeout(monkeypatch, tmp_path): config = BenchmarkConfig( quests=[str(quest_path)], - agents=[AgentConfig(model="random_choice", runs=1)], + agents=[HarnessConfig(model="random_choice", harness="random_choice", runs=1)], quest_timeout=1, max_workers=1, output_dir=str(tmp_path), diff --git a/llm_quest_benchmark/tests/test_benchmark_with_directory.py b/llm_quest_benchmark/tests/test_benchmark_with_directory.py index 7661f3d..bfe4e8d 100644 --- a/llm_quest_benchmark/tests/test_benchmark_with_directory.py +++ b/llm_quest_benchmark/tests/test_benchmark_with_directory.py @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) from llm_quest_benchmark.executors.benchmark import run_benchmark -from llm_quest_benchmark.schemas.config import AgentConfig, BenchmarkConfig +from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig def create_test_config(): @@ -19,7 +19,7 @@ def create_test_config(): return { "name": "Directory Benchmark Test", "quests": ["quests/sr_2_1_2121_eng"], - "agents": [{"model": "random_choice", "skip_single": True, "temperature": 0.7}], + "agents": [{"model": "random_choice", "harness": "random_choice", "skip_single": True, "temperature": 0.7}], "quest_timeout": 4, # Keep runtime below pytest global timeout "max_quests": 1, "debug": True, @@ -34,8 +34,8 @@ def test_benchmark_with_directory(): config_dict = create_test_config() logger.info(f"Created test config: {json.dumps(config_dict, indent=2)}") - # Convert agent dictionaries to AgentConfig objects first - config_dict["agents"] = [AgentConfig(**agent_dict) for agent_dict in config_dict["agents"]] + # Convert agent dictionaries to HarnessConfig objects first + config_dict["agents"] = [HarnessConfig(**agent_dict) for agent_dict in config_dict["agents"]] config = BenchmarkConfig(**config_dict) logger.info("Config validation passed") diff --git a/llm_quest_benchmark/tests/test_leaderboard.py b/llm_quest_benchmark/tests/test_leaderboard.py index aa22296..28a3e31 100644 --- a/llm_quest_benchmark/tests/test_leaderboard.py +++ b/llm_quest_benchmark/tests/test_leaderboard.py @@ -243,6 +243,73 @@ def test_generate_leaderboard_filters_public_slice(tmp_path, monkeypatch): assert {row["model"] for row in leaderboard["results"]} == {"model-a", "model-b", "model-c"} +def test_generate_leaderboard_excludes_retired_exp4_variants(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + + active_dir = Path("results/benchmarks/active") + active_dir.mkdir(parents=True, exist_ok=True) + retired_dir = Path("results/benchmarks/retired") + retired_dir.mkdir(parents=True, exist_ok=True) + + active_row = { + "quest": "quests/Core.qm", + "model": "gpt-5-mini", + "template": "stateful_compact.jinja", + "harness": "memo_compact", + "agent_id": "active", + "attempt": 1, + "outcome": "SUCCESS", + } + retired_rows = [ + { + "quest": "quests/Core.qm", + "model": "gpt-5-mini", + "template": "reasoning.jinja", + "harness": "compaction_no_memo", + "agent_id": "retired-no-memo", + "attempt": 1, + "outcome": "FAILURE", + }, + { + "quest": "quests/Core.qm", + "model": "gpt-5-mini", + "template": "memo_extended.jinja", + "harness": "memo_extended", + "agent_id": "retired-extended", + "attempt": 1, + "outcome": "FAILURE", + }, + ] + + (active_dir / "benchmark_summary.json").write_text( + json.dumps({"benchmark_id": "active", "name": "active", "agents": [], "results": [active_row], "db_runs": []}), + encoding="utf-8", + ) + (retired_dir / "benchmark_summary.json").write_text( + json.dumps( + { + "benchmark_id": "retired", + "name": "exp4_compaction_no_memo", + "agents": [], + "results": retired_rows, + "db_runs": [], + } + ), + encoding="utf-8", + ) + + leaderboard = generate_leaderboard( + [str(active_dir), str(retired_dir)], + "site/leaderboard.json", + min_runs=0, + public_model_ids=None, + ) + + assert len(leaderboard["results"]) == 1 + assert leaderboard["results"][0]["mode"] == "compact_memory_memo" + assert leaderboard["results"][0]["runs"] == 1 + + def test_generate_leaderboard_matches_db_runs_by_identifiers(tmp_path, monkeypatch): monkeypatch.chdir(tmp_path) From 9ed2679266cea2ec5a9efd275b165f2032a8ad70 Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Mon, 11 May 2026 22:10:42 +0400 Subject: [PATCH 13/24] fix: P2 harness model attribution and harness_id includes system_template Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- llm_quest_benchmark/executors/benchmark.py | 12 ++++++++- llm_quest_benchmark/schemas/config.py | 2 +- .../tests/harnesses/test_factory.py | 7 +++++ .../tests/integration/test_benchmark.py | 2 +- .../tests/test_benchmark_with_directory.py | 26 ++++++++++++++++--- 5 files changed, 43 insertions(+), 6 deletions(-) diff --git a/llm_quest_benchmark/executors/benchmark.py b/llm_quest_benchmark/executors/benchmark.py index 53c430d..82800b3 100644 --- a/llm_quest_benchmark/executors/benchmark.py +++ b/llm_quest_benchmark/executors/benchmark.py @@ -39,6 +39,16 @@ def _agent_harness(agent_config) -> str: return agent_config.harness +def _agent_model(agent_config) -> str: + """Return the result model label for the executed harness.""" + harness = _agent_harness(agent_config) + if harness == "human": + return "human" + if harness.startswith("random_choice"): + return "random_policy" + return agent_config.model + + def _agent_template(agent_config) -> str: """Return legacy template name for result artifacts.""" if hasattr(agent_config, "action_template"): @@ -91,7 +101,7 @@ def _result_entry( ) -> dict[str, Any]: return { "quest": quest, - "model": agent_config.model, + "model": _agent_model(agent_config), "temperature": agent_config.temperature, "harness": _agent_harness(agent_config), "template": _agent_template(agent_config), diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py index 7d5c74c..f32184e 100644 --- a/llm_quest_benchmark/schemas/config.py +++ b/llm_quest_benchmark/schemas/config.py @@ -123,7 +123,7 @@ def harness_id(self) -> str: """Generate a stable harness ID based on configuration values""" import hashlib - config_str = f"{self.model}_{self.temperature}_{self.harness}_{self.compaction_interval}" + config_str = f"{self.model}_{self.temperature}_{self.harness}_{self.system_template}_{self.compaction_interval}" hash_val = hashlib.md5(config_str.encode()).hexdigest()[:8] return f"{self.model}_t{self.temperature}_{self.harness}_{hash_val}" diff --git a/llm_quest_benchmark/tests/harnesses/test_factory.py b/llm_quest_benchmark/tests/harnesses/test_factory.py index 4d72b61..7f6f11c 100644 --- a/llm_quest_benchmark/tests/harnesses/test_factory.py +++ b/llm_quest_benchmark/tests/harnesses/test_factory.py @@ -62,6 +62,13 @@ def test_harness_config_stable_harness_id(): assert config.harness_id == HarnessConfig(harness="memo_compact", model="gpt-5-mini").harness_id +def test_harness_config_system_template_affects_harness_id(): + first = HarnessConfig(harness="memo_compact", model="gpt-5-mini", system_template="system_role.jinja") + second = HarnessConfig(harness="memo_compact", model="gpt-5-mini", system_template="system_role_risk.jinja") + + assert first.harness_id != second.harness_id + + def test_harness_config_allows_seeded_random_choice_harness(): config = HarnessConfig(harness="random_choice_123", model="gpt-5-mini") diff --git a/llm_quest_benchmark/tests/integration/test_benchmark.py b/llm_quest_benchmark/tests/integration/test_benchmark.py index c024da4..1c56d35 100644 --- a/llm_quest_benchmark/tests/integration/test_benchmark.py +++ b/llm_quest_benchmark/tests/integration/test_benchmark.py @@ -83,7 +83,7 @@ def test_benchmark_e2e(caplog, tmp_path): # Check first result result = results[0] assert result["quest"] == str(quest_path) - assert result["model"] == "random_choice" + assert result["model"] == "random_policy" assert result["temperature"] == 0.0 assert result["template"] == "reasoning.jinja" assert result["attempt"] == 1 diff --git a/llm_quest_benchmark/tests/test_benchmark_with_directory.py b/llm_quest_benchmark/tests/test_benchmark_with_directory.py index bfe4e8d..c6dc855 100644 --- a/llm_quest_benchmark/tests/test_benchmark_with_directory.py +++ b/llm_quest_benchmark/tests/test_benchmark_with_directory.py @@ -6,13 +6,13 @@ import pytest +from llm_quest_benchmark.executors.benchmark import _result_entry, run_benchmark +from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig + # Configure logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) -from llm_quest_benchmark.executors.benchmark import run_benchmark -from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig - def create_test_config(): """Create a test benchmark configuration with directory path""" @@ -27,6 +27,26 @@ def create_test_config(): } +def test_result_entry_logs_random_harness_model_as_random_policy(): + """Random harness results should not be attributed to the default LLM model.""" + agent_config = HarnessConfig(harness="random_choice", model="gpt-5-mini") + + result = _result_entry("quests/Boat.qm", agent_config, 1, "FAILURE") + + assert result["model"] == "random_policy" + assert result["harness"] == "random_choice" + + +def test_result_entry_logs_human_harness_model_as_human(): + """Human harness results should not be attributed to the default LLM model.""" + agent_config = HarnessConfig(harness="human", model="gpt-5-mini") + + result = _result_entry("quests/Boat.qm", agent_config, 1, "FAILURE") + + assert result["model"] == "human" + assert result["harness"] == "human" + + @pytest.mark.skipif(not Path("quests/sr_2_1_2121_eng").exists(), reason="Quest files not downloaded") def test_benchmark_with_directory(): """Test running a benchmark with a directory path""" From 96704c98c14b8c088262b5967b8fbc57d8bba175 Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Mon, 11 May 2026 22:44:45 +0400 Subject: [PATCH 14/24] fix: hide legacy AgentConfig public export Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- llm_quest_benchmark/core/runner.py | 4 ++-- llm_quest_benchmark/schemas/__init__.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/llm_quest_benchmark/core/runner.py b/llm_quest_benchmark/core/runner.py index e5f531a..e2b9ef8 100644 --- a/llm_quest_benchmark/core/runner.py +++ b/llm_quest_benchmark/core/runner.py @@ -15,7 +15,7 @@ from llm_quest_benchmark.core.logging import LogManager, QuestLogger from llm_quest_benchmark.environments.qm import QMPlayerEnv as QuestEnvironment from llm_quest_benchmark.environments.state import QuestOutcome -from llm_quest_benchmark.schemas.config import AgentConfig +from llm_quest_benchmark.schemas.config import HarnessConfig from llm_quest_benchmark.schemas.state import AgentState # Configure logging @@ -27,7 +27,7 @@ def run_quest_with_timeout( quest_path: str, agent: QuestPlayer, timeout: int = DEFAULT_QUEST_TIMEOUT, - agent_config: AgentConfig | None = None, + agent_config: HarnessConfig | Any | None = None, debug: bool = False, callbacks: list[Callable[[str, Any], None]] = None, ) -> QuestOutcome | None: diff --git a/llm_quest_benchmark/schemas/__init__.py b/llm_quest_benchmark/schemas/__init__.py index 0cb4242..cb0338f 100644 --- a/llm_quest_benchmark/schemas/__init__.py +++ b/llm_quest_benchmark/schemas/__init__.py @@ -7,11 +7,10 @@ "QMBridgeState", "BenchmarkConfig", "HarnessConfig", - "AgentConfig", ] # Import directly from the schema modules using relative imports from .bridge import QMBridgeState -from .config import AgentConfig, BenchmarkConfig, HarnessConfig +from .config import BenchmarkConfig, HarnessConfig from .response import LLMResponse from .state import AgentState, QMState From cce82dc0cf6b37b5d1ede7d7e641be6c0ed2783a Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Tue, 12 May 2026 00:39:44 +0400 Subject: [PATCH 15/24] docs: consolidate harness documentation --- .../balanced_gpt5mini_all_modes.yaml | 20 +- configs/kr2_en_benchmark.yaml | 40 ---- docs/ARCHITECTURE.md | 25 +++ docs/EXPERIMENTS_LOG.md | 139 +++++++++++++ docs/EXPERIMENT_AUDIT.md | 193 ------------------ docs/HARNESS_ENGINEERING.md | 64 ------ docs/SPEC.md | 18 ++ 7 files changed, 190 insertions(+), 309 deletions(-) delete mode 100644 configs/kr2_en_benchmark.yaml delete mode 100644 docs/EXPERIMENT_AUDIT.md delete mode 100644 docs/HARNESS_ENGINEERING.md diff --git a/configs/benchmarks/balanced_gpt5mini_all_modes.yaml b/configs/benchmarks/balanced_gpt5mini_all_modes.yaml index 4ab3e65..812e94a 100644 --- a/configs/benchmarks/balanced_gpt5mini_all_modes.yaml +++ b/configs/benchmarks/balanced_gpt5mini_all_modes.yaml @@ -21,49 +21,45 @@ quests: agents: # 1. Minimal prompt - model: gpt-5-mini - template: stub + harness: minimal temperature: 0.4 runs: 3 # 2. Short-context reasoning - model: gpt-5-mini - template: reasoning + harness: reasoning_recent temperature: 0.4 runs: 3 # 3. Full-history reasoning - model: gpt-5-mini - template: reasoning + harness: reasoning_full temperature: 0.4 runs: 3 - memory_mode: full_transcript # 4. Compact memory / memo - model: gpt-5-mini - template: stateful_compact + harness: memo_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 # 5. Prompt hints - model: gpt-5-mini - template: light_hints + harness: hinted_compact temperature: 0.4 runs: 3 # 6. Tools + compact memory - model: gpt-5-mini - template: tool_augmented + harness: tool_compact temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 # 7. Tools + hints + compact memory - model: gpt-5-mini - template: tool_augmented_hints + harness: tool_hinted temperature: 0.4 runs: 3 - memory_mode: compaction compaction_interval: 50 # 8. Planner loop - model: gpt-5-mini - template: planner + harness: planner temperature: 0.4 runs: 3 debug: false diff --git a/configs/kr2_en_benchmark.yaml b/configs/kr2_en_benchmark.yaml deleted file mode 100644 index 88a0fe5..0000000 --- a/configs/kr2_en_benchmark.yaml +++ /dev/null @@ -1,40 +0,0 @@ -# Benchmark configuration for Kr2 English quests -# Using recommended models with optimized temperature settings - -quests: - - quests/kr2_en - -agents: - # OpenAI models - - model: gpt-4o - harness: reasoning_recent - temperature: 0.5 - skip_single: true - - - model: gpt-4o-mini - harness: reasoning_recent - temperature: 0.7 - skip_single: true - - # Anthropic models - - model: claude-3-7-sonnet-latest - harness: reasoning_recent - temperature: 0.5 - skip_single: true - - - model: claude-3-5-sonnet-latest - harness: reasoning_recent - temperature: 0.6 - skip_single: true - -# Debug mode enables more detailed logging -debug: true - -# Quest timeout in seconds -quest_timeout: 120 - -# Output directory for benchmark results -output_dir: metrics/kr2_en - -# Optional name for this benchmark run -name: kr2_en_benchmark \ No newline at end of file diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 998241a..83472b3 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -15,6 +15,26 @@ The runtime loop is: 4. Apply the choice, log the step, and detect the terminal outcome. 5. Persist run metrics and run summaries. +## Harness Engineering Framing + +This project treats the **agent harness** as the primary experimental object. +An agent harness is the wrapper around a model that controls what the model +sees, what state is carried forward, what external tools are available, and how +a raw completion is converted into a quest action. In this codebase, harnesses +are not incidental plumbing: they are the independent variable. + +This follows the practical question raised by "How Much Heavy Lifting Can an +Agent Harness Do?" (arXiv:2604.07236): how much performance comes from the +surrounding scaffold rather than the base model alone? Space Rangers text +quests are useful because they are long enough to stress memory, planning, and +state tracking, but concrete enough to score with terminal success/failure +outcomes. + +Closest text-game benchmarks such as TextQuests and TALE-Suite usually vary +models under a mostly fixed evaluation scaffold. LLM Quest Benchmark can hold +the model fixed and vary the harness to ask which prompt, memory, tool, and +planning choices change behavior. + ## Main Runtime Layers ### 1. Quest Engine Layer @@ -107,3 +127,8 @@ and benchmark configuration parsing do not require API keys. | Tools + compact memory | `tool_compact` | `tool_augmented.jinja` | `CompactionMemory` | calculator, scratchpad, quest history | tool-select-then-act | | Tools + hints + compact memory | `tool_hinted` | `tool_augmented_hints.jinja` | `CompactionMemory` | calculator, scratchpad, quest history | tool-select-then-act | | Planner loop | `planner` | `planner.jinja` | `CompactionMemory` | none | plan-maintain-act | + +The harness names above are canonical snake_case identifiers used in YAML +configs, the CLI, result artifacts, and documentation. Public labels can be +friendlier, but experiment records should preserve the canonical names so runs +remain comparable. diff --git a/docs/EXPERIMENTS_LOG.md b/docs/EXPERIMENTS_LOG.md index 0d9ce49..6d0d7f9 100644 --- a/docs/EXPERIMENTS_LOG.md +++ b/docs/EXPERIMENTS_LOG.md @@ -21,6 +21,145 @@ Record of benchmark experiments, findings, and decisions. Keeps history out of source code. +## Current Coverage Audit (2026-05-11) + +Sources reviewed for this audit: + +- `docs/EXPERIMENTS_LOG.md` +- `docs/ARCHITECTURE.md` +- `configs/benchmarks/*.yaml` +- `site/leaderboard.json` + +This audit uses the post-refactor harness taxonomy: `minimal`, +`reasoning_recent`, `reasoning_full`, `memo_compact`, `hinted_compact`, +`tool_compact`, `tool_hinted`, and `planner`. + +### Experiment Inventory + +| Experiment | Config / source | Harness mapping | Quest scope | Completed runs recorded in log | Audit disposition | +|---|---|---|---|---:|---| +| Exp 2: Memory Modes | `memory_full_transcript.yaml`, `memory_compaction.yaml` | `reasoning_full`, `memo_compact` | 14 historical quests including `Prison` | 126 | Unreliable for canonical comparison: loop-breaker bug era. | +| Exp 3 Arm 1: No Loop Breaker | `exp3_no_loop_breaker.yaml` | `reasoning_full` | 18 quests, excluding `Boat`/`Prison` | 36 | Use only rerun after timeout fix; pre-fix attempt is noisy/incomplete. | +| Exp 3 Arm 2: Stateful Compact | `exp3_stateful_compact.yaml` | `memo_compact` | 18 quests, excluding `Boat`/`Prison` | 36 | Canonical memo baseline, but only 2 runs/quest. | +| Exp 4: Compaction No Memo | `exp4_compaction_no_memo.yaml` | retired ablation, not canonical | 18 quests | 36 | Do not aggregate into `memo_compact`. | +| Exp 4: Memo Extended | `exp4_memo_extended.yaml` | retired `memo_extended` variant | 18 quests | 36 | Non-canonical variant. | +| Exp 4: Memo Structured | `exp4_memo_structured.yaml` | retired `memo_structured` variant | 18 quests | 36 | Non-canonical variant. | +| Exp 4: Memo CoT | `exp4_memo_cot.yaml` | retired `memo_cot` variant | 18 quests | 36 | Non-canonical variant. | +| Exp 5: Baseline Variance | `exp5_stateful_compact_variance.yaml` | `memo_compact` | 18 quests | 90 | Canonical memo baseline variance study. | +| Exp 6: Prompt Hints | `exp6_prompt_hints.yaml` | `hinted_compact` | 18 quests | 54 | Canonical single-model harness comparison. | +| Exp 6: Tools | `exp6_tools.yaml` | `tool_compact` | 18 quests | 54 | Canonical single-model harness comparison. | +| Exp 6: Tools + Hints | `exp6_tools_hints.yaml` | `tool_hinted` | 18 quests | 54 | Canonical single-model harness comparison. | +| Exp 7: Multi-Model Comparison | `exp7_*.yaml` | `memo_compact` | 5 winnable quests | 75 | Canonical model sweep for memo harness. | +| Exp 7b: Model Upgrades | `exp7b_model_upgrades.yaml` | `memo_compact` | 18 quests | 108 | Noisy model-upgrade sweep; high timeout rates for Qwen 3.6 and Haiku 4.5. | + +### Harness Coverage Matrix + +The table below is computed from `site/leaderboard.json` and counts recorded +leaderboard runs by harness and quest. `Boat` and `Prison` are retained because +they still appear in the published leaderboard data, but they are retired from +the canonical experiment set. + +| Harness | Badday | Banket | Boat | Codebox | Depth | Driver | Edelweiss | Election | Foncers | Leonardo | Ministry | Pizza | Prison | Robots | Ski | Total | +|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| `minimal` | 22 | 22 | 23 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 331 | +| `reasoning_recent` | 22 | 22 | 28 | 22 | 22 | 24 | 25 | 30 | 25 | 25 | 26 | 22 | 28 | 31 | 31 | 383 | +| `reasoning_full` | 17 | 17 | 9 | 17 | 17 | 15 | 17 | 17 | 17 | 17 | 16 | 17 | 6 | 14 | 14 | 227 | +| `memo_compact` | 37 | 39 | 18 | 39 | 39 | 39 | 39 | 37 | 39 | 37 | 39 | 37 | 15 | 39 | 34 | 527 | +| `hinted_compact` | 4 | 4 | 1 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 1 | 4 | 4 | 54 | +| `tool_compact` | 3 | 3 | 0 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 0 | 3 | 3 | 39 | +| `tool_hinted` | 3 | 3 | 0 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 0 | 3 | 3 | 39 | +| `planner` | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 15 | + +Leaderboard scope note: the current public JSON includes 15 quest columns and +does not include several 18-quest experiment-log quests such as `Pilot`, +`Disk`, `Player`, `Shashki`, and `Sortirovka1`. A future leaderboard refresh +should either add them or explicitly document why the public slice excludes +them. + +### Gap Analysis + +All zero-run cells in the published leaderboard matrix are retired quest cells: + +- `tool_compact` x `Boat`: 0 runs. +- `tool_compact` x `Prison`: 0 runs. +- `tool_hinted` x `Boat`: 0 runs. +- `tool_hinted` x `Prison`: 0 runs. + +Because `Boat` and `Prison` are retired, these do not require new canonical +runs. They do indicate that the public leaderboard mixes active and retired +quest scopes. + +Cells with fewer than 3 runs: + +- `hinted_compact` x `Boat`: 1 run; retired quest. +- `hinted_compact` x `Prison`: 1 run; retired quest. +- `planner`: 1 run on every published quest. + +Canonical action item: the planner harness has insufficient variance coverage. +For active quests, it needs at least two additional runs per quest to reach the +minimum 3-run threshold. + +The following harnesses have leaderboard cells where the run count may be at +least 3, but the model dimension is still only one model: `tool_compact`, +`tool_hinted`, and `planner`. Their comparison is promising, but not yet +model-robust. + +### Noise And Anomalies + +Loop-breaker bug era: + +- Exp 2 memory-mode runs are unreliable. The experiment log documents a + number-normalization bug in `_normalize_for_signature` and aggressive loop + breaker overrides that changed correct model decisions. +- Exp 3 Arm 1 has a pre-fix/incomplete attempt affected by SDK timeout issues. + Only the rerun after the timeout fix should be considered. +- Any leaderboard entry whose provenance traces to Exp 2 or the Exp 3 pre-fix + attempt should be marked non-canonical until regenerated or excluded. + +High-timeout model-upgrade runs: + +- Exp 7b `Qwen 3.6 Flash`: 17/36 timeouts (47%). +- Exp 7b `Claude Haiku 4.5`: 19/36 timeouts (53%). +- Exp 7b `DeepSeek V4 Flash`: 5/36 timeouts (14%), below the >30% threshold + but still noisy because success was 0/36. + +Retired quests: + +- `Boat`: trivial / smoke-test-like quest; removed from canonical experiment + configs. +- `Prison`: loops endlessly; removed from canonical experiment configs. + +Retired harness variants: + +- `memo_extended` +- `memo_structured` +- `memo_cot` +- `compaction_no_memo` ablation + +These variants should not be merged into canonical `memo_compact` results. + +### Budget Estimate + +Top-priority new runs to close actionable gaps while avoiding retired quests: + +| Priority | Harness | Quest(s) | New runs needed | Reason | +|---:|---|---|---:|---| +| 1 | `planner` | 13 active published quests (`Badday`, `Banket`, `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`, `Foncers`, `Leonardo`, `Ministry`, `Pizza`, `Robots`, `Ski`) | 26 | Bring 1-run planner cells up to the 3-run minimum on active leaderboard quests. | +| 2 | `planner` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest so planner effects are not single-model artifacts. | +| 3 | `tool_compact` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest; current cells are all one-model results. | +| 4 | `tool_hinted` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest; current cells are all one-model results. | +| 5 | Public leaderboard refresh | `Pilot`, `Disk`, `Player`, `Shashki`, `Sortirovka1` | Scope-dependent | These quests are present in canonical 18-quest configs/logs but absent from the current public leaderboard matrix. Backfill or explicitly exclude them. | + +Do not spend new budget on `Boat` or `Prison` unless the goal is only to +reproduce historical/public rows; both are retired from canonical analysis. + +### Leaderboard Integrity + +Recommended integrity rule: canonical leaderboard aggregates should require +non-retired quests, canonical harness names, no loop-breaker bug provenance, at +least 3 runs per harness x quest cell, and at least two models for claims about +harness effects rather than model effects. + ## Exp 2: Memory Modes (2026-04-27) **Config**: `configs/benchmarks/memory_full_transcript.yaml`, `configs/benchmarks/memory_compaction.yaml` diff --git a/docs/EXPERIMENT_AUDIT.md b/docs/EXPERIMENT_AUDIT.md deleted file mode 100644 index 3298d0e..0000000 --- a/docs/EXPERIMENT_AUDIT.md +++ /dev/null @@ -1,193 +0,0 @@ -# Experiment Audit - -Generated: 2026-05-11 - -Sources reviewed: - -- `docs/EXPERIMENTS_LOG.md` -- `docs/ARCHITECTURE.md` -- `configs/benchmarks/*.yaml` -- `site/leaderboard.json` - -This audit uses the post-refactor harness taxonomy: `minimal`, -`reasoning_recent`, `reasoning_full`, `memo_compact`, `hinted_compact`, -`tool_compact`, `tool_hinted`, and `planner`. - -## Experiment Inventory - -| Experiment | Config / source | Harness mapping | Quest scope | Completed runs recorded in log | Audit disposition | -|---|---|---|---|---:|---| -| Exp 2: Memory Modes | `memory_full_transcript.yaml`, `memory_compaction.yaml` | `reasoning_full`, `memo_compact` | 14 historical quests including `Prison` | 126 | Unreliable for canonical comparison: loop-breaker bug era. | -| Exp 3 Arm 1: No Loop Breaker | `exp3_no_loop_breaker.yaml` | `reasoning_full` | 18 quests, excluding `Boat`/`Prison` | 36 | Use only rerun after timeout fix; pre-fix attempt is noisy/incomplete. | -| Exp 3 Arm 2: Stateful Compact | `exp3_stateful_compact.yaml` | `memo_compact` | 18 quests, excluding `Boat`/`Prison` | 36 | Canonical memo baseline, but only 2 runs/quest. | -| Exp 4: Compaction No Memo | `exp4_compaction_no_memo.yaml` | retired ablation, not canonical | 18 quests | 36 | Do not aggregate into `memo_compact`. | -| Exp 4: Memo Extended | `exp4_memo_extended.yaml` | retired `memo_extended` variant | 18 quests | 36 | Non-canonical variant. | -| Exp 4: Memo Structured | `exp4_memo_structured.yaml` | retired `memo_structured` variant | 18 quests | 36 | Non-canonical variant. | -| Exp 4: Memo CoT | `exp4_memo_cot.yaml` | retired `memo_cot` variant | 18 quests | 36 | Non-canonical variant. | -| Exp 5: Baseline Variance | `exp5_stateful_compact_variance.yaml` | `memo_compact` | 18 quests | 90 | Canonical memo baseline variance study. | -| Exp 6: Prompt Hints | `exp6_prompt_hints.yaml` | `hinted_compact` | 18 quests | 54 | Canonical single-model harness comparison. | -| Exp 6: Tools | `exp6_tools.yaml` | `tool_compact` | 18 quests | 54 | Canonical single-model harness comparison. | -| Exp 6: Tools + Hints | `exp6_tools_hints.yaml` | `tool_hinted` | 18 quests | 54 | Canonical single-model harness comparison. | -| Exp 7: Multi-Model Comparison | `exp7_*.yaml` | `memo_compact` | 5 winnable quests | 75 | Canonical model sweep for memo harness. | -| Exp 7b: Model Upgrades | `exp7b_model_upgrades.yaml` | `memo_compact` | 18 quests | 108 | Noisy model-upgrade sweep; high timeout rates for Qwen 3.6 and Haiku 4.5. | - -## 1. Harness Coverage Matrix - -The table below is computed from `site/leaderboard.json` and counts recorded -leaderboard runs by harness and quest. `Boat` and `Prison` are retained in this -matrix because they still appear in the published leaderboard data, but they -are retired from the canonical experiment set. - -| Harness | Badday | Banket | Boat | Codebox | Depth | Driver | Edelweiss | Election | Foncers | Leonardo | Ministry | Pizza | Prison | Robots | Ski | Total | -|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| -| `minimal` | 22 | 22 | 23 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 331 | -| `reasoning_recent` | 22 | 22 | 28 | 22 | 22 | 24 | 25 | 30 | 25 | 25 | 26 | 22 | 28 | 31 | 31 | 383 | -| `reasoning_full` | 17 | 17 | 9 | 17 | 17 | 15 | 17 | 17 | 17 | 17 | 16 | 17 | 6 | 14 | 14 | 227 | -| `memo_compact` | 37 | 39 | 18 | 39 | 39 | 39 | 39 | 37 | 39 | 37 | 39 | 37 | 15 | 39 | 34 | 527 | -| `hinted_compact` | 4 | 4 | 1 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 1 | 4 | 4 | 54 | -| `tool_compact` | 3 | 3 | **0** | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | **0** | 3 | 3 | 39 | -| `tool_hinted` | 3 | 3 | **0** | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | **0** | 3 | 3 | 39 | -| `planner` | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 15 | - -Leaderboard scope note: the current public JSON includes 15 quest columns and -does not include several 18-quest experiment-log quests such as `Pilot`, -`Disk`, `Player`, `Shashki`, and `Sortirovka1`. Those quests appear in the -benchmark configs and experiment log, so a future leaderboard refresh should -either add them or explicitly document why the public slice excludes them. - -## 2. Gap Analysis - -### Zero-run harness × quest cells - -All zero-run cells in the published leaderboard matrix are retired quest cells: - -- `tool_compact` × `Boat`: 0 runs. -- `tool_compact` × `Prison`: 0 runs. -- `tool_hinted` × `Boat`: 0 runs. -- `tool_hinted` × `Prison`: 0 runs. - -Because `Boat` and `Prison` are retired, these do not require new canonical -runs. They do indicate that the public leaderboard mixes active and retired -quest scopes. - -### Fewer than 3 runs - -- `hinted_compact` × `Boat`: 1 run; retired quest. -- `hinted_compact` × `Prison`: 1 run; retired quest. -- `planner`: 1 run on every published quest (`Badday`, `Banket`, `Boat`, - `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`, `Foncers`, - `Leonardo`, `Ministry`, `Pizza`, `Prison`, `Robots`, `Ski`). - -Canonical action item: the planner harness has insufficient variance coverage. -For active quests, it needs at least two additional runs per quest to reach the -minimum 3-run threshold. - -### Only 1 model tested - -The following harnesses have leaderboard cells where the run count may be at -least 3, but the model dimension is still only one model. These cells cannot -separate harness effects from model-specific behavior: - -- `tool_compact`: one model on all non-retired published quests - (`Badday`, `Banket`, `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`, - `Foncers`, `Leonardo`, `Ministry`, `Pizza`, `Robots`, `Ski`). -- `tool_hinted`: one model on all non-retired published quests - (`Badday`, `Banket`, `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`, - `Foncers`, `Leonardo`, `Ministry`, `Pizza`, `Robots`, `Ski`). -- `planner`: one model on every published quest and only one run per quest. -- `hinted_compact` on `Boat` and `Prison`: one model, but both quests are - retired. - -The stronger public comparison cells are `minimal`, `reasoning_recent`, -`reasoning_full`, and `memo_compact`, which have multi-model coverage in the -leaderboard data. However, `reasoning_full` and `memo_compact` still require -provenance filtering because early memory-mode runs overlap with the loop- -breaker bug era. - -## 3. Noise / Anomaly List - -### Loop-breaker bug era - -- Exp 2 memory-mode runs are unreliable. The experiment log documents a - number-normalization bug in `_normalize_for_signature` and aggressive loop - breaker overrides that changed correct model decisions. -- Exp 3 Arm 1 has a pre-fix/incomplete attempt affected by SDK timeout issues. - Only the rerun after the timeout fix should be considered. -- Any leaderboard entry whose provenance traces to Exp 2 or the Exp 3 pre-fix - attempt should be marked non-canonical until regenerated or excluded. - -### High timeout runs - -- Exp 7b `Qwen 3.6 Flash`: 17/36 timeouts (47%). -- Exp 7b `Claude Haiku 4.5`: 19/36 timeouts (53%). -- Exp 7b `DeepSeek V4 Flash`: 5/36 timeouts (14%), below the >30% threshold - but still noisy because success was 0/36. - -The Qwen 3.6 and Haiku 4.5 rows should be interpreted primarily as timeout / -verbosity failures, not clean harness-quality signals. - -### Retired quests - -- `Boat`: trivial / smoke-test-like quest; removed from canonical experiment - configs. -- `Prison`: loops endlessly; removed from canonical experiment configs. - -Both still appear in `site/leaderboard.json`, so public summaries should label -them as retired or remove them from canonical aggregates. - -### Retired harness variants - -The following Exp 4 arms are not part of the final taxonomy and should not be -merged into canonical `memo_compact` results: - -- `memo_extended` -- `memo_structured` -- `memo_cot` -- `compaction_no_memo` ablation - -Current YAML files have been migrated to the `harness:` key, so historical -variant identity must be preserved from `docs/EXPERIMENTS_LOG.md` and config -file names rather than inferred only from the post-refactor `harness` field. - -## 4. Budget Estimate - -Top-priority new runs to close actionable gaps while avoiding retired quests: - -| Priority | Harness | Quest(s) | New runs needed | Reason | -|---:|---|---|---:|---| -| 1 | `planner` | 13 active published quests (`Badday`, `Banket`, `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`, `Foncers`, `Leonardo`, `Ministry`, `Pizza`, `Robots`, `Ski`) | 26 | Bring 1-run planner cells up to the 3-run minimum on active leaderboard quests. | -| 2 | `planner` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest so planner effects are not single-model artifacts. | -| 3 | `tool_compact` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest; current cells are all one-model results. | -| 4 | `tool_hinted` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest; current cells are all one-model results. | -| 5 | Public leaderboard refresh | `Pilot`, `Disk`, `Player`, `Shashki`, `Sortirovka1` | Scope-dependent | These quests are present in canonical 18-quest configs/logs but absent from the current public leaderboard matrix. Backfill or explicitly exclude them. | - -Do not spend new budget on `Boat` or `Prison` unless the goal is only to -reproduce historical/public rows; both are retired from canonical analysis. - -## 5. Leaderboard Integrity - -Findings from `site/leaderboard.json`: - -1. The leaderboard uses the eight canonical public modes and does not expose - retired harness variants as separate modes. This is good, but it creates a - provenance risk if Exp 4 retired variants were ever aggregated under - `memo_compact`. -2. `Boat` and `Prison` remain in the published quest list despite being retired - from canonical experiment configs. They should be excluded from aggregate - claims or clearly labeled as retired. -3. `planner` has only one run per quest and one model. It should not be used for - reliability claims yet. -4. `tool_compact` and `tool_hinted` have three runs per active published quest, - but only one model. Their harness comparison is promising but not yet - model-robust. -5. Published `reasoning_full` / `memo_compact` rows need run-level provenance - checks before canonical use because early memory-mode experiments overlap - with the Exp 2 loop-breaker bug era. -6. Exp 7b model-upgrade entries for `Qwen 3.6 Flash` and `Claude Haiku 4.5` - should be annotated as high-timeout data if included in any leaderboard or - narrative comparison. - -Recommended integrity rule: canonical leaderboard aggregates should require -non-retired quests, canonical harness names, no loop-breaker bug provenance, at -least 3 runs per harness × quest cell, and at least two models for claims about -harness effects rather than model effects. diff --git a/docs/HARNESS_ENGINEERING.md b/docs/HARNESS_ENGINEERING.md deleted file mode 100644 index 5666ebc..0000000 --- a/docs/HARNESS_ENGINEERING.md +++ /dev/null @@ -1,64 +0,0 @@ -# Harness Engineering - -LLM Quest Benchmark treats the **agent harness** as the primary experimental -object. An agent harness is the wrapper around a model that controls what the -model sees, what state is carried forward, what external tools are available, -and how a raw model completion is converted into a quest action. In this -project, harnesses are not incidental plumbing: they are the independent -variable. - -This framing follows the harness engineering question raised by "How Much Heavy -Lifting Can an Agent Harness Do?" (arXiv:2604.07236): how much performance comes -from the surrounding scaffold rather than the base model alone? Space Rangers -text quests are a useful testbed because they are long enough to stress memory, -planning, and state tracking, but concrete enough to score with terminal -success/failure outcomes. - -## The Eight Canonical Harnesses - -| Harness name | What varies | -|---|---| -| `minimal` | Uses the smallest action-selection prompt with recent context only. This is the low-scaffold baseline. | -| `reasoning_recent` | Adds an explicit reasoning prompt while keeping recent-window memory. | -| `reasoning_full` | Keeps the reasoning prompt but exposes the full transcript instead of a short recent window. | -| `memo_compact` | Uses compacted memory plus a constrained 20-word memo to preserve salient state. | -| `hinted_compact` | Adds mechanics hints to the compact memo harness, without tools. | -| `tool_compact` | Adds calculator, scratchpad, and quest-history tools to compact memory. | -| `tool_hinted` | Combines compact memory, tools, and mechanics hints. | -| `planner` | Uses a plan-maintain-act loop with compact memory instead of a pure react loop. | - -The harness names are canonical snake_case identifiers used in YAML configs, -the CLI, and documentation. Public labels can be friendlier, but experimental -records should preserve these names so runs remain comparable. - -## Difference From TextQuests and TALE-Suite - -TextQuests (arXiv:2507.23701) and TALE-Suite are closest in spirit because they -also evaluate language models on interactive text-game tasks. Their main -comparison axis is model capability under a mostly fixed evaluation scaffold: -the harness is treated as test infrastructure, and the model is varied. - -LLM Quest Benchmark flips that emphasis. We can hold a model fixed and vary the -harness to ask which context, memory, tool, and planning choices change -behavior. That makes the benchmark useful for harness engineering: it can -separate "the model cannot do the task" from "this wrapper failed to show the -model the right state, preserve the right facts, or expose the right operation." - -## Findings So Far - -The strongest pattern so far is that bigger scaffolds are not automatically -better. A concise 20-word memo produced a sweet spot: it improved over no memo -and full transcript baselines, while longer or more structured memo variants -regressed. The likely mechanism is selective pressure: the short memo forces -the harness to preserve only state that matters for future decisions. - -Tools and hints show a synergy effect. Prompt hints alone hurt, and tools alone -were modest, but tools plus hints improved outcomes because the hints pointed -the model toward quantities and morally grey quest mechanics while the -calculator, scratchpad, and history search gave it ways to act on those -signals. - -Verbosity hurts in this environment. Some newer or larger models timed out more -often because they spent too much of the quest budget generating long step -responses. For sequential decision tasks, a harness that elicits concise, -actionable state updates can outperform one that invites broad reasoning. diff --git a/docs/SPEC.md b/docs/SPEC.md index cadbb99..44ef498 100644 --- a/docs/SPEC.md +++ b/docs/SPEC.md @@ -54,6 +54,24 @@ Use these labels for current public descriptions of benchmark harnesses: Older internal experiment labels are historical and should not be presented as the current public taxonomy. +## Current Interpretation + +The strongest pattern so far is that bigger scaffolds are not automatically +better. A concise 20-word memo produced a useful sweet spot: it improved over +no-memo and full-transcript baselines, while longer or more structured memo +variants regressed. The likely mechanism is selective pressure: the short memo +forces the harness to preserve only state that matters for future decisions. + +Tools and hints showed a synergy effect. Prompt hints alone hurt, and tools +alone were modest, but tools plus hints improved outcomes because the hints +pointed the model toward quantities and quest mechanics while the calculator, +scratchpad, and history search gave it ways to act on those signals. + +Verbosity is a recurring failure mode. Some newer or larger models timed out +more often because they spent too much of the quest budget generating long step +responses. For sequential decision tasks, a harness that elicits concise, +actionable state updates can outperform one that invites broad reasoning. + ## Implemented Runtime - Quest execution uses the TypeScript `space-rangers-quest` submodule through From 2276639d94832abb3176727b7f20d7d30bcb8251 Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Tue, 12 May 2026 00:43:01 +0400 Subject: [PATCH 16/24] fix: address harness review feedback --- configs/benchmarks/exp7b_model_upgrades.yaml | 2 +- llm_quest_benchmark/agents/agent_factory.py | 5 +++++ llm_quest_benchmark/executors/benchmark.py | 21 ++++++++++++------- llm_quest_benchmark/executors/cli/commands.py | 2 -- llm_quest_benchmark/harnesses/memo.py | 6 +++++- llm_quest_benchmark/harnesses/memory.py | 10 ++++++++- llm_quest_benchmark/harnesses/minimal.py | 4 ++-- llm_quest_benchmark/harnesses/planner.py | 4 ++-- llm_quest_benchmark/harnesses/tools.py | 8 ++++--- llm_quest_benchmark/schemas/config.py | 5 +++++ 10 files changed, 47 insertions(+), 20 deletions(-) diff --git a/configs/benchmarks/exp7b_model_upgrades.yaml b/configs/benchmarks/exp7b_model_upgrades.yaml index 22da91b..80ab53c 100644 --- a/configs/benchmarks/exp7b_model_upgrades.yaml +++ b/configs/benchmarks/exp7b_model_upgrades.yaml @@ -29,7 +29,7 @@ agents: temperature: 0.4 runs: 2 compaction_interval: 50 - - model: "claude:claude-haiku-4-5-20251001" + - model: "anthropic:claude-haiku-4-5-20251001" harness: memo_compact temperature: 0.4 runs: 2 diff --git a/llm_quest_benchmark/agents/agent_factory.py b/llm_quest_benchmark/agents/agent_factory.py index 6d2ff42..09e2607 100644 --- a/llm_quest_benchmark/agents/agent_factory.py +++ b/llm_quest_benchmark/agents/agent_factory.py @@ -48,6 +48,11 @@ def create_agent( """ logger.debug(f"Creating agent for model: {model}") resolved_action_template = normalize_template_name(action_template) + harness_routed_templates = {"planner.jinja", "tool_augmented.jinja", "tool_augmented_hints.jinja"} + if resolved_action_template in harness_routed_templates and memory_mode != "default": + raise ValueError( + "memory_mode is not supported for planner/tool harness templates; configure memory via harness selection." + ) # Human player if model == "human": diff --git a/llm_quest_benchmark/executors/benchmark.py b/llm_quest_benchmark/executors/benchmark.py index 82800b3..14dacaf 100644 --- a/llm_quest_benchmark/executors/benchmark.py +++ b/llm_quest_benchmark/executors/benchmark.py @@ -49,6 +49,11 @@ def _agent_model(agent_config) -> str: return agent_config.model +def _agent_id(agent_config) -> str: + """Return the stable result identifier for legacy and harness configs.""" + return getattr(agent_config, "harness_id", None) or agent_config.agent_id + + def _agent_template(agent_config) -> str: """Return legacy template name for result artifacts.""" if hasattr(agent_config, "action_template"): @@ -106,7 +111,7 @@ def _result_entry( "harness": _agent_harness(agent_config), "template": _agent_template(agent_config), "memory_mode": _agent_memory_mode(agent_config), - "agent_id": agent_config.harness_id if hasattr(agent_config, "harness_id") else agent_config.agent_id, + "agent_id": _agent_id(agent_config), "attempt": attempt, "outcome": outcome, "reward": reward, @@ -137,7 +142,7 @@ def _mark_run_timeout(run_id: int | None, quest: str, agent_config, benchmark_id WHERE id = ? """, ( - agent_config.agent_id, + _agent_id(agent_config), agent_config_json, benchmark_id, QuestOutcome.TIMEOUT.name, @@ -160,7 +165,7 @@ def _mark_run_timeout(run_id: int | None, quest: str, agent_config, benchmark_id Path(quest).stem, end_time, end_time, - agent_config.agent_id, + _agent_id(agent_config), agent_config_json, QuestOutcome.TIMEOUT.name, 0.0, @@ -404,7 +409,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[ logger.info( "Queued agent %s quest %s (attempt %s/%s)", - agent_config.agent_id, + _agent_id(agent_config), quest_name, attempt, agent_config.runs, @@ -436,7 +441,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[ } logger.info( "Agent %s running quest %s (attempt %s/%s)", - agent_config.agent_id, + _agent_id(agent_config), task["quest_name"], task["attempt"], agent_config.runs, @@ -449,7 +454,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[ "total_runs": total_runs, "quest": task["quest"], "quest_name": task["quest_name"], - "agent_id": agent_config.agent_id, + "agent_id": _agent_id(agent_config), "model": agent_config.model, "attempt": task["attempt"], }, @@ -484,7 +489,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[ "total_runs": total_runs, "quest": task["quest"], "quest_name": task["quest_name"], - "agent_id": agent_config.agent_id, + "agent_id": _agent_id(agent_config), "model": agent_config.model, "attempt": task["attempt"], "outcome": result["outcome"], @@ -529,7 +534,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[ "total_runs": total_runs, "quest": task["quest"], "quest_name": task["quest_name"], - "agent_id": agent_config.agent_id, + "agent_id": _agent_id(agent_config), "model": agent_config.model, "attempt": task["attempt"], "outcome": QuestOutcome.TIMEOUT.name, diff --git a/llm_quest_benchmark/executors/cli/commands.py b/llm_quest_benchmark/executors/cli/commands.py index 4b029bd..d554f70 100644 --- a/llm_quest_benchmark/executors/cli/commands.py +++ b/llm_quest_benchmark/executors/cli/commands.py @@ -8,7 +8,6 @@ from pathlib import Path from typing import Any -import click from dotenv import load_dotenv # Initialize quest registry early @@ -353,7 +352,6 @@ def run( "reasoning_recent", "--harness", help="Harness to use for quest decisions.", - click_type=click.Choice(HARNESS_CHOICES), ), compaction_interval: int = typer.Option(50, help="Advanced override for compaction interval."), timeout: int = typer.Option(60, help="Timeout in seconds for run (0 for no timeout)."), diff --git a/llm_quest_benchmark/harnesses/memo.py b/llm_quest_benchmark/harnesses/memo.py index eaab06b..63bfb60 100644 --- a/llm_quest_benchmark/harnesses/memo.py +++ b/llm_quest_benchmark/harnesses/memo.py @@ -27,7 +27,11 @@ def __init__( temperature=temperature, skip_single=skip_single, debug=debug, - memory_module=memory_module or CompactionMemory(compaction_interval=compaction_interval), + memory_module=( + memory_module + if memory_module is not None + else CompactionMemory(compaction_interval=compaction_interval) + ), **kwargs, ) self._memory_mode = "compaction" diff --git a/llm_quest_benchmark/harnesses/memory.py b/llm_quest_benchmark/harnesses/memory.py index ab4f72c..22581fa 100644 --- a/llm_quest_benchmark/harnesses/memory.py +++ b/llm_quest_benchmark/harnesses/memory.py @@ -251,9 +251,11 @@ def _maybe_compact(self) -> None: return if self.llm_client is None: # No LLM client available for compaction; skip silently + self._steps_since_compaction = 0 return transcript_text = self._format_transcript_for_compaction() if not transcript_text: + self._steps_since_compaction = 0 return prompt_parts = ["You are summarizing an agent's progress through a text quest."] @@ -272,11 +274,17 @@ def _maybe_compact(self) -> None: "Write a concise summary in plain text, max 300 words." ) - summary = (self.llm_client.get_completion("\n".join(prompt_parts)) or "").strip() + try: + summary = (self.llm_client.get_completion("\n".join(prompt_parts)) or "").strip() + except Exception: + self._steps_since_compaction = 0 + return if summary: self._compaction_summary = summary self._transcript = [] self._steps_since_compaction = 0 + else: + self._steps_since_compaction = 0 def _format_transcript_for_compaction(self) -> str: recent = ( diff --git a/llm_quest_benchmark/harnesses/minimal.py b/llm_quest_benchmark/harnesses/minimal.py index 8fa8ba0..8fdd944 100644 --- a/llm_quest_benchmark/harnesses/minimal.py +++ b/llm_quest_benchmark/harnesses/minimal.py @@ -39,11 +39,11 @@ def _get_action_impl(self, observation: str, choices: list[dict[str, str]]) -> i state_signature = self._state_signature(observation, choices) prompt = self._format_prompt(self._build_contextual_state(observation), choices) parsed_response = self._parse_with_retries(prompt, observation, choices) + if parsed_response.action < 1 or parsed_response.action > len(choices): + parsed_response.action = 1 self.history.append(parsed_response) self._last_response = parsed_response self._remember_decision(observation, choices, state_signature, parsed_response) - if parsed_response.action < 1 or parsed_response.action > len(choices): - parsed_response.action = 1 return parsed_response.action except Exception as exc: self.logger.error("Harness error during LLM call: %s", exc) diff --git a/llm_quest_benchmark/harnesses/planner.py b/llm_quest_benchmark/harnesses/planner.py index efb77a9..810440c 100644 --- a/llm_quest_benchmark/harnesses/planner.py +++ b/llm_quest_benchmark/harnesses/planner.py @@ -164,11 +164,11 @@ def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int: parsed_response.total_tokens = total_usage["total_tokens"] parsed_response.estimated_cost_usd = total_usage["estimated_cost_usd"] + if parsed_response.action < 1 or parsed_response.action > len(choices): + parsed_response.action = 1 self.history.append(parsed_response) self._last_response = parsed_response self._remember_decision(state, choices, state_signature, parsed_response) - if parsed_response.action < 1 or parsed_response.action > len(choices): - parsed_response.action = 1 return parsed_response.action except Exception as exc: self.logger.error("Planner harness error during LLM call: %s", exc) diff --git a/llm_quest_benchmark/harnesses/tools.py b/llm_quest_benchmark/harnesses/tools.py index 5386d6d..63edcd8 100644 --- a/llm_quest_benchmark/harnesses/tools.py +++ b/llm_quest_benchmark/harnesses/tools.py @@ -160,12 +160,14 @@ def search(self, query: str) -> str: scored.sort(key=lambda item: (item[0], item[1].get("step", 0)), reverse=True) best = [entry for score, entry in scored if score > 0][: self.history_window] if not best: - best = [entry for _, entry in scored[-self.history_window :]] + best = [entry for _, entry in scored[: self.history_window]] lines = [] for entry in best: + choices = entry.get("choices", []) + choices_text = choices if isinstance(choices, str) else "; ".join(choices) lines.append( - f"Step {entry['step']}: obs={entry['observation']} | " - f"choices={'; '.join(entry['choices'])} | picked={entry.get('selected_choice', 'n/a')}" + f"Step {entry.get('step', '?')}: obs={entry.get('observation', '')} | " + f"choices={choices_text} | picked={entry.get('selected_choice', 'n/a')}" ) return "\n".join(lines) diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py index f32184e..05053cd 100644 --- a/llm_quest_benchmark/schemas/config.py +++ b/llm_quest_benchmark/schemas/config.py @@ -111,6 +111,11 @@ def __post_init__(self): ): valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES] raise ValueError(f"Invalid harness: {self.harness}. Supported harnesses: {valid}") + if self.model not in ("human",) and not is_random_choice_harness(self.model): + from llm_quest_benchmark.llm.client import is_supported_model_name + + if not is_supported_model_name(self.model): + raise ValueError(f"Invalid model: {self.model}. Supported models: {MODEL_CHOICES}") if not (0.0 <= self.temperature <= 2.0): raise ValueError(f"Temperature must be between 0.0 and 2.0, got {self.temperature}") if self.runs < 1: From 42a89b1d8ecadcbfddb6e3e9623914d54f89f203 Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Tue, 12 May 2026 12:03:53 +0400 Subject: [PATCH 17/24] fix: preserve legacy agent compatibility --- llm_quest_benchmark/agents/agent_factory.py | 2 +- llm_quest_benchmark/agents/strategic_agent.py | 80 ++++++++++++++++++- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/llm_quest_benchmark/agents/agent_factory.py b/llm_quest_benchmark/agents/agent_factory.py index 09e2607..43ad273 100644 --- a/llm_quest_benchmark/agents/agent_factory.py +++ b/llm_quest_benchmark/agents/agent_factory.py @@ -49,7 +49,7 @@ def create_agent( logger.debug(f"Creating agent for model: {model}") resolved_action_template = normalize_template_name(action_template) harness_routed_templates = {"planner.jinja", "tool_augmented.jinja", "tool_augmented_hints.jinja"} - if resolved_action_template in harness_routed_templates and memory_mode != "default": + if resolved_action_template in harness_routed_templates and memory_mode not in ("default", "compaction"): raise ValueError( "memory_mode is not supported for planner/tool harness templates; configure memory via harness selection." ) diff --git a/llm_quest_benchmark/agents/strategic_agent.py b/llm_quest_benchmark/agents/strategic_agent.py index a4cc4e7..edd656f 100644 --- a/llm_quest_benchmark/agents/strategic_agent.py +++ b/llm_quest_benchmark/agents/strategic_agent.py @@ -1,3 +1,79 @@ -"""Deprecated strategic agent module.""" +"""Deprecated compatibility wrapper for strategic agents.""" -raise ImportError("strategic_agent is deprecated; use llm_quest_benchmark.harnesses instead") +import logging +import warnings +from typing import Any + +from llm_quest_benchmark.agents.base import QuestPlayer +from llm_quest_benchmark.llm.prompt import PromptRenderer + +warnings.warn("strategic_agent is deprecated, use llm_quest_benchmark.harnesses", DeprecationWarning, stacklevel=2) + + +class StrategicAgent(QuestPlayer): + """Backward-compatible strategic analysis decorator.""" + + def __init__(self, base_agent: QuestPlayer, debug: bool = False, template: str = "advanced.jinja"): + super().__init__(skip_single=base_agent.skip_single) + self.agent = base_agent + self.debug = debug + self.history = [] + + self.logger = logging.getLogger(self.__class__.__name__) + if self.debug: + self.logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter("%(name)s - %(message)s")) + self.logger.addHandler(handler) + + self.prompt_renderer = PromptRenderer(None, template=template) + + def _get_action_impl(self, observation: str, choices: list) -> int: + if hasattr(self.agent, "llm"): + if self.debug: + self.logger.debug("\nObservation:\n%s", observation) + + analysis = self.agent.llm( + "Analyze this situation and explain your thinking step-by-step instead of choosing an action:\n" + + observation + ) + + if self.debug: + self.logger.debug("\nAnalysis:\n%s", analysis) + + self.history.append({"observation": observation, "analysis": analysis}) + enhanced_context = self.get_enhanced_context(observation, choices) + if self.debug: + self.logger.debug("\nEnhanced Context:\n%s", enhanced_context) + + return self.agent.get_action(enhanced_context, choices) + + return self.agent.get_action(observation, choices) + + def get_enhanced_context(self, observation: str, choices: list) -> str: + context = [f"Turn {len(self.history) + 1}: {entry['analysis']}" for entry in self.history[-3:]] + return self.prompt_renderer.render_action_prompt( + observation=observation, + choices=choices, + state_tracker=context, + ) + + def reset(self) -> None: + self.history = [] + self.agent.reset() + + def on_game_start(self) -> None: + if self.debug: + self.logger.debug("Starting new game with strategic analysis") + self.agent.on_game_start() + + def on_game_end(self, final_state: dict[str, Any]) -> None: + self.agent.on_game_end(final_state) + if self.debug: + self.logger.debug("Final Analysis History:") + for entry in self.history: + self.logger.debug("\nObservation: %s", entry["observation"]) + self.logger.debug("Analysis: %s", entry["analysis"]) + + +__all__ = ["StrategicAgent"] From d9a0f224205f98f61f6fe09ec872cc98426bb431 Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Tue, 12 May 2026 12:56:55 +0400 Subject: [PATCH 18/24] fix: validate special harness models --- configs/default.yaml | 2 +- configs/kr2_en_test.yaml | 2 +- configs/test/parallel_agents_test.yaml | 1 + configs/test/test_benchmark.yaml | 1 + llm_quest_benchmark/harnesses/factory.py | 4 +++- llm_quest_benchmark/schemas/config.py | 7 ++++++- .../tests/executors/cli/test_commands.py | 7 +++++-- .../tests/harnesses/test_factory.py | 20 +++++++++++++++++++ .../tests/integration/test_quest_e2e.py | 4 ++-- 9 files changed, 40 insertions(+), 8 deletions(-) diff --git a/configs/default.yaml b/configs/default.yaml index ff185a5..3159029 100644 --- a/configs/default.yaml +++ b/configs/default.yaml @@ -5,7 +5,7 @@ quests: agents: - model: random_choice - harness: reasoning_recent + harness: random_choice temperature: 0.0 skip_single: true diff --git a/configs/kr2_en_test.yaml b/configs/kr2_en_test.yaml index 0addb04..94cfaa3 100644 --- a/configs/kr2_en_test.yaml +++ b/configs/kr2_en_test.yaml @@ -5,7 +5,7 @@ quests: agents: - model: random_choice # Use random agent for speed and reliability temperature: 0.5 - harness: reasoning_recent + harness: random_choice quest_timeout: 10 # short timeout for testing debug: true output_dir: results/benchmarks diff --git a/configs/test/parallel_agents_test.yaml b/configs/test/parallel_agents_test.yaml index 0aec1be..873d3ed 100644 --- a/configs/test/parallel_agents_test.yaml +++ b/configs/test/parallel_agents_test.yaml @@ -5,6 +5,7 @@ quests: - quests/kr_1_ru/Diamond.qm agents: - model: random_choice + harness: random_choice - model: gpt-5-mini harness: reasoning_recent debug: true diff --git a/configs/test/test_benchmark.yaml b/configs/test/test_benchmark.yaml index c20c648..b3321d9 100644 --- a/configs/test/test_benchmark.yaml +++ b/configs/test/test_benchmark.yaml @@ -3,6 +3,7 @@ quests: - quests/Boat.qm agents: - model: random_choice + harness: random_choice - model: gemini-2.5-flash harness: reasoning_recent debug: true diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py index 8ea4b84..5af7c50 100644 --- a/llm_quest_benchmark/harnesses/factory.py +++ b/llm_quest_benchmark/harnesses/factory.py @@ -70,9 +70,11 @@ def create_harness( raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}") is_random_model, seed = _parse_random_choice_seed(model) if is_random_model: - return RandomAgent(seed=seed, debug=debug, skip_single=skip_single) + raise ValueError("Use harness='random_choice' for random policy runs instead of pairing random_choice model with an LLM harness") if model.startswith("random_choice"): raise ValueError(f"Unknown random_choice model '{model}'. Valid: {valid}") + if model == "human": + raise ValueError("Use harness='human' for human runs instead of pairing human model with an LLM harness") cls = HARNESS_REGISTRY[harness] return cls( model_name=model, diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py index 05053cd..8008e1b 100644 --- a/llm_quest_benchmark/schemas/config.py +++ b/llm_quest_benchmark/schemas/config.py @@ -18,7 +18,7 @@ DEFAULT_BENCHMARK_CONFIG = { "quests": ["quests/Boat.qm"], "agents": [ - {"model": "random_choice", "skip_single": True, "temperature": 0.0, "harness": "minimal"}, + {"model": "random_choice", "skip_single": True, "temperature": 0.0, "harness": "random_choice"}, {"model": "gpt-5-mini", "skip_single": True, "temperature": 0.4, "harness": "reasoning_recent"}, ], "debug": False, @@ -43,6 +43,7 @@ def get_default_benchmark_yaml() -> str: - quests/Boat.qm agents: - model: random_choice + harness: random_choice - model: gpt-5-mini harness: reasoning_recent debug: true @@ -111,6 +112,10 @@ def __post_init__(self): ): valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES] raise ValueError(f"Invalid harness: {self.harness}. Supported harnesses: {valid}") + if self.model == "human" and self.harness != "human": + raise ValueError("Use harness: human with model: human") + if is_random_choice_harness(self.model) and not is_random_choice_harness(self.harness): + raise ValueError("Use harness: random_choice with model: random_choice") if self.model not in ("human",) and not is_random_choice_harness(self.model): from llm_quest_benchmark.llm.client import is_supported_model_name diff --git a/llm_quest_benchmark/tests/executors/cli/test_commands.py b/llm_quest_benchmark/tests/executors/cli/test_commands.py index db0daf1..d88fdbf 100644 --- a/llm_quest_benchmark/tests/executors/cli/test_commands.py +++ b/llm_quest_benchmark/tests/executors/cli/test_commands.py @@ -20,7 +20,10 @@ def test_version(): def test_run_quest(): """Test running a quest with random agent""" - result = runner.invoke(app, ["run", "--quest", str(DEFAULT_QUEST), "--model", "random_choice", "--debug"]) + result = runner.invoke( + app, + ["run", "--quest", str(DEFAULT_QUEST), "--model", "random_choice", "--harness", "random_choice", "--debug"], + ) assert result.exit_code in [0, 1, 2] @@ -31,7 +34,7 @@ def test_run_quest_invalid_args(): assert result.exit_code == 2 # Test missing quest file - result = runner.invoke(app, ["run", "--quest", "nonexistent.qm", "--model", "random_choice"]) + result = runner.invoke(app, ["run", "--quest", "nonexistent.qm", "--model", "random_choice", "--harness", "random_choice"]) assert result.exit_code == 2 diff --git a/llm_quest_benchmark/tests/harnesses/test_factory.py b/llm_quest_benchmark/tests/harnesses/test_factory.py index 7f6f11c..800a502 100644 --- a/llm_quest_benchmark/tests/harnesses/test_factory.py +++ b/llm_quest_benchmark/tests/harnesses/test_factory.py @@ -55,6 +55,16 @@ def test_random_choice_model_does_not_hide_bad_harness(): create_harness("bad_name", model="random_choice_123") +def test_random_choice_model_requires_random_harness(): + with pytest.raises(ValueError, match="harness='random_choice'"): + create_harness("minimal", model="random_choice") + + +def test_human_model_requires_human_harness(): + with pytest.raises(ValueError, match="harness='human'"): + create_harness("minimal", model="human") + + def test_harness_config_stable_harness_id(): config = HarnessConfig(harness="memo_compact", model="gpt-5-mini") @@ -75,6 +85,16 @@ def test_harness_config_allows_seeded_random_choice_harness(): assert config.harness == "random_choice_123" +def test_harness_config_rejects_random_model_with_llm_harness(): + with pytest.raises(ValueError, match="harness: random_choice"): + HarnessConfig(harness="minimal", model="random_choice") + + +def test_harness_config_rejects_human_model_with_llm_harness(): + with pytest.raises(ValueError, match="harness: human"): + HarnessConfig(harness="minimal", model="human") + + def test_harness_config_allows_retired_exp4_aliases(): for harness_name in ("compaction_no_memo", "memo_cot", "memo_extended", "memo_structured"): config = HarnessConfig(harness=harness_name, model="gpt-5-mini") diff --git a/llm_quest_benchmark/tests/integration/test_quest_e2e.py b/llm_quest_benchmark/tests/integration/test_quest_e2e.py index a0e376d..86568bb 100644 --- a/llm_quest_benchmark/tests/integration/test_quest_e2e.py +++ b/llm_quest_benchmark/tests/integration/test_quest_e2e.py @@ -19,9 +19,9 @@ def test_quest_run_with_llm(caplog): """Test that quest runs with LLM agent and reaches a final state""" caplog.set_level(logging.DEBUG) # Show all logs in test output - # Create LLM harness + # Create random harness agent = create_harness( - harness="minimal", + harness="random_choice", model="random_choice", # Use random for testing system_template=SYSTEM_ROLE_TEMPLATE, temperature=0.0, From 0a3f0b0081ecdd6f687751851c1f5265dc7effd7 Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Tue, 12 May 2026 13:00:20 +0400 Subject: [PATCH 19/24] style: format special harness validation --- llm_quest_benchmark/harnesses/factory.py | 4 +++- llm_quest_benchmark/tests/executors/cli/test_commands.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py index 5af7c50..4c1591f 100644 --- a/llm_quest_benchmark/harnesses/factory.py +++ b/llm_quest_benchmark/harnesses/factory.py @@ -70,7 +70,9 @@ def create_harness( raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}") is_random_model, seed = _parse_random_choice_seed(model) if is_random_model: - raise ValueError("Use harness='random_choice' for random policy runs instead of pairing random_choice model with an LLM harness") + raise ValueError( + "Use harness='random_choice' for random policy runs instead of pairing random_choice model with an LLM harness" + ) if model.startswith("random_choice"): raise ValueError(f"Unknown random_choice model '{model}'. Valid: {valid}") if model == "human": diff --git a/llm_quest_benchmark/tests/executors/cli/test_commands.py b/llm_quest_benchmark/tests/executors/cli/test_commands.py index d88fdbf..a3825cd 100644 --- a/llm_quest_benchmark/tests/executors/cli/test_commands.py +++ b/llm_quest_benchmark/tests/executors/cli/test_commands.py @@ -34,7 +34,9 @@ def test_run_quest_invalid_args(): assert result.exit_code == 2 # Test missing quest file - result = runner.invoke(app, ["run", "--quest", "nonexistent.qm", "--model", "random_choice", "--harness", "random_choice"]) + result = runner.invoke( + app, ["run", "--quest", "nonexistent.qm", "--model", "random_choice", "--harness", "random_choice"] + ) assert result.exit_code == 2 From ff73599521e9c96352882e05cb5f7a12ea050117 Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Tue, 12 May 2026 13:11:24 +0400 Subject: [PATCH 20/24] fix: preserve legacy memory routing --- llm_quest_benchmark/agents/agent_factory.py | 38 ++++++++++++++------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/llm_quest_benchmark/agents/agent_factory.py b/llm_quest_benchmark/agents/agent_factory.py index 43ad273..3d68e92 100644 --- a/llm_quest_benchmark/agents/agent_factory.py +++ b/llm_quest_benchmark/agents/agent_factory.py @@ -17,6 +17,18 @@ logger = logging.getLogger(__name__) +def _legacy_memory_module(memory_mode: str, compaction_interval: int): + from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory + + if memory_mode == "default": + return DefaultMemory() + if memory_mode == "full_transcript": + return FullTranscriptMemory() + if memory_mode == "compaction": + return CompactionMemory(compaction_interval=compaction_interval) + raise ValueError(f"Invalid memory_mode: {memory_mode}") + + def create_agent( model: str = DEFAULT_MODEL, system_template: str = SYSTEM_ROLE_TEMPLATE, @@ -48,11 +60,6 @@ def create_agent( """ logger.debug(f"Creating agent for model: {model}") resolved_action_template = normalize_template_name(action_template) - harness_routed_templates = {"planner.jinja", "tool_augmented.jinja", "tool_augmented_hints.jinja"} - if resolved_action_template in harness_routed_templates and memory_mode not in ("default", "compaction"): - raise ValueError( - "memory_mode is not supported for planner/tool harness templates; configure memory via harness selection." - ) # Human player if model == "human": @@ -69,30 +76,35 @@ def create_agent( return RandomAgent(seed=seed, debug=debug, skip_single=skip_single) if resolved_action_template == "planner.jinja": - from llm_quest_benchmark.harnesses.factory import create_harness + from llm_quest_benchmark.harnesses.planner import PlannerHarness - return create_harness( - harness="planner", - model=model, + agent = PlannerHarness( + model_name=model, temperature=temperature, skip_single=skip_single, debug=debug, compaction_interval=compaction_interval, system_template=system_template, + memory_module=_legacy_memory_module(memory_mode, compaction_interval), ) + agent._memory_mode = memory_mode + return agent if resolved_action_template in ("tool_augmented.jinja", "tool_augmented_hints.jinja"): - from llm_quest_benchmark.harnesses.factory import create_harness + from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness, ToolHintedHarness - return create_harness( - harness="tool_hinted" if resolved_action_template == "tool_augmented_hints.jinja" else "tool_compact", - model=model, + cls = ToolHintedHarness if resolved_action_template == "tool_augmented_hints.jinja" else ToolCompactHarness + agent = cls( + model_name=model, temperature=temperature, skip_single=skip_single, debug=debug, compaction_interval=compaction_interval, system_template=system_template, + memory_module=_legacy_memory_module(memory_mode, compaction_interval), ) + agent._memory_mode = memory_mode + return agent # Default to LLM agent return LLMAgent( From 6ae2265874e0a835cdcafdff2c320daf434880dd Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Tue, 12 May 2026 18:17:38 +0400 Subject: [PATCH 21/24] remove legacy agent compatibility --- docs/EXPERIMENTS_LOG.md | 2 +- llm_quest_benchmark/agents/__init__.py | 22 +--- llm_quest_benchmark/agents/agent_factory.py | 119 ------------------ llm_quest_benchmark/agents/llm_agent.py | 103 --------------- llm_quest_benchmark/agents/planner_agent.py | 9 -- llm_quest_benchmark/agents/strategic_agent.py | 79 ------------ llm_quest_benchmark/agents/tool_agent.py | 9 -- llm_quest_benchmark/core/runner.py | 1 - llm_quest_benchmark/harnesses/tool_harness.py | 2 +- llm_quest_benchmark/schemas/config.py | 45 ------- 10 files changed, 7 insertions(+), 384 deletions(-) delete mode 100644 llm_quest_benchmark/agents/agent_factory.py delete mode 100644 llm_quest_benchmark/agents/llm_agent.py delete mode 100644 llm_quest_benchmark/agents/planner_agent.py delete mode 100644 llm_quest_benchmark/agents/strategic_agent.py delete mode 100644 llm_quest_benchmark/agents/tool_agent.py diff --git a/docs/EXPERIMENTS_LOG.md b/docs/EXPERIMENTS_LOG.md index 6d0d7f9..a9ca972 100644 --- a/docs/EXPERIMENTS_LOG.md +++ b/docs/EXPERIMENTS_LOG.md @@ -190,7 +190,7 @@ The `_apply_loop_breaker` mechanism was overriding correct LLM decisions. Eviden ### Decision -- **Disabled loop breaker** entirely in all agent types (llm_agent, planner_agent, tool_agent) +- **Disabled loop breaker** entirely in all harness types - **Removed number normalization** from state signature computation - Kept `_state_action_counts` and `_state_signature` (used by safety filter and loop escape) - Removed `_apply_loop_breaker` method and `_loop_repetition_threshold` field as dead code diff --git a/llm_quest_benchmark/agents/__init__.py b/llm_quest_benchmark/agents/__init__.py index d056964..fdd1aa6 100644 --- a/llm_quest_benchmark/agents/__init__.py +++ b/llm_quest_benchmark/agents/__init__.py @@ -1,29 +1,17 @@ -__all__ = ["create_agent", "QuestPlayer", "RandomAgent", "LLMAgent", "PlannerAgent", "ToolAgent"] +__all__ = ["QuestPlayer", "HumanPlayer", "RandomAgent"] def __getattr__(name): - if name == "create_agent": - from .agent_factory import create_agent - - return create_agent if name == "QuestPlayer": from .base import QuestPlayer return QuestPlayer + if name == "HumanPlayer": + from .human_player import HumanPlayer + + return HumanPlayer if name == "RandomAgent": from .random_agent import RandomAgent return RandomAgent - if name == "LLMAgent": - from .llm_agent import LLMAgent - - return LLMAgent - if name == "PlannerAgent": - from .planner_agent import PlannerAgent - - return PlannerAgent - if name == "ToolAgent": - from .tool_agent import ToolAgent - - return ToolAgent raise AttributeError(name) diff --git a/llm_quest_benchmark/agents/agent_factory.py b/llm_quest_benchmark/agents/agent_factory.py deleted file mode 100644 index 3d68e92..0000000 --- a/llm_quest_benchmark/agents/agent_factory.py +++ /dev/null @@ -1,119 +0,0 @@ -"""Factory for creating quest agents""" - -import logging - -from llm_quest_benchmark.agents.base import QuestPlayer -from llm_quest_benchmark.agents.human_player import HumanPlayer -from llm_quest_benchmark.agents.llm_agent import LLMAgent -from llm_quest_benchmark.agents.random_agent import RandomAgent -from llm_quest_benchmark.constants import ( - DEFAULT_MODEL, - DEFAULT_TEMPERATURE, - DEFAULT_TEMPLATE, - SYSTEM_ROLE_TEMPLATE, - normalize_template_name, -) - -logger = logging.getLogger(__name__) - - -def _legacy_memory_module(memory_mode: str, compaction_interval: int): - from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory - - if memory_mode == "default": - return DefaultMemory() - if memory_mode == "full_transcript": - return FullTranscriptMemory() - if memory_mode == "compaction": - return CompactionMemory(compaction_interval=compaction_interval) - raise ValueError(f"Invalid memory_mode: {memory_mode}") - - -def create_agent( - model: str = DEFAULT_MODEL, - system_template: str = SYSTEM_ROLE_TEMPLATE, - action_template: str = DEFAULT_TEMPLATE, - temperature: float = DEFAULT_TEMPERATURE, - skip_single: bool = False, - debug: bool = False, - memory_mode: str = "default", - compaction_interval: int = 10, -) -> QuestPlayer: - """Create a quest agent based on model name and parameters. - - Args: - model (str): Model identifier. Can be: - - LLM model name (e.g. 'gpt-5-mini', 'claude-sonnet-4-5') - - 'random_choice' for random testing agent (can include seed e.g. 'random_choice_123') - - 'human' for interactive human player - debug (bool): Enable debug logging - system_template (str): System template for LLM agents - action_template (str): Action template for LLM agents - temperature (float): Temperature for LLM sampling - skip_single (bool): Auto-select single choices - - Returns: - QuestPlayer: Appropriate agent instance - - Raises: - ValueError: If model type is not recognized - """ - logger.debug(f"Creating agent for model: {model}") - resolved_action_template = normalize_template_name(action_template) - - # Human player - if model == "human": - return HumanPlayer(skip_single=skip_single) - - # Random choice agent - if model.startswith("random_choice"): - seed = None - if "_" in model: - try: - seed = int(model.split("_")[-1]) - except ValueError: - pass - return RandomAgent(seed=seed, debug=debug, skip_single=skip_single) - - if resolved_action_template == "planner.jinja": - from llm_quest_benchmark.harnesses.planner import PlannerHarness - - agent = PlannerHarness( - model_name=model, - temperature=temperature, - skip_single=skip_single, - debug=debug, - compaction_interval=compaction_interval, - system_template=system_template, - memory_module=_legacy_memory_module(memory_mode, compaction_interval), - ) - agent._memory_mode = memory_mode - return agent - - if resolved_action_template in ("tool_augmented.jinja", "tool_augmented_hints.jinja"): - from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness, ToolHintedHarness - - cls = ToolHintedHarness if resolved_action_template == "tool_augmented_hints.jinja" else ToolCompactHarness - agent = cls( - model_name=model, - temperature=temperature, - skip_single=skip_single, - debug=debug, - compaction_interval=compaction_interval, - system_template=system_template, - memory_module=_legacy_memory_module(memory_mode, compaction_interval), - ) - agent._memory_mode = memory_mode - return agent - - # Default to LLM agent - return LLMAgent( - debug=debug, - model_name=model, - system_template=system_template, - action_template=resolved_action_template, - temperature=temperature, - skip_single=skip_single, - memory_mode=memory_mode, - compaction_interval=compaction_interval, - ) diff --git a/llm_quest_benchmark/agents/llm_agent.py b/llm_quest_benchmark/agents/llm_agent.py deleted file mode 100644 index 7b6d352..0000000 --- a/llm_quest_benchmark/agents/llm_agent.py +++ /dev/null @@ -1,103 +0,0 @@ -"""Deprecated compatibility wrapper for harness-based LLM agents.""" - -import warnings - -from llm_quest_benchmark.constants import ( - DEFAULT_MODEL, - DEFAULT_TEMPERATURE, - DEFAULT_TEMPLATE, - MODEL_CHOICES, - SYSTEM_ROLE_TEMPLATE, -) -from llm_quest_benchmark.harnesses.base import ( - RISKY_CHOICE_KEYWORDS, - SAFE_CHOICE_KEYWORDS, - _is_numeric_raw_reasoning, - _parse_json_response, - _raw_reasoning_fallback, - parse_llm_response, -) -from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory -from llm_quest_benchmark.harnesses.minimal import MinimalHarness - -warnings.warn("llm_agent is deprecated, use llm_quest_benchmark.harnesses", DeprecationWarning, stacklevel=2) - - -class LLMAgent(MinimalHarness): - """Backward-compatible LLMAgent facade backed by concrete harness classes.""" - - SUPPORTED_MODELS = MODEL_CHOICES - - def __init__( - self, - model_name: str = DEFAULT_MODEL, - system_template: str = SYSTEM_ROLE_TEMPLATE, - action_template: str = DEFAULT_TEMPLATE, - temperature: float = DEFAULT_TEMPERATURE, - skip_single: bool = False, - debug: bool = False, - memory_mode: str = "default", - compaction_interval: int = 10, - ): - if memory_mode == "default": - memory_module = DefaultMemory() - elif memory_mode == "full_transcript": - memory_module = FullTranscriptMemory() - elif memory_mode == "compaction": - memory_module = CompactionMemory(compaction_interval=compaction_interval) - else: - raise ValueError(f"Invalid memory_mode: {memory_mode}") - - super().__init__( - model_name=model_name, - system_template=system_template, - action_template=action_template, - temperature=temperature, - skip_single=skip_single, - debug=debug, - memory_module=memory_module, - ) - self.agent_id = f"llm_{self.model_name}" - self._memory_mode = memory_mode - self._compaction_interval = compaction_interval - - def _remember_observation(self, observation: str) -> None: - """Compatibility hook used by legacy tests and callers.""" - clean = (observation or "").strip() - if not clean: - return - self._observation_history.append(clean) - if len(self._observation_history) > 20: - self._observation_history = self._observation_history[-20:] - if self.memory_module is not None: - self.memory_module.update({"observation": clean, "step": self._step_count + 1}) - - def _build_contextual_state(self, state: str) -> str: - """Build context while honoring legacy direct history mutation.""" - if isinstance(self.memory_module, DefaultMemory): - self.memory_module._observations = list(self._observation_history) - self.memory_module._decisions = list(self._decision_history) - return super()._build_contextual_state(state) - - def _apply_safety_filter(self, action_or_choices, choices_or_action) -> int: - """Accept both legacy (action, choices) and harness (choices, action) argument order.""" - if isinstance(action_or_choices, list): - return super()._apply_safety_filter(action_or_choices, choices_or_action) - return super()._apply_safety_filter(choices_or_action, action_or_choices) - - def __str__(self) -> str: - return ( - f"LLMAgent(model={self.model_name}, system_template={self.system_template}, " - f"action_template={self.action_template}, temperature={self.temperature})" - ) - - -__all__ = [ - "LLMAgent", - "parse_llm_response", - "_parse_json_response", - "_raw_reasoning_fallback", - "_is_numeric_raw_reasoning", - "RISKY_CHOICE_KEYWORDS", - "SAFE_CHOICE_KEYWORDS", -] diff --git a/llm_quest_benchmark/agents/planner_agent.py b/llm_quest_benchmark/agents/planner_agent.py deleted file mode 100644 index cd20e0d..0000000 --- a/llm_quest_benchmark/agents/planner_agent.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Deprecated compatibility wrapper for the planner harness.""" - -import warnings - -from llm_quest_benchmark.harnesses.planner import PlannerHarness as PlannerAgent - -warnings.warn("planner_agent is deprecated, use harnesses.planner", DeprecationWarning, stacklevel=2) - -__all__ = ["PlannerAgent"] diff --git a/llm_quest_benchmark/agents/strategic_agent.py b/llm_quest_benchmark/agents/strategic_agent.py deleted file mode 100644 index edd656f..0000000 --- a/llm_quest_benchmark/agents/strategic_agent.py +++ /dev/null @@ -1,79 +0,0 @@ -"""Deprecated compatibility wrapper for strategic agents.""" - -import logging -import warnings -from typing import Any - -from llm_quest_benchmark.agents.base import QuestPlayer -from llm_quest_benchmark.llm.prompt import PromptRenderer - -warnings.warn("strategic_agent is deprecated, use llm_quest_benchmark.harnesses", DeprecationWarning, stacklevel=2) - - -class StrategicAgent(QuestPlayer): - """Backward-compatible strategic analysis decorator.""" - - def __init__(self, base_agent: QuestPlayer, debug: bool = False, template: str = "advanced.jinja"): - super().__init__(skip_single=base_agent.skip_single) - self.agent = base_agent - self.debug = debug - self.history = [] - - self.logger = logging.getLogger(self.__class__.__name__) - if self.debug: - self.logger.setLevel(logging.DEBUG) - handler = logging.StreamHandler() - handler.setFormatter(logging.Formatter("%(name)s - %(message)s")) - self.logger.addHandler(handler) - - self.prompt_renderer = PromptRenderer(None, template=template) - - def _get_action_impl(self, observation: str, choices: list) -> int: - if hasattr(self.agent, "llm"): - if self.debug: - self.logger.debug("\nObservation:\n%s", observation) - - analysis = self.agent.llm( - "Analyze this situation and explain your thinking step-by-step instead of choosing an action:\n" - + observation - ) - - if self.debug: - self.logger.debug("\nAnalysis:\n%s", analysis) - - self.history.append({"observation": observation, "analysis": analysis}) - enhanced_context = self.get_enhanced_context(observation, choices) - if self.debug: - self.logger.debug("\nEnhanced Context:\n%s", enhanced_context) - - return self.agent.get_action(enhanced_context, choices) - - return self.agent.get_action(observation, choices) - - def get_enhanced_context(self, observation: str, choices: list) -> str: - context = [f"Turn {len(self.history) + 1}: {entry['analysis']}" for entry in self.history[-3:]] - return self.prompt_renderer.render_action_prompt( - observation=observation, - choices=choices, - state_tracker=context, - ) - - def reset(self) -> None: - self.history = [] - self.agent.reset() - - def on_game_start(self) -> None: - if self.debug: - self.logger.debug("Starting new game with strategic analysis") - self.agent.on_game_start() - - def on_game_end(self, final_state: dict[str, Any]) -> None: - self.agent.on_game_end(final_state) - if self.debug: - self.logger.debug("Final Analysis History:") - for entry in self.history: - self.logger.debug("\nObservation: %s", entry["observation"]) - self.logger.debug("Analysis: %s", entry["analysis"]) - - -__all__ = ["StrategicAgent"] diff --git a/llm_quest_benchmark/agents/tool_agent.py b/llm_quest_benchmark/agents/tool_agent.py deleted file mode 100644 index 659a747..0000000 --- a/llm_quest_benchmark/agents/tool_agent.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Deprecated compatibility wrapper for the tool harness.""" - -import warnings - -from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness as ToolAgent - -warnings.warn("tool_agent is deprecated, use harnesses.tool_harness", DeprecationWarning, stacklevel=2) - -__all__ = ["ToolAgent"] diff --git a/llm_quest_benchmark/core/runner.py b/llm_quest_benchmark/core/runner.py index e2b9ef8..bebf0c6 100644 --- a/llm_quest_benchmark/core/runner.py +++ b/llm_quest_benchmark/core/runner.py @@ -20,7 +20,6 @@ # Configure logging logging.getLogger("quest").setLevel(logging.WARNING) -logging.getLogger("LLMAgent").setLevel(logging.WARNING) def run_quest_with_timeout( diff --git a/llm_quest_benchmark/harnesses/tool_harness.py b/llm_quest_benchmark/harnesses/tool_harness.py index a398bfe..a4f09dd 100644 --- a/llm_quest_benchmark/harnesses/tool_harness.py +++ b/llm_quest_benchmark/harnesses/tool_harness.py @@ -217,7 +217,7 @@ def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int: action=1, is_default=True, parse_mode="error_default", - reasoning=f"tool_agent_error: {exc}", + reasoning=f"tool_harness_error: {exc}", ) self.history.append(default_response) self._last_response = default_response diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py index 8008e1b..505416b 100644 --- a/llm_quest_benchmark/schemas/config.py +++ b/llm_quest_benchmark/schemas/config.py @@ -8,7 +8,6 @@ from llm_quest_benchmark.constants import ( DEFAULT_MODEL, DEFAULT_TEMPERATURE, - DEFAULT_TEMPLATE, MODEL_CHOICES, SYSTEM_ROLE_TEMPLATE, normalize_template_name, @@ -143,50 +142,6 @@ def agent_id(self) -> str: return self.harness_id -@dataclass -class AgentConfig: - """Legacy configuration for a single agent in benchmark""" - - model: str = DEFAULT_MODEL - system_template: str = SYSTEM_ROLE_TEMPLATE - action_template: str = DEFAULT_TEMPLATE - temperature: float = DEFAULT_TEMPERATURE - runs: int = 1 - skip_single: bool = False - debug: bool = False - benchmark_id: str | None = None - memory_mode: str = "default" - compaction_interval: int = 10 - - def __post_init__(self): - self.system_template = normalize_template_name(self.system_template) - self.action_template = normalize_template_name(self.action_template) - if self.model not in ("random_choice", "human"): - # Keep parser compatibility for legacy names while UI remains clean. - from llm_quest_benchmark.llm.client import is_supported_model_name - - if not is_supported_model_name(self.model): - raise ValueError(f"Invalid model: {self.model}. Supported models: {MODEL_CHOICES}") - if not (0.0 <= self.temperature <= 2.0): - raise ValueError(f"Temperature must be between 0.0 and 2.0, got {self.temperature}") - if self.runs < 1: - raise ValueError(f"runs must be >= 1, got {self.runs}") - if self.memory_mode not in ("default", "full_transcript", "compaction"): - raise ValueError(f"Invalid memory_mode: {self.memory_mode}") - if self.memory_mode == "compaction" and self.compaction_interval < 1: - raise ValueError(f"compaction_interval must be >= 1, got {self.compaction_interval}") - - @property - def agent_id(self) -> str: - """Generate a unique agent ID based on configuration values""" - import hashlib - - interval_tag = f"_ci{self.compaction_interval}" if self.memory_mode == "compaction" else "" - config_str = f"{self.model}_{self.temperature}_{self.system_template}_{self.action_template}_{self.memory_mode}{interval_tag}" - hash_val = hashlib.md5(config_str.encode()).hexdigest()[:8] - return f"{self.model}_t{self.temperature}_{hash_val}" - - @dataclass class BenchmarkConfig: """Configuration for benchmark run""" From 131ecad7b021b243b061d38b6a8bdfb39cdd1f44 Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Wed, 13 May 2026 19:28:32 +0400 Subject: [PATCH 22/24] simplify harness template surface --- README.md | 3 +- docs/ARCHITECTURE.md | 2 ++ llm_quest_benchmark/harnesses/factory.py | 6 +++- .../prompt_templates/consequence_scan.jinja | 18 ---------- .../consequence_scan_subgoal.jinja | 19 ---------- .../prompt_templates/light_hints.jinja | 18 ---------- .../loop_aware_reasoning.jinja | 19 ---------- .../prompt_templates/objective_guard.jinja | 18 ---------- .../prompt_templates/strategic.jinja | 32 ----------------- .../system_role_completion.jinja | 11 ------ .../prompt_templates/system_role_risk.jinja | 16 --------- llm_quest_benchmark/schemas/config.py | 19 +++++++++- .../tests/harnesses/test_factory.py | 35 +++++++++++++++++-- .../tests/test_benchmark_with_directory.py | 4 +-- 14 files changed, 61 insertions(+), 159 deletions(-) delete mode 100644 llm_quest_benchmark/prompt_templates/consequence_scan.jinja delete mode 100644 llm_quest_benchmark/prompt_templates/consequence_scan_subgoal.jinja delete mode 100644 llm_quest_benchmark/prompt_templates/light_hints.jinja delete mode 100644 llm_quest_benchmark/prompt_templates/loop_aware_reasoning.jinja delete mode 100644 llm_quest_benchmark/prompt_templates/objective_guard.jinja delete mode 100644 llm_quest_benchmark/prompt_templates/strategic.jinja delete mode 100644 llm_quest_benchmark/prompt_templates/system_role_completion.jinja delete mode 100644 llm_quest_benchmark/prompt_templates/system_role_risk.jinja diff --git a/README.md b/README.md index 3ff854c..d99a55b 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,8 @@ Provider-specific keys in `.env`: ## Project Structure -- `llm_quest_benchmark/agents/` - Agent implementations (LLM, planner, tool-augmented) +- `llm_quest_benchmark/harnesses/` - LLM harness implementations for prompt, memory, tools, and planning experiments +- `llm_quest_benchmark/agents/` - Non-LLM player primitives (`human`, `random_choice`) - `llm_quest_benchmark/prompt_templates/` - Jinja2 prompt templates for the public context-scaffold taxonomy - `llm_quest_benchmark/executors/` - CLI, benchmark orchestration, TS bridge - `configs/benchmarks/` - YAML benchmark configurations diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 83472b3..bf474bc 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -99,6 +99,8 @@ and benchmark configuration parsing do not require API keys. harness memory. - `stateful_compact.jinja`: Compact memory / 20-word memo prompt. - `stateful_compact_hints.jinja`: Compact memo prompt with mechanics hints. + - `memo_cot.jinja`, `memo_extended.jinja`, `memo_structured.jinja`: + retained Exp 4 memo variants. - `planner.jinja`: Planner loop prompt. - `tool_augmented.jinja`, `tool_augmented_hints.jinja`: Tool prompts with compact memory, optionally with hints. diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py index 4c1591f..8d22462 100644 --- a/llm_quest_benchmark/harnesses/factory.py +++ b/llm_quest_benchmark/harnesses/factory.py @@ -60,7 +60,12 @@ def create_harness( ) -> QuestPlayer: valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES] is_random_harness, seed = _parse_random_choice_seed(harness) + is_random_model, _ = _parse_random_choice_seed(model) if is_random_harness: + if is_random_model and model != "random_choice": + raise ValueError("Encode random seeds in harness, for example harness='random_choice_123'") + if model not in (DEFAULT_MODEL, "random_choice"): + raise ValueError("Use model='random_choice' with random_choice harnesses") return RandomAgent(seed=seed, debug=debug, skip_single=skip_single) if harness.startswith("random_choice"): raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}") @@ -68,7 +73,6 @@ def create_harness( return HumanPlayer(skip_single=skip_single) if harness not in HARNESS_REGISTRY: raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}") - is_random_model, seed = _parse_random_choice_seed(model) if is_random_model: raise ValueError( "Use harness='random_choice' for random policy runs instead of pairing random_choice model with an LLM harness" diff --git a/llm_quest_benchmark/prompt_templates/consequence_scan.jinja b/llm_quest_benchmark/prompt_templates/consequence_scan.jinja deleted file mode 100644 index 55ce54b..0000000 --- a/llm_quest_benchmark/prompt_templates/consequence_scan.jinja +++ /dev/null @@ -1,18 +0,0 @@ -Current story state: -{{ observation }} - -Available actions: -{% for choice in choices %} -{{ loop.index }}. {{ choice.text }} -{% endfor %} - -Mission objective: complete the quest successfully. - -Decision method: -1. For each action, estimate immediate consequence in 5 words max. -2. Prefer actions that preserve progress and gather information. -3. Avoid options that abandon, surrender, or end the mission early. -4. If uncertain, choose the lowest-risk progress action. - -Return ONLY valid JSON (no markdown/code fences), exactly: -{"analysis":"","reasoning":"","result":} diff --git a/llm_quest_benchmark/prompt_templates/consequence_scan_subgoal.jinja b/llm_quest_benchmark/prompt_templates/consequence_scan_subgoal.jinja deleted file mode 100644 index 7fd4236..0000000 --- a/llm_quest_benchmark/prompt_templates/consequence_scan_subgoal.jinja +++ /dev/null @@ -1,19 +0,0 @@ -Current story state: -{{ observation }} - -Available actions: -{% for choice in choices %} -{{ loop.index }}. {{ choice.text }} -{% endfor %} - -Mission objective: complete the quest successfully. - -Decision method: -1. Use any provided memo from prior turns to stay consistent. -2. For each action, estimate immediate consequence in 5 words max. -3. Prefer actions that preserve progress and gather information. -4. Avoid options that abandon, surrender, or end the mission early. -5. If uncertain, choose the lowest-risk progress action. - -Return ONLY valid JSON (no markdown/code fences), exactly: -{"analysis":"","reasoning":"","memo":"","result":} diff --git a/llm_quest_benchmark/prompt_templates/light_hints.jinja b/llm_quest_benchmark/prompt_templates/light_hints.jinja deleted file mode 100644 index eb3ab60..0000000 --- a/llm_quest_benchmark/prompt_templates/light_hints.jinja +++ /dev/null @@ -1,18 +0,0 @@ -General hints for this type of quest: -- Read the scene literally. Win/loss constraints are usually stated directly in the text. -- Preparation, study, negotiation, and reconnaissance are often safer than direct combat or bravado. -- Prefer actions that gather clues or unlock safer options before committing to irreversible moves. -- Avoid choices that abandon the mission, surrender, or waste scarce time/resources for no gain. -- If a scene repeats, the last branch did not help - try a different action. -- Prioritize the core objective over optional heroic detours. - -Current story state: -{{ observation }} - -Available actions: -{% for choice in choices %} -{{ loop.index }}. {{ choice.text }} -{% endfor %} - -Return ONLY valid JSON (no markdown/code fences), exactly: -{"analysis":"","reasoning":"","result":} diff --git a/llm_quest_benchmark/prompt_templates/loop_aware_reasoning.jinja b/llm_quest_benchmark/prompt_templates/loop_aware_reasoning.jinja deleted file mode 100644 index 38a9343..0000000 --- a/llm_quest_benchmark/prompt_templates/loop_aware_reasoning.jinja +++ /dev/null @@ -1,19 +0,0 @@ -Current story state: -{{ observation }} - -Available actions: -{% for choice in choices %} -{{ loop.index }}. {{ choice.text }} -{% endfor %} - -Mission objective: complete the quest successfully. - -Decision policy: -1. Prefer actions that preserve progress and avoid premature failure. -2. Use Status/context hints (stats, resources, relationships) to reduce obvious risks. -3. If this scene appears repeated, avoid repeating the same action that did not progress. -4. When uncertain, choose the safest reversible action that keeps the mission alive. -5. Do not surrender/quit unless it is clearly required for success. - -Return ONLY valid JSON (no markdown/code fences), exactly: -{"analysis":"","reasoning":"","result":} diff --git a/llm_quest_benchmark/prompt_templates/objective_guard.jinja b/llm_quest_benchmark/prompt_templates/objective_guard.jinja deleted file mode 100644 index b80d482..0000000 --- a/llm_quest_benchmark/prompt_templates/objective_guard.jinja +++ /dev/null @@ -1,18 +0,0 @@ -Current story state: -{{ observation }} - -Available actions: -{% for choice in choices %} -{{ loop.index }}. {{ choice.text }} -{% endfor %} - -Choose the action that best supports mission completion. - -Guardrails: -1. Keep the run alive unless ending is clearly successful. -2. Penalize actions that look like quitting, escaping, or self-sabotage. -3. Prefer actions that unlock clues, credentials, access, or progression gates. -4. Resolve ambiguity by selecting the most reversible safe option. - -Return ONLY valid JSON (no markdown/code fences), exactly: -{"analysis":"","reasoning":"","result":} diff --git a/llm_quest_benchmark/prompt_templates/strategic.jinja b/llm_quest_benchmark/prompt_templates/strategic.jinja deleted file mode 100644 index 1668c41..0000000 --- a/llm_quest_benchmark/prompt_templates/strategic.jinja +++ /dev/null @@ -1,32 +0,0 @@ -{# Tier 3: Strategic agent prompt #} -{# Contextual State Tracker #} -{% if state_tracker %} -Historical context: -{% for entry in state_tracker %} -- {{ entry }} -{% endfor %} -{% endif %} - -Current Situation: -{{ observation }} - -Available Actions: -{% for choice in choices %} -{{ loop.index }}. {{ choice.text }} -{% endfor %} - -Analysis Framework: -1. Immediate Context [<50 words] -2. Plausible Hypotheses [2-3 possibilities] -3. Action Impact Forecast [short/long-term] -4. Confidence Estimate [High/Medium/Low] - -Response format: -```json -{ - "hypotheses": ["...", "..."], - "reasoning": "", - "choice": , - "confidence": "" -} -``` diff --git a/llm_quest_benchmark/prompt_templates/system_role_completion.jinja b/llm_quest_benchmark/prompt_templates/system_role_completion.jinja deleted file mode 100644 index 918a695..0000000 --- a/llm_quest_benchmark/prompt_templates/system_role_completion.jinja +++ /dev/null @@ -1,11 +0,0 @@ -You are a mission-completion specialist for interactive fiction quests. - -Core behavior: -1. Infer the current objective from narrative clues. -2. Prioritize actions that maintain progress and optionality. -3. Avoid premature terminal outcomes unless success is explicit. -4. Prefer evidence-based choices over stylistic roleplay. - -When the state is ambiguous: -- choose the safest action that still advances the mission. -- avoid speculative high-risk branches without support in the text. diff --git a/llm_quest_benchmark/prompt_templates/system_role_risk.jinja b/llm_quest_benchmark/prompt_templates/system_role_risk.jinja deleted file mode 100644 index ea19c36..0000000 --- a/llm_quest_benchmark/prompt_templates/system_role_risk.jinja +++ /dev/null @@ -1,16 +0,0 @@ -{# Enhanced system role for interactive fiction #} -You are an experienced interactive fiction player. Your capabilities include: - -1. Dynamic Goal Recognition: Infer objectives from narrative context -2. Clue Chaining: Connect information across scenes -3. Consequence Forecasting: Predict 2-3 steps ahead for each action -4. Narrative Consistency: Maintain character/story logic - -Follow these principles: -- Treat each choice as part of an unfolding mystery -- Track objects/characters/relationships as state components -- Consider both practical and thematic implications -- Admit uncertainty when clues are ambiguous -- Flag potential contradictions in story logic - -Any bad move can fail the quest, so prefer robust low-risk progress over flashy but uncertain options. diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py index 505416b..5cd93b2 100644 --- a/llm_quest_benchmark/schemas/config.py +++ b/llm_quest_benchmark/schemas/config.py @@ -26,6 +26,18 @@ "name": "Default Benchmark", } +COMPACTION_HARNESSES = { + "memo_compact", + "hinted_compact", + "tool_compact", + "tool_hinted", + "planner", + "compaction_no_memo", + "memo_cot", + "memo_extended", + "memo_structured", +} + def get_default_benchmark_yaml() -> str: """Get the default benchmark configuration from default.yaml file""" @@ -111,8 +123,12 @@ def __post_init__(self): ): valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES] raise ValueError(f"Invalid harness: {self.harness}. Supported harnesses: {valid}") + if self.harness == "human" and self.model != "human": + raise ValueError("Use model: human with harness: human") if self.model == "human" and self.harness != "human": raise ValueError("Use harness: human with model: human") + if is_random_choice_harness(self.harness) and self.model != "random_choice": + raise ValueError("Use model: random_choice with random_choice harnesses") if is_random_choice_harness(self.model) and not is_random_choice_harness(self.harness): raise ValueError("Use harness: random_choice with model: random_choice") if self.model not in ("human",) and not is_random_choice_harness(self.model): @@ -132,7 +148,8 @@ def harness_id(self) -> str: """Generate a stable harness ID based on configuration values""" import hashlib - config_str = f"{self.model}_{self.temperature}_{self.harness}_{self.system_template}_{self.compaction_interval}" + interval_tag = f"_ci{self.compaction_interval}" if self.harness in COMPACTION_HARNESSES else "" + config_str = f"{self.model}_{self.temperature}_{self.harness}_{self.system_template}{interval_tag}" hash_val = hashlib.md5(config_str.encode()).hexdigest()[:8] return f"{self.model}_t{self.temperature}_{self.harness}_{hash_val}" diff --git a/llm_quest_benchmark/tests/harnesses/test_factory.py b/llm_quest_benchmark/tests/harnesses/test_factory.py index 800a502..31c3ae2 100644 --- a/llm_quest_benchmark/tests/harnesses/test_factory.py +++ b/llm_quest_benchmark/tests/harnesses/test_factory.py @@ -34,7 +34,7 @@ def test_create_random_choice_harness(): def test_create_seeded_random_choice_harness(): - harness = create_harness("random_choice_123") + harness = create_harness("random_choice_123", model="random_choice") assert isinstance(harness, RandomAgent) assert harness.agent_id == "random_123" @@ -60,6 +60,11 @@ def test_random_choice_model_requires_random_harness(): create_harness("minimal", model="random_choice") +def test_seeded_random_model_is_rejected(): + with pytest.raises(ValueError, match="Encode random seeds in harness"): + create_harness("random_choice", model="random_choice_123") + + def test_human_model_requires_human_harness(): with pytest.raises(ValueError, match="harness='human'"): create_harness("minimal", model="human") @@ -74,17 +79,41 @@ def test_harness_config_stable_harness_id(): def test_harness_config_system_template_affects_harness_id(): first = HarnessConfig(harness="memo_compact", model="gpt-5-mini", system_template="system_role.jinja") - second = HarnessConfig(harness="memo_compact", model="gpt-5-mini", system_template="system_role_risk.jinja") + second = HarnessConfig(harness="memo_compact", model="gpt-5-mini", system_template="custom_system_role.jinja") + + assert first.harness_id != second.harness_id + + +def test_non_compaction_harness_id_ignores_compaction_interval(): + first = HarnessConfig(harness="reasoning_recent", model="gpt-5-mini", compaction_interval=10) + second = HarnessConfig(harness="reasoning_recent", model="gpt-5-mini", compaction_interval=99) + + assert first.harness_id == second.harness_id + + +def test_compaction_harness_id_includes_compaction_interval(): + first = HarnessConfig(harness="memo_compact", model="gpt-5-mini", compaction_interval=10) + second = HarnessConfig(harness="memo_compact", model="gpt-5-mini", compaction_interval=99) assert first.harness_id != second.harness_id def test_harness_config_allows_seeded_random_choice_harness(): - config = HarnessConfig(harness="random_choice_123", model="gpt-5-mini") + config = HarnessConfig(harness="random_choice_123", model="random_choice") assert config.harness == "random_choice_123" +def test_harness_config_rejects_llm_model_with_random_harness(): + with pytest.raises(ValueError, match="model: random_choice"): + HarnessConfig(harness="random_choice", model="gpt-5-mini") + + +def test_harness_config_rejects_llm_model_with_human_harness(): + with pytest.raises(ValueError, match="model: human"): + HarnessConfig(harness="human", model="gpt-5-mini") + + def test_harness_config_rejects_random_model_with_llm_harness(): with pytest.raises(ValueError, match="harness: random_choice"): HarnessConfig(harness="minimal", model="random_choice") diff --git a/llm_quest_benchmark/tests/test_benchmark_with_directory.py b/llm_quest_benchmark/tests/test_benchmark_with_directory.py index c6dc855..87b2221 100644 --- a/llm_quest_benchmark/tests/test_benchmark_with_directory.py +++ b/llm_quest_benchmark/tests/test_benchmark_with_directory.py @@ -29,7 +29,7 @@ def create_test_config(): def test_result_entry_logs_random_harness_model_as_random_policy(): """Random harness results should not be attributed to the default LLM model.""" - agent_config = HarnessConfig(harness="random_choice", model="gpt-5-mini") + agent_config = HarnessConfig(harness="random_choice", model="random_choice") result = _result_entry("quests/Boat.qm", agent_config, 1, "FAILURE") @@ -39,7 +39,7 @@ def test_result_entry_logs_random_harness_model_as_random_policy(): def test_result_entry_logs_human_harness_model_as_human(): """Human harness results should not be attributed to the default LLM model.""" - agent_config = HarnessConfig(harness="human", model="gpt-5-mini") + agent_config = HarnessConfig(harness="human", model="human") result = _result_entry("quests/Boat.qm", agent_config, 1, "FAILURE") From 4f3983457cd9107a3f72a0264c4f9905da63ff19 Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Wed, 13 May 2026 20:01:42 +0400 Subject: [PATCH 23/24] rename non-llm agents to players --- README.md | 4 +- docs/ARCHITECTURE.md | 4 +- llm_quest_benchmark/core/runner.py | 2 +- llm_quest_benchmark/harnesses/base.py | 30 +--- llm_quest_benchmark/harnesses/factory.py | 8 +- llm_quest_benchmark/harnesses/memory.py | 129 +++++++++++------- llm_quest_benchmark/harnesses/minimal.py | 5 - llm_quest_benchmark/harnesses/tool_harness.py | 12 +- llm_quest_benchmark/harnesses/tools.py | 2 +- .../{agents => players}/__init__.py | 10 +- .../{agents => players}/base.py | 6 +- .../human_player.py => players/human.py} | 4 +- .../random_agent.py => players/random.py} | 18 +-- llm_quest_benchmark/renderers/factory.py | 6 +- llm_quest_benchmark/renderers/progress.py | 10 +- .../tests/agents/test_mode_agents.py | 5 - .../tests/executors/cli/test_commands.py | 2 +- .../{agents => harnesses}/test_anthropic.py | 0 .../test_base.py} | 0 .../tests/harnesses/test_factory.py | 8 +- .../tests/harnesses/test_harnesses.py | 14 +- .../tests/integration/test_quest_e2e.py | 10 +- .../{agents => players}/test_human_player.py | 2 +- llm_quest_benchmark/tests/test_database.py | 4 +- 24 files changed, 145 insertions(+), 150 deletions(-) rename llm_quest_benchmark/{agents => players}/__init__.py (50%) rename llm_quest_benchmark/{agents => players}/base.py (92%) rename llm_quest_benchmark/{agents/human_player.py => players/human.py} (91%) rename llm_quest_benchmark/{agents/random_agent.py => players/random.py} (75%) delete mode 100644 llm_quest_benchmark/tests/agents/test_mode_agents.py rename llm_quest_benchmark/tests/{agents => harnesses}/test_anthropic.py (100%) rename llm_quest_benchmark/tests/{agents/test_llm_agent.py => harnesses/test_base.py} (100%) rename llm_quest_benchmark/tests/{agents => players}/test_human_player.py (95%) diff --git a/README.md b/README.md index d99a55b..013fb57 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ uv run llm-quest benchmark --config configs/benchmarks/memory_full_transcript.ya uv run llm-quest benchmark-report --benchmark-id --output report.md # Analyze a single run -uv run llm-quest analyze-run --run-summary results///run_/run_summary.json +uv run llm-quest analyze-run --run-summary results///run_/run_summary.json # Play as human in terminal uv run llm-quest play --quest quests/Boat.qm @@ -108,7 +108,7 @@ Provider-specific keys in `.env`: ## Project Structure - `llm_quest_benchmark/harnesses/` - LLM harness implementations for prompt, memory, tools, and planning experiments -- `llm_quest_benchmark/agents/` - Non-LLM player primitives (`human`, `random_choice`) +- `llm_quest_benchmark/players/` - Non-LLM player primitives (`human`, `random_choice`) - `llm_quest_benchmark/prompt_templates/` - Jinja2 prompt templates for the public context-scaffold taxonomy - `llm_quest_benchmark/executors/` - CLI, benchmark orchestration, TS bridge - `configs/benchmarks/` - YAML benchmark configurations diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index bf474bc..2588ee2 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -61,8 +61,8 @@ planning choices change behavior. history helpers used by tool harnesses. - `llm_quest_benchmark/harnesses/factory.py`: `create_harness()` and the canonical harness registry. -- `llm_quest_benchmark/agents/human_player.py`, - `llm_quest_benchmark/agents/random_agent.py`: Non-LLM `QuestPlayer` +- `llm_quest_benchmark/players/human.py`, + `llm_quest_benchmark/players/random.py`: Non-LLM `QuestPlayer` implementations preserved for interactive and random baselines. Harness construction lazily initializes provider clients, so template rendering diff --git a/llm_quest_benchmark/core/runner.py b/llm_quest_benchmark/core/runner.py index bebf0c6..d86c07b 100644 --- a/llm_quest_benchmark/core/runner.py +++ b/llm_quest_benchmark/core/runner.py @@ -10,11 +10,11 @@ from copy import deepcopy from typing import Any -from llm_quest_benchmark.agents.base import QuestPlayer from llm_quest_benchmark.constants import DEFAULT_QUEST_TIMEOUT from llm_quest_benchmark.core.logging import LogManager, QuestLogger from llm_quest_benchmark.environments.qm import QMPlayerEnv as QuestEnvironment from llm_quest_benchmark.environments.state import QuestOutcome +from llm_quest_benchmark.players.base import QuestPlayer from llm_quest_benchmark.schemas.config import HarnessConfig from llm_quest_benchmark.schemas.state import AgentState diff --git a/llm_quest_benchmark/harnesses/base.py b/llm_quest_benchmark/harnesses/base.py index 440675b..fd8864b 100644 --- a/llm_quest_benchmark/harnesses/base.py +++ b/llm_quest_benchmark/harnesses/base.py @@ -9,10 +9,10 @@ from json_repair import repair_json -from llm_quest_benchmark.agents.base import QuestPlayer from llm_quest_benchmark.constants import DEFAULT_TEMPLATE, normalize_template_name from llm_quest_benchmark.llm.client import get_llm_client, parse_model_name from llm_quest_benchmark.llm.prompt import PromptRenderer +from llm_quest_benchmark.players.base import QuestPlayer from llm_quest_benchmark.schemas.response import LLMResponse RISKY_CHOICE_KEYWORDS = ( @@ -311,7 +311,6 @@ def _get_action_impl(self, observation, choices) -> int: """Return the selected 1-based action number.""" pass - @abstractmethod def reset(self) -> None: """Reset harness state between episodes.""" super().reset() @@ -343,33 +342,6 @@ def on_game_end(self, final_state: dict[str, Any]) -> None: def get_last_response(self) -> LLMResponse | None: return self._last_response - @property - def _quest_briefing(self) -> str | None: - return getattr(self.memory_module, "_quest_briefing", None) - - @_quest_briefing.setter - def _quest_briefing(self, value: str | None) -> None: - if self.memory_module is not None: - self.memory_module._quest_briefing = value - - @property - def _transcript(self) -> list[dict[str, Any]]: - return getattr(self.memory_module, "_transcript", []) - - @_transcript.setter - def _transcript(self, value: list[dict[str, Any]]) -> None: - if self.memory_module is not None: - self.memory_module._transcript = value - - @property - def _steps_since_compaction(self) -> int: - return getattr(self.memory_module, "_steps_since_compaction", 0) - - @_steps_since_compaction.setter - def _steps_since_compaction(self, value: int) -> None: - if self.memory_module is not None: - self.memory_module._steps_since_compaction = value - def _build_contextual_state(self, state: str) -> str: if self.memory_module is None: return state diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py index 8d22462..87e2d77 100644 --- a/llm_quest_benchmark/harnesses/factory.py +++ b/llm_quest_benchmark/harnesses/factory.py @@ -1,8 +1,5 @@ """Factory for creating harness-based quest players.""" -from llm_quest_benchmark.agents.base import QuestPlayer -from llm_quest_benchmark.agents.human_player import HumanPlayer -from llm_quest_benchmark.agents.random_agent import RandomAgent from llm_quest_benchmark.constants import DEFAULT_MODEL from llm_quest_benchmark.harnesses.memo import ( CompactionNoMemoHarness, @@ -16,6 +13,9 @@ from llm_quest_benchmark.harnesses.planner import PlannerHarness from llm_quest_benchmark.harnesses.reasoning import ReasoningFullTranscriptHarness, ReasoningRecentHarness from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness, ToolHintedHarness +from llm_quest_benchmark.players.base import QuestPlayer +from llm_quest_benchmark.players.human import HumanPlayer +from llm_quest_benchmark.players.random import RandomPlayer HARNESS_REGISTRY = { "minimal": MinimalHarness, @@ -66,7 +66,7 @@ def create_harness( raise ValueError("Encode random seeds in harness, for example harness='random_choice_123'") if model not in (DEFAULT_MODEL, "random_choice"): raise ValueError("Use model='random_choice' with random_choice harnesses") - return RandomAgent(seed=seed, debug=debug, skip_single=skip_single) + return RandomPlayer(seed=seed, debug=debug, skip_single=skip_single) if harness.startswith("random_choice"): raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}") if harness == "human": diff --git a/llm_quest_benchmark/harnesses/memory.py b/llm_quest_benchmark/harnesses/memory.py index 22581fa..45ba5e5 100644 --- a/llm_quest_benchmark/harnesses/memory.py +++ b/llm_quest_benchmark/harnesses/memory.py @@ -1,8 +1,11 @@ -"""Memory modules for harness-based quest agents.""" +"""Memory modules for harness-based quest players.""" +import logging from abc import ABC, abstractmethod from typing import Any +logger = logging.getLogger(__name__) + class MemoryModule(ABC): @abstractmethod @@ -17,12 +20,44 @@ def update(self, step_data: dict) -> None: def reset(self) -> None: pass + @property + def quest_briefing(self) -> str | None: + return None + + @property + def transcript(self) -> list[dict[str, Any]]: + return [] + + @transcript.setter + def transcript(self, value: list[dict[str, Any]]) -> None: + raise TypeError(f"{self.__class__.__name__} does not support transcript assignment") + + @property + def steps_since_compaction(self) -> int: + return 0 + + @steps_since_compaction.setter + def steps_since_compaction(self, value: int) -> None: + raise TypeError(f"{self.__class__.__name__} does not support compaction counters") + def set_quest_briefing(self, briefing: str) -> None: - pass + clean = (briefing or "").strip() + if hasattr(self, "_quest_briefing"): + self._quest_briefing = clean or None + + def _briefing_block(self, current_state: str) -> str | None: + briefing = self.quest_briefing + if not briefing: + return None + if current_state.strip() == briefing: + return None + if len(briefing) > 800: + briefing = briefing[:800] + "..." + return f"Quest briefing (your mission):\n{briefing}" class DefaultMemory(MemoryModule): - """Recent N observations window (no compaction).""" + """Recent N observations window without compaction.""" def __init__(self, context_window: int = 3, context_chars: int = 220, decision_window: int = 5): self.context_window = context_window @@ -32,9 +67,9 @@ def __init__(self, context_window: int = 3, context_chars: int = 220, decision_w self._observations: list[str] = [] self._decisions: list[dict[str, Any]] = [] - def set_quest_briefing(self, briefing: str) -> None: - clean = (briefing or "").strip() - self._quest_briefing = clean or None + @property + def quest_briefing(self) -> str | None: + return self._quest_briefing def get_context(self, step: int) -> str: blocks: list[str] = [] @@ -106,16 +141,6 @@ def reset(self) -> None: self._observations = [] self._decisions = [] - def _briefing_block(self, current_state: str) -> str | None: - if not self._quest_briefing: - return None - if current_state.strip() == self._quest_briefing: - return None - briefing = self._quest_briefing - if len(briefing) > 800: - briefing = briefing[:800] + "..." - return f"Quest briefing (your mission):\n{briefing}" - class FullTranscriptMemory(MemoryModule): """Unbounded full transcript in context.""" @@ -124,9 +149,17 @@ def __init__(self): self._quest_briefing: str | None = None self._transcript: list[dict[str, Any]] = [] - def set_quest_briefing(self, briefing: str) -> None: - clean = (briefing or "").strip() - self._quest_briefing = clean or None + @property + def quest_briefing(self) -> str | None: + return self._quest_briefing + + @property + def transcript(self) -> list[dict[str, Any]]: + return self._transcript + + @transcript.setter + def transcript(self, value: list[dict[str, Any]]) -> None: + self._transcript = value def get_context(self, step: int) -> str: blocks: list[str] = [] @@ -170,19 +203,9 @@ def reset(self) -> None: self._quest_briefing = None self._transcript = [] - def _briefing_block(self, current_state: str) -> str | None: - if not self._quest_briefing: - return None - if current_state.strip() == self._quest_briefing: - return None - briefing = self._quest_briefing - if len(briefing) > 800: - briefing = briefing[:800] + "..." - return f"Quest briefing (your mission):\n{briefing}" - class CompactionMemory(MemoryModule): - """Periodic LLM summarization + 20-word memo field.""" + """Periodic LLM summarization plus 20-word memo field.""" def __init__(self, compaction_interval: int = 50, llm_client=None): self.compaction_interval = compaction_interval @@ -192,9 +215,25 @@ def __init__(self, compaction_interval: int = 50, llm_client=None): self._compaction_summary: str | None = None self._steps_since_compaction = 0 - def set_quest_briefing(self, briefing: str) -> None: - clean = (briefing or "").strip() - self._quest_briefing = clean or None + @property + def quest_briefing(self) -> str | None: + return self._quest_briefing + + @property + def transcript(self) -> list[dict[str, Any]]: + return self._transcript + + @transcript.setter + def transcript(self, value: list[dict[str, Any]]) -> None: + self._transcript = value + + @property + def steps_since_compaction(self) -> int: + return self._steps_since_compaction + + @steps_since_compaction.setter + def steps_since_compaction(self, value: int) -> None: + self._steps_since_compaction = value def get_context(self, step: int) -> str: blocks: list[str] = [] @@ -250,15 +289,14 @@ def _maybe_compact(self) -> None: if self._steps_since_compaction < self.compaction_interval: return if self.llm_client is None: - # No LLM client available for compaction; skip silently - self._steps_since_compaction = 0 + logger.debug("Skipping compaction because no LLM client is attached") return transcript_text = self._format_transcript_for_compaction() if not transcript_text: self._steps_since_compaction = 0 return - prompt_parts = ["You are summarizing an agent's progress through a text quest."] + prompt_parts = ["You are summarizing a quest player's progress through a text quest."] if self._quest_briefing: prompt_parts.append(f"\nQUEST BRIEFING (the original mission):\n{self._quest_briefing}") if self._compaction_summary: @@ -266,7 +304,7 @@ def _maybe_compact(self) -> None: prompt_parts.append(f"\nTRANSCRIPT OF LAST {self._steps_since_compaction} STEPS:\n{transcript_text}") prompt_parts.append( "\nSummarize the agent's progress. Include:\n" - "- Current objective (what the agent should do next)\n" + "- Current objective (what the player should do next)\n" "- Progress so far (what has been accomplished)\n" "- Key facts (NPCs, items, locations, deadlines discovered)\n" "- Failed approaches (actions/paths that didn't work)\n" @@ -276,15 +314,14 @@ def _maybe_compact(self) -> None: try: summary = (self.llm_client.get_completion("\n".join(prompt_parts)) or "").strip() - except Exception: + except Exception as exc: + logger.debug("Skipping compaction because summarization failed: %s", exc) self._steps_since_compaction = 0 return if summary: self._compaction_summary = summary self._transcript = [] - self._steps_since_compaction = 0 - else: - self._steps_since_compaction = 0 + self._steps_since_compaction = 0 def _format_transcript_for_compaction(self) -> str: recent = ( @@ -311,16 +348,6 @@ def _format_transcript_for_compaction(self) -> str: lines.append(line) return "\n\n".join(lines) - def _briefing_block(self, current_state: str) -> str | None: - if not self._quest_briefing: - return None - if current_state.strip() == self._quest_briefing: - return None - briefing = self._quest_briefing - if len(briefing) > 800: - briefing = briefing[:800] + "..." - return f"Quest briefing (your mission):\n{briefing}" - @staticmethod def _twenty_word_memo(memo: str) -> str: return " ".join(memo.split()[:20]) diff --git a/llm_quest_benchmark/harnesses/minimal.py b/llm_quest_benchmark/harnesses/minimal.py index 8fdd944..462d128 100644 --- a/llm_quest_benchmark/harnesses/minimal.py +++ b/llm_quest_benchmark/harnesses/minimal.py @@ -20,10 +20,8 @@ def __init__( skip_single: bool = False, debug: bool = False, memory_module=None, - compaction_interval: int = 50, **_, ): - del compaction_interval super().__init__( model_name=model_name, system_template=system_template, @@ -56,6 +54,3 @@ def _get_action_impl(self, observation: str, choices: list[dict[str, str]]) -> i self.history.append(default_response) self._last_response = default_response return 1 - - def reset(self) -> None: - super().reset() diff --git a/llm_quest_benchmark/harnesses/tool_harness.py b/llm_quest_benchmark/harnesses/tool_harness.py index a4f09dd..0acc699 100644 --- a/llm_quest_benchmark/harnesses/tool_harness.py +++ b/llm_quest_benchmark/harnesses/tool_harness.py @@ -184,20 +184,24 @@ def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int: tool_calls = self._extract_tool_calls(selection_response) parsed_response = self._parse_llm_response(selection_response, len(choices)) tool_results: list[str] = [] + final_choice_used = False total_usage = self._normalize_usage(selection_usage) if tool_calls: tool_results = self._execute_tool_calls(tool_calls) parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=tool_results) total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage)) + final_choice_used = True elif parsed_response.is_default: parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=[]) total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage)) + final_choice_used = True - action_before_policy = parsed_response.action - parsed_response.action = self._apply_safety_filter(choices, parsed_response.action) - if parsed_response.action != action_before_policy and not parsed_response.reasoning: - parsed_response.reasoning = "policy_safety_override" + if not final_choice_used: + action_before_policy = parsed_response.action + parsed_response.action = self._apply_safety_filter(choices, parsed_response.action) + if parsed_response.action != action_before_policy and not parsed_response.reasoning: + parsed_response.reasoning = "policy_safety_override" parsed_response.prompt_tokens = total_usage["prompt_tokens"] parsed_response.completion_tokens = total_usage["completion_tokens"] diff --git a/llm_quest_benchmark/harnesses/tools.py b/llm_quest_benchmark/harnesses/tools.py index 63edcd8..9978c58 100644 --- a/llm_quest_benchmark/harnesses/tools.py +++ b/llm_quest_benchmark/harnesses/tools.py @@ -1,4 +1,4 @@ -"""Reusable tools for harness-based quest agents.""" +"""Reusable tools for harness-based quest players.""" import ast import re diff --git a/llm_quest_benchmark/agents/__init__.py b/llm_quest_benchmark/players/__init__.py similarity index 50% rename from llm_quest_benchmark/agents/__init__.py rename to llm_quest_benchmark/players/__init__.py index fdd1aa6..aa71d5b 100644 --- a/llm_quest_benchmark/agents/__init__.py +++ b/llm_quest_benchmark/players/__init__.py @@ -1,4 +1,4 @@ -__all__ = ["QuestPlayer", "HumanPlayer", "RandomAgent"] +__all__ = ["QuestPlayer", "HumanPlayer", "RandomPlayer"] def __getattr__(name): @@ -7,11 +7,11 @@ def __getattr__(name): return QuestPlayer if name == "HumanPlayer": - from .human_player import HumanPlayer + from .human import HumanPlayer return HumanPlayer - if name == "RandomAgent": - from .random_agent import RandomAgent + if name == "RandomPlayer": + from .random import RandomPlayer - return RandomAgent + return RandomPlayer raise AttributeError(name) diff --git a/llm_quest_benchmark/agents/base.py b/llm_quest_benchmark/players/base.py similarity index 92% rename from llm_quest_benchmark/agents/base.py rename to llm_quest_benchmark/players/base.py index eed7609..9e53750 100644 --- a/llm_quest_benchmark/agents/base.py +++ b/llm_quest_benchmark/players/base.py @@ -1,4 +1,4 @@ -"""Base classes for quest players (both human and LLM)""" +"""Base class for quest players and harnesses.""" from abc import ABC, abstractmethod from typing import Any @@ -13,7 +13,7 @@ def __init__(self, skip_single: bool = False): """Initialize player with skip_single option""" self.skip_single = skip_single self._last_response: LLMResponse = None - self.agent_id = "base_agent" # Default agent ID + self.agent_id = "base_player" def get_action(self, observation: str, choices: list) -> int: """Get action number from observation and choices @@ -55,7 +55,7 @@ def _get_action_impl(self, observation: str, choices: list) -> int: pass def get_last_response(self) -> LLMResponse: - """Get the last response from the agent""" + """Get the last response from the player or harness.""" return self._last_response @abstractmethod diff --git a/llm_quest_benchmark/agents/human_player.py b/llm_quest_benchmark/players/human.py similarity index 91% rename from llm_quest_benchmark/agents/human_player.py rename to llm_quest_benchmark/players/human.py index 721c43d..b5d74f4 100644 --- a/llm_quest_benchmark/agents/human_player.py +++ b/llm_quest_benchmark/players/human.py @@ -3,7 +3,7 @@ import logging from typing import Any -from llm_quest_benchmark.agents.base import QuestPlayer +from llm_quest_benchmark.players.base import QuestPlayer class HumanPlayer(QuestPlayer): @@ -15,7 +15,7 @@ def __init__(self, skip_single: bool = False, debug: bool = False): self.logger = logging.getLogger(__name__) if debug: self.logger.setLevel(logging.DEBUG) - # Set agent_id for database records + # Keep the persisted identifier stable for existing result artifacts. self.agent_id = "human" def _get_action_impl(self, observation: str, choices: list) -> int: diff --git a/llm_quest_benchmark/agents/random_agent.py b/llm_quest_benchmark/players/random.py similarity index 75% rename from llm_quest_benchmark/agents/random_agent.py rename to llm_quest_benchmark/players/random.py index e428353..a8fea29 100644 --- a/llm_quest_benchmark/agents/random_agent.py +++ b/llm_quest_benchmark/players/random.py @@ -1,17 +1,19 @@ -"""Random agent for testing quests""" +"""Random player for testing quests""" import logging import random -from llm_quest_benchmark.agents.base import QuestPlayer +from llm_quest_benchmark.players.base import QuestPlayer -class RandomAgent(QuestPlayer): - """Agent that randomly selects from available choices. - Used for testing quests and finding edge cases.""" +class RandomPlayer(QuestPlayer): + """Player that randomly selects from available choices. + + Used for testing quests and finding edge cases. + """ def __init__(self, seed: int = None, debug: bool = False, skip_single: bool = False): - """Initialize random agent. + """Initialize random player. Args: seed (int, optional): Random seed for reproducibility. Defaults to None. @@ -24,7 +26,7 @@ def __init__(self, seed: int = None, debug: bool = False, skip_single: bool = Fa if debug: self.logger.setLevel(logging.DEBUG) self.rng = random.Random(seed) - # Set agent_id for database records + # Keep the persisted identifier stable for existing result artifacts. self.agent_id = f"random_{seed}" if seed is not None else "random" def _get_action_impl(self, observation: str, choices: list[dict[str, str]]) -> int: @@ -43,5 +45,5 @@ def _get_action_impl(self, observation: str, choices: list[dict[str, str]]) -> i return self.rng.randint(1, len(choices)) def reset(self) -> None: - """Reset agent state - nothing to reset for random agent""" + """Reset player state; nothing to reset for random choice.""" pass diff --git a/llm_quest_benchmark/renderers/factory.py b/llm_quest_benchmark/renderers/factory.py index 8b18218..0a8f3e5 100644 --- a/llm_quest_benchmark/renderers/factory.py +++ b/llm_quest_benchmark/renderers/factory.py @@ -1,7 +1,7 @@ """Factory for creating appropriate renderers based on agent type and mode""" -from llm_quest_benchmark.agents.base import QuestPlayer -from llm_quest_benchmark.agents.human_player import HumanPlayer +from llm_quest_benchmark.players.base import QuestPlayer +from llm_quest_benchmark.players.human import HumanPlayer from llm_quest_benchmark.renderers.base import BaseRenderer from llm_quest_benchmark.renderers.null import NoRenderer from llm_quest_benchmark.renderers.progress import ProgressRenderer @@ -25,7 +25,7 @@ def create_renderer( The factory follows these rules: 1. In debug mode, always use NoRenderer 2. For human players, use RichRenderer - 3. For automated agents (LLM, Random): + 3. For automated players (LLM, Random): - In benchmark mode (total_quests provided), use ProgressRenderer - Otherwise, use NoRenderer """ diff --git a/llm_quest_benchmark/renderers/progress.py b/llm_quest_benchmark/renderers/progress.py index 9d2cde9..a5097d2 100644 --- a/llm_quest_benchmark/renderers/progress.py +++ b/llm_quest_benchmark/renderers/progress.py @@ -45,23 +45,23 @@ def __init__(self, total_quests: int, total_runs: int): self.console.print("\n[bold cyan]Benchmark Progress[/]") def render_game_state(self, state: dict[str, Any]) -> None: - """No game state rendering needed for automated agents""" + """No game state rendering needed for automated players""" pass def render_title(self) -> None: - """No title rendering needed for automated agents""" + """No title rendering needed for automated players""" pass def render_quest_text(self, text: str) -> None: - """No quest text rendering needed for automated agents""" + """No quest text rendering needed for automated players""" pass def render_choices(self, choices: list) -> None: - """No choices rendering needed for automated agents""" + """No choices rendering needed for automated players""" pass def render_parameters(self, params: list) -> None: - """No parameters rendering needed for automated agents""" + """No parameters rendering needed for automated players""" pass def render_error(self, message: str) -> None: diff --git a/llm_quest_benchmark/tests/agents/test_mode_agents.py b/llm_quest_benchmark/tests/agents/test_mode_agents.py deleted file mode 100644 index a41a11a..0000000 --- a/llm_quest_benchmark/tests/agents/test_mode_agents.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Legacy agent-mode tests retired. - -Planner/tool/memo behavior now lives in -``llm_quest_benchmark.tests.harnesses.test_harnesses``. -""" diff --git a/llm_quest_benchmark/tests/executors/cli/test_commands.py b/llm_quest_benchmark/tests/executors/cli/test_commands.py index a3825cd..1bd972e 100644 --- a/llm_quest_benchmark/tests/executors/cli/test_commands.py +++ b/llm_quest_benchmark/tests/executors/cli/test_commands.py @@ -19,7 +19,7 @@ def test_version(): def test_run_quest(): - """Test running a quest with random agent""" + """Test running a quest with random player""" result = runner.invoke( app, ["run", "--quest", str(DEFAULT_QUEST), "--model", "random_choice", "--harness", "random_choice", "--debug"], diff --git a/llm_quest_benchmark/tests/agents/test_anthropic.py b/llm_quest_benchmark/tests/harnesses/test_anthropic.py similarity index 100% rename from llm_quest_benchmark/tests/agents/test_anthropic.py rename to llm_quest_benchmark/tests/harnesses/test_anthropic.py diff --git a/llm_quest_benchmark/tests/agents/test_llm_agent.py b/llm_quest_benchmark/tests/harnesses/test_base.py similarity index 100% rename from llm_quest_benchmark/tests/agents/test_llm_agent.py rename to llm_quest_benchmark/tests/harnesses/test_base.py diff --git a/llm_quest_benchmark/tests/harnesses/test_factory.py b/llm_quest_benchmark/tests/harnesses/test_factory.py index 31c3ae2..49062fe 100644 --- a/llm_quest_benchmark/tests/harnesses/test_factory.py +++ b/llm_quest_benchmark/tests/harnesses/test_factory.py @@ -1,10 +1,10 @@ import pytest -from llm_quest_benchmark.agents.human_player import HumanPlayer -from llm_quest_benchmark.agents.random_agent import RandomAgent from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness from llm_quest_benchmark.harnesses.memo import MemoCompactHarness from llm_quest_benchmark.harnesses.minimal import MinimalHarness +from llm_quest_benchmark.players.human import HumanPlayer +from llm_quest_benchmark.players.random import RandomPlayer from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig @@ -30,13 +30,13 @@ def test_create_human_harness(): def test_create_random_choice_harness(): harness = create_harness("random_choice") - assert isinstance(harness, RandomAgent) + assert isinstance(harness, RandomPlayer) def test_create_seeded_random_choice_harness(): harness = create_harness("random_choice_123", model="random_choice") - assert isinstance(harness, RandomAgent) + assert isinstance(harness, RandomPlayer) assert harness.agent_id == "random_123" diff --git a/llm_quest_benchmark/tests/harnesses/test_harnesses.py b/llm_quest_benchmark/tests/harnesses/test_harnesses.py index 3cba73e..efa03bb 100644 --- a/llm_quest_benchmark/tests/harnesses/test_harnesses.py +++ b/llm_quest_benchmark/tests/harnesses/test_harnesses.py @@ -139,7 +139,7 @@ def test_compaction_memory_receives_existing_llm_client(): assert action == 2 assert harness.memory_module.llm_client is mocked_llm assert harness.memory_module._compaction_summary == "Summary: paid the fuel merchant and should keep receipt." - assert harness._steps_since_compaction == 0 + assert harness.memory_module.steps_since_compaction == 0 def test_planner_harness_first_turn_generates_plan_then_acts(): @@ -186,8 +186,8 @@ def test_planner_harness_reuses_plan_when_state_is_stable(): def test_planner_harness_uses_contextual_memory_state(): harness = PlannerHarness(model_name="gpt-5-mini", compaction_interval=50) - harness._quest_briefing = "Original mission: win the election." - harness._transcript = [ + harness.memory_module.set_quest_briefing("Original mission: win the election.") + harness.memory_module.transcript = [ { "step": 1, "observation": "You learned Maloqs value strength.", @@ -196,7 +196,7 @@ def test_planner_harness_uses_contextual_memory_state(): "action": 1, } ] - harness._steps_since_compaction = 1 + harness.memory_module.steps_since_compaction = 1 mocked_llm = Mock() mocked_llm.get_completion.side_effect = [ "Use the remembered cultural clue.", @@ -323,8 +323,8 @@ def test_tool_compact_harness_can_use_scratchpad_tool_call(): def test_tool_compact_harness_uses_contextual_memory_state(): harness = ToolCompactHarness(model_name="gpt-5-mini", compaction_interval=50) - harness._quest_briefing = "Original mission: pass pilot certification." - harness._transcript = [ + harness.memory_module.set_quest_briefing("Original mission: pass pilot certification.") + harness.memory_module.transcript = [ { "step": 1, "observation": "Hogger is greedy.", @@ -333,7 +333,7 @@ def test_tool_compact_harness_uses_contextual_memory_state(): "action": 1, } ] - harness._steps_since_compaction = 1 + harness.memory_module.steps_since_compaction = 1 mocked_llm = Mock() mocked_llm.get_completion.return_value = ( '{"memo":"Hogger is greedy","analysis":"no tools needed","tool_calls":[],"result":1}' diff --git a/llm_quest_benchmark/tests/integration/test_quest_e2e.py b/llm_quest_benchmark/tests/integration/test_quest_e2e.py index 86568bb..3d02d1a 100644 --- a/llm_quest_benchmark/tests/integration/test_quest_e2e.py +++ b/llm_quest_benchmark/tests/integration/test_quest_e2e.py @@ -63,13 +63,13 @@ def mock_callback(event: str, data: Any) -> None: @pytest.mark.e2e @pytest.mark.timeout(TIMEOUT) -def test_random_agent_on_test_quest(caplog): - """Test that random agent can complete a test quest""" +def test_random_player_on_test_quest(caplog): + """Test that random player can complete a test quest""" caplog.set_level(logging.DEBUG) # Show all logs in test output - # Create random agent + # Create random player agent = create_harness("random_choice", skip_single=True, debug=True) - assert agent is not None, "Failed to create random agent" + assert agent is not None, "Failed to create random player" # Mock callback for testing def mock_callback(event: str, data: Any) -> None: @@ -80,7 +80,7 @@ def mock_callback(event: str, data: Any) -> None: elif event == "error": caplog.error(f"Error: {data}") - # Run quest with random agent + # Run quest with random player try: outcome = run_quest_with_timeout( quest_path=str(DEFAULT_QUEST), diff --git a/llm_quest_benchmark/tests/agents/test_human_player.py b/llm_quest_benchmark/tests/players/test_human_player.py similarity index 95% rename from llm_quest_benchmark/tests/agents/test_human_player.py rename to llm_quest_benchmark/tests/players/test_human_player.py index 8334ebd..7108f78 100644 --- a/llm_quest_benchmark/tests/agents/test_human_player.py +++ b/llm_quest_benchmark/tests/players/test_human_player.py @@ -4,7 +4,7 @@ import pytest -from llm_quest_benchmark.agents.human_player import HumanPlayer +from llm_quest_benchmark.players.human import HumanPlayer def test_human_player_initialization(): diff --git a/llm_quest_benchmark/tests/test_database.py b/llm_quest_benchmark/tests/test_database.py index a04f53d..d00c6b1 100644 --- a/llm_quest_benchmark/tests/test_database.py +++ b/llm_quest_benchmark/tests/test_database.py @@ -249,8 +249,8 @@ def test_run_summary_export_tracks_repetition_rate(tmp_path, monkeypatch, quest_ assert exported["metrics"]["bad_decision_rate"] == 0.0 -def test_random_agent_does_not_export_json(tmp_path, monkeypatch, quest_logger): - """Random agent runs should not create result artifacts in results/.""" +def test_random_player_does_not_export_json(tmp_path, monkeypatch, quest_logger): + """Random player runs should not create result artifacts in results/.""" monkeypatch.setattr(logging_module, "RESULTS_DIR", tmp_path) quest_logger.agent = "random_choice" From 1fe3930742e2f15d21aa38cf71d42bad08fcdb6c Mon Sep 17 00:00:00 2001 From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com> Date: Wed, 13 May 2026 20:03:17 +0400 Subject: [PATCH 24/24] fix harness leaderboard memory mode --- llm_quest_benchmark/core/leaderboard.py | 2 +- llm_quest_benchmark/tests/test_leaderboard.py | 41 +++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/llm_quest_benchmark/core/leaderboard.py b/llm_quest_benchmark/core/leaderboard.py index dc0a67b..078648e 100644 --- a/llm_quest_benchmark/core/leaderboard.py +++ b/llm_quest_benchmark/core/leaderboard.py @@ -385,7 +385,7 @@ def generate_leaderboard( template_from_config = str(config.get("action_template") or "") if template_from_config: template = template_from_config - memory_mode = config.get("memory_mode") + memory_mode = config.get("memory_mode") or result_row.get("memory_mode") if _is_retired_result( str(source_name) if source_name else None, str(benchmark_id) if benchmark_id else None, diff --git a/llm_quest_benchmark/tests/test_leaderboard.py b/llm_quest_benchmark/tests/test_leaderboard.py index 28a3e31..46407cf 100644 --- a/llm_quest_benchmark/tests/test_leaderboard.py +++ b/llm_quest_benchmark/tests/test_leaderboard.py @@ -377,3 +377,44 @@ def test_generate_leaderboard_matches_db_runs_by_identifiers(tmp_path, monkeypat rows = {(row["quest"], row["mode"]): row for row in leaderboard["results"]} assert rows[("Alpha", "compact_memory_memo")]["avg_steps"] == 10.0 assert rows[("Beta", "full_history_reasoning")]["avg_steps"] == 20.0 + + +def test_generate_leaderboard_uses_result_row_memory_mode_without_db_config(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + benchmark_dir = Path("results/benchmarks/bench_result_memory_mode") + benchmark_dir.mkdir(parents=True, exist_ok=True) + results = [ + { + "quest": "quests/Beta.qm", + "model": "gpt-5-mini", + "template": "reasoning.jinja", + "memory_mode": "full_transcript", + "agent_id": "harness_gpt-5-mini", + "outcome": "SUCCESS", + } + ] + db_runs = [ + { + "id": 20, + "quest_file": "quests/Beta.qm", + "quest_name": "Beta", + "agent_id": "harness_gpt-5-mini", + "agent_config": json.dumps({"model": "gpt-5-mini", "harness": "reasoning_full"}), + "outcome": "SUCCESS", + } + ] + (benchmark_dir / "benchmark_summary.json").write_text( + json.dumps( + {"benchmark_id": "bench_result_memory_mode", "harnesses": [], "results": results, "db_runs": db_runs} + ), + encoding="utf-8", + ) + + leaderboard = generate_leaderboard( + [str(benchmark_dir)], + "site/leaderboard.json", + min_runs=0, + public_model_ids=None, + ) + + assert leaderboard["results"][0]["mode"] == "full_history_reasoning"