From a95908fb4fbf7b728fb4e43eb782f37c0291403d Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Mon, 11 May 2026 13:51:09 +0400
Subject: [PATCH 01/24] harnesses: add package skeleton with base, memory,
 tools

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 llm_quest_benchmark/harnesses/__init__.py |   3 +
 llm_quest_benchmark/harnesses/base.py     | 269 ++++++++++++++++++
 llm_quest_benchmark/harnesses/memory.py   | 317 ++++++++++++++++++++++
 llm_quest_benchmark/harnesses/tools.py    | 171 ++++++++++++
 4 files changed, 760 insertions(+)
 create mode 100644 llm_quest_benchmark/harnesses/__init__.py
 create mode 100644 llm_quest_benchmark/harnesses/base.py
 create mode 100644 llm_quest_benchmark/harnesses/memory.py
 create mode 100644 llm_quest_benchmark/harnesses/tools.py

diff --git a/llm_quest_benchmark/harnesses/__init__.py b/llm_quest_benchmark/harnesses/__init__.py
new file mode 100644
index 0000000..75cef22
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/__init__.py
@@ -0,0 +1,3 @@
+from llm_quest_benchmark.harnesses.base import BaseHarness
+
+__all__ = ["BaseHarness"]
diff --git a/llm_quest_benchmark/harnesses/base.py b/llm_quest_benchmark/harnesses/base.py
new file mode 100644
index 0000000..2ae3e16
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/base.py
@@ -0,0 +1,269 @@
+"""Base harness class for quest benchmark experiments."""
+
+import logging
+from abc import abstractmethod
+from typing import Any
+
+from llm_quest_benchmark.agents.base import QuestPlayer
+from llm_quest_benchmark.agents.llm_agent import (
+    RISKY_CHOICE_KEYWORDS,
+    SAFE_CHOICE_KEYWORDS,
+    _is_numeric_raw_reasoning,
+    _raw_reasoning_fallback,
+    parse_llm_response,
+)
+from llm_quest_benchmark.constants import DEFAULT_TEMPLATE, normalize_template_name
+from llm_quest_benchmark.llm.client import get_llm_client, parse_model_name
+from llm_quest_benchmark.llm.prompt import PromptRenderer
+from llm_quest_benchmark.schemas.response import LLMResponse
+
+
+class BaseHarness(QuestPlayer):
+    """Abstract LLM harness base class."""
+
+    def __init__(
+        self,
+        model_name,
+        system_template,
+        temperature,
+        skip_single,
+        debug,
+        memory_module=None,
+        tools=None,
+    ):
+        super().__init__(skip_single=skip_single)
+        self.debug = debug
+        self.model_name = model_name.lower()
+        self.system_template = normalize_template_name(system_template)
+        self.action_template = DEFAULT_TEMPLATE
+        self.temperature = temperature
+        self.harness_name = ""
+        self.agent_id = f"harness_{self.model_name}"
+        self.memory_module = memory_module
+        self.tools = tools or []
+        self.model_spec = parse_model_name(self.model_name)
+        self.logger = logging.getLogger(self.__class__.__name__)
+        if self.debug:
+            self.logger.setLevel(logging.DEBUG)
+            self.logger.propagate = False
+            if not any(getattr(h, "_llm_quest_handler", False) for h in self.logger.handlers):
+                handler = logging.StreamHandler()
+                handler.setFormatter(logging.Formatter("%(name)s - %(message)s"))
+                handler._llm_quest_handler = True
+                self.logger.addHandler(handler)
+
+        self.prompt_renderer = PromptRenderer(
+            None,
+            system_template=self.system_template,
+            action_template=self.action_template,
+        )
+        self.llm = None
+        self.history: list[LLMResponse] = []
+        self._use_safety_filter = True
+        self._last_response = LLMResponse(action=1, is_default=True)
+
+    def _ensure_llm(self) -> None:
+        """Lazily create the provider client only when inference is needed."""
+        if self.llm is None:
+            self.llm = get_llm_client(
+                self.model_name,
+                system_prompt=self.prompt_renderer.render_system_prompt(),
+                temperature=self.temperature,
+            )
+
+    @abstractmethod
+    def _get_action_impl(self, observation, choices) -> int:
+        """Return the selected 1-based action number."""
+        pass
+
+    @abstractmethod
+    def reset(self) -> None:
+        """Reset harness state between episodes."""
+        super().reset()
+        self.history = []
+        self._last_response = LLMResponse(action=1, is_default=True)
+        if self.memory_module is not None:
+            self.memory_module.reset()
+
+    def _format_prompt(self, observation, choices, memo=None, context=None) -> str:
+        """Render system and action Jinja templates for the current decision."""
+        system_prompt = self.prompt_renderer.render_system_prompt(
+            observation=observation,
+            choices=choices,
+            memo=memo,
+            context=context,
+        ).strip()
+        action_prompt = self.prompt_renderer.action_template.render(
+            observation=observation,
+            choices=[{"text": c.get("text", "")} for c in choices],
+            memo=memo,
+            context=context,
+        ).strip()
+        if system_prompt:
+            return f"{system_prompt}\n\n{action_prompt}".strip()
+        return action_prompt
+
+    def _parse_llm_response(self, response, num_choices) -> LLMResponse:
+        """Parse an LLM response into a structured response object."""
+        return parse_llm_response(response, num_choices, self.debug, self.logger)
+
+    def _call_llm(self, prompt, system_prompt=None) -> str:
+        """Call the LLM client with lightweight retry handling."""
+        self._ensure_llm()
+        last_error: Exception | None = None
+        for attempt in range(3):
+            try:
+                if system_prompt is not None:
+                    return self.llm.get_completion(prompt, system_prompt=system_prompt)
+                return self.llm.get_completion(prompt)
+            except TypeError:
+                if system_prompt is not None:
+                    return self.llm.get_completion(prompt)
+                raise
+            except Exception as exc:
+                last_error = exc
+                if self.debug:
+                    self.logger.warning("LLM call failed on attempt %d: %s", attempt + 1, exc)
+        raise last_error or RuntimeError("LLM call failed")
+
+    def _choice_risk_score(self, choice_text: str) -> int:
+        text = (choice_text or "").lower()
+        score = 0
+        for keyword in RISKY_CHOICE_KEYWORDS:
+            if keyword in text:
+                score += 2
+        for keyword in SAFE_CHOICE_KEYWORDS:
+            if keyword in text:
+                score -= 1
+        return score
+
+    def _apply_safety_filter(self, choices, preferred_action) -> int:
+        """Replace obviously risky actions when a clearly safer alternative exists."""
+        if not self._use_safety_filter or len(choices) < 2:
+            return preferred_action
+
+        current_idx = preferred_action - 1
+        if current_idx < 0 or current_idx >= len(choices):
+            return preferred_action
+
+        scored = [(idx + 1, self._choice_risk_score(c.get("text", ""))) for idx, c in enumerate(choices)]
+        scored.sort(key=lambda item: item[1])
+
+        best_action, best_score = scored[0]
+        current_score = self._choice_risk_score(choices[current_idx].get("text", ""))
+        if current_score - best_score >= 2:
+            if self.debug:
+                self.logger.debug(
+                    "Safety filter override: %s -> %s (risk %s -> %s)",
+                    preferred_action,
+                    best_action,
+                    current_score,
+                    best_score,
+                )
+            return best_action
+        return preferred_action
+
+    @staticmethod
+    def _normalize_usage(usage: dict[str, Any] | None) -> dict[str, Any]:
+        usage = usage or {}
+        prompt_tokens = int(usage.get("prompt_tokens") or 0)
+        completion_tokens = int(usage.get("completion_tokens") or 0)
+        total_tokens = int(usage.get("total_tokens") or (prompt_tokens + completion_tokens))
+        estimated_cost_usd = usage.get("estimated_cost_usd")
+        if estimated_cost_usd is not None:
+            estimated_cost_usd = float(estimated_cost_usd)
+        return {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": total_tokens,
+            "estimated_cost_usd": estimated_cost_usd,
+        }
+
+    @classmethod
+    def _merge_usage(cls, first: dict[str, Any] | None, second: dict[str, Any] | None) -> dict[str, Any]:
+        a = cls._normalize_usage(first)
+        b = cls._normalize_usage(second)
+        merged_cost = None
+        if a["estimated_cost_usd"] is not None or b["estimated_cost_usd"] is not None:
+            merged_cost = (a["estimated_cost_usd"] or 0.0) + (b["estimated_cost_usd"] or 0.0)
+        return {
+            "prompt_tokens": a["prompt_tokens"] + b["prompt_tokens"],
+            "completion_tokens": a["completion_tokens"] + b["completion_tokens"],
+            "total_tokens": a["total_tokens"] + b["total_tokens"],
+            "estimated_cost_usd": merged_cost,
+        }
+
+    def _format_retry_prompt(self, state: str, choices: list[dict[str, str]]) -> str:
+        clipped_state = (state or "").strip()
+        if len(clipped_state) > 500:
+            clipped_state = clipped_state[:500] + "..."
+        choices_text = "\n".join([f"{i + 1}. {(c.get('text', '') or '')[:160]}" for i, c in enumerate(choices)])
+        return f"""Choose the best action.
+State: {clipped_state}
+Actions:
+{choices_text}
+
+Return valid JSON only:
+{{
+  "analysis": "<max 25 words>",
+  "reasoning": "<max 25 words>",
+  "result": <integer from 1 to {len(choices)}>
+}}"""
+
+    def _format_force_numeric_retry_prompt(self, choices: list[dict[str, str]]) -> str:
+        choices_text = "\n".join([f"{i + 1}. {(c.get('text', '') or '')[:110]}" for i, c in enumerate(choices)])
+        return f"""Pick one action number.
+{choices_text}
+Reply with one integer only: 1 to {len(choices)}."""
+
+    def _needs_force_numeric_retry(self) -> bool:
+        return self.model_spec.provider == "openai" and (
+            self.model_spec.model_id.startswith("gpt-5") or self.model_spec.model_id.startswith("o")
+        )
+
+    def _parse_with_retries(self, prompt: str, observation: str, choices: list[dict[str, str]]) -> LLMResponse:
+        """Call the model, parse, and retry once on invalid/default output."""
+        llm_response = self._call_llm(prompt)
+        llm_usage = self.llm.get_last_usage()
+        first_response = self._parse_llm_response(llm_response, len(choices))
+        parsed_response = first_response
+
+        if parsed_response.is_default:
+            retry_response = self._call_llm(self._format_retry_prompt(observation, choices))
+            retry_usage = self.llm.get_last_usage()
+            llm_usage = self._merge_usage(llm_usage, retry_usage)
+            retry_parsed = self._parse_llm_response(retry_response, len(choices))
+            if not retry_parsed.is_default:
+                retry_parsed.parse_mode = f"retry_{retry_parsed.parse_mode or 'parsed'}"
+                parsed_response = retry_parsed
+            elif self._needs_force_numeric_retry():
+                force_retry_response = self._call_llm(self._format_force_numeric_retry_prompt(choices))
+                force_retry_usage = self.llm.get_last_usage()
+                llm_usage = self._merge_usage(llm_usage, force_retry_usage)
+                force_retry_parsed = self._parse_llm_response(force_retry_response, len(choices))
+                if not force_retry_parsed.is_default:
+                    force_retry_parsed.parse_mode = f"force_retry_{force_retry_parsed.parse_mode or 'parsed'}"
+                    parsed_response = force_retry_parsed
+
+        if parsed_response is not first_response:
+            if parsed_response.analysis is None and first_response.analysis is not None:
+                parsed_response.analysis = first_response.analysis
+            if _is_numeric_raw_reasoning(parsed_response.reasoning):
+                if first_response.reasoning and not _is_numeric_raw_reasoning(first_response.reasoning):
+                    parsed_response.reasoning = first_response.reasoning
+                else:
+                    first_raw_reasoning = _raw_reasoning_fallback(llm_response)
+                    if first_raw_reasoning and not _is_numeric_raw_reasoning(first_raw_reasoning):
+                        parsed_response.reasoning = first_raw_reasoning
+
+        action_before_policy = parsed_response.action
+        parsed_response.action = self._apply_safety_filter(choices, parsed_response.action)
+        if parsed_response.action != action_before_policy and not parsed_response.reasoning:
+            parsed_response.reasoning = "policy_safety_override"
+
+        usage_payload = self._normalize_usage(llm_usage)
+        parsed_response.prompt_tokens = usage_payload["prompt_tokens"]
+        parsed_response.completion_tokens = usage_payload["completion_tokens"]
+        parsed_response.total_tokens = usage_payload["total_tokens"]
+        parsed_response.estimated_cost_usd = usage_payload["estimated_cost_usd"]
+        return parsed_response
diff --git a/llm_quest_benchmark/harnesses/memory.py b/llm_quest_benchmark/harnesses/memory.py
new file mode 100644
index 0000000..ff54ff9
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/memory.py
@@ -0,0 +1,317 @@
+"""Memory modules for harness-based quest agents."""
+
+from abc import ABC, abstractmethod
+from typing import Any
+
+
+class MemoryModule(ABC):
+    @abstractmethod
+    def get_context(self, step: int) -> str:
+        pass
+
+    @abstractmethod
+    def update(self, step_data: dict) -> None:
+        pass
+
+    @abstractmethod
+    def reset(self) -> None:
+        pass
+
+    def set_quest_briefing(self, briefing: str) -> None:
+        pass
+
+
+class DefaultMemory(MemoryModule):
+    """Recent N observations window (no compaction)."""
+
+    def __init__(self, context_window: int = 3, context_chars: int = 220, decision_window: int = 5):
+        self.context_window = context_window
+        self.context_chars = context_chars
+        self.decision_window = decision_window
+        self._quest_briefing: str | None = None
+        self._observations: list[str] = []
+        self._decisions: list[dict[str, Any]] = []
+
+    def set_quest_briefing(self, briefing: str) -> None:
+        clean = (briefing or "").strip()
+        self._quest_briefing = clean or None
+
+    def get_context(self, step: int) -> str:
+        blocks: list[str] = []
+        current = self._observations[-1] if self._observations else ""
+
+        briefing = self._briefing_block(current)
+        if briefing:
+            blocks.append(briefing)
+
+        if len(self._observations) > 1:
+            previous = self._observations[:-1][-self.context_window :]
+            if previous:
+                snippets = []
+                for idx, text in enumerate(previous, start=1):
+                    clipped = text if len(text) <= self.context_chars else text[: self.context_chars] + "..."
+                    snippets.append(f"[Previous {idx}] {clipped}")
+                blocks.append("Recent context from previous steps:\n" + "\n\n".join(snippets))
+
+        if self._decisions:
+            recent_memos = []
+            for item in self._decisions[-self.decision_window :]:
+                memo = (item.get("memo") or "").strip()
+                if not memo:
+                    continue
+                if recent_memos and recent_memos[-1] == memo:
+                    continue
+                recent_memos.append(memo)
+            if recent_memos:
+                lines = [f"[Memo {idx}] {memo}" for idx, memo in enumerate(recent_memos, start=1)]
+                blocks.append("State memo (recent):\n" + "\n".join(lines))
+
+            decision_lines = []
+            for idx, item in enumerate(self._decisions[-self.decision_window :], start=1):
+                choice = item.get("choice") or item.get("choice_text", "")
+                parse_mode = item.get("parse_mode", "unknown")
+                memo_val = item.get("memo")
+                memo_suffix = f" | memo: {memo_val}" if memo_val else ""
+                decision_lines.append(
+                    f"[Decision {idx}] action {item.get('action')}: {choice} (parse={parse_mode}){memo_suffix}"
+                )
+            blocks.append("Recent selected actions:\n" + "\n".join(decision_lines))
+
+        return "\n\n".join(blocks)
+
+    def update(self, step_data: dict) -> None:
+        observation = (step_data.get("observation") or step_data.get("state") or "").strip()
+        if observation:
+            if self._quest_briefing is None:
+                self._quest_briefing = observation
+            self._observations.append(observation)
+            if len(self._observations) > 20:
+                self._observations = self._observations[-20:]
+
+        if any(key in step_data for key in ("action", "choice", "choice_text", "memo")):
+            memo = (step_data.get("memo") or "").strip()[:350] or None
+            self._decisions.append(
+                {
+                    "action": step_data.get("action"),
+                    "choice": step_data.get("choice") or step_data.get("choice_text", ""),
+                    "parse_mode": step_data.get("parse_mode", "unknown"),
+                    "memo": memo,
+                }
+            )
+            if len(self._decisions) > 40:
+                self._decisions = self._decisions[-40:]
+
+    def reset(self) -> None:
+        self._quest_briefing = None
+        self._observations = []
+        self._decisions = []
+
+    def _briefing_block(self, current_state: str) -> str | None:
+        if not self._quest_briefing:
+            return None
+        if current_state.strip() == self._quest_briefing:
+            return None
+        briefing = self._quest_briefing
+        if len(briefing) > 800:
+            briefing = briefing[:800] + "..."
+        return f"Quest briefing (your mission):\n{briefing}"
+
+
+class FullTranscriptMemory(MemoryModule):
+    """Unbounded full transcript in context."""
+
+    def __init__(self):
+        self._quest_briefing: str | None = None
+        self._transcript: list[dict[str, Any]] = []
+
+    def set_quest_briefing(self, briefing: str) -> None:
+        clean = (briefing or "").strip()
+        self._quest_briefing = clean or None
+
+    def get_context(self, step: int) -> str:
+        blocks: list[str] = []
+        current_state = self._transcript[-1].get("observation", "") if self._transcript else ""
+        briefing = self._briefing_block(current_state)
+        if briefing:
+            blocks.append(briefing)
+
+        if self._transcript:
+            lines = []
+            for entry in self._transcript:
+                step_value = entry.get("step", "?")
+                obs = entry.get("observation", "")
+                if len(obs) > 400:
+                    obs = obs[:400] + "..."
+                chosen = entry.get("choice_text") or entry.get("choice", "")
+                reasoning = entry.get("reasoning", "")
+                line = f"Step {step_value}: {obs}"
+                if chosen:
+                    line += f"\n  You chose: {chosen}"
+                if reasoning:
+                    line += f"\n  Reasoning: {reasoning[:800]}"
+                state_notes = entry.get("memo", "")
+                if state_notes:
+                    line += f"\n  State: {state_notes[:350]}"
+                lines.append(line)
+            blocks.append("=== QUEST TRANSCRIPT ===\n" + "\n\n".join(lines))
+
+        return "\n\n".join(blocks)
+
+    def update(self, step_data: dict) -> None:
+        observation = (step_data.get("observation") or step_data.get("state") or "").strip()
+        if observation and self._quest_briefing is None:
+            self._quest_briefing = observation
+        entry = dict(step_data)
+        entry["observation"] = observation
+        entry["step"] = entry.get("step") or len(self._transcript) + 1
+        self._transcript.append(entry)
+
+    def reset(self) -> None:
+        self._quest_briefing = None
+        self._transcript = []
+
+    def _briefing_block(self, current_state: str) -> str | None:
+        if not self._quest_briefing:
+            return None
+        if current_state.strip() == self._quest_briefing:
+            return None
+        briefing = self._quest_briefing
+        if len(briefing) > 800:
+            briefing = briefing[:800] + "..."
+        return f"Quest briefing (your mission):\n{briefing}"
+
+
+class CompactionMemory(MemoryModule):
+    """Periodic LLM summarization + 20-word memo field."""
+
+    def __init__(self, compaction_interval: int = 50, llm_client=None):
+        self.compaction_interval = compaction_interval
+        self.llm_client = llm_client
+        self._quest_briefing: str | None = None
+        self._transcript: list[dict[str, Any]] = []
+        self._compaction_summary: str | None = None
+        self._steps_since_compaction = 0
+
+    def set_quest_briefing(self, briefing: str) -> None:
+        clean = (briefing or "").strip()
+        self._quest_briefing = clean or None
+
+    def get_context(self, step: int) -> str:
+        blocks: list[str] = []
+        current_state = self._transcript[-1].get("observation", "") if self._transcript else ""
+        briefing = self._briefing_block(current_state)
+        if briefing:
+            blocks.append(briefing)
+
+        if self._compaction_summary:
+            compacted_at = max(0, step - self._steps_since_compaction)
+            blocks.append(f"=== QUEST MEMORY (compacted at step {compacted_at}) ===\n{self._compaction_summary}")
+
+        recent = self._transcript[-self._steps_since_compaction :] if self._steps_since_compaction > 0 else []
+        if recent:
+            lines = []
+            for entry in recent:
+                step_value = entry.get("step", "?")
+                obs = entry.get("observation", "")
+                if len(obs) > 400:
+                    obs = obs[:400] + "..."
+                chosen = entry.get("choice_text") or entry.get("choice", "")
+                line = f"Step {step_value}: {obs}"
+                if chosen:
+                    line += f"\n  You chose: {chosen}"
+                state_notes = entry.get("memo", "")
+                if state_notes:
+                    line += f"\n  State: {state_notes[:350]}"
+                lines.append(line)
+            blocks.append("=== RECENT STEPS ===\n" + "\n\n".join(lines))
+
+        return "\n\n".join(blocks)
+
+    def update(self, step_data: dict) -> None:
+        observation = (step_data.get("observation") or step_data.get("state") or "").strip()
+        if observation and self._quest_briefing is None:
+            self._quest_briefing = observation
+        entry = dict(step_data)
+        entry["observation"] = observation[:400]
+        entry["step"] = entry.get("step") or len(self._transcript) + 1
+        if entry.get("memo"):
+            entry["memo"] = self._twenty_word_memo(str(entry["memo"]))
+        self._transcript.append(entry)
+        self._steps_since_compaction += 1
+        self._maybe_compact()
+
+    def reset(self) -> None:
+        self._quest_briefing = None
+        self._transcript = []
+        self._compaction_summary = None
+        self._steps_since_compaction = 0
+
+    def _maybe_compact(self) -> None:
+        if self._steps_since_compaction < self.compaction_interval:
+            return
+        transcript_text = self._format_transcript_for_compaction()
+        if not transcript_text:
+            return
+
+        prompt_parts = ["You are summarizing an agent's progress through a text quest."]
+        if self._quest_briefing:
+            prompt_parts.append(f"\nQUEST BRIEFING (the original mission):\n{self._quest_briefing}")
+        if self._compaction_summary:
+            prompt_parts.append(f"\nPREVIOUS SUMMARY:\n{self._compaction_summary}")
+        prompt_parts.append(f"\nTRANSCRIPT OF LAST {self._steps_since_compaction} STEPS:\n{transcript_text}")
+        prompt_parts.append(
+            "\nSummarize the agent's progress. Include:\n"
+            "- Current objective (what the agent should do next)\n"
+            "- Progress so far (what has been accomplished)\n"
+            "- Key facts (NPCs, items, locations, deadlines discovered)\n"
+            "- Failed approaches (actions/paths that didn't work)\n"
+            "- Map knowledge (locations visited and connections)\n\n"
+            "Write a concise summary in plain text, max 300 words."
+        )
+
+        summary = ""
+        if self.llm_client is not None:
+            summary = (self.llm_client.get_completion("\n".join(prompt_parts)) or "").strip()
+        if summary:
+            self._compaction_summary = summary
+            self._transcript = []
+            self._steps_since_compaction = 0
+
+    def _format_transcript_for_compaction(self) -> str:
+        recent = (
+            self._transcript[-self._steps_since_compaction :]
+            if self._steps_since_compaction > 0
+            else self._transcript[-self.compaction_interval :]
+        )
+        lines = []
+        for entry in recent:
+            step = entry.get("step", "?")
+            obs = entry.get("observation", "")
+            if len(obs) > 400:
+                obs = obs[:400] + "..."
+            chosen = entry.get("choice_text") or entry.get("choice", "")
+            reasoning = entry.get("reasoning", "")
+            state_notes = entry.get("memo", "")
+            line = f"Step {step}: {obs}"
+            if chosen:
+                line += f"\n  Chose: {chosen}"
+            if state_notes:
+                line += f"\n  State: {state_notes[:350]}"
+            if reasoning:
+                line += f"\n  Reasoning: {reasoning[:800]}"
+            lines.append(line)
+        return "\n\n".join(lines)
+
+    def _briefing_block(self, current_state: str) -> str | None:
+        if not self._quest_briefing:
+            return None
+        if current_state.strip() == self._quest_briefing:
+            return None
+        briefing = self._quest_briefing
+        if len(briefing) > 800:
+            briefing = briefing[:800] + "..."
+        return f"Quest briefing (your mission):\n{briefing}"
+
+    @staticmethod
+    def _twenty_word_memo(memo: str) -> str:
+        return " ".join(memo.split()[:20])
diff --git a/llm_quest_benchmark/harnesses/tools.py b/llm_quest_benchmark/harnesses/tools.py
new file mode 100644
index 0000000..5386d6d
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/tools.py
@@ -0,0 +1,171 @@
+"""Reusable tools for harness-based quest agents."""
+
+import ast
+import re
+
+MAX_SCRATCHPAD_CHARS = 1200
+
+
+def calculator(expression: str) -> str:
+    """Evaluate a restricted arithmetic/comparison expression."""
+    expr = (expression or "").strip()
+    if not expr:
+        return "error: empty expression"
+    if len(expr) > 240:
+        return "error: expression too long"
+    if not re.fullmatch(r"[0-9a-zA-Z\s+\-*/().,<>=!%]+", expr):
+        return "error: unsupported characters"
+
+    allowed_nodes = (
+        ast.Expression,
+        ast.Constant,
+        ast.UnaryOp,
+        ast.UAdd,
+        ast.USub,
+        ast.BinOp,
+        ast.Add,
+        ast.Sub,
+        ast.Mult,
+        ast.Div,
+        ast.FloorDiv,
+        ast.Mod,
+        ast.Pow,
+        ast.Compare,
+        ast.Eq,
+        ast.NotEq,
+        ast.Lt,
+        ast.LtE,
+        ast.Gt,
+        ast.GtE,
+        ast.BoolOp,
+        ast.And,
+        ast.Or,
+    )
+    try:
+        tree = ast.parse(expr, mode="eval")
+        for node in ast.walk(tree):
+            if not isinstance(node, allowed_nodes):
+                return f"error: unsupported expression element {node.__class__.__name__}"
+            if isinstance(node, ast.Constant) and not isinstance(node.value, (int, float, bool)):
+                return "error: constants must be numeric or boolean"
+        result = _eval_calculator_node(tree.body)
+    except Exception as exc:
+        return f"error: {exc}"
+    return f"{expr} = {result}"
+
+
+def _eval_calculator_node(node: ast.AST) -> int | float | bool:
+    if isinstance(node, ast.Constant) and isinstance(node.value, (int, float, bool)):
+        return node.value
+    if isinstance(node, ast.UnaryOp):
+        value = _eval_calculator_node(node.operand)
+        if isinstance(node.op, ast.UAdd):
+            return +value
+        if isinstance(node.op, ast.USub):
+            return -value
+    if isinstance(node, ast.BinOp):
+        left = _eval_calculator_node(node.left)
+        right = _eval_calculator_node(node.right)
+        if isinstance(node.op, ast.Add):
+            return left + right
+        if isinstance(node.op, ast.Sub):
+            return left - right
+        if isinstance(node.op, ast.Mult):
+            return left * right
+        if isinstance(node.op, ast.Div):
+            return left / right
+        if isinstance(node.op, ast.FloorDiv):
+            return left // right
+        if isinstance(node.op, ast.Mod):
+            return left % right
+        if isinstance(node.op, ast.Pow):
+            if abs(right) > 8:
+                raise ValueError("exponent too large")
+            return left**right
+    if isinstance(node, ast.BoolOp):
+        values = [bool(_eval_calculator_node(value)) for value in node.values]
+        if isinstance(node.op, ast.And):
+            return all(values)
+        if isinstance(node.op, ast.Or):
+            return any(values)
+    if isinstance(node, ast.Compare):
+        left = _eval_calculator_node(node.left)
+        for op, comparator in zip(node.ops, node.comparators, strict=True):
+            right = _eval_calculator_node(comparator)
+            if isinstance(op, ast.Eq):
+                ok = left == right
+            elif isinstance(op, ast.NotEq):
+                ok = left != right
+            elif isinstance(op, ast.Lt):
+                ok = left < right
+            elif isinstance(op, ast.LtE):
+                ok = left <= right
+            elif isinstance(op, ast.Gt):
+                ok = left > right
+            elif isinstance(op, ast.GtE):
+                ok = left >= right
+            else:
+                raise ValueError("unsupported comparison")
+            if not ok:
+                return False
+            left = right
+        return True
+    raise ValueError("unsupported expression")
+
+
+class Scratchpad:
+    """Persistent free-form note blob with read and replace operations."""
+
+    def __init__(self, max_chars: int = MAX_SCRATCHPAD_CHARS):
+        self.max_chars = max_chars
+        self._content = ""
+
+    def read(self) -> str:
+        return self._content or "(empty)"
+
+    def write_replace(self, content: str = "") -> str:
+        note = " ".join((content or "").strip().split())
+        self._content = note[: self.max_chars]
+        return f"updated: {self._content or '(empty)'}"
+
+    def reset(self) -> None:
+        self._content = ""
+
+
+class QuestHistoryTool:
+    """Keyword search over a run-local quest step log."""
+
+    def __init__(self, step_log: list[dict] | None = None, history_window: int = 10):
+        self.step_log = step_log if step_log is not None else []
+        self.history_window = history_window
+
+    def search(self, query: str) -> str:
+        """Return relevant previous steps from this quest run via keyword match."""
+        if not self.step_log:
+            return "No prior quest steps recorded yet."
+
+        tokens = set(re.findall(r"[a-zA-Z\u0400-\u04ff0-9_]{3,}", (query or "").lower()))
+        scored = []
+        for entry in self.step_log:
+            haystack = " ".join(
+                [
+                    entry.get("observation", ""),
+                    " ".join(entry.get("choices", [])),
+                    entry.get("selected_choice", ""),
+                ]
+            ).lower()
+            score = sum(1 for token in tokens if token in haystack)
+            scored.append((score, entry))
+
+        scored.sort(key=lambda item: (item[0], item[1].get("step", 0)), reverse=True)
+        best = [entry for score, entry in scored if score > 0][: self.history_window]
+        if not best:
+            best = [entry for _, entry in scored[-self.history_window :]]
+
+        lines = []
+        for entry in best:
+            lines.append(
+                f"Step {entry['step']}: obs={entry['observation']} | "
+                f"choices={'; '.join(entry['choices'])} | picked={entry.get('selected_choice', 'n/a')}"
+            )
+        return "\n".join(lines)

From 1c63c3c317187b40e7488252327685c464ac50b4 Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Mon, 11 May 2026 14:00:31 +0400
Subject: [PATCH 02/24] harnesses: implement 8 concrete harness classes

---
 llm_quest_benchmark/agents/__init__.py        |  44 +-
 llm_quest_benchmark/agents/agent_factory.py   |  28 +-
 llm_quest_benchmark/agents/planner_agent.py   | 236 +----------
 llm_quest_benchmark/agents/strategic_agent.py |  95 +----
 llm_quest_benchmark/agents/tool_agent.py      | 385 +-----------------
 llm_quest_benchmark/harnesses/base.py         | 127 +++++-
 llm_quest_benchmark/harnesses/factory.py      |  53 +++
 llm_quest_benchmark/harnesses/memo.py         |  62 +++
 llm_quest_benchmark/harnesses/minimal.py      |  61 +++
 llm_quest_benchmark/harnesses/planner.py      | 198 +++++++++
 llm_quest_benchmark/harnesses/reasoning.py    |  57 +++
 llm_quest_benchmark/harnesses/tool_harness.py | 238 +++++++++++
 12 files changed, 849 insertions(+), 735 deletions(-)
 create mode 100644 llm_quest_benchmark/harnesses/factory.py
 create mode 100644 llm_quest_benchmark/harnesses/memo.py
 create mode 100644 llm_quest_benchmark/harnesses/minimal.py
 create mode 100644 llm_quest_benchmark/harnesses/planner.py
 create mode 100644 llm_quest_benchmark/harnesses/reasoning.py
 create mode 100644 llm_quest_benchmark/harnesses/tool_harness.py

diff --git a/llm_quest_benchmark/agents/__init__.py b/llm_quest_benchmark/agents/__init__.py
index 852fb91..d056964 100644
--- a/llm_quest_benchmark/agents/__init__.py
+++ b/llm_quest_benchmark/agents/__init__.py
@@ -1,15 +1,29 @@
-from .agent_factory import create_agent
-from .base import QuestPlayer
-from .llm_agent import LLMAgent
-from .planner_agent import PlannerAgent
-from .random_agent import RandomAgent
-from .tool_agent import ToolAgent
-
-__all__ = [
-    "create_agent",
-    "QuestPlayer",
-    "RandomAgent",
-    "LLMAgent",
-    "PlannerAgent",
-    "ToolAgent",
-]
+__all__ = ["create_agent", "QuestPlayer", "RandomAgent", "LLMAgent", "PlannerAgent", "ToolAgent"]
+
+
+def __getattr__(name):
+    if name == "create_agent":
+        from .agent_factory import create_agent
+
+        return create_agent
+    if name == "QuestPlayer":
+        from .base import QuestPlayer
+
+        return QuestPlayer
+    if name == "RandomAgent":
+        from .random_agent import RandomAgent
+
+        return RandomAgent
+    if name == "LLMAgent":
+        from .llm_agent import LLMAgent
+
+        return LLMAgent
+    if name == "PlannerAgent":
+        from .planner_agent import PlannerAgent
+
+        return PlannerAgent
+    if name == "ToolAgent":
+        from .tool_agent import ToolAgent
+
+        return ToolAgent
+    raise AttributeError(name)
diff --git a/llm_quest_benchmark/agents/agent_factory.py b/llm_quest_benchmark/agents/agent_factory.py
index d7b889b..6d2ff42 100644
--- a/llm_quest_benchmark/agents/agent_factory.py
+++ b/llm_quest_benchmark/agents/agent_factory.py
@@ -5,9 +5,7 @@
 from llm_quest_benchmark.agents.base import QuestPlayer
 from llm_quest_benchmark.agents.human_player import HumanPlayer
 from llm_quest_benchmark.agents.llm_agent import LLMAgent
-from llm_quest_benchmark.agents.planner_agent import PlannerAgent
 from llm_quest_benchmark.agents.random_agent import RandomAgent
-from llm_quest_benchmark.agents.tool_agent import ToolAgent
 from llm_quest_benchmark.constants import (
     DEFAULT_MODEL,
     DEFAULT_TEMPERATURE,
@@ -66,27 +64,29 @@ def create_agent(
         return RandomAgent(seed=seed, debug=debug, skip_single=skip_single)
 
     if resolved_action_template == "planner.jinja":
-        return PlannerAgent(
-            debug=debug,
-            model_name=model,
-            system_template=system_template,
-            action_template=resolved_action_template,
+        from llm_quest_benchmark.harnesses.factory import create_harness
+
+        return create_harness(
+            harness="planner",
+            model=model,
             temperature=temperature,
             skip_single=skip_single,
-            memory_mode=memory_mode,
+            debug=debug,
             compaction_interval=compaction_interval,
+            system_template=system_template,
         )
 
     if resolved_action_template in ("tool_augmented.jinja", "tool_augmented_hints.jinja"):
-        return ToolAgent(
-            debug=debug,
-            model_name=model,
-            system_template=system_template,
-            action_template=resolved_action_template,
+        from llm_quest_benchmark.harnesses.factory import create_harness
+
+        return create_harness(
+            harness="tool_hinted" if resolved_action_template == "tool_augmented_hints.jinja" else "tool_compact",
+            model=model,
             temperature=temperature,
             skip_single=skip_single,
-            memory_mode=memory_mode,
+            debug=debug,
             compaction_interval=compaction_interval,
+            system_template=system_template,
         )
 
     # Default to LLM agent
diff --git a/llm_quest_benchmark/agents/planner_agent.py b/llm_quest_benchmark/agents/planner_agent.py
index 1999afd..cd20e0d 100644
--- a/llm_quest_benchmark/agents/planner_agent.py
+++ b/llm_quest_benchmark/agents/planner_agent.py
@@ -1,235 +1,9 @@
-"""Planner agent with a lightweight plan-maintain-act loop."""
+"""Deprecated compatibility wrapper for the planner harness."""
 
-import logging
-import re
-from typing import Any
+import warnings
 
-from llm_quest_benchmark.agents.llm_agent import LLMAgent, LLMResponse, parse_llm_response
+from llm_quest_benchmark.harnesses.planner import PlannerHarness as PlannerAgent
 
+warnings.warn("planner_agent is deprecated, use harnesses.planner", DeprecationWarning, stacklevel=2)
 
-class PlannerAgent(LLMAgent):
-    """LLM agent that maintains a short plan and re-plans on notable changes."""
-
-    def __init__(
-        self,
-        *args,
-        action_template: str = "planner.jinja",
-        **kwargs,
-    ):
-        super().__init__(*args, action_template=action_template, **kwargs)
-        self.agent_id = f"planner_{self.model_name}"
-        self.current_plan: str | None = None
-        self._plan_history: list[str] = []
-
-    def _recent_actions(self) -> list[str]:
-        entries = []
-        for item in self._decision_history[-3:]:
-            choice = (item.get("choice") or "").strip()
-            if not choice:
-                continue
-            entries.append(f"{item.get('action')}. {choice}")
-        return entries
-
-    @staticmethod
-    def _normalize_plan(raw_plan: str) -> str:
-        compact = " ".join((raw_plan or "").strip().split())
-        if not compact:
-            return ""
-
-        sentences = re.split(r"(?<=[.!?])\s+", compact)
-        sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
-        if len(sentences) >= 5:
-            return " ".join(sentences[:5])
-        return compact
-
-    def _build_planner_prompt(
-        self,
-        observation: str,
-        choices: list[dict[str, str]],
-        prompt_kind: str,
-        replan_reason: str | None = None,
-    ) -> str:
-        template = self.prompt_renderer.get_template(self.action_template)
-        return template.render(
-            prompt_kind=prompt_kind,
-            observation=observation,
-            choices=[{"text": choice.get("text", "")} for choice in choices],
-            current_plan=self.current_plan,
-            replan_reason=replan_reason,
-            recent_actions=self._recent_actions(),
-        ).strip()
-
-    def _observation_changed_significantly(self, observation: str) -> bool:
-        """Check if the observation differs enough from the previous one to warrant re-planning.
-
-        Uses token-level overlap ratio: if less than 50% of tokens are shared,
-        the scene has changed significantly.
-        """
-        if len(self._observation_history) < 2:
-            return False
-
-        prev_tokens = set(self._observation_history[-2].lower().split())
-        curr_tokens = set((observation or "").lower().split())
-        if not prev_tokens or not curr_tokens:
-            return True
-        overlap = len(prev_tokens & curr_tokens) / max(len(prev_tokens), len(curr_tokens))
-        return overlap < 0.5
-
-    def _should_replan(self, observation: str, state_signature: str) -> tuple[bool, str | None]:
-        if not self.current_plan:
-            return True, "No plan exists yet."
-
-        if any(self._state_action_counts.get(state_signature, {}).values()):
-            return True, "This state has repeated, so a previous action already failed to progress."
-
-        if self._observation_changed_significantly(observation):
-            return True, "The scene changed significantly from the previous observation."
-
-        return False, None
-
-    def _update_plan(
-        self,
-        observation: str,
-        choices: list[dict[str, str]],
-        replan_reason: str | None,
-    ) -> dict[str, Any]:
-        self._ensure_llm()
-        prompt = self._build_planner_prompt(
-            observation,
-            choices,
-            prompt_kind="plan",
-            replan_reason=replan_reason,
-        )
-        plan_response = self.llm.get_completion(prompt)
-        usage = self.llm.get_last_usage()
-        plan = self._normalize_plan(plan_response)
-        if not plan:
-            if self.current_plan:
-                plan = self.current_plan
-            else:
-                plan = (
-                    "Gather clues, protect resources, and avoid obvious traps while "
-                    "advancing toward the main objective."
-                )
-        self.current_plan = plan
-        self._plan_history.append(plan)
-        if len(self._plan_history) > 10:
-            self._plan_history = self._plan_history[-10:]
-        return usage
-
-    def _choose_action_with_plan(
-        self,
-        observation: str,
-        choices: list[dict[str, str]],
-        replan_reason: str | None,
-    ) -> tuple[LLMResponse, dict[str, Any]]:
-        prompt = self._build_planner_prompt(
-            observation,
-            choices,
-            prompt_kind="act",
-            replan_reason=replan_reason,
-        )
-        llm_response = self.llm.get_completion(prompt)
-        llm_usage = self.llm.get_last_usage()
-        parsed_response = parse_llm_response(llm_response, len(choices), self.debug, self.logger)
-
-        if parsed_response.is_default:
-            retry_response = self.llm.get_completion(self._format_retry_prompt(observation, choices))
-            retry_usage = self.llm.get_last_usage()
-            llm_usage = self._merge_usage(llm_usage, retry_usage)
-            retry_parsed = parse_llm_response(
-                retry_response,
-                len(choices),
-                self.debug,
-                self.logger,
-            )
-            if not retry_parsed.is_default:
-                retry_parsed.parse_mode = f"retry_{retry_parsed.parse_mode or 'parsed'}"
-                parsed_response = retry_parsed
-            elif self._needs_force_numeric_retry():
-                force_retry_response = self.llm.get_completion(self._format_force_numeric_retry_prompt(choices))
-                force_retry_usage = self.llm.get_last_usage()
-                llm_usage = self._merge_usage(llm_usage, force_retry_usage)
-                force_retry_parsed = parse_llm_response(
-                    force_retry_response,
-                    len(choices),
-                    self.debug,
-                    self.logger,
-                )
-                if not force_retry_parsed.is_default:
-                    force_retry_parsed.parse_mode = f"force_retry_{force_retry_parsed.parse_mode or 'parsed'}"
-                    parsed_response = force_retry_parsed
-
-        return parsed_response, llm_usage
-
-    def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int:
-        if self.debug:
-            self.logger.debug("PlannerAgent evaluating state with %s choices", len(choices))
-        try:
-            state_signature = self._state_signature(state, choices)
-            contextual_state = self._build_contextual_state(state)
-            should_replan, replan_reason = self._should_replan(state, state_signature)
-            plan_usage = None
-            if should_replan:
-                plan_usage = self._update_plan(contextual_state, choices, replan_reason)
-
-            parsed_response, action_usage = self._choose_action_with_plan(
-                contextual_state,
-                choices,
-                replan_reason if should_replan else None,
-            )
-            action_before_policy = parsed_response.action
-            parsed_response.action = self._apply_safety_filter(parsed_response.action, choices)
-            if parsed_response.action != action_before_policy and not parsed_response.reasoning:
-                parsed_response.reasoning = "policy_safety_override"
-
-            total_usage = (
-                self._merge_usage(plan_usage, action_usage) if plan_usage else self._normalize_usage(action_usage)
-            )
-            if plan_usage:
-                total_usage = self._normalize_usage(total_usage)
-
-            parsed_response.prompt_tokens = total_usage["prompt_tokens"]
-            parsed_response.completion_tokens = total_usage["completion_tokens"]
-            parsed_response.total_tokens = total_usage["total_tokens"]
-            parsed_response.estimated_cost_usd = total_usage["estimated_cost_usd"]
-
-            self.history.append(parsed_response)
-            self._last_response = parsed_response
-            self._remember_decision(state, choices, state_signature, parsed_response)
-
-            if parsed_response.action < 1 or parsed_response.action > len(choices):
-                self.logger.error(
-                    "INVALID ACTION DETECTED: %s not in range 1-%s",
-                    parsed_response.action,
-                    len(choices),
-                )
-                parsed_response.action = 1
-
-            return parsed_response.action
-        except Exception as exc:
-            self.logger.error("Planner agent error during LLM call: %s", exc)
-            default_response = LLMResponse(
-                action=1,
-                is_default=True,
-                parse_mode="error_default",
-                reasoning=f"planner_error: {exc}",
-            )
-            self.history.append(default_response)
-            self._last_response = default_response
-            return 1
-
-    def reset(self) -> None:
-        super().reset()
-        self.current_plan = None
-        self._plan_history = []
-
-    def on_game_start(self) -> None:
-        super().on_game_start()
-        self.current_plan = None
-        self._plan_history = []
-
-    def on_game_end(self, final_state: dict[str, Any]) -> None:
-        if self.debug:
-            logging.getLogger(self.__class__.__name__).debug("Planner finished with plan: %s", self.current_plan)
-        super().on_game_end(final_state)
+__all__ = ["PlannerAgent"]
diff --git a/llm_quest_benchmark/agents/strategic_agent.py b/llm_quest_benchmark/agents/strategic_agent.py
index 387c650..a4cc4e7 100644
--- a/llm_quest_benchmark/agents/strategic_agent.py
+++ b/llm_quest_benchmark/agents/strategic_agent.py
@@ -1,94 +1,3 @@
-"""Strategic agent decorator that adds analysis capabilities"""
+"""Deprecated strategic agent module."""
 
-import logging
-from typing import Any
-
-from llm_quest_benchmark.agents.base import QuestPlayer
-from llm_quest_benchmark.llm.prompt import PromptRenderer
-
-
-class StrategicAgent(QuestPlayer):
-    """Decorator that adds strategic thinking to any quest player"""
-
-    def __init__(self, base_agent: QuestPlayer, debug: bool = False, template: str = "advanced.jinja"):
-        """Initialize strategic agent wrapper
-
-        Args:
-            base_agent: Base agent to wrap (usually LLMAgent)
-            debug: Enable debug logging
-            template: Template to use for enhanced prompts
-        """
-        super().__init__(skip_single=base_agent.skip_single)
-        self.agent = base_agent
-        self.debug = debug
-        self.history = []
-
-        # Setup logging
-        self.logger = logging.getLogger(self.__class__.__name__)
-        if self.debug:
-            self.logger.setLevel(logging.DEBUG)
-            handler = logging.StreamHandler()
-            handler.setFormatter(logging.Formatter("%(name)s - %(message)s"))
-            self.logger.addHandler(handler)
-
-        # Initialize prompt renderer
-        self.prompt_renderer = PromptRenderer(None, template=template)
-
-    def _get_action_impl(self, observation: str, choices: list) -> str:
-        """Implementation of action selection logic with strategic analysis"""
-        if hasattr(self.agent, "llm"):
-            # First, get situation analysis
-            if self.debug:
-                self.logger.debug(f"\nObservation:\n{observation}")
-
-            analysis = self.agent.llm(
-                "Analyze this situation and explain your thinking step-by-step instead of choosing an action:\n"
-                + observation
-            )
-
-            if self.debug:
-                self.logger.debug(f"\nAnalysis:\n{analysis}")
-
-            # Store analysis in history
-            self.history.append({"observation": observation, "analysis": analysis})
-
-            # Get enhanced context with history
-            enhanced_context = self.get_enhanced_context(observation, choices)
-            if self.debug:
-                self.logger.debug(f"\nEnhanced Context:\n{enhanced_context}")
-
-            # Then make the actual choice with analysis context
-            return self.agent.get_action(enhanced_context, choices)
-        else:
-            # If agent doesn't have LLM capability, just pass through
-            return self.agent.get_action(observation, choices)
-
-    def get_enhanced_context(self, observation: str, choices: list) -> str:
-        """Build context for advanced prompt with historical analysis"""
-        context = [
-            f"Turn {len(self.history) + 1}: {entry['analysis']}"
-            for entry in self.history[-3:]  # Last 3 analyses
-        ]
-        return self.prompt_renderer.render_action_prompt(
-            observation=observation, choices=choices, state_tracker=context
-        )
-
-    def reset(self) -> None:
-        """Reset both strategic and base agent state"""
-        self.history = []
-        self.agent.reset()
-
-    def on_game_start(self) -> None:
-        """Pass through to base agent"""
-        if self.debug:
-            self.logger.debug("Starting new game with strategic analysis")
-        self.agent.on_game_start()
-
-    def on_game_end(self, final_state: dict[str, Any]) -> None:
-        """Pass through to base agent and log analysis history"""
-        self.agent.on_game_end(final_state)
-        if self.debug:
-            self.logger.debug("Final Analysis History:")
-            for entry in self.history:
-                self.logger.debug(f"\nObservation: {entry['observation']}")
-                self.logger.debug(f"Analysis: {entry['analysis']}")
+raise ImportError("strategic_agent is deprecated; use llm_quest_benchmark.harnesses instead")
diff --git a/llm_quest_benchmark/agents/tool_agent.py b/llm_quest_benchmark/agents/tool_agent.py
index 694d1ac..659a747 100644
--- a/llm_quest_benchmark/agents/tool_agent.py
+++ b/llm_quest_benchmark/agents/tool_agent.py
@@ -1,384 +1,9 @@
-"""Tool-augmented agent with lightweight structured prompting."""
+"""Deprecated compatibility wrapper for the tool harness."""
 
-import ast
-import re
-from typing import Any
+import warnings
 
-from llm_quest_benchmark.agents.llm_agent import (
-    LLMAgent,
-    LLMResponse,
-    _parse_json_response,
-    parse_llm_response,
-)
+from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness as ToolAgent
 
+warnings.warn("tool_agent is deprecated, use harnesses.tool_harness", DeprecationWarning, stacklevel=2)
 
-class ToolAgent(LLMAgent):
-    """LLM agent with generic run-local tools for history, math, and state notes."""
-
-    DEFAULT_HISTORY_WINDOW = 10
-    MAX_SCRATCHPAD_CHARS = 1200
-    MAX_TOOL_INPUT_CHARS = 500
-
-    def __init__(
-        self,
-        *args,
-        action_template: str = "tool_augmented.jinja",
-        history_window: int | None = None,
-        **kwargs,
-    ):
-        super().__init__(*args, action_template=action_template, **kwargs)
-        self.agent_id = f"tool_{self.model_name}"
-        self._step_log: list[dict[str, Any]] = []
-        self._history_window = history_window or self.DEFAULT_HISTORY_WINDOW
-        self._scratchpad = ""
-
-    def _recent_steps(self) -> list[str]:
-        snippets = []
-        for entry in self._step_log[-self._history_window :]:
-            snippets.append(f"Step {entry['step']}: {entry['observation']} -> {entry.get('selected_choice', 'n/a')}")
-        return snippets
-
-    def _tool_descriptions(self) -> list[str]:
-        return [
-            "quest_history(query): search earlier observations and chosen actions in this quest.",
-            "calculator(expression): evaluate arithmetic and simple comparisons.",
-            "scratchpad(operation, content): read or replace one persistent note. operation is read or write_replace.",
-        ]
-
-    def quest_history(self, query: str) -> str:
-        """Return relevant previous steps from this quest run via keyword match."""
-        if not self._step_log:
-            return "No prior quest steps recorded yet."
-
-        tokens = set(re.findall(r"[a-zA-Z\u0400-\u04ff0-9_]{3,}", (query or "").lower()))
-        scored = []
-        for entry in self._step_log:
-            haystack = " ".join(
-                [
-                    entry.get("observation", ""),
-                    " ".join(entry.get("choices", [])),
-                    entry.get("selected_choice", ""),
-                ]
-            ).lower()
-            score = sum(1 for token in tokens if token in haystack)
-            scored.append((score, entry))
-
-        scored.sort(key=lambda item: (item[0], item[1].get("step", 0)), reverse=True)
-        best = [entry for s, entry in scored if s > 0][: self._history_window]
-        if not best:
-            best = [entry for _, entry in scored[-self._history_window :]]
-
-        lines = []
-        for entry in best:
-            lines.append(
-                f"Step {entry['step']}: obs={entry['observation']} | "
-                f"choices={'; '.join(entry['choices'])} | picked={entry.get('selected_choice', 'n/a')}"
-            )
-        return "\n".join(lines)
-
-    @staticmethod
-    def calculator(expression: str) -> str:
-        """Evaluate a restricted arithmetic/comparison expression."""
-        expr = (expression or "").strip()
-        if not expr:
-            return "error: empty expression"
-        if len(expr) > 240:
-            return "error: expression too long"
-        if not re.fullmatch(r"[0-9a-zA-Z\s+\-*/().,<>=!%]+", expr):
-            return "error: unsupported characters"
-
-        allowed_nodes = (
-            ast.Expression,
-            ast.Constant,
-            ast.UnaryOp,
-            ast.UAdd,
-            ast.USub,
-            ast.BinOp,
-            ast.Add,
-            ast.Sub,
-            ast.Mult,
-            ast.Div,
-            ast.FloorDiv,
-            ast.Mod,
-            ast.Pow,
-            ast.Compare,
-            ast.Eq,
-            ast.NotEq,
-            ast.Lt,
-            ast.LtE,
-            ast.Gt,
-            ast.GtE,
-            ast.BoolOp,
-            ast.And,
-            ast.Or,
-        )
-        try:
-            tree = ast.parse(expr, mode="eval")
-            for node in ast.walk(tree):
-                if not isinstance(node, allowed_nodes):
-                    return f"error: unsupported expression element {node.__class__.__name__}"
-                if isinstance(node, ast.Constant) and not isinstance(node.value, (int, float, bool)):
-                    return "error: constants must be numeric or boolean"
-            result = ToolAgent._eval_calculator_node(tree.body)
-        except Exception as exc:
-            return f"error: {exc}"
-        return f"{expr} = {result}"
-
-    @staticmethod
-    def _eval_calculator_node(node: ast.AST) -> int | float | bool:
-        if isinstance(node, ast.Constant) and isinstance(node.value, (int, float, bool)):
-            return node.value
-        if isinstance(node, ast.UnaryOp):
-            value = ToolAgent._eval_calculator_node(node.operand)
-            if isinstance(node.op, ast.UAdd):
-                return +value
-            if isinstance(node.op, ast.USub):
-                return -value
-        if isinstance(node, ast.BinOp):
-            left = ToolAgent._eval_calculator_node(node.left)
-            right = ToolAgent._eval_calculator_node(node.right)
-            if isinstance(node.op, ast.Add):
-                return left + right
-            if isinstance(node.op, ast.Sub):
-                return left - right
-            if isinstance(node.op, ast.Mult):
-                return left * right
-            if isinstance(node.op, ast.Div):
-                return left / right
-            if isinstance(node.op, ast.FloorDiv):
-                return left // right
-            if isinstance(node.op, ast.Mod):
-                return left % right
-            if isinstance(node.op, ast.Pow):
-                if abs(right) > 8:
-                    raise ValueError("exponent too large")
-                return left**right
-        if isinstance(node, ast.BoolOp):
-            values = [bool(ToolAgent._eval_calculator_node(value)) for value in node.values]
-            if isinstance(node.op, ast.And):
-                return all(values)
-            if isinstance(node.op, ast.Or):
-                return any(values)
-        if isinstance(node, ast.Compare):
-            left = ToolAgent._eval_calculator_node(node.left)
-            for op, comparator in zip(node.ops, node.comparators, strict=True):
-                right = ToolAgent._eval_calculator_node(comparator)
-                if isinstance(op, ast.Eq):
-                    ok = left == right
-                elif isinstance(op, ast.NotEq):
-                    ok = left != right
-                elif isinstance(op, ast.Lt):
-                    ok = left < right
-                elif isinstance(op, ast.LtE):
-                    ok = left <= right
-                elif isinstance(op, ast.Gt):
-                    ok = left > right
-                elif isinstance(op, ast.GtE):
-                    ok = left >= right
-                else:
-                    raise ValueError("unsupported comparison")
-                if not ok:
-                    return False
-                left = right
-            return True
-        raise ValueError("unsupported expression")
-
-    def scratchpad(self, operation: str, content: str = "") -> str:
-        """Read or replace one persistent free-form note blob."""
-        op = (operation or "").strip().lower()
-        if op == "read":
-            return self._scratchpad or "(empty)"
-        if op == "write_replace":
-            note = " ".join((content or "").strip().split())
-            self._scratchpad = note[: self.MAX_SCRATCHPAD_CHARS]
-            return f"updated: {self._scratchpad or '(empty)'}"
-        return "error: operation must be read or write_replace"
-
-    def _build_tool_prompt(
-        self,
-        observation: str,
-        choices: list[dict[str, str]],
-        prompt_kind: str,
-        tool_results: list[str] | None = None,
-    ) -> str:
-        template = self.prompt_renderer.get_template(self.action_template)
-        return template.render(
-            prompt_kind=prompt_kind,
-            observation=observation,
-            choices=[{"text": choice.get("text", "")} for choice in choices],
-            tool_descriptions=self._tool_descriptions(),
-            tool_results=tool_results or [],
-            recent_steps=self._recent_steps(),
-            scratchpad_note=self._scratchpad,
-        ).strip()
-
-    @staticmethod
-    def _extract_tool_calls(response: str) -> list[dict[str, Any]]:
-        payload, _ = _parse_json_response(response)
-        if not isinstance(payload, dict):
-            return []
-
-        tool_calls = payload.get("tool_calls")
-        if not isinstance(tool_calls, list):
-            return []
-
-        normalized = []
-        for item in tool_calls[:1]:
-            if not isinstance(item, dict):
-                continue
-            tool_name = str(item.get("tool") or "").strip()
-            tool_input = item.get("input")
-            operation = str(item.get("operation") or "").strip()
-            content = str(item.get("content") or "").strip()
-            if isinstance(tool_input, dict):
-                operation = operation or str(tool_input.get("operation") or "").strip()
-                content = content or str(tool_input.get("content") or "").strip()
-                tool_input = tool_input.get("expression") or tool_input.get("query") or tool_input.get("content") or ""
-            tool_input = str(tool_input or "").strip()
-            if len(tool_input) > ToolAgent.MAX_TOOL_INPUT_CHARS:
-                tool_input = tool_input[: ToolAgent.MAX_TOOL_INPUT_CHARS]
-            if len(content) > ToolAgent.MAX_TOOL_INPUT_CHARS:
-                content = content[: ToolAgent.MAX_TOOL_INPUT_CHARS]
-            if tool_name:
-                normalized.append(
-                    {
-                        "tool": tool_name,
-                        "input": tool_input,
-                        "operation": operation,
-                        "content": content,
-                    }
-                )
-        return normalized
-
-    def _execute_tool_calls(self, tool_calls: list[dict[str, Any]]) -> list[str]:
-        results = []
-        for tc in tool_calls:
-            name, inp = tc["tool"], tc.get("input", "")
-            if name == "quest_history":
-                result = self.quest_history(inp)
-            elif name == "calculator":
-                result = self.calculator(inp)
-            elif name == "scratchpad":
-                operation = tc.get("operation") or inp
-                result = self.scratchpad(str(operation), str(tc.get("content") or ""))
-            else:
-                result = f"unknown tool: {name}"
-            call_repr = inp
-            if name == "scratchpad":
-                call_repr = f"{tc.get('operation') or inp}, {tc.get('content') or ''}".strip(", ")
-            results.append(f"{name}({call_repr}) => {result}")
-        return results
-
-    def _final_choice(
-        self,
-        observation: str,
-        choices: list[dict[str, str]],
-        tool_results: list[str] | None = None,
-    ) -> tuple[LLMResponse, dict[str, Any]]:
-        prompt = self._build_tool_prompt(
-            observation,
-            choices,
-            prompt_kind="final",
-            tool_results=tool_results,
-        )
-        llm_response = self.llm.get_completion(prompt)
-        llm_usage = self.llm.get_last_usage()
-        parsed_response = parse_llm_response(llm_response, len(choices), self.debug, self.logger)
-
-        if parsed_response.is_default:
-            retry_response = self.llm.get_completion(self._format_retry_prompt(observation, choices))
-            retry_usage = self.llm.get_last_usage()
-            llm_usage = self._merge_usage(llm_usage, retry_usage)
-            retry_parsed = parse_llm_response(retry_response, len(choices), self.debug, self.logger)
-            if not retry_parsed.is_default:
-                retry_parsed.parse_mode = f"retry_{retry_parsed.parse_mode or 'parsed'}"
-                parsed_response = retry_parsed
-            elif self._needs_force_numeric_retry():
-                force_response = self.llm.get_completion(self._format_force_numeric_retry_prompt(choices))
-                force_usage = self.llm.get_last_usage()
-                llm_usage = self._merge_usage(llm_usage, force_usage)
-                force_parsed = parse_llm_response(force_response, len(choices), self.debug, self.logger)
-                if not force_parsed.is_default:
-                    force_parsed.parse_mode = f"force_retry_{force_parsed.parse_mode or 'parsed'}"
-                    parsed_response = force_parsed
-
-        return parsed_response, llm_usage
-
-    def _log_step(self, observation: str, choices: list[dict[str, str]], response: LLMResponse) -> None:
-        selected = ""
-        if 1 <= response.action <= len(choices):
-            selected = choices[response.action - 1].get("text", "")
-
-        clipped = " ".join((observation or "").strip().split())
-        if len(clipped) > 180:
-            clipped = clipped[:180] + "..."
-
-        self._step_log.append(
-            {
-                "step": len(self._step_log) + 1,
-                "observation": clipped,
-                "choices": [c.get("text", "") for c in choices],
-                "selected_choice": selected,
-            }
-        )
-
-    def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int:
-        try:
-            state_signature = self._state_signature(state, choices)
-            contextual_state = self._build_contextual_state(state)
-            self._ensure_llm()
-
-            selection_prompt = self._build_tool_prompt(contextual_state, choices, prompt_kind="select")
-            selection_response = self.llm.get_completion(selection_prompt)
-            selection_usage = self.llm.get_last_usage()
-            tool_calls = self._extract_tool_calls(selection_response)
-            parsed_response = parse_llm_response(selection_response, len(choices), self.debug, self.logger)
-            tool_results: list[str] = []
-
-            total_usage = self._normalize_usage(selection_usage)
-            if tool_calls:
-                tool_results = self._execute_tool_calls(tool_calls)
-                parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=tool_results)
-                total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage))
-            elif parsed_response.is_default:
-                parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=[])
-                total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage))
-
-            action_before_policy = parsed_response.action
-            parsed_response.action = self._apply_safety_filter(parsed_response.action, choices)
-            if parsed_response.action != action_before_policy and not parsed_response.reasoning:
-                parsed_response.reasoning = "policy_safety_override"
-
-            parsed_response.prompt_tokens = total_usage["prompt_tokens"]
-            parsed_response.completion_tokens = total_usage["completion_tokens"]
-            parsed_response.total_tokens = total_usage["total_tokens"]
-            parsed_response.estimated_cost_usd = total_usage["estimated_cost_usd"]
-            parsed_response.tool_calls = tool_calls or None
-            parsed_response.tool_results = tool_results or None
-
-            self.history.append(parsed_response)
-            self._last_response = parsed_response
-            self._remember_decision(state, choices, state_signature, parsed_response)
-            self._log_step(state, choices, parsed_response)
-            return parsed_response.action
-        except Exception as exc:
-            self.logger.error("Tool agent error during LLM call: %s", exc)
-            default_response = LLMResponse(
-                action=1,
-                is_default=True,
-                parse_mode="error_default",
-                reasoning=f"tool_agent_error: {exc}",
-            )
-            self.history.append(default_response)
-            self._last_response = default_response
-            return 1
-
-    def reset(self) -> None:
-        super().reset()
-        self._step_log = []
-        self._scratchpad = ""
-
-    def on_game_start(self) -> None:
-        super().on_game_start()
-        self._step_log = []
-        self._scratchpad = ""
+__all__ = ["ToolAgent"]
diff --git a/llm_quest_benchmark/harnesses/base.py b/llm_quest_benchmark/harnesses/base.py
index 2ae3e16..ec70b55 100644
--- a/llm_quest_benchmark/harnesses/base.py
+++ b/llm_quest_benchmark/harnesses/base.py
@@ -1,6 +1,8 @@
 """Base harness class for quest benchmark experiments."""
 
+import hashlib
 import logging
+import re
 from abc import abstractmethod
 from typing import Any
 
@@ -30,14 +32,15 @@ def __init__(
         debug,
         memory_module=None,
         tools=None,
+        action_template=DEFAULT_TEMPLATE,
     ):
         super().__init__(skip_single=skip_single)
         self.debug = debug
         self.model_name = model_name.lower()
         self.system_template = normalize_template_name(system_template)
-        self.action_template = DEFAULT_TEMPLATE
+        self.action_template = normalize_template_name(action_template)
         self.temperature = temperature
-        self.harness_name = ""
+        self.harness_name = getattr(self.__class__, "harness_name", "")
         self.agent_id = f"harness_{self.model_name}"
         self.memory_module = memory_module
         self.tools = tools or []
@@ -61,6 +64,10 @@ def __init__(
         self.history: list[LLMResponse] = []
         self._use_safety_filter = True
         self._last_response = LLMResponse(action=1, is_default=True)
+        self._observation_history: list[str] = []
+        self._decision_history: list[dict[str, Any]] = []
+        self._state_action_counts: dict[str, dict[int, int]] = {}
+        self._step_count = 0
 
     def _ensure_llm(self) -> None:
         """Lazily create the provider client only when inference is needed."""
@@ -82,9 +89,125 @@ def reset(self) -> None:
         super().reset()
         self.history = []
         self._last_response = LLMResponse(action=1, is_default=True)
+        self._observation_history = []
+        self._decision_history = []
+        self._state_action_counts = {}
+        self._step_count = 0
         if self.memory_module is not None:
             self.memory_module.reset()
 
+    def get_action(self, observation: str, choices: list[dict[str, str]]) -> int:
+        clean = (observation or "").strip()
+        if clean:
+            self._observation_history.append(clean)
+            if len(self._observation_history) > 20:
+                self._observation_history = self._observation_history[-20:]
+            if self.memory_module is not None:
+                self.memory_module.update({"observation": clean, "step": self._step_count + 1})
+        return super().get_action(observation, choices)
+
+    def on_game_start(self) -> None:
+        super().on_game_start()
+        self.reset()
+
+    def on_game_end(self, final_state: dict[str, Any]) -> None:
+        if self.debug:
+            self.logger.debug("Game ended with state: %s", final_state)
+
+    def get_last_response(self) -> LLMResponse | None:
+        return self._last_response
+
+    @property
+    def _quest_briefing(self) -> str | None:
+        return getattr(self.memory_module, "_quest_briefing", None)
+
+    @_quest_briefing.setter
+    def _quest_briefing(self, value: str | None) -> None:
+        if self.memory_module is not None:
+            self.memory_module._quest_briefing = value
+
+    @property
+    def _transcript(self) -> list[dict[str, Any]]:
+        return getattr(self.memory_module, "_transcript", [])
+
+    @_transcript.setter
+    def _transcript(self, value: list[dict[str, Any]]) -> None:
+        if self.memory_module is not None:
+            self.memory_module._transcript = value
+
+    @property
+    def _steps_since_compaction(self) -> int:
+        return getattr(self.memory_module, "_steps_since_compaction", 0)
+
+    @_steps_since_compaction.setter
+    def _steps_since_compaction(self, value: int) -> None:
+        if self.memory_module is not None:
+            self.memory_module._steps_since_compaction = value
+
+    def _build_contextual_state(self, state: str) -> str:
+        if self.memory_module is None:
+            return state
+        context = self.memory_module.get_context(self._step_count + 1)
+        if not context:
+            return state
+        return f"{context}\n\nCurrent story state:\n{state}"
+
+    @staticmethod
+    def _normalize_for_signature(value: str, max_len: int = 320) -> str:
+        text = (value or "").lower()
+        text = re.sub(r"\s+", " ", text).strip()
+        return text[:max_len] if len(text) > max_len else text
+
+    def _state_signature(self, state: str, choices: list[dict[str, str]]) -> str:
+        normalized_state = self._normalize_for_signature(state, max_len=420)
+        normalized_choices = "|".join(
+            self._normalize_for_signature(choice.get("text", ""), max_len=110) for choice in choices
+        )
+        raw_signature = f"{normalized_state}||{normalized_choices}"
+        return hashlib.sha1(raw_signature.encode("utf-8", errors="ignore")).hexdigest()[:20]
+
+    def _remember_decision(
+        self,
+        state: str,
+        choices: list[dict[str, str]],
+        state_signature: str,
+        response: LLMResponse,
+    ) -> None:
+        action = int(response.action)
+        counts = self._state_action_counts.setdefault(state_signature, {})
+        counts[action] = counts.get(action, 0) + 1
+
+        selected_text = ""
+        if 1 <= action <= len(choices):
+            selected_text = choices[action - 1].get("text", "")
+        state_snippet = (state or "").strip()
+        if len(state_snippet) > 220:
+            state_snippet = state_snippet[:220] + "..."
+
+        decision = {
+            "state": state_snippet,
+            "action": action,
+            "choice": selected_text,
+            "choice_text": selected_text,
+            "parse_mode": response.parse_mode or "unknown",
+            "memo": (response.memo or "").strip()[:350] or None,
+            "reasoning": (response.reasoning or "")[:800],
+        }
+        self._decision_history.append(decision)
+        if len(self._decision_history) > 40:
+            self._decision_history = self._decision_history[-40:]
+
+        self._step_count += 1
+        if self.memory_module is not None:
+            self.memory_module.update(
+                {
+                    "step": self._step_count,
+                    "observation": state,
+                    "choices": [c.get("text", "") for c in choices],
+                    **decision,
+                }
+            )
+
     def _format_prompt(self, observation, choices, memo=None, context=None) -> str:
         """Render system and action Jinja templates for the current decision."""
         system_prompt = self.prompt_renderer.render_system_prompt(
diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py
new file mode 100644
index 0000000..58d2546
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/factory.py
@@ -0,0 +1,53 @@
+"""Factory for creating harness-based quest players."""
+
+from llm_quest_benchmark.agents.base import QuestPlayer
+from llm_quest_benchmark.agents.human_player import HumanPlayer
+from llm_quest_benchmark.agents.random_agent import RandomAgent
+from llm_quest_benchmark.harnesses.memo import HintedCompactHarness, MemoCompactHarness
+from llm_quest_benchmark.harnesses.minimal import MinimalHarness
+from llm_quest_benchmark.harnesses.planner import PlannerHarness
+from llm_quest_benchmark.harnesses.reasoning import ReasoningFullTranscriptHarness, ReasoningRecentHarness
+from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness, ToolHintedHarness
+
+HARNESS_REGISTRY = {
+    "minimal": MinimalHarness,
+    "reasoning_recent": ReasoningRecentHarness,
+    "reasoning_full": ReasoningFullTranscriptHarness,
+    "memo_compact": MemoCompactHarness,
+    "hinted_compact": HintedCompactHarness,
+    "tool_compact": ToolCompactHarness,
+    "tool_hinted": ToolHintedHarness,
+    "planner": PlannerHarness,
+}
+
+
+def create_harness(
+    harness: str,
+    model: str,
+    temperature: float = 0.4,
+    skip_single: bool = False,
+    debug: bool = False,
+    compaction_interval: int = 50,
+    system_template: str = "system_role.jinja",
+) -> QuestPlayer:
+    if harness == "human":
+        return HumanPlayer(skip_single=skip_single)
+    if harness.startswith("random_choice"):
+        seed = None
+        if "_" in harness[13:]:
+            try:
+                seed = int(harness.split("_")[-1])
+            except ValueError:
+                pass
+        return RandomAgent(seed=seed, debug=debug, skip_single=skip_single)
+    if harness not in HARNESS_REGISTRY:
+        raise ValueError(f"Unknown harness '{harness}'. Valid: {sorted(HARNESS_REGISTRY)}")
+    cls = HARNESS_REGISTRY[harness]
+    return cls(
+        model_name=model,
+        temperature=temperature,
+        skip_single=skip_single,
+        debug=debug,
+        compaction_interval=compaction_interval,
+        system_template=system_template,
+    )
diff --git a/llm_quest_benchmark/harnesses/memo.py b/llm_quest_benchmark/harnesses/memo.py
new file mode 100644
index 0000000..764f206
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/memo.py
@@ -0,0 +1,62 @@
+"""Compacted-memory harness variants."""
+
+from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE
+from llm_quest_benchmark.harnesses.memory import CompactionMemory
+from llm_quest_benchmark.harnesses.minimal import MinimalHarness
+
+
+class MemoCompactHarness(MinimalHarness):
+    harness_name = "memo_compact"
+
+    def __init__(
+        self,
+        model_name: str = DEFAULT_MODEL,
+        system_template: str = SYSTEM_ROLE_TEMPLATE,
+        action_template: str = "stateful_compact.jinja",
+        temperature: float = DEFAULT_TEMPERATURE,
+        skip_single: bool = False,
+        debug: bool = False,
+        compaction_interval: int = 50,
+        memory_module=None,
+        **kwargs,
+    ):
+        super().__init__(
+            model_name=model_name,
+            system_template=system_template,
+            action_template=action_template,
+            temperature=temperature,
+            skip_single=skip_single,
+            debug=debug,
+            memory_module=memory_module or CompactionMemory(compaction_interval=compaction_interval),
+            **kwargs,
+        )
+        self._memory_mode = "compaction"
+        self._compaction_interval = compaction_interval
+
+
+class HintedCompactHarness(MemoCompactHarness):
+    harness_name = "hinted_compact"
+
+    def __init__(
+        self,
+        model_name: str = DEFAULT_MODEL,
+        system_template: str = SYSTEM_ROLE_TEMPLATE,
+        action_template: str = "stateful_compact_hints.jinja",
+        temperature: float = DEFAULT_TEMPERATURE,
+        skip_single: bool = False,
+        debug: bool = False,
+        compaction_interval: int = 50,
+        memory_module=None,
+        **kwargs,
+    ):
+        super().__init__(
+            model_name=model_name,
+            system_template=system_template,
+            action_template=action_template,
+            temperature=temperature,
+            skip_single=skip_single,
+            debug=debug,
+            compaction_interval=compaction_interval,
+            memory_module=memory_module,
+            **kwargs,
+        )
diff --git a/llm_quest_benchmark/harnesses/minimal.py b/llm_quest_benchmark/harnesses/minimal.py
new file mode 100644
index 0000000..8fa8ba0
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/minimal.py
@@ -0,0 +1,61 @@
+"""Minimal harness implementation."""
+
+from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE
+from llm_quest_benchmark.harnesses.base import BaseHarness
+from llm_quest_benchmark.harnesses.memory import DefaultMemory
+from llm_quest_benchmark.schemas.response import LLMResponse
+
+
+class MinimalHarness(BaseHarness):
+    """Simple prompt-call-parse action loop with recent-memory context."""
+
+    harness_name = "minimal"
+
+    def __init__(
+        self,
+        model_name: str = DEFAULT_MODEL,
+        system_template: str = SYSTEM_ROLE_TEMPLATE,
+        action_template: str = "stub.jinja",
+        temperature: float = DEFAULT_TEMPERATURE,
+        skip_single: bool = False,
+        debug: bool = False,
+        memory_module=None,
+        compaction_interval: int = 50,
+        **_,
+    ):
+        del compaction_interval
+        super().__init__(
+            model_name=model_name,
+            system_template=system_template,
+            action_template=action_template,
+            temperature=temperature,
+            skip_single=skip_single,
+            debug=debug,
+            memory_module=memory_module or DefaultMemory(),
+        )
+
+    def _get_action_impl(self, observation: str, choices: list[dict[str, str]]) -> int:
+        try:
+            state_signature = self._state_signature(observation, choices)
+            prompt = self._format_prompt(self._build_contextual_state(observation), choices)
+            parsed_response = self._parse_with_retries(prompt, observation, choices)
+            self.history.append(parsed_response)
+            self._last_response = parsed_response
+            self._remember_decision(observation, choices, state_signature, parsed_response)
+            if parsed_response.action < 1 or parsed_response.action > len(choices):
+                parsed_response.action = 1
+            return parsed_response.action
+        except Exception as exc:
+            self.logger.error("Harness error during LLM call: %s", exc)
+            default_response = LLMResponse(
+                action=1,
+                is_default=True,
+                parse_mode="error_default",
+                reasoning=f"llm_call_error: {exc}",
+            )
+            self.history.append(default_response)
+            self._last_response = default_response
+            return 1
+
+    def reset(self) -> None:
+        super().reset()
diff --git a/llm_quest_benchmark/harnesses/planner.py b/llm_quest_benchmark/harnesses/planner.py
new file mode 100644
index 0000000..efb77a9
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/planner.py
@@ -0,0 +1,198 @@
+"""Planner harness implementation."""
+
+import logging
+import re
+from typing import Any
+
+from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE
+from llm_quest_benchmark.harnesses.base import BaseHarness
+from llm_quest_benchmark.harnesses.memory import CompactionMemory
+from llm_quest_benchmark.schemas.response import LLMResponse
+
+
+class PlannerHarness(BaseHarness):
+    """Compacted-memory harness with a lightweight plan-maintain-act loop."""
+
+    harness_name = "planner"
+
+    def __init__(
+        self,
+        model_name: str = DEFAULT_MODEL,
+        system_template: str = SYSTEM_ROLE_TEMPLATE,
+        action_template: str = "planner.jinja",
+        temperature: float = DEFAULT_TEMPERATURE,
+        skip_single: bool = False,
+        debug: bool = False,
+        compaction_interval: int = 50,
+        memory_module=None,
+        **_,
+    ):
+        super().__init__(
+            model_name=model_name,
+            system_template=system_template,
+            action_template=action_template,
+            temperature=temperature,
+            skip_single=skip_single,
+            debug=debug,
+            memory_module=memory_module or CompactionMemory(compaction_interval=compaction_interval),
+        )
+        self.agent_id = f"planner_{self.model_name}"
+        self.current_plan: str | None = None
+        self._plan_history: list[str] = []
+        self._memory_mode = "compaction"
+        self._compaction_interval = compaction_interval
+
+    def _recent_actions(self) -> list[str]:
+        entries = []
+        for item in self._decision_history[-3:]:
+            choice = (item.get("choice") or "").strip()
+            if choice:
+                entries.append(f"{item.get('action')}. {choice}")
+        return entries
+
+    @staticmethod
+    def _normalize_plan(raw_plan: str) -> str:
+        compact = " ".join((raw_plan or "").strip().split())
+        if not compact:
+            return ""
+        sentences = re.split(r"(?<=[.!?])\s+", compact)
+        sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
+        if len(sentences) >= 5:
+            return " ".join(sentences[:5])
+        return compact
+
+    def _build_planner_prompt(
+        self,
+        observation: str,
+        choices: list[dict[str, str]],
+        prompt_kind: str,
+        replan_reason: str | None = None,
+    ) -> str:
+        template = self.prompt_renderer.get_template(self.action_template)
+        return template.render(
+            prompt_kind=prompt_kind,
+            observation=observation,
+            choices=[{"text": choice.get("text", "")} for choice in choices],
+            current_plan=self.current_plan,
+            replan_reason=replan_reason,
+            recent_actions=self._recent_actions(),
+        ).strip()
+
+    def _observation_changed_significantly(self, observation: str) -> bool:
+        if len(self._observation_history) < 2:
+            return False
+        prev_tokens = set(self._observation_history[-2].lower().split())
+        curr_tokens = set((observation or "").lower().split())
+        if not prev_tokens or not curr_tokens:
+            return True
+        overlap = len(prev_tokens & curr_tokens) / max(len(prev_tokens), len(curr_tokens))
+        return overlap < 0.5
+
+    def _should_replan(self, observation: str, state_signature: str) -> tuple[bool, str | None]:
+        if not self.current_plan:
+            return True, "No plan exists yet."
+        if any(self._state_action_counts.get(state_signature, {}).values()):
+            return True, "This state has repeated, so a previous action already failed to progress."
+        if self._observation_changed_significantly(observation):
+            return True, "The scene changed significantly from the previous observation."
+        return False, None
+
+    def _update_plan(
+        self,
+        observation: str,
+        choices: list[dict[str, str]],
+        replan_reason: str | None,
+    ) -> dict[str, Any]:
+        self._ensure_llm()
+        prompt = self._build_planner_prompt(observation, choices, prompt_kind="plan", replan_reason=replan_reason)
+        plan_response = self._call_llm(prompt)
+        usage = self.llm.get_last_usage()
+        plan = self._normalize_plan(plan_response)
+        if not plan:
+            plan = self.current_plan or (
+                "Gather clues, protect resources, and avoid obvious traps while advancing toward the main objective."
+            )
+        self.current_plan = plan
+        self._plan_history.append(plan)
+        if len(self._plan_history) > 10:
+            self._plan_history = self._plan_history[-10:]
+        return usage
+
+    def _choose_action_with_plan(
+        self,
+        observation: str,
+        choices: list[dict[str, str]],
+        replan_reason: str | None,
+    ) -> tuple[LLMResponse, dict[str, Any]]:
+        prompt = self._build_planner_prompt(observation, choices, prompt_kind="act", replan_reason=replan_reason)
+        parsed_response = self._parse_with_retries(prompt, observation, choices)
+        return parsed_response, {
+            "prompt_tokens": parsed_response.prompt_tokens,
+            "completion_tokens": parsed_response.completion_tokens,
+            "total_tokens": parsed_response.total_tokens,
+            "estimated_cost_usd": parsed_response.estimated_cost_usd,
+        }
+
+    def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int:
+        if self.debug:
+            self.logger.debug("PlannerHarness evaluating state with %s choices", len(choices))
+        try:
+            state_signature = self._state_signature(state, choices)
+            contextual_state = self._build_contextual_state(state)
+            should_replan, replan_reason = self._should_replan(state, state_signature)
+            plan_usage = None
+            if should_replan:
+                plan_usage = self._update_plan(contextual_state, choices, replan_reason)
+
+            parsed_response, action_usage = self._choose_action_with_plan(
+                contextual_state,
+                choices,
+                replan_reason if should_replan else None,
+            )
+
+            action_before_policy = parsed_response.action
+            parsed_response.action = self._apply_safety_filter(choices, parsed_response.action)
+            if parsed_response.action != action_before_policy and not parsed_response.reasoning:
+                parsed_response.reasoning = "policy_safety_override"
+
+            total_usage = (
+                self._merge_usage(plan_usage, action_usage) if plan_usage else self._normalize_usage(action_usage)
+            )
+            total_usage = self._normalize_usage(total_usage)
+            parsed_response.prompt_tokens = total_usage["prompt_tokens"]
+            parsed_response.completion_tokens = total_usage["completion_tokens"]
+            parsed_response.total_tokens = total_usage["total_tokens"]
+            parsed_response.estimated_cost_usd = total_usage["estimated_cost_usd"]
+
+            self.history.append(parsed_response)
+            self._last_response = parsed_response
+            self._remember_decision(state, choices, state_signature, parsed_response)
+            if parsed_response.action < 1 or parsed_response.action > len(choices):
+                parsed_response.action = 1
+            return parsed_response.action
+        except Exception as exc:
+            self.logger.error("Planner harness error during LLM call: %s", exc)
+            default_response = LLMResponse(
+                action=1,
+                is_default=True,
+                parse_mode="error_default",
+                reasoning=f"planner_error: {exc}",
+            )
+            self.history.append(default_response)
+            self._last_response = default_response
+            return 1
+
+    def reset(self) -> None:
+        super().reset()
+        self.current_plan = None
+        self._plan_history = []
+
+    def on_game_start(self) -> None:
+        super().on_game_start()
+        self.current_plan = None
+        self._plan_history = []
+
+    def on_game_end(self, final_state: dict[str, Any]) -> None:
+        if self.debug:
+            logging.getLogger(self.__class__.__name__).debug("Planner finished with plan: %s", self.current_plan)
+        super().on_game_end(final_state)
diff --git a/llm_quest_benchmark/harnesses/reasoning.py b/llm_quest_benchmark/harnesses/reasoning.py
new file mode 100644
index 0000000..79564d5
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/reasoning.py
@@ -0,0 +1,57 @@
+"""Reasoning harness variants."""
+
+from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE
+from llm_quest_benchmark.harnesses.memory import DefaultMemory, FullTranscriptMemory
+from llm_quest_benchmark.harnesses.minimal import MinimalHarness
+
+
+class ReasoningRecentHarness(MinimalHarness):
+    harness_name = "reasoning_recent"
+
+    def __init__(
+        self,
+        model_name: str = DEFAULT_MODEL,
+        system_template: str = SYSTEM_ROLE_TEMPLATE,
+        action_template: str = "reasoning.jinja",
+        temperature: float = DEFAULT_TEMPERATURE,
+        skip_single: bool = False,
+        debug: bool = False,
+        memory_module=None,
+        **kwargs,
+    ):
+        super().__init__(
+            model_name=model_name,
+            system_template=system_template,
+            action_template=action_template,
+            temperature=temperature,
+            skip_single=skip_single,
+            debug=debug,
+            memory_module=memory_module or DefaultMemory(),
+            **kwargs,
+        )
+
+
+class ReasoningFullTranscriptHarness(MinimalHarness):
+    harness_name = "reasoning_full"
+
+    def __init__(
+        self,
+        model_name: str = DEFAULT_MODEL,
+        system_template: str = SYSTEM_ROLE_TEMPLATE,
+        action_template: str = "reasoning.jinja",
+        temperature: float = DEFAULT_TEMPERATURE,
+        skip_single: bool = False,
+        debug: bool = False,
+        memory_module=None,
+        **kwargs,
+    ):
+        super().__init__(
+            model_name=model_name,
+            system_template=system_template,
+            action_template=action_template,
+            temperature=temperature,
+            skip_single=skip_single,
+            debug=debug,
+            memory_module=memory_module or FullTranscriptMemory(),
+            **kwargs,
+        )
diff --git a/llm_quest_benchmark/harnesses/tool_harness.py b/llm_quest_benchmark/harnesses/tool_harness.py
new file mode 100644
index 0000000..e89e7c1
--- /dev/null
+++ b/llm_quest_benchmark/harnesses/tool_harness.py
@@ -0,0 +1,238 @@
+"""Tool-augmented harness implementations."""
+
+from typing import Any
+
+from llm_quest_benchmark.agents.llm_agent import _parse_json_response
+from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE
+from llm_quest_benchmark.harnesses.base import BaseHarness
+from llm_quest_benchmark.harnesses.memory import CompactionMemory
+from llm_quest_benchmark.harnesses.tools import QuestHistoryTool, Scratchpad, calculator
+from llm_quest_benchmark.schemas.response import LLMResponse
+
+
+class ToolCompactHarness(BaseHarness):
+    """Compacted-memory harness with a two-phase tool selection/action loop."""
+
+    harness_name = "tool_compact"
+    DEFAULT_HISTORY_WINDOW = 10
+    MAX_TOOL_INPUT_CHARS = 500
+
+    def __init__(
+        self,
+        model_name: str = DEFAULT_MODEL,
+        system_template: str = SYSTEM_ROLE_TEMPLATE,
+        action_template: str = "tool_augmented.jinja",
+        temperature: float = DEFAULT_TEMPERATURE,
+        skip_single: bool = False,
+        debug: bool = False,
+        compaction_interval: int = 50,
+        memory_module=None,
+        history_window: int | None = None,
+        **_,
+    ):
+        self._step_log: list[dict[str, Any]] = []
+        self._history_window = history_window or self.DEFAULT_HISTORY_WINDOW
+        self._scratchpad_tool = Scratchpad()
+        self._history_tool = QuestHistoryTool(self._step_log, self._history_window)
+        super().__init__(
+            model_name=model_name,
+            system_template=system_template,
+            action_template=action_template,
+            temperature=temperature,
+            skip_single=skip_single,
+            debug=debug,
+            memory_module=memory_module or CompactionMemory(compaction_interval=compaction_interval),
+            tools=[calculator, self._scratchpad_tool, self._history_tool],
+        )
+        self._memory_mode = "compaction"
+        self._compaction_interval = compaction_interval
+
+    def _recent_steps(self) -> list[str]:
+        return [
+            f"Step {entry['step']}: {entry['observation']} -> {entry.get('selected_choice', 'n/a')}"
+            for entry in self._step_log[-self._history_window :]
+        ]
+
+    def _tool_descriptions(self) -> list[str]:
+        return [
+            "quest_history(query): search earlier observations and chosen actions in this quest.",
+            "calculator(expression): evaluate arithmetic and simple comparisons.",
+            "scratchpad(operation, content): read or replace one persistent note. operation is read or write_replace.",
+        ]
+
+    def quest_history(self, query: str) -> str:
+        return self._history_tool.search(query)
+
+    @staticmethod
+    def calculator(expression: str) -> str:
+        return calculator(expression)
+
+    def scratchpad(self, operation: str, content: str = "") -> str:
+        op = (operation or "").strip().lower()
+        if op == "read":
+            return self._scratchpad_tool.read()
+        if op == "write_replace":
+            return self._scratchpad_tool.write_replace(content)
+        return "error: operation must be read or write_replace"
+
+    def _build_tool_prompt(
+        self,
+        observation: str,
+        choices: list[dict[str, str]],
+        prompt_kind: str,
+        tool_results: list[str] | None = None,
+    ) -> str:
+        template = self.prompt_renderer.get_template(self.action_template)
+        return template.render(
+            prompt_kind=prompt_kind,
+            observation=observation,
+            choices=[{"text": choice.get("text", "")} for choice in choices],
+            tool_descriptions=self._tool_descriptions(),
+            tool_results=tool_results or [],
+            recent_steps=self._recent_steps(),
+            scratchpad_note=self._scratchpad_tool.read() if self._scratchpad_tool.read() != "(empty)" else "",
+        ).strip()
+
+    @staticmethod
+    def _extract_tool_calls(response: str) -> list[dict[str, Any]]:
+        payload, _ = _parse_json_response(response)
+        if not isinstance(payload, dict):
+            return []
+        tool_calls = payload.get("tool_calls")
+        if not isinstance(tool_calls, list):
+            return []
+
+        normalized = []
+        for item in tool_calls[:1]:
+            if not isinstance(item, dict):
+                continue
+            tool_name = str(item.get("tool") or "").strip()
+            tool_input = item.get("input")
+            operation = str(item.get("operation") or "").strip()
+            content = str(item.get("content") or "").strip()
+            if isinstance(tool_input, dict):
+                operation = operation or str(tool_input.get("operation") or "").strip()
+                content = content or str(tool_input.get("content") or "").strip()
+                tool_input = tool_input.get("expression") or tool_input.get("query") or tool_input.get("content") or ""
+            tool_input = str(tool_input or "").strip()
+            if len(tool_input) > ToolCompactHarness.MAX_TOOL_INPUT_CHARS:
+                tool_input = tool_input[: ToolCompactHarness.MAX_TOOL_INPUT_CHARS]
+            if len(content) > ToolCompactHarness.MAX_TOOL_INPUT_CHARS:
+                content = content[: ToolCompactHarness.MAX_TOOL_INPUT_CHARS]
+            if tool_name:
+                normalized.append({"tool": tool_name, "input": tool_input, "operation": operation, "content": content})
+        return normalized
+
+    def _execute_tool_calls(self, tool_calls: list[dict[str, Any]]) -> list[str]:
+        results = []
+        for tc in tool_calls:
+            name, inp = tc["tool"], tc.get("input", "")
+            if name == "quest_history":
+                result = self.quest_history(inp)
+            elif name == "calculator":
+                result = self.calculator(inp)
+            elif name == "scratchpad":
+                operation = tc.get("operation") or inp
+                result = self.scratchpad(str(operation), str(tc.get("content") or ""))
+            else:
+                result = f"unknown tool: {name}"
+            call_repr = inp
+            if name == "scratchpad":
+                call_repr = f"{tc.get('operation') or inp}, {tc.get('content') or ''}".strip(", ")
+            results.append(f"{name}({call_repr}) => {result}")
+        return results
+
+    def _final_choice(
+        self,
+        observation: str,
+        choices: list[dict[str, str]],
+        tool_results: list[str] | None = None,
+    ) -> tuple[LLMResponse, dict[str, Any]]:
+        prompt = self._build_tool_prompt(observation, choices, prompt_kind="final", tool_results=tool_results)
+        parsed_response = self._parse_with_retries(prompt, observation, choices)
+        return parsed_response, {
+            "prompt_tokens": parsed_response.prompt_tokens,
+            "completion_tokens": parsed_response.completion_tokens,
+            "total_tokens": parsed_response.total_tokens,
+            "estimated_cost_usd": parsed_response.estimated_cost_usd,
+        }
+
+    def _log_step(self, observation: str, choices: list[dict[str, str]], response: LLMResponse) -> None:
+        selected = ""
+        if 1 <= response.action <= len(choices):
+            selected = choices[response.action - 1].get("text", "")
+        clipped = " ".join((observation or "").strip().split())
+        if len(clipped) > 180:
+            clipped = clipped[:180] + "..."
+        self._step_log.append(
+            {
+                "step": len(self._step_log) + 1,
+                "observation": clipped,
+                "choices": [c.get("text", "") for c in choices],
+                "selected_choice": selected,
+            }
+        )
+
+    def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int:
+        try:
+            state_signature = self._state_signature(state, choices)
+            contextual_state = self._build_contextual_state(state)
+            self._ensure_llm()
+
+            selection_prompt = self._build_tool_prompt(contextual_state, choices, prompt_kind="select")
+            selection_response = self._call_llm(selection_prompt)
+            selection_usage = self.llm.get_last_usage()
+            tool_calls = self._extract_tool_calls(selection_response)
+            parsed_response = self._parse_llm_response(selection_response, len(choices))
+            tool_results: list[str] = []
+
+            total_usage = self._normalize_usage(selection_usage)
+            if tool_calls:
+                tool_results = self._execute_tool_calls(tool_calls)
+                parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=tool_results)
+                total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage))
+            elif parsed_response.is_default:
+                parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=[])
+                total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage))
+
+            action_before_policy = parsed_response.action
+            parsed_response.action = self._apply_safety_filter(choices, parsed_response.action)
+            if parsed_response.action != action_before_policy and not parsed_response.reasoning:
+                parsed_response.reasoning = "policy_safety_override"
+
+            parsed_response.prompt_tokens = total_usage["prompt_tokens"]
+            parsed_response.completion_tokens = total_usage["completion_tokens"]
+            parsed_response.total_tokens = total_usage["total_tokens"]
+            parsed_response.estimated_cost_usd = total_usage["estimated_cost_usd"]
+            parsed_response.tool_calls = tool_calls or None
+            parsed_response.tool_results = tool_results or None
+
+            self.history.append(parsed_response)
+            self._last_response = parsed_response
+            self._remember_decision(state, choices, state_signature, parsed_response)
+            self._log_step(state, choices, parsed_response)
+            return parsed_response.action
+        except Exception as exc:
+            self.logger.error("Tool harness error during LLM call: %s", exc)
+            default_response = LLMResponse(
+                action=1,
+                is_default=True,
+                parse_mode="error_default",
+                reasoning=f"tool_agent_error: {exc}",
+            )
+            self.history.append(default_response)
+            self._last_response = default_response
+            return 1
+
+    def reset(self) -> None:
+        super().reset()
+        self._step_log = []
+        self._scratchpad_tool.reset()
+        self._history_tool.step_log = self._step_log
+
+
+class ToolHintedHarness(ToolCompactHarness):
+    harness_name = "tool_hinted"
+
+    def __init__(self, *args, action_template: str = "tool_augmented_hints.jinja", **kwargs):
+        super().__init__(*args, action_template=action_template, **kwargs)

From 68cb27efa07ec8e813d7dc132041f622d08c8f6a Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Mon, 11 May 2026 14:04:22 +0400
Subject: [PATCH 03/24] harnesses: HarnessConfig schema + test_factory

---
 llm_quest_benchmark/harnesses/factory.py      |  3 +-
 llm_quest_benchmark/schemas/config.py         | 59 +++++++++++--
 .../tests/harnesses/__init__.py               |  0
 .../tests/harnesses/test_factory.py           | 87 +++++++++++++++++++
 4 files changed, 140 insertions(+), 9 deletions(-)
 create mode 100644 llm_quest_benchmark/tests/harnesses/__init__.py
 create mode 100644 llm_quest_benchmark/tests/harnesses/test_factory.py

diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py
index 58d2546..e423783 100644
--- a/llm_quest_benchmark/harnesses/factory.py
+++ b/llm_quest_benchmark/harnesses/factory.py
@@ -3,6 +3,7 @@
 from llm_quest_benchmark.agents.base import QuestPlayer
 from llm_quest_benchmark.agents.human_player import HumanPlayer
 from llm_quest_benchmark.agents.random_agent import RandomAgent
+from llm_quest_benchmark.constants import DEFAULT_MODEL
 from llm_quest_benchmark.harnesses.memo import HintedCompactHarness, MemoCompactHarness
 from llm_quest_benchmark.harnesses.minimal import MinimalHarness
 from llm_quest_benchmark.harnesses.planner import PlannerHarness
@@ -23,7 +24,7 @@
 
 def create_harness(
     harness: str,
-    model: str,
+    model: str = DEFAULT_MODEL,
     temperature: float = 0.4,
     skip_single: bool = False,
     debug: bool = False,
diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py
index 6a030b2..74799bd 100644
--- a/llm_quest_benchmark/schemas/config.py
+++ b/llm_quest_benchmark/schemas/config.py
@@ -18,8 +18,8 @@
 DEFAULT_BENCHMARK_CONFIG = {
     "quests": ["quests/Boat.qm"],
     "agents": [
-        {"model": "random_choice", "skip_single": True, "temperature": 0.0, "template": "reasoning.jinja"},
-        {"model": "gpt-5-mini", "skip_single": True, "temperature": 0.4, "template": "reasoning.jinja"},
+        {"model": "random_choice", "skip_single": True, "temperature": 0.0, "harness": "minimal"},
+        {"model": "gpt-5-mini", "skip_single": True, "temperature": 0.4, "harness": "reasoning_recent"},
     ],
     "debug": False,
     "quest_timeout": 30,
@@ -44,7 +44,7 @@ def get_default_benchmark_yaml() -> str:
 agents:
   - model: random_choice
   - model: gpt-5-mini
-    template: reasoning.jinja
+    harness: reasoning_recent
 debug: true
 # One worker per agent will be used automatically
 output_dir: results/benchmarks"""
@@ -54,9 +54,51 @@ def get_default_benchmark_yaml() -> str:
         return f.read()
 
 
+@dataclass
+class HarnessConfig:
+    """Configuration for a single harness in benchmark"""
+
+    model: str = DEFAULT_MODEL
+    system_template: str = SYSTEM_ROLE_TEMPLATE
+    harness: str = "reasoning_recent"
+    temperature: float = DEFAULT_TEMPERATURE
+    runs: int = 1
+    skip_single: bool = False
+    debug: bool = False
+    benchmark_id: str | None = None
+    compaction_interval: int = 50
+
+    def __post_init__(self):
+        self.system_template = normalize_template_name(self.system_template)
+        from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY
+
+        if self.harness not in HARNESS_REGISTRY:
+            raise ValueError(f"Invalid harness: {self.harness}. Supported harnesses: {sorted(HARNESS_REGISTRY)}")
+        if not (0.0 <= self.temperature <= 2.0):
+            raise ValueError(f"Temperature must be between 0.0 and 2.0, got {self.temperature}")
+        if self.runs < 1:
+            raise ValueError(f"runs must be >= 1, got {self.runs}")
+        if self.compaction_interval < 1:
+            raise ValueError(f"compaction_interval must be >= 1, got {self.compaction_interval}")
+
+    @property
+    def harness_id(self) -> str:
+        """Generate a stable harness ID based on configuration values"""
+        import hashlib
+
+        config_str = f"{self.model}_{self.temperature}_{self.harness}_{self.compaction_interval}"
+        hash_val = hashlib.md5(config_str.encode()).hexdigest()[:8]
+        return f"{self.model}_t{self.temperature}_{self.harness}_{hash_val}"
+
+    @property
+    def agent_id(self) -> str:
+        """DB-compatible alias for harness_id"""
+        return self.harness_id
+
+
 @dataclass
 class AgentConfig:
-    """Configuration for a single agent in benchmark"""
+    """Legacy configuration for a single agent in benchmark"""
 
     model: str = DEFAULT_MODEL
     system_template: str = SYSTEM_ROLE_TEMPLATE
@@ -103,7 +145,7 @@ class BenchmarkConfig:
     """Configuration for benchmark run"""
 
     quests: list[str]  # List of quest files or directories
-    agents: list[AgentConfig]  # List of agent configurations to test
+    agents: list[HarnessConfig]  # List of harness configurations to test
     debug: bool = False
     quest_timeout: int = 60  # Timeout per quest
     benchmark_timeout: int | None = None  # Total timeout for all quests, defaults to quest_timeout * num_quests
@@ -137,10 +179,11 @@ def from_yaml(cls, yaml_path: str) -> "BenchmarkConfig":
         if "agents" in data:
             agents = []
             for agent in data["agents"]:
-                # Handle 'template' key which maps to action_template in AgentConfig
                 if "template" in agent:
-                    agent["action_template"] = agent.pop("template")
-                agents.append(AgentConfig(**agent))
+                    raise ValueError("Use 'harness:' instead of 'template:'")
+                if "memory_mode" in agent:
+                    raise ValueError("Use 'harness:' instead of 'memory_mode:'")
+                agents.append(HarnessConfig(**agent))
             data["agents"] = agents
 
         return cls(**data)
diff --git a/llm_quest_benchmark/tests/harnesses/__init__.py b/llm_quest_benchmark/tests/harnesses/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/llm_quest_benchmark/tests/harnesses/test_factory.py b/llm_quest_benchmark/tests/harnesses/test_factory.py
new file mode 100644
index 0000000..5b7bfa0
--- /dev/null
+++ b/llm_quest_benchmark/tests/harnesses/test_factory.py
@@ -0,0 +1,87 @@
+import pytest
+
+from llm_quest_benchmark.agents.human_player import HumanPlayer
+from llm_quest_benchmark.agents.random_agent import RandomAgent
+from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness
+from llm_quest_benchmark.harnesses.memo import MemoCompactHarness
+from llm_quest_benchmark.harnesses.minimal import MinimalHarness
+from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig
+
+
+def test_create_minimal_harness():
+    harness = create_harness("minimal", model="gpt-5-mini")
+
+    assert isinstance(harness, MinimalHarness)
+
+
+def test_all_harness_names_instantiate():
+    for harness_name, harness_cls in HARNESS_REGISTRY.items():
+        harness = create_harness(harness_name, model="gpt-5-mini")
+
+        assert isinstance(harness, harness_cls)
+
+
+def test_create_human_harness():
+    harness = create_harness("human")
+
+    assert isinstance(harness, HumanPlayer)
+
+
+def test_create_random_choice_harness():
+    harness = create_harness("random_choice")
+
+    assert isinstance(harness, RandomAgent)
+
+
+def test_create_bad_harness_name_raises():
+    with pytest.raises(ValueError):
+        create_harness("bad_name", model="gpt-5-mini")
+
+
+def test_harness_config_stable_harness_id():
+    config = HarnessConfig(harness="memo_compact", model="gpt-5-mini")
+
+    assert isinstance(config.harness_id, str)
+    assert config.harness_id == HarnessConfig(harness="memo_compact", model="gpt-5-mini").harness_id
+
+
+def test_benchmark_config_from_yaml_parses_harness(tmp_path):
+    quest_path = tmp_path / "quest.qm"
+    quest_path.write_text("", encoding="utf-8")
+    config_path = tmp_path / "benchmark.yaml"
+    config_path.write_text(
+        f"""
+quests:
+  - {quest_path}
+agents:
+  - model: gpt-5-mini
+    harness: memo_compact
+""",
+        encoding="utf-8",
+    )
+
+    config = BenchmarkConfig.from_yaml(str(config_path))
+
+    assert len(config.agents) == 1
+    assert isinstance(config.agents[0], HarnessConfig)
+    assert isinstance(create_harness(config.agents[0].harness, model=config.agents[0].model), MemoCompactHarness)
+    assert config.agents[0].harness == "memo_compact"
+
+
+def test_benchmark_config_from_yaml_rejects_template(tmp_path):
+    quest_path = tmp_path / "quest.qm"
+    quest_path.write_text("", encoding="utf-8")
+    config_path = tmp_path / "benchmark.yaml"
+    config_path.write_text(
+        f"""
+quests:
+  - {quest_path}
+agents:
+  - model: gpt-5-mini
+    template: reasoning.jinja
+""",
+        encoding="utf-8",
+    )
+
+    with pytest.raises(ValueError, match="Use 'harness:' instead of 'template:'"):
+        BenchmarkConfig.from_yaml(str(config_path))

From 1bb312158af2df23acd49039203b70a597699de7 Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Mon, 11 May 2026 14:15:44 +0400
Subject: [PATCH 04/24] harnesses: implement 8 concrete harness classes, retire
 agents/

---
 llm_quest_benchmark/agents/llm_agent.py       | 981 ++----------------
 llm_quest_benchmark/harnesses/base.py         | 240 ++++-
 llm_quest_benchmark/harnesses/tool_harness.py |   3 +-
 3 files changed, 292 insertions(+), 932 deletions(-)

diff --git a/llm_quest_benchmark/agents/llm_agent.py b/llm_quest_benchmark/agents/llm_agent.py
index 64ff0cc..7b6d352 100644
--- a/llm_quest_benchmark/agents/llm_agent.py
+++ b/llm_quest_benchmark/agents/llm_agent.py
@@ -1,272 +1,30 @@
-"""LLM agent for Space Rangers quests"""
+"""Deprecated compatibility wrapper for harness-based LLM agents."""
 
-import hashlib
-import json
-import logging
-import re
-from typing import Any
+import warnings
 
-from json_repair import repair_json
-
-from llm_quest_benchmark.agents.base import QuestPlayer
 from llm_quest_benchmark.constants import (
     DEFAULT_MODEL,
     DEFAULT_TEMPERATURE,
     DEFAULT_TEMPLATE,
     MODEL_CHOICES,
     SYSTEM_ROLE_TEMPLATE,
-    normalize_template_name,
-)
-from llm_quest_benchmark.llm.client import (
-    get_llm_client,
-    is_supported_model_name,
-    parse_model_name,
-)
-from llm_quest_benchmark.llm.prompt import PromptRenderer
-from llm_quest_benchmark.schemas.response import LLMResponse
-
-RISKY_CHOICE_KEYWORDS = (
-    "улететь",
-    "сдаться",
-    "отказ",
-    "провал",
-    "убежать",
-    "surrender",
-    "give up",
 )
-
-SAFE_CHOICE_KEYWORDS = (
-    "пройти мимо",
-    "избежать",
-    "подготов",
-    "библиотек",
-    "изуч",
-    "wait",
-    "avoid",
-    "study",
+from llm_quest_benchmark.harnesses.base import (
+    RISKY_CHOICE_KEYWORDS,
+    SAFE_CHOICE_KEYWORDS,
+    _is_numeric_raw_reasoning,
+    _parse_json_response,
+    _raw_reasoning_fallback,
+    parse_llm_response,
 )
+from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory
+from llm_quest_benchmark.harnesses.minimal import MinimalHarness
 
+warnings.warn("llm_agent is deprecated, use llm_quest_benchmark.harnesses", DeprecationWarning, stacklevel=2)
 
-def _parse_json_response(
-    response: str,
-    debug: bool = False,
-    logger: logging.Logger | None = None,
-) -> tuple[dict[str, Any] | None, str | None]:
-    """Try to parse response as JSON, with repair attempt if needed."""
-    cleaned_response = (response or "").strip()
-    if not cleaned_response:
-        return None, None
-
-    try:
-        # Extract JSON from response if there are backticks
-        if "```json" in cleaned_response:
-            # Find the start and end of the JSON block
-            start = cleaned_response.find("```json") + 7
-            end = cleaned_response.find("```", start)
-            if end > start:
-                json_str = cleaned_response[start:end].strip()
-                if debug and logger:
-                    logger.debug(f"Extracted JSON: {json_str}")
-                result = json.loads(json_str)
-                if debug and logger:
-                    logger.debug(f"Parsed JSON: {result}")
-                return result, "json_fenced"
-
-        # Extract a probable JSON object from free-form text.
-        embedded_json = re.search(r"\{[\s\S]*\}", cleaned_response)
-        if embedded_json:
-            candidate = embedded_json.group(0).strip()
-            if candidate and candidate != cleaned_response:
-                try:
-                    result = json.loads(candidate)
-                    if debug and logger:
-                        logger.debug(f"Parsed embedded JSON: {result}")
-                    return result, "json_embedded"
-                except json.JSONDecodeError:
-                    pass
-
-        # Try to parse directly
-        result = json.loads(cleaned_response)
-        if debug and logger:
-            logger.debug(f"Direct JSON parse successful: {result}")
-        return result, "json_direct"
-    except json.JSONDecodeError:
-        if debug and logger:
-            logger.debug("Initial JSON parse failed, attempting repair")
-        try:
-            repaired = repair_json(cleaned_response)
-            if debug and logger:
-                logger.debug(f"Repaired JSON: {repaired}")
-            result = json.loads(repaired)
-            if debug and logger:
-                logger.debug(f"Parse of repaired JSON successful: {result}")
-            return result, "json_repaired"
-        except Exception as e:
-            if debug and logger:
-                logger.error(f"JSON repair failed: {e}")
-            return None, None
-
-
-def _validate_action_number(
-    action: int, num_choices: int, debug: bool = False, logger: logging.Logger | None = None
-) -> bool:
-    """Validate that action number is within valid range"""
-    if 1 <= action <= num_choices:
-        return True
-    if debug and logger:
-        logger.error(f"Action number {action} out of range [1, {num_choices}]")
-    return False
-
-
-def _extract_action_from_text(response: str, num_choices: int) -> int | None:
-    """Extract a candidate action from free-form text."""
-    for match in re.finditer(r"\b(\d+)\b", response):
-        action = int(match.group(1))
-        if 1 <= action <= num_choices:
-            return action
-    return None
-
-
-def _extract_field_from_text(response: str, field: str) -> str | None:
-    """Best-effort extraction of analysis/reasoning from loosely formatted output."""
-    if not response:
-        return None
-
-    # JSON-like field forms: "analysis": "...", 'analysis': '...'
-    json_pattern = re.compile(
-        rf"""['"]{re.escape(field)}['"]\s*:\s*['"](?P<value>.*?)['"]""",
-        re.IGNORECASE | re.DOTALL,
-    )
-    match = json_pattern.search(response)
-    if match:
-        value = " ".join(match.group("value").strip().split())
-        if value:
-            return value
-
-    # Partial JSON field forms without a closing quote in truncated outputs.
-    partial_json_pattern = re.compile(
-        rf"""['"]{re.escape(field)}['"]\s*:\s*['"](?P<value>[^"\n\r]+)""",
-        re.IGNORECASE,
-    )
-    match = partial_json_pattern.search(response)
-    if match:
-        value = " ".join(match.group("value").strip().split())
-        if value:
-            return value
-
-    # Label forms: Analysis: ..., Reasoning - ...
-    label_pattern = re.compile(
-        rf"""(?im)^\s*{re.escape(field)}\s*[:\-]\s*(?P<value>.+?)\s*$""",
-    )
-    match = label_pattern.search(response)
-    if match:
-        value = " ".join(match.group("value").strip().split())
-        if value:
-            return value
-
-    return None
-
-
-def _raw_reasoning_fallback(response: str) -> str | None:
-    compact = " ".join((response or "").strip().split())
-    if not compact:
-        return None
-    if len(compact) > 240:
-        compact = compact[:237] + "..."
-    return f"raw_response: {compact}"
-
-
-def _is_numeric_raw_reasoning(reasoning: str | None) -> bool:
-    if not reasoning:
-        return False
-    if not reasoning.startswith("raw_response:"):
-        return False
-    payload = reasoning.split(":", 1)[1].strip()
-    return payload.isdigit()
-
-
-def parse_llm_response(
-    response: str, num_choices: int, debug: bool = False, logger: logging.Logger | None = None
-) -> LLMResponse:
-    """Parse LLM response and return structured response object."""
-    if debug and logger:
-        logger.debug(f"Raw LLM response: {response}")
-
-    extracted_analysis = _extract_field_from_text(response, "analysis")
-    extracted_reasoning = _extract_field_from_text(response, "reasoning")
-    raw_reasoning = _raw_reasoning_fallback(response)
-
-    # Try parsing as JSON first
-    response_json, json_parse_mode = _parse_json_response(response, debug, logger)
-    if response_json and isinstance(response_json, dict):
-        analysis = response_json.get("analysis") or extracted_analysis
-        reasoning = response_json.get("reasoning") or response_json.get("thinking") or extracted_reasoning
-        if not reasoning and analysis:
-            reasoning = analysis
-        if not analysis and not reasoning:
-            reasoning = raw_reasoning
-
-        memo_raw = response_json.get("memo")
-        memo = str(memo_raw) if memo_raw is not None else None
-
-        # Check for either 'action' or 'result' field
-        action_value = response_json.get("action") or response_json.get("result") or response_json.get("choice")
-        if action_value is not None:
-            try:
-                action = int(action_value)
-                if _validate_action_number(action, num_choices, debug, logger):
-                    return LLMResponse(
-                        action=action,
-                        reasoning=reasoning,
-                        analysis=analysis,
-                        memo=memo,
-                        is_default=False,
-                        parse_mode=json_parse_mode or "json",
-                    )
-            except (ValueError, TypeError):
-                if debug and logger:
-                    logger.error(f"Invalid action value in JSON: {action_value}")
-
-    # Try parsing as plain number
-    try:
-        action = int(response.strip())
-        if _validate_action_number(action, num_choices, debug, logger):
-            return LLMResponse(
-                action=action,
-                reasoning=extracted_reasoning or extracted_analysis or raw_reasoning,
-                analysis=extracted_analysis,
-                is_default=False,
-                parse_mode="number_only",
-            )
-    except ValueError:
-        if debug and logger:
-            logger.error(f"Could not parse response as number: {response}")
-
-    # Fallback: extract first valid integer from text.
-    extracted_action = _extract_action_from_text(response, num_choices)
-    if extracted_action is not None:
-        return LLMResponse(
-            action=extracted_action,
-            reasoning=extracted_reasoning or extracted_analysis or raw_reasoning,
-            analysis=extracted_analysis,
-            is_default=False,
-            parse_mode="number_extracted",
-        )
-
-    # Default to first choice if all parsing attempts fail
-    if debug and logger:
-        logger.error(f"Error during response parsing, defaulting to first choice. Response: {response[:100]}...")
-    return LLMResponse(
-        action=1,
-        reasoning=extracted_reasoning or extracted_analysis or raw_reasoning,
-        analysis=extracted_analysis,
-        is_default=True,
-        parse_mode="default_first",
-    )
 
-
-class LLMAgent(QuestPlayer):
-    """LLM-powered agent for Space Rangers quests"""
+class LLMAgent(MinimalHarness):
+    """Backward-compatible LLMAgent facade backed by concrete harness classes."""
 
     SUPPORTED_MODELS = MODEL_CHOICES
 
@@ -281,688 +39,65 @@ def __init__(
         memory_mode: str = "default",
         compaction_interval: int = 10,
     ):
-        super().__init__(skip_single=skip_single)
-        self.debug = debug
-        self.model_name = model_name.lower()
-        self.system_template = normalize_template_name(system_template)
-        self.action_template = normalize_template_name(action_template)
-        self.temperature = temperature
-        # Set agent_id for database records
-        self.agent_id = f"llm_{self.model_name}"
-
-        if not is_supported_model_name(self.model_name):
-            raise ValueError(f"Unsupported model: {model_name}. Supported models are: {self.SUPPORTED_MODELS}")
-
-        self.model_spec = parse_model_name(self.model_name)
-        self.logger = logging.getLogger(self.__class__.__name__)
-        if self.debug:
-            self.logger.setLevel(logging.DEBUG)
-            self.logger.propagate = False
-            if not any(getattr(h, "_llm_quest_handler", False) for h in self.logger.handlers):
-                handler = logging.StreamHandler()
-                handler.setFormatter(logging.Formatter("%(name)s - %(message)s"))
-                handler._llm_quest_handler = True
-                self.logger.addHandler(handler)
+        if memory_mode == "default":
+            memory_module = DefaultMemory()
+        elif memory_mode == "full_transcript":
+            memory_module = FullTranscriptMemory()
+        elif memory_mode == "compaction":
+            memory_module = CompactionMemory(compaction_interval=compaction_interval)
+        else:
+            raise ValueError(f"Invalid memory_mode: {memory_mode}")
 
-        # Initialize prompt renderer
-        self.prompt_renderer = PromptRenderer(
-            None, system_template=self.system_template, action_template=self.action_template
+        super().__init__(
+            model_name=model_name,
+            system_template=system_template,
+            action_template=action_template,
+            temperature=temperature,
+            skip_single=skip_single,
+            debug=debug,
+            memory_module=memory_module,
         )
-
-        # Delay API client creation so template-only flows and tests do not require API keys.
-        self.llm = None
-        self.history: list[LLMResponse] = []
-        self._observation_history: list[str] = []
-        self._decision_history: list[dict[str, Any]] = []
-        self._state_action_counts: dict[str, dict[int, int]] = {}
-        self._context_window = 3
-        self._context_chars = 220
-        self._decision_window = 5
-        self._max_state_signatures = 200
-        self._use_safety_filter = True
-        self._last_response = LLMResponse(action=1, is_default=True)
-
-        # Quest briefing: pinned first observation (mission goal)
-        self._quest_briefing: str | None = None
-
-        # Memory mode: "default", "full_transcript", "compaction"
-        if memory_mode not in ("default", "full_transcript", "compaction"):
-            raise ValueError(f"Invalid memory_mode: {memory_mode}")
+        self.agent_id = f"llm_{self.model_name}"
         self._memory_mode = memory_mode
-        self._transcript: list[dict[str, Any]] = []
         self._compaction_interval = compaction_interval
-        self._compaction_summary: str | None = None
-        self._steps_since_compaction = 0
-        self._step_count = 0
-
-    def _ensure_llm(self):
-        """Lazily create the provider client only when inference is needed."""
-        if self.llm is None:
-            self.llm = get_llm_client(
-                self.model_name,
-                system_prompt=self.prompt_renderer.render_system_prompt(),
-                temperature=self.temperature,
-            )
-
-    def get_last_response(self) -> LLMResponse | None:
-        """Get the last LLM response from history"""
-        return self._last_response
-
-    def get_action(self, observation: str, choices: list[dict[str, str]]) -> int:
-        """Track observation history for context, then delegate base action flow."""
-        self._remember_observation(observation)
-        return super().get_action(observation, choices)
 
     def _remember_observation(self, observation: str) -> None:
+        """Compatibility hook used by legacy tests and callers."""
         clean = (observation or "").strip()
         if not clean:
             return
-        if self._quest_briefing is None:
-            self._quest_briefing = clean
         self._observation_history.append(clean)
         if len(self._observation_history) > 20:
             self._observation_history = self._observation_history[-20:]
+        if self.memory_module is not None:
+            self.memory_module.update({"observation": clean, "step": self._step_count + 1})
 
     def _build_contextual_state(self, state: str) -> str:
-        """Build context-augmented state based on memory mode."""
-        if self._memory_mode == "full_transcript":
-            return self._build_full_transcript_state(state)
-        if self._memory_mode == "compaction":
-            return self._build_compaction_state(state)
-        return self._build_default_state(state)
-
-    def _briefing_block(self, state: str) -> str | None:
-        """Return quest briefing block if available and not redundant with current state."""
-        if not self._quest_briefing:
-            return None
-        if state.strip() == self._quest_briefing:
-            return None
-        briefing = self._quest_briefing
-        if len(briefing) > 800:
-            briefing = briefing[:800] + "..."
-        return f"Quest briefing (your mission):\n{briefing}"
-
-    def _build_default_state(self, state: str) -> str:
-        """Original sliding-window context, now with pinned briefing."""
-        blocks: list[str] = []
-
-        briefing = self._briefing_block(state)
-        if briefing:
-            blocks.append(briefing)
-
-        if len(self._observation_history) > 1:
-            previous = self._observation_history[:-1][-self._context_window :]
-            if previous:
-                snippets = []
-                for idx, text in enumerate(previous, start=1):
-                    clipped = text if len(text) <= self._context_chars else text[: self._context_chars] + "..."
-                    snippets.append(f"[Previous {idx}] {clipped}")
-                blocks.append("Recent context from previous steps:\n" + "\n\n".join(snippets))
-
-        if self._decision_history:
-            recent_memos = []
-            for item in self._decision_history[-self._decision_window :]:
-                m = (item.get("memo") or "").strip()
-                if not m:
-                    continue
-                if recent_memos and recent_memos[-1] == m:
-                    continue
-                recent_memos.append(m)
-            if recent_memos:
-                lines = [f"[Memo {idx}] {m}" for idx, m in enumerate(recent_memos, start=1)]
-                blocks.append("State memo (recent):\n" + "\n".join(lines))
-
-            recent_decisions = self._decision_history[-self._decision_window :]
-            decision_lines = []
-            for idx, item in enumerate(recent_decisions, start=1):
-                choice = item.get("choice", "")
-                parse_mode = item.get("parse_mode", "unknown")
-                memo_val = item.get("memo")
-                memo_suffix = f" | memo: {memo_val}" if memo_val else ""
-                decision_lines.append(
-                    f"[Decision {idx}] action {item.get('action')}: {choice} (parse={parse_mode}){memo_suffix}"
-                )
-            blocks.append("Recent selected actions:\n" + "\n".join(decision_lines))
-
-        if not blocks:
-            return state
-
-        sep = "\n\n"
-        return f"{sep.join(blocks)}\n\nCurrent story state:\n{state}"
-
-    def _build_full_transcript_state(self, state: str) -> str:
-        """Full decision transcript with pinned briefing."""
-        blocks: list[str] = []
-
-        briefing = self._briefing_block(state)
-        if briefing:
-            blocks.append(briefing)
-
-        if self._transcript:
-            lines = []
-            entries = self._transcript
-            # Budget: keep first 3 + last N that fit under ~40 entries total
-            if len(entries) > 40:
-                entries = entries[:3] + [{"_gap": len(entries) - 40}] + entries[-(40 - 3) :]
-            for entry in entries:
-                if "_gap" in entry:
-                    lines.append(f"  ... ({entry['_gap']} steps omitted) ...")
-                    continue
-                step = entry.get("step", "?")
-                obs = entry.get("observation", "")
-                if len(obs) > 400:
-                    obs = obs[:400] + "..."
-                chosen = entry.get("choice_text", "")
-                reasoning = entry.get("reasoning", "")
-                line = f"Step {step}: {obs}"
-                if chosen:
-                    line += f"\n  You chose: {chosen}"
-                if reasoning:
-                    line += f"\n  Reasoning: {reasoning[:800]}"
-                state_notes = entry.get("memo", "")
-                if state_notes:
-                    line += f"\n  State: {state_notes[:350]}"
-                lines.append(line)
-            blocks.append("=== QUEST TRANSCRIPT ===\n" + "\n\n".join(lines))
-
-        blocks.append(f"Step {self._step_count} (CURRENT):\n{state}")
-        return "\n\n".join(blocks)
-
-    def _build_compaction_state(self, state: str) -> str:
-        """Compacted memory summary + recent steps since last compaction."""
-        blocks: list[str] = []
-
-        briefing = self._briefing_block(state)
-        if briefing:
-            blocks.append(briefing)
-
-        if self._compaction_summary:
-            blocks.append(
-                f"=== QUEST MEMORY (compacted at step {self._step_count - self._steps_since_compaction}) ===\n{self._compaction_summary}"
-            )
-
-        if self._transcript:
-            recent = self._transcript[-self._steps_since_compaction :] if self._steps_since_compaction > 0 else []
-            if recent:
-                lines = []
-                for entry in recent:
-                    step = entry.get("step", "?")
-                    obs = entry.get("observation", "")
-                    if len(obs) > 400:
-                        obs = obs[:400] + "..."
-                    chosen = entry.get("choice_text", "")
-                    line = f"Step {step}: {obs}"
-                    if chosen:
-                        line += f"\n  You chose: {chosen}"
-                    state_notes = entry.get("memo", "")
-                    if state_notes:
-                        line += f"\n  State: {state_notes[:350]}"
-                    lines.append(line)
-                blocks.append("=== RECENT STEPS ===\n" + "\n\n".join(lines))
-
-        blocks.append(f"Step {self._step_count} (CURRENT):\n{state}")
-        return "\n\n".join(blocks)
-
-    def _maybe_compact(self) -> None:
-        """Run compaction if interval reached. Called after recording a decision."""
-        if self._memory_mode != "compaction":
-            return
-        if self._steps_since_compaction < self._compaction_interval:
-            return
-
-        transcript_text = self._format_transcript_for_compaction()
-        if not transcript_text:
-            return
-
-        prompt_parts = []
-        prompt_parts.append("You are summarizing an agent's progress through a text quest.")
-        if self._quest_briefing:
-            prompt_parts.append(f"\nQUEST BRIEFING (the original mission):\n{self._quest_briefing}")
-        if self._compaction_summary:
-            prompt_parts.append(f"\nPREVIOUS SUMMARY:\n{self._compaction_summary}")
-        prompt_parts.append(f"\nTRANSCRIPT OF LAST {self._steps_since_compaction} STEPS:\n{transcript_text}")
-        prompt_parts.append(
-            "\nSummarize the agent's progress. Include:\n"
-            "- Current objective (what the agent should do next)\n"
-            "- Progress so far (what has been accomplished)\n"
-            "- Key facts (NPCs, items, locations, deadlines discovered)\n"
-            "- Failed approaches (actions/paths that didn't work)\n"
-            "- Map knowledge (locations visited and connections)\n\n"
-            "Write a concise summary in plain text, max 300 words."
-        )
-
-        compaction_prompt = "\n".join(prompt_parts)
-        try:
-            self._ensure_llm()
-            summary = self.llm.get_completion(compaction_prompt)
-            compaction_usage = self.llm.get_last_usage() or {}
-            if compaction_usage:
-                pt = int(
-                    compaction_usage.get("prompt_tokens", 0)
-                    if isinstance(compaction_usage, dict)
-                    else getattr(compaction_usage, "prompt_tokens", 0)
-                )
-                ct = int(
-                    compaction_usage.get("completion_tokens", 0)
-                    if isinstance(compaction_usage, dict)
-                    else getattr(compaction_usage, "completion_tokens", 0)
-                )
-                self._record_compaction_usage(pt, ct)
-            stripped = (summary or "").strip()
-            if not stripped:
-                if self.debug:
-                    self.logger.warning("Compaction returned empty summary at step %d", self._step_count)
-                self._steps_since_compaction = max(0, self._compaction_interval // 2)
-                return
-            self._compaction_summary = stripped
-            self._transcript = []
-            self._steps_since_compaction = 0
-            if self.debug:
-                self.logger.debug(
-                    "Compaction completed at step %d: %s", self._step_count, self._compaction_summary[:200]
-                )
-        except Exception as e:
-            if self.debug:
-                self.logger.warning("Compaction failed at step %d: %s", self._step_count, e)
-            self._steps_since_compaction = max(0, self._compaction_interval // 2)
-
-    def _record_compaction_usage(self, prompt_tokens: int, completion_tokens: int) -> None:
-        """Record token usage from compaction calls into agent history."""
-        compaction_response = LLMResponse(
-            action=0,
-            is_default=True,
-            parse_mode="compaction",
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-            total_tokens=prompt_tokens + completion_tokens,
-        )
-        self.history.append(compaction_response)
-
-    def _format_transcript_for_compaction(self) -> str:
-        """Format recent transcript entries for the compaction prompt."""
-        recent = (
-            self._transcript[-self._steps_since_compaction :]
-            if self._steps_since_compaction > 0
-            else self._transcript[-self._compaction_interval :]
-        )
-        lines = []
-        for entry in recent:
-            step = entry.get("step", "?")
-            obs = entry.get("observation", "")
-            if len(obs) > 400:
-                obs = obs[:400] + "..."
-            chosen = entry.get("choice_text", "")
-            reasoning = entry.get("reasoning", "")
-            state_notes = entry.get("memo", "")
-            line = f"Step {step}: {obs}"
-            if chosen:
-                line += f"\n  Chose: {chosen}"
-            if state_notes:
-                line += f"\n  State: {state_notes[:350]}"
-            if reasoning:
-                line += f"\n  Reasoning: {reasoning[:800]}"
-            lines.append(line)
-        return "\n\n".join(lines)
-
-    @staticmethod
-    def _normalize_for_signature(value: str, max_len: int = 320) -> str:
-        text = (value or "").lower()
-        text = re.sub(r"\s+", " ", text).strip()
-        if len(text) > max_len:
-            return text[:max_len]
-        return text
-
-    def _state_signature(self, state: str, choices: list[dict[str, str]]) -> str:
-        normalized_state = self._normalize_for_signature(state, max_len=420)
-        normalized_choices = "|".join(
-            self._normalize_for_signature(choice.get("text", ""), max_len=110) for choice in choices
-        )
-        raw_signature = f"{normalized_state}||{normalized_choices}"
-        return hashlib.sha1(raw_signature.encode("utf-8", errors="ignore")).hexdigest()[:20]
-
-    def _remember_decision(
-        self,
-        state: str,
-        choices: list[dict[str, str]],
-        state_signature: str,
-        response: LLMResponse,
-    ) -> None:
-        action = int(response.action)
-        counts = self._state_action_counts.setdefault(state_signature, {})
-        counts[action] = counts.get(action, 0) + 1
-
-        if len(self._state_action_counts) > self._max_state_signatures:
-            oldest_key = next(iter(self._state_action_counts.keys()))
-            if oldest_key != state_signature:
-                self._state_action_counts.pop(oldest_key, None)
-
-        selected_text = ""
-        if 1 <= action <= len(choices):
-            selected_text = choices[action - 1].get("text", "")
-        state_snippet = state.strip()
-        if len(state_snippet) > self._context_chars:
-            state_snippet = state_snippet[: self._context_chars] + "..."
-
-        self._decision_history.append(
-            {
-                "state": state_snippet,
-                "action": action,
-                "choice": selected_text,
-                "parse_mode": response.parse_mode or "unknown",
-                "memo": (response.memo or "").strip()[:350] or None,
-            }
-        )
-        if len(self._decision_history) > 40:
-            self._decision_history = self._decision_history[-40:]
-
-        # Transcript for full_transcript and compaction modes
-        if self._memory_mode in ("full_transcript", "compaction"):
-            self._step_count += 1
-            self._steps_since_compaction += 1
-            self._transcript.append(
-                {
-                    "step": self._step_count,
-                    "observation": state_snippet if self._memory_mode == "compaction" else state.strip()[:400],
-                    "choice_text": selected_text,
-                    "reasoning": (response.reasoning or "")[:800],
-                    "memo": (response.memo or "").strip()[:350] or None,
-                    "action": action,
-                }
-            )
-            self._maybe_compact()
-
-    def _choice_risk_score(self, choice_text: str) -> int:
-        text = (choice_text or "").lower()
-        score = 0
-        for keyword in RISKY_CHOICE_KEYWORDS:
-            if keyword in text:
-                score += 2
-        for keyword in SAFE_CHOICE_KEYWORDS:
-            if keyword in text:
-                score -= 1
-        return score
-
-    def _apply_safety_filter(self, action: int, choices: list[dict[str, str]]) -> int:
-        """Replace obviously risky actions when a clearly safer alternative exists."""
-        if not self._use_safety_filter or len(choices) < 2:
-            return action
-
-        current_idx = action - 1
-        if current_idx < 0 or current_idx >= len(choices):
-            return action
-
-        scored = [(idx + 1, self._choice_risk_score(c.get("text", ""))) for idx, c in enumerate(choices)]
-        scored.sort(key=lambda item: item[1])
-
-        best_action, best_score = scored[0]
-        current_score = self._choice_risk_score(choices[current_idx].get("text", ""))
-
-        # Only override when the chosen action is materially riskier than the best option.
-        if current_score - best_score >= 2:
-            if self.debug:
-                self.logger.debug(
-                    "Safety filter override: %s -> %s (risk %s -> %s)",
-                    action,
-                    best_action,
-                    current_score,
-                    best_score,
-                )
-            return best_action
-        return action
-
-    @staticmethod
-    def _state_fingerprint(state: str) -> str:
-        """Create a stable fingerprint for loop detection."""
-        compact = " ".join((state or "").lower().split())
-        if len(compact) > 500:
-            compact = compact[:500]
-        return compact
-
-    def _apply_loop_escape(
-        self,
-        state_key: str,
-        action: int,
-        choices: list[dict[str, str]],
-    ) -> tuple[int, bool]:
-        """Diversify action when the same state repeats with no apparent progress."""
-        if len(choices) <= 1:
-            return action, False
-
-        counts = self._state_action_counts.get(state_key, {})
-        total_visits = sum(counts.values())
-        if total_visits < 3:
-            return action, False
-
-        current_count = counts.get(action, 0)
-        if current_count < 2:
-            return action, False
-        all_actions = list(range(1, len(choices) + 1))
-        ranked = sorted(
-            all_actions,
-            key=lambda a: (
-                counts.get(a, 0),
-                self._choice_risk_score(choices[a - 1].get("text", "")),
-            ),
-        )
-        best_action = ranked[0]
-
-        if best_action != action and counts.get(best_action, 0) < current_count:
-            return best_action, True
-        if total_visits >= 5 and current_count >= 3 and best_action != action:
-            return best_action, True
-        return action, False
-
-    @staticmethod
-    def _normalize_usage(usage: dict[str, Any] | None) -> dict[str, Any]:
-        usage = usage or {}
-        prompt_tokens = int(usage.get("prompt_tokens") or 0)
-        completion_tokens = int(usage.get("completion_tokens") or 0)
-        total_tokens = int(usage.get("total_tokens") or (prompt_tokens + completion_tokens))
-        estimated_cost_usd = usage.get("estimated_cost_usd")
-        if estimated_cost_usd is not None:
-            estimated_cost_usd = float(estimated_cost_usd)
-        return {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": total_tokens,
-            "estimated_cost_usd": estimated_cost_usd,
-        }
-
-    @classmethod
-    def _merge_usage(cls, first: dict[str, Any] | None, second: dict[str, Any] | None) -> dict[str, Any]:
-        a = cls._normalize_usage(first)
-        b = cls._normalize_usage(second)
-        merged_cost = None
-        if a["estimated_cost_usd"] is not None or b["estimated_cost_usd"] is not None:
-            merged_cost = (a["estimated_cost_usd"] or 0.0) + (b["estimated_cost_usd"] or 0.0)
-        return {
-            "prompt_tokens": a["prompt_tokens"] + b["prompt_tokens"],
-            "completion_tokens": a["completion_tokens"] + b["completion_tokens"],
-            "total_tokens": a["total_tokens"] + b["total_tokens"],
-            "estimated_cost_usd": merged_cost,
-        }
-
-    def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int:
-        """Implementation of action selection logic.
-
-        Args:
-            state (str): Current game state text
-            choices (List[Dict[str, str]]): List of available choices
-
-        Returns:
-            int: Selected action number (1-based)
-        """
-        if self.debug:
-            self.logger.debug(f"Getting action for state with {len(choices)} choices available")
-            for i, choice in enumerate(choices):
-                self.logger.debug(f"Choice {i + 1}: {choice.get('text', 'NO TEXT')}")
-        try:
-            state_signature = self._state_signature(state, choices)
-            # Format prompt
-            prompt = self._format_prompt(self._build_contextual_state(state), choices)
-            if self.debug:
-                self.logger.debug(f"\nPrompt:\n{prompt}")
-
-            # Get LLM response
-            self._ensure_llm()
-            llm_response = self.llm.get_completion(prompt)
-            llm_usage = self.llm.get_last_usage()
-            if self.debug:
-                self.logger.debug(f"LLM response: {llm_response}")
-                choices_debug = []
-                for i, c in enumerate(choices):
-                    choices_debug.append(f"{i + 1}: {c['text']}")
-                self.logger.debug(f"Available choices: {choices_debug}")
-
-            # Parse response
-            first_response = parse_llm_response(
-                llm_response,
-                len(choices),
-                self.debug,
-                self.logger,
-            )
-            parsed_response = first_response
-
-            if parsed_response.is_default:
-                retry_response = self.llm.get_completion(self._format_retry_prompt(state, choices))
-                retry_usage = self.llm.get_last_usage()
-                llm_usage = self._merge_usage(llm_usage, retry_usage)
-                retry_parsed = parse_llm_response(retry_response, len(choices), self.debug, self.logger)
-                if not retry_parsed.is_default:
-                    retry_parsed.parse_mode = f"retry_{retry_parsed.parse_mode or 'parsed'}"
-                    parsed_response = retry_parsed
-                elif self._needs_force_numeric_retry():
-                    # GPT-5/o models occasionally return empty visible text on long prompts.
-                    # Use a tiny final retry that asks for number-only output.
-                    force_retry_response = self.llm.get_completion(self._format_force_numeric_retry_prompt(choices))
-                    force_retry_usage = self.llm.get_last_usage()
-                    llm_usage = self._merge_usage(llm_usage, force_retry_usage)
-                    force_retry_parsed = parse_llm_response(
-                        force_retry_response,
-                        len(choices),
-                        self.debug,
-                        self.logger,
-                    )
-                    if not force_retry_parsed.is_default:
-                        force_retry_parsed.parse_mode = f"force_retry_{force_retry_parsed.parse_mode or 'parsed'}"
-                        parsed_response = force_retry_parsed
-
-            action_before_policy = parsed_response.action
-            if parsed_response is not first_response:
-                if parsed_response.analysis is None and first_response.analysis is not None:
-                    parsed_response.analysis = first_response.analysis
-                if _is_numeric_raw_reasoning(parsed_response.reasoning):
-                    if first_response.reasoning and not _is_numeric_raw_reasoning(first_response.reasoning):
-                        parsed_response.reasoning = first_response.reasoning
-                    else:
-                        first_raw_reasoning = _raw_reasoning_fallback(llm_response)
-                        if first_raw_reasoning and not _is_numeric_raw_reasoning(first_raw_reasoning):
-                            parsed_response.reasoning = first_raw_reasoning
-
-            parsed_response.action = self._apply_safety_filter(parsed_response.action, choices)
-            if parsed_response.action != action_before_policy and not parsed_response.reasoning:
-                parsed_response.reasoning = "policy_safety_override"
-            usage_payload = self._normalize_usage(llm_usage)
-            parsed_response.prompt_tokens = usage_payload["prompt_tokens"]
-            parsed_response.completion_tokens = usage_payload["completion_tokens"]
-            parsed_response.total_tokens = usage_payload["total_tokens"]
-            parsed_response.estimated_cost_usd = usage_payload["estimated_cost_usd"]
-
-            if self.debug:
-                self.logger.debug(f"Parsed LLM response: {parsed_response}")
-                self.logger.debug(f"Final action to be returned: {parsed_response.action}")
-
-            # Store response in history
-            self.history.append(parsed_response)
-            self._last_response = parsed_response
-            self._remember_decision(state, choices, state_signature, parsed_response)
-
-            # Check that action is within valid range before returning
-            if parsed_response.action < 1 or parsed_response.action > len(choices):
-                self.logger.error(f"INVALID ACTION DETECTED: {parsed_response.action} not in range 1-{len(choices)}")
-                # Use default first action instead
-                parsed_response.action = 1
-                self.logger.warning("Defaulting to action 1 instead")
-
-            return parsed_response.action
-
-        except Exception as e:
-            self.logger.error(f"Error during LLM call: {e}")
-            default_response = LLMResponse(
-                action=1,
-                is_default=True,
-                parse_mode="error_default",
-                reasoning=_raw_reasoning_fallback(f"llm_call_error: {e}"),
-            )
-            self.history.append(default_response)
-            self._last_response = default_response
-            return 1  # Default to first choice on error
-
-    def reset(self) -> None:
-        """Reset agent state"""
-        self.history = []
-        self._observation_history = []
-        self._decision_history = []
-        self._state_action_counts = {}
-        self._last_response = LLMResponse(action=1, is_default=True)
-        self._quest_briefing = None
-        self._transcript = []
-        self._compaction_summary = None
-        self._steps_since_compaction = 0
-        self._step_count = 0
-
-    def on_game_start(self) -> None:
-        """Called when game starts"""
-        super().on_game_start()
-        self._observation_history = []
-        self._decision_history = []
-        self._state_action_counts = {}
-        self._last_response = LLMResponse(action=1, is_default=True)
-        self._quest_briefing = None
-        self._transcript = []
-        self._compaction_summary = None
-        self._steps_since_compaction = 0
-        self._step_count = 0
-
-    def on_game_end(self, final_state: dict[str, Any]) -> None:
-        """Log final state for analysis"""
-        if self.debug:
-            self.logger.debug(f"Game ended with state: {final_state}")
+        """Build context while honoring legacy direct history mutation."""
+        if isinstance(self.memory_module, DefaultMemory):
+            self.memory_module._observations = list(self._observation_history)
+            self.memory_module._decisions = list(self._decision_history)
+        return super()._build_contextual_state(state)
+
+    def _apply_safety_filter(self, action_or_choices, choices_or_action) -> int:
+        """Accept both legacy (action, choices) and harness (choices, action) argument order."""
+        if isinstance(action_or_choices, list):
+            return super()._apply_safety_filter(action_or_choices, choices_or_action)
+        return super()._apply_safety_filter(choices_or_action, action_or_choices)
 
     def __str__(self) -> str:
-        """String representation of the agent"""
-        return f"LLMAgent(model={self.model_name}, system_template={self.system_template}, action_template={self.action_template}, temperature={self.temperature})"
-
-    def _format_prompt(self, state: str, choices: list[dict[str, str]]) -> str:
-        """Format the prompt for the LLM"""
-        return self.prompt_renderer.render_action_prompt(state, choices).strip()
-
-    def _format_retry_prompt(self, state: str, choices: list[dict[str, str]]) -> str:
-        """Fallback prompt that still preserves reasoning for log analysis."""
-        clipped_state = (state or "").strip()
-        if len(clipped_state) > 500:
-            clipped_state = clipped_state[:500] + "..."
-        choices_text = "\n".join([f"{i + 1}. {(c.get('text', '') or '')[:160]}" for i, c in enumerate(choices)])
-        return f"""Choose the best action.
-State: {clipped_state}
-Actions:
-{choices_text}
-
-Return valid JSON only:
-{{
-  "analysis": "<max 25 words>",
-  "reasoning": "<max 25 words>",
-  "result": <integer from 1 to {len(choices)}>
-}}"""
+        return (
+            f"LLMAgent(model={self.model_name}, system_template={self.system_template}, "
+            f"action_template={self.action_template}, temperature={self.temperature})"
+        )
 
-    def _format_force_numeric_retry_prompt(self, choices: list[dict[str, str]]) -> str:
-        """Very short retry prompt used for models that return empty visible output."""
-        choices_text = "\n".join([f"{i + 1}. {(c.get('text', '') or '')[:110]}" for i, c in enumerate(choices)])
-        return f"""Pick one action number.
-{choices_text}
-Reply with one integer only: 1 to {len(choices)}."""
 
-    def _needs_force_numeric_retry(self) -> bool:
-        return self.model_spec.provider == "openai" and (
-            self.model_spec.model_id.startswith("gpt-5") or self.model_spec.model_id.startswith("o")
-        )
+__all__ = [
+    "LLMAgent",
+    "parse_llm_response",
+    "_parse_json_response",
+    "_raw_reasoning_fallback",
+    "_is_numeric_raw_reasoning",
+    "RISKY_CHOICE_KEYWORDS",
+    "SAFE_CHOICE_KEYWORDS",
+]
diff --git a/llm_quest_benchmark/harnesses/base.py b/llm_quest_benchmark/harnesses/base.py
index ec70b55..6fa8afd 100644
--- a/llm_quest_benchmark/harnesses/base.py
+++ b/llm_quest_benchmark/harnesses/base.py
@@ -1,24 +1,250 @@
 """Base harness class for quest benchmark experiments."""
 
 import hashlib
+import json
 import logging
 import re
 from abc import abstractmethod
 from typing import Any
 
+from json_repair import repair_json
+
 from llm_quest_benchmark.agents.base import QuestPlayer
-from llm_quest_benchmark.agents.llm_agent import (
-    RISKY_CHOICE_KEYWORDS,
-    SAFE_CHOICE_KEYWORDS,
-    _is_numeric_raw_reasoning,
-    _raw_reasoning_fallback,
-    parse_llm_response,
-)
 from llm_quest_benchmark.constants import DEFAULT_TEMPLATE, normalize_template_name
 from llm_quest_benchmark.llm.client import get_llm_client, parse_model_name
 from llm_quest_benchmark.llm.prompt import PromptRenderer
 from llm_quest_benchmark.schemas.response import LLMResponse
 
+RISKY_CHOICE_KEYWORDS = (
+    "улететь",
+    "сдаться",
+    "отказ",
+    "провал",
+    "убежать",
+    "surrender",
+    "give up",
+)
+
+SAFE_CHOICE_KEYWORDS = (
+    "пройти мимо",
+    "избежать",
+    "подготов",
+    "библиотек",
+    "изуч",
+    "wait",
+    "avoid",
+    "study",
+)
+
+
+def _parse_json_response(
+    response: str,
+    debug: bool = False,
+    logger: logging.Logger | None = None,
+) -> tuple[dict[str, Any] | None, str | None]:
+    """Try to parse response as JSON, with repair attempt if needed."""
+    cleaned_response = (response or "").strip()
+    if not cleaned_response:
+        return None, None
+
+    try:
+        if "```json" in cleaned_response:
+            start = cleaned_response.find("```json") + 7
+            end = cleaned_response.find("```", start)
+            if end > start:
+                json_str = cleaned_response[start:end].strip()
+                if debug and logger:
+                    logger.debug("Extracted JSON: %s", json_str)
+                result = json.loads(json_str)
+                if debug and logger:
+                    logger.debug("Parsed JSON: %s", result)
+                return result, "json_fenced"
+
+        embedded_json = re.search(r"\{[\s\S]*\}", cleaned_response)
+        if embedded_json:
+            candidate = embedded_json.group(0).strip()
+            if candidate and candidate != cleaned_response:
+                try:
+                    result = json.loads(candidate)
+                    if debug and logger:
+                        logger.debug("Parsed embedded JSON: %s", result)
+                    return result, "json_embedded"
+                except json.JSONDecodeError:
+                    pass
+
+        result = json.loads(cleaned_response)
+        if debug and logger:
+            logger.debug("Direct JSON parse successful: %s", result)
+        return result, "json_direct"
+    except json.JSONDecodeError:
+        if debug and logger:
+            logger.debug("Initial JSON parse failed, attempting repair")
+        try:
+            repaired = repair_json(cleaned_response)
+            if debug and logger:
+                logger.debug("Repaired JSON: %s", repaired)
+            result = json.loads(repaired)
+            if debug and logger:
+                logger.debug("Parse of repaired JSON successful: %s", result)
+            return result, "json_repaired"
+        except Exception as exc:
+            if debug and logger:
+                logger.error("JSON repair failed: %s", exc)
+            return None, None
+
+
+def _validate_action_number(
+    action: int,
+    num_choices: int,
+    debug: bool = False,
+    logger: logging.Logger | None = None,
+) -> bool:
+    """Validate that action number is within valid range."""
+    if 1 <= action <= num_choices:
+        return True
+    if debug and logger:
+        logger.error("Action number %s out of range [1, %s]", action, num_choices)
+    return False
+
+
+def _extract_action_from_text(response: str, num_choices: int) -> int | None:
+    """Extract a candidate action from free-form text."""
+    for match in re.finditer(r"\b(\d+)\b", response):
+        action = int(match.group(1))
+        if 1 <= action <= num_choices:
+            return action
+    return None
+
+
+def _extract_field_from_text(response: str, field: str) -> str | None:
+    """Best-effort extraction of analysis/reasoning from loosely formatted output."""
+    if not response:
+        return None
+
+    json_pattern = re.compile(
+        rf"""['"]{re.escape(field)}['"]\s*:\s*['"](?P<value>.*?)['"]""",
+        re.IGNORECASE | re.DOTALL,
+    )
+    match = json_pattern.search(response)
+    if match:
+        value = " ".join(match.group("value").strip().split())
+        if value:
+            return value
+
+    partial_json_pattern = re.compile(
+        rf"""['"]{re.escape(field)}['"]\s*:\s*['"](?P<value>[^"\n\r]+)""",
+        re.IGNORECASE,
+    )
+    match = partial_json_pattern.search(response)
+    if match:
+        value = " ".join(match.group("value").strip().split())
+        if value:
+            return value
+
+    label_pattern = re.compile(
+        rf"""(?im)^\s*{re.escape(field)}\s*[:\-]\s*(?P<value>.+?)\s*$""",
+    )
+    match = label_pattern.search(response)
+    if match:
+        value = " ".join(match.group("value").strip().split())
+        if value:
+            return value
+
+    return None
+
+
+def _raw_reasoning_fallback(response: str) -> str | None:
+    compact = " ".join((response or "").strip().split())
+    if not compact:
+        return None
+    if len(compact) > 240:
+        compact = compact[:237] + "..."
+    return f"raw_response: {compact}"
+
+
+def _is_numeric_raw_reasoning(reasoning: str | None) -> bool:
+    if not reasoning or not reasoning.startswith("raw_response:"):
+        return False
+    payload = reasoning.split(":", 1)[1].strip()
+    return payload.isdigit()
+
+
+def parse_llm_response(
+    response: str,
+    num_choices: int,
+    debug: bool = False,
+    logger: logging.Logger | None = None,
+) -> LLMResponse:
+    """Parse an LLM response and return a structured response object."""
+    if debug and logger:
+        logger.debug("Raw LLM response: %s", response)
+
+    extracted_analysis = _extract_field_from_text(response, "analysis")
+    extracted_reasoning = _extract_field_from_text(response, "reasoning")
+    raw_reasoning = _raw_reasoning_fallback(response)
+
+    response_json, json_parse_mode = _parse_json_response(response, debug, logger)
+    if response_json and isinstance(response_json, dict):
+        analysis = response_json.get("analysis") or extracted_analysis
+        reasoning = response_json.get("reasoning") or response_json.get("thinking") or extracted_reasoning
+        if not reasoning and analysis:
+            reasoning = analysis
+        if not analysis and not reasoning:
+            reasoning = raw_reasoning
+
+        memo_raw = response_json.get("memo")
+        memo = str(memo_raw) if memo_raw is not None else None
+        action_value = response_json.get("action") or response_json.get("result") or response_json.get("choice")
+        if action_value is not None:
+            try:
+                action = int(action_value)
+                if _validate_action_number(action, num_choices, debug, logger):
+                    return LLMResponse(
+                        action=action,
+                        reasoning=reasoning,
+                        analysis=analysis,
+                        memo=memo,
+                        is_default=False,
+                        parse_mode=json_parse_mode or "json",
+                    )
+            except (ValueError, TypeError):
+                if debug and logger:
+                    logger.error("Invalid action value in JSON: %s", action_value)
+
+    try:
+        action = int(response.strip())
+        if _validate_action_number(action, num_choices, debug, logger):
+            return LLMResponse(
+                action=action,
+                reasoning=extracted_reasoning or extracted_analysis or raw_reasoning,
+                analysis=extracted_analysis,
+                is_default=False,
+                parse_mode="number_only",
+            )
+    except ValueError:
+        if debug and logger:
+            logger.error("Could not parse response as number: %s", response)
+
+    extracted_action = _extract_action_from_text(response, num_choices)
+    if extracted_action is not None:
+        return LLMResponse(
+            action=extracted_action,
+            reasoning=extracted_reasoning or extracted_analysis or raw_reasoning,
+            analysis=extracted_analysis,
+            is_default=False,
+            parse_mode="number_extracted",
+        )
+
+    if debug and logger:
+        logger.error("Error during response parsing, defaulting to first choice. Response: %s...", response[:100])
+    return LLMResponse(
+        action=1,
+        reasoning=extracted_reasoning or extracted_analysis or raw_reasoning,
+        analysis=extracted_analysis,
+        is_default=True,
+        parse_mode="default_first",
+    )
+
 
 class BaseHarness(QuestPlayer):
     """Abstract LLM harness base class."""
diff --git a/llm_quest_benchmark/harnesses/tool_harness.py b/llm_quest_benchmark/harnesses/tool_harness.py
index e89e7c1..a398bfe 100644
--- a/llm_quest_benchmark/harnesses/tool_harness.py
+++ b/llm_quest_benchmark/harnesses/tool_harness.py
@@ -2,9 +2,8 @@
 
 from typing import Any
 
-from llm_quest_benchmark.agents.llm_agent import _parse_json_response
 from llm_quest_benchmark.constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, SYSTEM_ROLE_TEMPLATE
-from llm_quest_benchmark.harnesses.base import BaseHarness
+from llm_quest_benchmark.harnesses.base import BaseHarness, _parse_json_response
 from llm_quest_benchmark.harnesses.memory import CompactionMemory
 from llm_quest_benchmark.harnesses.tools import QuestHistoryTool, Scratchpad, calculator
 from llm_quest_benchmark.schemas.response import LLMResponse

From 78babe8558fab338463a0971aafadbadc0c4163a Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Mon, 11 May 2026 14:38:19 +0400
Subject: [PATCH 05/24] harnesses: HarnessConfig schema + factory

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 llm_quest_benchmark/harnesses/factory.py      | 37 +++++++++---
 llm_quest_benchmark/schemas/__init__.py       | 12 +++-
 llm_quest_benchmark/schemas/config.py         | 43 ++++++++++++--
 .../tests/harnesses/test_factory.py           | 57 ++++++++++++++++++-
 4 files changed, 131 insertions(+), 18 deletions(-)

diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py
index e423783..b46f5dc 100644
--- a/llm_quest_benchmark/harnesses/factory.py
+++ b/llm_quest_benchmark/harnesses/factory.py
@@ -21,6 +21,22 @@
     "planner": PlannerHarness,
 }
 
+SPECIAL_HARNESSES = ("human", "random_choice", "random_choice_<seed>")
+
+
+def _parse_random_choice_seed(identifier: str) -> tuple[bool, int | None]:
+    if identifier == "random_choice":
+        return True, None
+    prefix = "random_choice_"
+    if identifier.startswith(prefix) and identifier[len(prefix) :].isdigit():
+        return True, int(identifier[len(prefix) :])
+    return False, None
+
+
+def is_random_choice_harness(identifier: str) -> bool:
+    is_random, _ = _parse_random_choice_seed(identifier)
+    return is_random
+
 
 def create_harness(
     harness: str,
@@ -31,18 +47,21 @@ def create_harness(
     compaction_interval: int = 50,
     system_template: str = "system_role.jinja",
 ) -> QuestPlayer:
+    valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES]
+    is_random_harness, seed = _parse_random_choice_seed(harness)
+    if is_random_harness:
+        return RandomAgent(seed=seed, debug=debug, skip_single=skip_single)
+    if harness.startswith("random_choice"):
+        raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}")
     if harness == "human":
         return HumanPlayer(skip_single=skip_single)
-    if harness.startswith("random_choice"):
-        seed = None
-        if "_" in harness[13:]:
-            try:
-                seed = int(harness.split("_")[-1])
-            except ValueError:
-                pass
-        return RandomAgent(seed=seed, debug=debug, skip_single=skip_single)
     if harness not in HARNESS_REGISTRY:
-        raise ValueError(f"Unknown harness '{harness}'. Valid: {sorted(HARNESS_REGISTRY)}")
+        raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}")
+    is_random_model, seed = _parse_random_choice_seed(model)
+    if is_random_model:
+        return RandomAgent(seed=seed, debug=debug, skip_single=skip_single)
+    if model.startswith("random_choice"):
+        raise ValueError(f"Unknown random_choice model '{model}'. Valid: {valid}")
     cls = HARNESS_REGISTRY[harness]
     return cls(
         model_name=model,
diff --git a/llm_quest_benchmark/schemas/__init__.py b/llm_quest_benchmark/schemas/__init__.py
index 34fee08..0cb4242 100644
--- a/llm_quest_benchmark/schemas/__init__.py
+++ b/llm_quest_benchmark/schemas/__init__.py
@@ -1,9 +1,17 @@
 """Schema exports for LLM Quest Benchmark"""
 
-__all__ = ["QMState", "AgentState", "LLMResponse", "QMBridgeState", "BenchmarkConfig", "AgentConfig"]
+__all__ = [
+    "QMState",
+    "AgentState",
+    "LLMResponse",
+    "QMBridgeState",
+    "BenchmarkConfig",
+    "HarnessConfig",
+    "AgentConfig",
+]
 
 # Import directly from the schema modules using relative imports
 from .bridge import QMBridgeState
-from .config import AgentConfig, BenchmarkConfig
+from .config import AgentConfig, BenchmarkConfig, HarnessConfig
 from .response import LLMResponse
 from .state import AgentState, QMState
diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py
index 74799bd..c658729 100644
--- a/llm_quest_benchmark/schemas/config.py
+++ b/llm_quest_benchmark/schemas/config.py
@@ -68,12 +68,45 @@ class HarnessConfig:
     benchmark_id: str | None = None
     compaction_interval: int = 50
 
+    def __init__(
+        self,
+        model: str = DEFAULT_MODEL,
+        system_template: str = SYSTEM_ROLE_TEMPLATE,
+        harness: str = "reasoning_recent",
+        temperature: float = DEFAULT_TEMPERATURE,
+        runs: int = 1,
+        skip_single: bool = False,
+        debug: bool = False,
+        benchmark_id: str | None = None,
+        compaction_interval: int = 50,
+        **legacy_keys,
+    ):
+        if "template" in legacy_keys or "action_template" in legacy_keys:
+            raise ValueError("Use harness: key instead of template:")
+        if "memory_mode" in legacy_keys:
+            raise ValueError("Use harness: key instead of memory_mode:")
+        if legacy_keys:
+            unexpected = ", ".join(sorted(legacy_keys))
+            raise TypeError(f"Unexpected HarnessConfig key(s): {unexpected}")
+
+        self.model = model
+        self.system_template = system_template
+        self.harness = harness
+        self.temperature = temperature
+        self.runs = runs
+        self.skip_single = skip_single
+        self.debug = debug
+        self.benchmark_id = benchmark_id
+        self.compaction_interval = compaction_interval
+        self.__post_init__()
+
     def __post_init__(self):
         self.system_template = normalize_template_name(self.system_template)
-        from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY
+        from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, SPECIAL_HARNESSES, is_random_choice_harness
 
-        if self.harness not in HARNESS_REGISTRY:
-            raise ValueError(f"Invalid harness: {self.harness}. Supported harnesses: {sorted(HARNESS_REGISTRY)}")
+        if self.harness not in HARNESS_REGISTRY and self.harness != "human" and not is_random_choice_harness(self.harness):
+            valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES]
+            raise ValueError(f"Invalid harness: {self.harness}. Supported harnesses: {valid}")
         if not (0.0 <= self.temperature <= 2.0):
             raise ValueError(f"Temperature must be between 0.0 and 2.0, got {self.temperature}")
         if self.runs < 1:
@@ -180,9 +213,9 @@ def from_yaml(cls, yaml_path: str) -> "BenchmarkConfig":
             agents = []
             for agent in data["agents"]:
                 if "template" in agent:
-                    raise ValueError("Use 'harness:' instead of 'template:'")
+                    raise ValueError("Use harness: key instead of template:")
                 if "memory_mode" in agent:
-                    raise ValueError("Use 'harness:' instead of 'memory_mode:'")
+                    raise ValueError("Use harness: key instead of memory_mode:")
                 agents.append(HarnessConfig(**agent))
             data["agents"] = agents
 
diff --git a/llm_quest_benchmark/tests/harnesses/test_factory.py b/llm_quest_benchmark/tests/harnesses/test_factory.py
index 5b7bfa0..187f4d3 100644
--- a/llm_quest_benchmark/tests/harnesses/test_factory.py
+++ b/llm_quest_benchmark/tests/harnesses/test_factory.py
@@ -33,11 +33,28 @@ def test_create_random_choice_harness():
     assert isinstance(harness, RandomAgent)
 
 
+def test_create_seeded_random_choice_harness():
+    harness = create_harness("random_choice_123")
+
+    assert isinstance(harness, RandomAgent)
+    assert harness.agent_id == "random_123"
+
+
 def test_create_bad_harness_name_raises():
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="minimal"):
         create_harness("bad_name", model="gpt-5-mini")
 
 
+def test_create_bad_random_choice_seed_raises():
+    with pytest.raises(ValueError, match="random_choice_<seed>"):
+        create_harness("random_choice_bad")
+
+
+def test_random_choice_model_does_not_hide_bad_harness():
+    with pytest.raises(ValueError, match="bad_name"):
+        create_harness("bad_name", model="random_choice_123")
+
+
 def test_harness_config_stable_harness_id():
     config = HarnessConfig(harness="memo_compact", model="gpt-5-mini")
 
@@ -45,6 +62,22 @@ def test_harness_config_stable_harness_id():
     assert config.harness_id == HarnessConfig(harness="memo_compact", model="gpt-5-mini").harness_id
 
 
+def test_harness_config_allows_seeded_random_choice_harness():
+    config = HarnessConfig(harness="random_choice_123", model="gpt-5-mini")
+
+    assert config.harness == "random_choice_123"
+
+
+def test_harness_config_rejects_old_template_key():
+    with pytest.raises(ValueError, match="Use harness: key instead of template:"):
+        HarnessConfig(model="gpt-5-mini", template="reasoning.jinja")
+
+
+def test_harness_config_rejects_old_memory_mode_key():
+    with pytest.raises(ValueError, match="Use harness: key instead of memory_mode:"):
+        HarnessConfig(model="gpt-5-mini", harness="memo_compact", memory_mode="compaction")
+
+
 def test_benchmark_config_from_yaml_parses_harness(tmp_path):
     quest_path = tmp_path / "quest.qm"
     quest_path.write_text("", encoding="utf-8")
@@ -83,5 +116,25 @@ def test_benchmark_config_from_yaml_rejects_template(tmp_path):
         encoding="utf-8",
     )
 
-    with pytest.raises(ValueError, match="Use 'harness:' instead of 'template:'"):
+    with pytest.raises(ValueError, match="Use harness: key instead of template:"):
+        BenchmarkConfig.from_yaml(str(config_path))
+
+
+def test_benchmark_config_from_yaml_rejects_memory_mode(tmp_path):
+    quest_path = tmp_path / "quest.qm"
+    quest_path.write_text("", encoding="utf-8")
+    config_path = tmp_path / "benchmark.yaml"
+    config_path.write_text(
+        f"""
+quests:
+  - {quest_path}
+agents:
+  - model: gpt-5-mini
+    harness: memo_compact
+    memory_mode: compaction
+""",
+        encoding="utf-8",
+    )
+
+    with pytest.raises(ValueError, match="Use harness: key instead of memory_mode:"):
         BenchmarkConfig.from_yaml(str(config_path))

From eeac9f355398267adf727649816100f351434d37 Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Mon, 11 May 2026 14:42:43 +0400
Subject: [PATCH 06/24] harnesses: wire CLI, benchmark, YAML configs

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 configs/benchmarks/exp3_no_loop_breaker.yaml  |  3 +-
 configs/benchmarks/exp3_stateful_compact.yaml |  3 +-
 .../benchmarks/exp4_compaction_no_memo.yaml   |  3 +-
 configs/benchmarks/exp4_memo_cot.yaml         |  3 +-
 configs/benchmarks/exp4_memo_extended.yaml    |  3 +-
 configs/benchmarks/exp4_memo_structured.yaml  |  3 +-
 .../exp5_stateful_compact_variance.yaml       |  3 +-
 configs/benchmarks/exp6_prompt_hints.yaml     |  3 +-
 configs/benchmarks/exp6_tools.yaml            |  3 +-
 configs/benchmarks/exp6_tools_hints.yaml      |  3 +-
 .../benchmarks/exp6_unified_tools_screen.yaml |  3 +-
 configs/benchmarks/exp7_deepseek.yaml         |  3 +-
 configs/benchmarks/exp7_haiku.yaml            |  3 +-
 configs/benchmarks/exp7_llama.yaml            |  3 +-
 configs/benchmarks/exp7_mistral.yaml          |  3 +-
 configs/benchmarks/exp7_qwen.yaml             |  3 +-
 configs/benchmarks/exp7b_model_upgrades.yaml  |  9 ++---
 configs/benchmarks/memory_compaction.yaml     | 18 +++------
 .../benchmarks/memory_full_transcript.yaml    |  9 ++---
 configs/benchmarks/memory_modes_pilot.yaml    | 12 ++----
 configs/benchmarks/openrouter_smoke_test.yaml | 10 ++---
 llm_quest_benchmark/executors/benchmark.py    | 39 ++++++++++++++-----
 llm_quest_benchmark/executors/cli/commands.py | 31 +++++++++------
 23 files changed, 87 insertions(+), 89 deletions(-)

diff --git a/configs/benchmarks/exp3_no_loop_breaker.yaml b/configs/benchmarks/exp3_no_loop_breaker.yaml
index 64240fe..57e7124 100644
--- a/configs/benchmarks/exp3_no_loop_breaker.yaml
+++ b/configs/benchmarks/exp3_no_loop_breaker.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: reasoning
+    harness: reasoning_full
     temperature: 0.4
     runs: 2
-    memory_mode: full_transcript
 debug: false
 quest_timeout: 600
 max_workers: 2
diff --git a/configs/benchmarks/exp3_stateful_compact.yaml b/configs/benchmarks/exp3_stateful_compact.yaml
index b43fc6b..bb9973c 100644
--- a/configs/benchmarks/exp3_stateful_compact.yaml
+++ b/configs/benchmarks/exp3_stateful_compact.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp4_compaction_no_memo.yaml b/configs/benchmarks/exp4_compaction_no_memo.yaml
index 5ef4130..896dd60 100644
--- a/configs/benchmarks/exp4_compaction_no_memo.yaml
+++ b/configs/benchmarks/exp4_compaction_no_memo.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp4_memo_cot.yaml b/configs/benchmarks/exp4_memo_cot.yaml
index fe97bca..9bfe382 100644
--- a/configs/benchmarks/exp4_memo_cot.yaml
+++ b/configs/benchmarks/exp4_memo_cot.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: memo_cot
+    harness: memo_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp4_memo_extended.yaml b/configs/benchmarks/exp4_memo_extended.yaml
index 66d1bf4..25e5620 100644
--- a/configs/benchmarks/exp4_memo_extended.yaml
+++ b/configs/benchmarks/exp4_memo_extended.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: memo_extended
+    harness: memo_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp4_memo_structured.yaml b/configs/benchmarks/exp4_memo_structured.yaml
index 83502c7..96e5daf 100644
--- a/configs/benchmarks/exp4_memo_structured.yaml
+++ b/configs/benchmarks/exp4_memo_structured.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: memo_structured
+    harness: memo_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp5_stateful_compact_variance.yaml b/configs/benchmarks/exp5_stateful_compact_variance.yaml
index 6f99f29..89cc80b 100644
--- a/configs/benchmarks/exp5_stateful_compact_variance.yaml
+++ b/configs/benchmarks/exp5_stateful_compact_variance.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 5
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp6_prompt_hints.yaml b/configs/benchmarks/exp6_prompt_hints.yaml
index 098b1db..4c70e61 100644
--- a/configs/benchmarks/exp6_prompt_hints.yaml
+++ b/configs/benchmarks/exp6_prompt_hints.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: stateful_compact_hints
+    harness: hinted_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp6_tools.yaml b/configs/benchmarks/exp6_tools.yaml
index 8630bb0..b254005 100644
--- a/configs/benchmarks/exp6_tools.yaml
+++ b/configs/benchmarks/exp6_tools.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: tool_augmented
+    harness: tool_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp6_tools_hints.yaml b/configs/benchmarks/exp6_tools_hints.yaml
index b7949fc..0c0c3b6 100644
--- a/configs/benchmarks/exp6_tools_hints.yaml
+++ b/configs/benchmarks/exp6_tools_hints.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: tool_augmented_hints
+    harness: tool_hinted
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp6_unified_tools_screen.yaml b/configs/benchmarks/exp6_unified_tools_screen.yaml
index 0c43290..b80f8c0 100644
--- a/configs/benchmarks/exp6_unified_tools_screen.yaml
+++ b/configs/benchmarks/exp6_unified_tools_screen.yaml
@@ -24,10 +24,9 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: tool_augmented
+    harness: tool_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp7_deepseek.yaml b/configs/benchmarks/exp7_deepseek.yaml
index 1b82664..6971569 100644
--- a/configs/benchmarks/exp7_deepseek.yaml
+++ b/configs/benchmarks/exp7_deepseek.yaml
@@ -7,10 +7,9 @@ quests:
   - quests/sr_2_1_2121_eng/Robots_eng.qm
 agents:
   - model: "openrouter:deepseek/deepseek-chat-v3-0324"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp7_haiku.yaml b/configs/benchmarks/exp7_haiku.yaml
index 72cd6c2..8546c80 100644
--- a/configs/benchmarks/exp7_haiku.yaml
+++ b/configs/benchmarks/exp7_haiku.yaml
@@ -7,10 +7,9 @@ quests:
   - quests/sr_2_1_2121_eng/Robots_eng.qm
 agents:
   - model: "anthropic:claude-3-5-haiku-latest"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp7_llama.yaml b/configs/benchmarks/exp7_llama.yaml
index 27eda5a..61e156c 100644
--- a/configs/benchmarks/exp7_llama.yaml
+++ b/configs/benchmarks/exp7_llama.yaml
@@ -7,10 +7,9 @@ quests:
   - quests/sr_2_1_2121_eng/Robots_eng.qm
 agents:
   - model: "openrouter:meta-llama/llama-4-scout"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp7_mistral.yaml b/configs/benchmarks/exp7_mistral.yaml
index 76f1a40..f570882 100644
--- a/configs/benchmarks/exp7_mistral.yaml
+++ b/configs/benchmarks/exp7_mistral.yaml
@@ -7,10 +7,9 @@ quests:
   - quests/sr_2_1_2121_eng/Robots_eng.qm
 agents:
   - model: "openrouter:mistralai/mistral-small-2603"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp7_qwen.yaml b/configs/benchmarks/exp7_qwen.yaml
index 572d7a6..27496cc 100644
--- a/configs/benchmarks/exp7_qwen.yaml
+++ b/configs/benchmarks/exp7_qwen.yaml
@@ -7,10 +7,9 @@ quests:
   - quests/sr_2_1_2121_eng/Robots_eng.qm
 agents:
   - model: "openrouter:qwen/qwen3-30b-a3b"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/exp7b_model_upgrades.yaml b/configs/benchmarks/exp7b_model_upgrades.yaml
index 4c35c8b..22da91b 100644
--- a/configs/benchmarks/exp7b_model_upgrades.yaml
+++ b/configs/benchmarks/exp7b_model_upgrades.yaml
@@ -20,22 +20,19 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:deepseek/deepseek-v4-flash"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
   - model: "openrouter:qwen/qwen3.6-flash"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
   - model: "claude:claude-haiku-4-5-20251001"
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 2
-    memory_mode: compaction
     compaction_interval: 50
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/memory_compaction.yaml b/configs/benchmarks/memory_compaction.yaml
index 1bb10a8..c403665 100644
--- a/configs/benchmarks/memory_compaction.yaml
+++ b/configs/benchmarks/memory_compaction.yaml
@@ -18,45 +18,39 @@ quests:
 agents:
   # Gemini 3 Flash - compaction interval 10
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 10
   # Gemini 3 Flash - compaction interval 20
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 20
   # GPT-5.4 Mini - compaction interval 10
   - model: "openrouter:openai/gpt-5.4-mini"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 10
   # GPT-5.4 Mini - compaction interval 20
   - model: "openrouter:openai/gpt-5.4-mini"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 20
   # DeepSeek V3.2 - compaction interval 10
   - model: "openrouter:deepseek/deepseek-v3.2"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 10
   # DeepSeek V3.2 - compaction interval 20
   - model: "openrouter:deepseek/deepseek-v3.2"
-    template: reasoning
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 20
 debug: false
 quest_timeout: 600
diff --git a/configs/benchmarks/memory_full_transcript.yaml b/configs/benchmarks/memory_full_transcript.yaml
index 04ad152..9fc82a4 100644
--- a/configs/benchmarks/memory_full_transcript.yaml
+++ b/configs/benchmarks/memory_full_transcript.yaml
@@ -18,22 +18,19 @@ quests:
 agents:
   # Gemini 3 Flash - full transcript
   - model: "openrouter:google/gemini-3-flash-preview"
-    template: reasoning
+    harness: reasoning_full
     temperature: 0.4
     runs: 3
-    memory_mode: full_transcript
   # GPT-5.4 Mini - full transcript
   - model: "openrouter:openai/gpt-5.4-mini"
-    template: reasoning
+    harness: reasoning_full
     temperature: 0.4
     runs: 3
-    memory_mode: full_transcript
   # DeepSeek V3.2 - full transcript
   - model: "openrouter:deepseek/deepseek-v3.2"
-    template: reasoning
+    harness: reasoning_full
     temperature: 0.4
     runs: 3
-    memory_mode: full_transcript
 debug: false
 quest_timeout: 600
 max_workers: 2
diff --git a/configs/benchmarks/memory_modes_pilot.yaml b/configs/benchmarks/memory_modes_pilot.yaml
index 2e4d862..db6aa23 100644
--- a/configs/benchmarks/memory_modes_pilot.yaml
+++ b/configs/benchmarks/memory_modes_pilot.yaml
@@ -5,31 +5,27 @@ quests:
 agents:
 # Short-context reasoning - default memory (3 obs, 5 decisions)
 - model: openrouter:google/gemini-3-flash-preview
-  template: reasoning
+  harness: reasoning_recent
   temperature: 0.4
   runs: 3
-  memory_mode: default
 
 # Short-context reasoning - loop-aware template
 - model: openrouter:google/gemini-3-flash-preview
-  template: loop_aware_reasoning
+  harness: reasoning_recent
   temperature: 0.4
   runs: 3
-  memory_mode: default
 
 # Full-history reasoning
 - model: openrouter:google/gemini-3-flash-preview
-  template: reasoning
+  harness: reasoning_full
   temperature: 0.4
   runs: 3
-  memory_mode: full_transcript
 
 # Compact memory / memo (compact every 10 steps)
 - model: openrouter:google/gemini-3-flash-preview
-  template: reasoning
+  harness: memo_compact
   temperature: 0.4
   runs: 3
-  memory_mode: compaction
   compaction_interval: 10
 
 debug: false
diff --git a/configs/benchmarks/openrouter_smoke_test.yaml b/configs/benchmarks/openrouter_smoke_test.yaml
index 6194df3..2fb50be 100644
--- a/configs/benchmarks/openrouter_smoke_test.yaml
+++ b/configs/benchmarks/openrouter_smoke_test.yaml
@@ -3,23 +3,23 @@ quests:
   - quests/Boat.qm
 agents:
   - model: "openrouter:anthropic/claude-sonnet-4-6"
-    template: stub
+    harness: minimal
     temperature: 0.4
     runs: 1
   - model: "openrouter:openai/gpt-5.4-mini"
-    template: stub
+    harness: minimal
     temperature: 0.4
     runs: 1
   - model: "openrouter:google/gemini-2.5-flash"
-    template: stub
+    harness: minimal
     temperature: 0.4
     runs: 1
   - model: "openrouter:deepseek/deepseek-chat"
-    template: stub
+    harness: minimal
     temperature: 0.4
     runs: 1
   - model: "openrouter:qwen/qwen3-235b-a22b"
-    template: stub
+    harness: minimal
     temperature: 0.4
     runs: 1
 debug: false
diff --git a/llm_quest_benchmark/executors/benchmark.py b/llm_quest_benchmark/executors/benchmark.py
index 0b78062..69e7d22 100644
--- a/llm_quest_benchmark/executors/benchmark.py
+++ b/llm_quest_benchmark/executors/benchmark.py
@@ -12,10 +12,10 @@
 from pathlib import Path
 from typing import Any
 
-from llm_quest_benchmark.agents.agent_factory import create_agent
 from llm_quest_benchmark.core.logging import DEFAULT_DB_PATH
 from llm_quest_benchmark.core.runner import run_quest_with_timeout
 from llm_quest_benchmark.environments.state import QuestOutcome
+from llm_quest_benchmark.harnesses.factory import create_harness
 from llm_quest_benchmark.llm import tracing
 from llm_quest_benchmark.schemas.config import BenchmarkConfig
 
@@ -34,6 +34,28 @@
 logger = logging.getLogger(__name__)
 
 
+def _agent_harness(agent_config) -> str:
+    """Return harness name for new configs, with legacy AgentConfig fallback."""
+    if hasattr(agent_config, "harness"):
+        return agent_config.harness
+
+    template = getattr(agent_config, "action_template", "reasoning.jinja")
+    memory_mode = getattr(agent_config, "memory_mode", "default")
+    template = template.removesuffix(".jinja")
+    legacy_mapping = {
+        ("stub", "default"): "minimal",
+        ("reasoning", "default"): "reasoning_recent",
+        ("reasoning", "full_transcript"): "reasoning_full",
+        ("reasoning", "compaction"): "memo_compact",
+        ("stateful_compact", "compaction"): "memo_compact",
+        ("stateful_compact_hints", "compaction"): "hinted_compact",
+        ("tool_augmented", "compaction"): "tool_compact",
+        ("tool_augmented_hints", "compaction"): "tool_hinted",
+        ("planner", "compaction"): "planner",
+    }
+    return legacy_mapping.get((template, memory_mode), "reasoning_recent")
+
+
 def _result_entry(
     quest: str,
     agent_config,
@@ -46,8 +68,8 @@ def _result_entry(
         "quest": quest,
         "model": agent_config.model,
         "temperature": agent_config.temperature,
-        "template": agent_config.action_template,
-        "agent_id": agent_config.agent_id,
+        "harness": _agent_harness(agent_config),
+        "agent_id": agent_config.harness_id if hasattr(agent_config, "harness_id") else agent_config.agent_id,
         "attempt": attempt,
         "outcome": outcome,
         "reward": reward,
@@ -132,15 +154,14 @@ def callback(event: str, data: Any = None) -> None:
             )
 
     try:
-        agent = create_agent(
+        agent = create_harness(
+            harness=_agent_harness(agent_config),
             model=agent_config.model,
             temperature=agent_config.temperature,
-            system_template=agent_config.system_template,
-            action_template=agent_config.action_template,
             skip_single=agent_config.skip_single,
             debug=agent_config.debug,
-            memory_mode=agent_config.memory_mode,
             compaction_interval=agent_config.compaction_interval,
+            system_template=agent_config.system_template,
         )
         outcome = run_quest_with_timeout(
             quest,
@@ -254,7 +275,7 @@ def _write_benchmark_artifacts(config: BenchmarkConfig, results: list[dict[str,
                 "temperature": agent.temperature,
                 "runs": agent.runs,
                 "system_template": agent.system_template,
-                "action_template": agent.action_template,
+                "harness": _agent_harness(agent),
             }
             for agent in config.agents
         ],
@@ -281,7 +302,7 @@ def _write_benchmark_artifacts(config: BenchmarkConfig, results: list[dict[str,
             {
                 "model": agent.model,
                 "system_template": agent.system_template,
-                "action_template": agent.action_template,
+                "harness": _agent_harness(agent),
                 "temperature": agent.temperature,
                 "runs": agent.runs,
                 "skip_single": agent.skip_single,
diff --git a/llm_quest_benchmark/executors/cli/commands.py b/llm_quest_benchmark/executors/cli/commands.py
index e3cecbd..4b029bd 100644
--- a/llm_quest_benchmark/executors/cli/commands.py
+++ b/llm_quest_benchmark/executors/cli/commands.py
@@ -8,6 +8,7 @@
 from pathlib import Path
 from typing import Any
 
+import click
 from dotenv import load_dotenv
 
 # Initialize quest registry early
@@ -18,13 +19,10 @@
 
 import typer
 
-from llm_quest_benchmark.agents.agent_factory import create_agent
-from llm_quest_benchmark.agents.human_player import HumanPlayer
 from llm_quest_benchmark.constants import (
     DEFAULT_MODEL,
     DEFAULT_QUEST,
     DEFAULT_TEMPERATURE,
-    DEFAULT_TEMPLATE,
     INFINITE_TIMEOUT,
     MODEL_CHOICES,
     SYSTEM_ROLE_TEMPLATE,
@@ -40,9 +38,10 @@
     print_summary,
     run_benchmark,
 )
+from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness
 from llm_quest_benchmark.llm import tracing
 from llm_quest_benchmark.renderers.terminal import RichRenderer
-from llm_quest_benchmark.schemas.config import AgentConfig, BenchmarkConfig
+from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig
 
 # Initialize logging
 log_manager = LogManager()
@@ -53,6 +52,8 @@
     rich_markup_mode="rich",
 )
 
+HARNESS_CHOICES = list(HARNESS_REGISTRY.keys())
+
 
 def version_callback(value: bool):
     if value:
@@ -348,7 +349,13 @@ def run(
     model: str = typer.Option(DEFAULT_MODEL, help=f"Model for the LLM agent (choices: {', '.join(MODEL_CHOICES)})."),
     temperature: float = typer.Option(DEFAULT_TEMPERATURE, help="Temperature for LLM sampling"),
     system_template: str = typer.Option(SYSTEM_ROLE_TEMPLATE, help="Template to use for system instructions."),
-    action_template: str = typer.Option(DEFAULT_TEMPLATE, help="Template to use for action prompts."),
+    harness: str = typer.Option(
+        "reasoning_recent",
+        "--harness",
+        help="Harness to use for quest decisions.",
+        click_type=click.Choice(HARNESS_CHOICES),
+    ),
+    compaction_interval: int = typer.Option(50, help="Advanced override for compaction interval."),
     timeout: int = typer.Option(60, help="Timeout in seconds for run (0 for no timeout)."),
     skip: bool = typer.Option(True, help="Auto-select single choices without asking agent."),
     debug: bool = typer.Option(False, help="Enable debug logging and output, remove terminal UI."),
@@ -365,23 +372,25 @@ def run(
         log_manager.setup(debug)
 
         # Create agent config
-        agent_config = AgentConfig(
+        agent_config = HarnessConfig(
             model=model,
             system_template=system_template,
-            action_template=action_template,
+            harness=harness,
             temperature=temperature,
             skip_single=skip,
             debug=debug,
+            compaction_interval=compaction_interval,
         )
 
         # Create agent
-        agent = create_agent(
+        agent = create_harness(
+            harness=harness,
             model=model,
             system_template=system_template,
-            action_template=action_template,
             temperature=temperature,
             skip_single=skip,
             debug=debug,
+            compaction_interval=compaction_interval,
         )
 
         log.warning(f"Starting quest run with agent {str(agent)}")
@@ -458,7 +467,7 @@ def play(
         log.debug(f"Quest file: {quest}")
 
         # Create interactive player
-        player = HumanPlayer(skip_single=skip, debug=debug)
+        player = create_harness(harness="human", skip_single=skip, debug=debug)
 
         # Run quest in interactive mode
         result = run_quest_with_timeout(quest_path=str(quest), agent=player, timeout=INFINITE_TIMEOUT, debug=debug)
@@ -952,7 +961,7 @@ def benchmark(
 
     This command runs benchmark evaluation using a YAML configuration file that specifies:
     - quests: list of quest files or directories to test
-    - agents: list of agents with their model, template, and temperature settings
+    - agents: list of harnesses with their model, harness, and temperature settings
     - other settings: debug, timeout, workers, etc.
 
     Example:

From 664452d3ca54d82ba4a146d8ae566280c8ee70ae Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Mon, 11 May 2026 14:52:33 +0400
Subject: [PATCH 07/24] tests: migrate to harness API

---
 .../tests/agents/test_anthropic.py            |  18 +-
 .../tests/agents/test_llm_agent.py            | 104 +++---
 .../tests/agents/test_mode_agents.py          | 260 +-------------
 .../tests/harnesses/test_harnesses.py         | 335 ++++++++++++++++++
 .../tests/integration/test_mode_agents_e2e.py |  22 +-
 .../tests/integration/test_quest_e2e.py       |  12 +-
 6 files changed, 417 insertions(+), 334 deletions(-)
 create mode 100644 llm_quest_benchmark/tests/harnesses/test_harnesses.py

diff --git a/llm_quest_benchmark/tests/agents/test_anthropic.py b/llm_quest_benchmark/tests/agents/test_anthropic.py
index 5dd1f95..ba60f97 100644
--- a/llm_quest_benchmark/tests/agents/test_anthropic.py
+++ b/llm_quest_benchmark/tests/agents/test_anthropic.py
@@ -1,15 +1,15 @@
-"""Deterministic tests for Anthropic-backed agent behavior."""
+"""Deterministic tests for Anthropic-backed harness behavior."""
 
 from unittest.mock import Mock, patch
 
 import pytest
 
-from llm_quest_benchmark.agents.agent_factory import create_agent
+from llm_quest_benchmark.harnesses.factory import create_harness
 
 
 @patch("llm_quest_benchmark.llm.client.anthropic.Anthropic")
-def test_anthropic_agent_mocked_completion(mock_anthropic_cls):
-    """Agent should parse a mocked Anthropic completion without network calls."""
+def test_anthropic_harness_mocked_completion(mock_anthropic_cls):
+    """Harness should parse a mocked Anthropic completion without network calls."""
     mock_client = Mock()
     mock_response = Mock()
     mock_block = Mock()
@@ -18,15 +18,15 @@ def test_anthropic_agent_mocked_completion(mock_anthropic_cls):
     mock_client.messages.create.return_value = mock_response
     mock_anthropic_cls.return_value = mock_client
 
-    agent = create_agent("claude-sonnet-4-5")
-    action = agent.get_action("Test prompt", [{"text": "A"}, {"text": "B"}])
+    harness = create_harness("minimal", model="claude-sonnet-4-5")
+    action = harness.get_action("Test prompt", [{"text": "A"}, {"text": "B"}])
 
     assert action == 2
     assert mock_client.messages.create.call_count == 1
 
 
-def test_anthropic_agent_empty_choices_raises():
+def test_anthropic_harness_empty_choices_raises():
     """Base player contract should reject empty choices."""
-    agent = create_agent("claude-sonnet-4-5")
+    harness = create_harness("minimal", model="claude-sonnet-4-5")
     with pytest.raises(ValueError, match="No choices provided"):
-        agent.get_action("Test prompt", [])
+        harness.get_action("Test prompt", [])
diff --git a/llm_quest_benchmark/tests/agents/test_llm_agent.py b/llm_quest_benchmark/tests/agents/test_llm_agent.py
index 06ff32f..1f3b99c 100644
--- a/llm_quest_benchmark/tests/agents/test_llm_agent.py
+++ b/llm_quest_benchmark/tests/agents/test_llm_agent.py
@@ -1,10 +1,11 @@
-"""Tests for LLM agent"""
+"""Tests for the base LLM harness behavior."""
 
 from unittest.mock import Mock, patch
 
 import pytest
 
-from llm_quest_benchmark.agents.llm_agent import LLMAgent, parse_llm_response
+from llm_quest_benchmark.harnesses.base import parse_llm_response
+from llm_quest_benchmark.harnesses.minimal import MinimalHarness
 from llm_quest_benchmark.schemas.response import LLMResponse
 
 
@@ -20,8 +21,8 @@ def example_choices():
 
 @pytest.mark.timeout(5)  # Quick unit test
 @patch("llm_quest_benchmark.llm.client.OpenAI")
-def test_agent_basic_flow(mock_openai, monkeypatch):
-    """Test basic agent functionality with mocked LLM"""
+def test_harness_basic_flow(mock_openai, monkeypatch):
+    """Test basic harness functionality with mocked LLM"""
     monkeypatch.setenv("OPENAI_API_KEY", "test-key")
     # Setup mock
     mock_chat = Mock()
@@ -41,14 +42,14 @@ def test_agent_basic_flow(mock_openai, monkeypatch):
     observation = "You are at a trading station."
     choices = [{"id": "1", "text": "Talk to merchant"}, {"id": "2", "text": "Leave station"}]
 
-    # Create agent and test
-    agent = LLMAgent(model_name="gpt-5-mini")
-    result = agent.get_action(observation, choices)
+    # Create harness and test
+    harness = MinimalHarness(model_name="gpt-5-mini")
+    result = harness.get_action(observation, choices)
 
     # Verify results
     assert result == 1  # Expect an integer
     assert mock_chat.completions.create.call_count == 1
-    last_response = agent.get_last_response()
+    last_response = harness.get_last_response()
     assert last_response.prompt_tokens == 9
     assert last_response.completion_tokens == 2
     assert last_response.total_tokens == 11
@@ -56,47 +57,47 @@ def test_agent_basic_flow(mock_openai, monkeypatch):
 
 def test_template_rendering():
     """Test that templates are rendered correctly"""
-    agent = LLMAgent()
+    harness = MinimalHarness()
     observation = "Test observation"
     choices = [{"text": "Option 1"}, {"text": "Option 2"}]
 
     # Test that prompt is rendered correctly
-    prompt = agent.prompt_renderer.render_action_prompt(observation, choices)
+    prompt = harness.prompt_renderer.render_action_prompt(observation, choices)
     assert "Test observation" in prompt
     assert "Option 1" in prompt
     assert "Option 2" in prompt
 
 
-def test_agent_initialization_without_api_key(monkeypatch):
-    """Agent construction should not require provider API keys before inference."""
+def test_harness_initialization_without_api_key(monkeypatch):
+    """Harness construction should not require provider API keys before inference."""
     monkeypatch.delenv("OPENAI_API_KEY", raising=False)
     monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
-    agent = LLMAgent(model_name="gpt-5-mini")
-    assert agent.llm is None
+    harness = MinimalHarness(model_name="gpt-5-mini")
+    assert harness.llm is None
 
 
 def test_gemini_prompt_uses_selected_template():
-    agent = LLMAgent(model_name="gemini-2.5-flash")
-    prompt = agent._format_prompt("state", [{"text": "A"}, {"text": "B"}])
+    harness = MinimalHarness(model_name="gemini-2.5-flash", action_template="reasoning.jinja")
+    prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}])
     assert "Return ONLY valid JSON" in prompt
     assert "A" in prompt
     assert "B" in prompt
 
 
 def test_non_gemini_prompt_uses_selected_template():
-    agent = LLMAgent(model_name="gpt-5-mini", action_template="stub.jinja")
-    prompt = agent._format_prompt("state", [{"text": "A"}, {"text": "B"}])
+    harness = MinimalHarness(model_name="gpt-5-mini", action_template="stub.jinja")
+    prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}])
     assert "IMPORTANT: Please respond with ONLY a single number" in prompt
 
 
 def test_template_alias_without_suffix_is_supported():
-    agent = LLMAgent(model_name="gpt-5-mini", action_template="reasoning")
-    prompt = agent._format_prompt("state", [{"text": "A"}, {"text": "B"}])
+    harness = MinimalHarness(model_name="gpt-5-mini", action_template="reasoning")
+    prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}])
     assert '"result"' in prompt
 
 
 def test_gpt5_force_numeric_retry_path():
-    agent = LLMAgent(model_name="gpt-5-mini")
+    harness = MinimalHarness(model_name="gpt-5-mini")
     mocked_llm = Mock()
     mocked_llm.get_completion.side_effect = ["```json\n{", "```json\n{", "2"]
     mocked_llm.get_last_usage.side_effect = [
@@ -104,58 +105,57 @@ def test_gpt5_force_numeric_retry_path():
         {"prompt_tokens": 6, "completion_tokens": 1, "total_tokens": 7, "estimated_cost_usd": 0.0005},
         {"prompt_tokens": 4, "completion_tokens": 1, "total_tokens": 5, "estimated_cost_usd": 0.0003},
     ]
-    agent.llm = mocked_llm
+    harness.llm = mocked_llm
 
-    action = agent.get_action("state", [{"text": "A"}, {"text": "B"}])
+    action = harness.get_action("state", [{"text": "A"}, {"text": "B"}])
 
     assert action == 2
     assert mocked_llm.get_completion.call_count == 3
-    last = agent.get_last_response()
+    last = harness.get_last_response()
     assert last.total_tokens == 24
     assert last.estimated_cost_usd == pytest.approx(0.0018)
     assert last.parse_mode == "force_retry_number_only"
 
 
 def test_contextual_state_includes_previous_observations():
-    agent = LLMAgent(model_name="gpt-5-mini")
-    agent._remember_observation("Previous hint")
-    agent._remember_observation("Current state")
-    contextual = agent._build_contextual_state("Current state")
+    harness = MinimalHarness(model_name="gpt-5-mini")
+    harness.memory_module.update({"observation": "Previous hint"})
+    harness.memory_module.update({"observation": "Current state"})
+    contextual = harness._build_contextual_state("Current state")
     assert "Recent context from previous steps" in contextual
     assert "Previous hint" in contextual
 
 
 def test_contextual_state_includes_recent_decisions():
-    agent = LLMAgent(model_name="gpt-5-mini")
-    agent._decision_history = [
-        {"action": 2, "choice": "Inspect the terminal", "parse_mode": "json_direct"},
-        {"action": 1, "choice": "Ask for access", "parse_mode": "retry_json_repaired"},
-    ]
-    contextual = agent._build_contextual_state("Current state")
+    harness = MinimalHarness(model_name="gpt-5-mini")
+    harness.memory_module.update({"observation": "Previous state"})
+    harness.memory_module.update({"action": 2, "choice": "Inspect the terminal", "parse_mode": "json_direct"})
+    harness.memory_module.update({"action": 1, "choice": "Ask for access", "parse_mode": "retry_json_repaired"})
+    contextual = harness._build_contextual_state("Current state")
     assert "Recent selected actions" in contextual
     assert "Inspect the terminal" in contextual
     assert "parse=json_direct" in contextual
 
 
 def test_safety_filter_prefers_lower_risk_choice():
-    agent = LLMAgent(model_name="gpt-5-mini")
+    harness = MinimalHarness(model_name="gpt-5-mini")
     choices = [
         {"text": "Пойти в космопорт и улететь, чтобы завтра не позориться"},
         {"text": "Постараться пройти мимо"},
     ]
-    assert agent._apply_safety_filter(1, choices) == 2
+    assert harness._apply_safety_filter(choices, 1) == 2
 
 
 def test_get_last_response_uses_skip_single_result():
-    agent = LLMAgent(model_name="gpt-5-mini", skip_single=True)
-    agent.history.append(LLMResponse(action=2, is_default=False))
-    agent._last_response = LLMResponse(action=2, is_default=False)
+    harness = MinimalHarness(model_name="gpt-5-mini", skip_single=True)
+    harness.history.append(LLMResponse(action=2, is_default=False))
+    harness._last_response = LLMResponse(action=2, is_default=False)
 
-    action = agent.get_action("state", [{"id": "1", "text": "Only option"}])
+    action = harness.get_action("state", [{"id": "1", "text": "Only option"}])
 
     assert action == 1
-    assert agent.get_last_response().action == 1
-    assert agent.get_last_response().reasoning == "auto_single_choice"
+    assert harness.get_last_response().action == 1
+    assert harness.get_last_response().reasoning == "auto_single_choice"
 
 
 def test_parse_llm_response_number_only_tracks_parse_mode():
@@ -194,7 +194,7 @@ def test_parse_llm_response_uses_analysis_as_reasoning_when_truncated():
 
 
 def test_llm_error_default_response_keeps_reasoning_marker():
-    agent = LLMAgent(model_name="gemini-2.5-flash")
+    harness = MinimalHarness(model_name="gemini-2.5-flash")
     mocked_llm = Mock()
     mocked_llm.get_completion.side_effect = RuntimeError("provider returned empty message")
     mocked_llm.get_last_usage.return_value = {
@@ -203,20 +203,20 @@ def test_llm_error_default_response_keeps_reasoning_marker():
         "total_tokens": 0,
         "estimated_cost_usd": None,
     }
-    agent.llm = mocked_llm
+    harness.llm = mocked_llm
 
-    action = agent.get_action("state", [{"text": "A"}, {"text": "B"}])
+    action = harness.get_action("state", [{"text": "A"}, {"text": "B"}])
 
     assert action == 1
-    last = agent.get_last_response()
+    last = harness.get_last_response()
     assert last.is_default is True
     assert last.reasoning is not None
     assert "llm_call_error" in last.reasoning
 
 
 def test_retry_prompt_requests_json_payload():
-    agent = LLMAgent(model_name="gemini-2.5-flash")
-    prompt = agent._format_retry_prompt("state", [{"text": "A"}, {"text": "B"}])
+    harness = MinimalHarness(model_name="gemini-2.5-flash")
+    prompt = harness._format_retry_prompt("state", [{"text": "A"}, {"text": "B"}])
     assert "Return valid JSON only" in prompt
     assert '"analysis"' in prompt
     assert '"reasoning"' in prompt
@@ -224,7 +224,7 @@ def test_retry_prompt_requests_json_payload():
 
 
 def test_retry_preserves_reasoning_from_first_attempt():
-    agent = LLMAgent(model_name="gemini-2.5-flash")
+    harness = MinimalHarness(model_name="gemini-2.5-flash")
     mocked_llm = Mock()
     mocked_llm.get_completion.side_effect = [
         "Analysis: low oxygen\nReasoning: safer move first\n```json\n{",
@@ -244,12 +244,12 @@ def test_retry_preserves_reasoning_from_first_attempt():
             "estimated_cost_usd": 0.0002,
         },
     ]
-    agent.llm = mocked_llm
+    harness.llm = mocked_llm
 
-    action = agent.get_action("state", [{"text": "A"}, {"text": "B"}])
+    action = harness.get_action("state", [{"text": "A"}, {"text": "B"}])
 
     assert action == 2
-    last = agent.get_last_response()
+    last = harness.get_last_response()
     assert last.analysis is not None
     assert "low oxygen" in last.analysis
     assert last.reasoning is not None
diff --git a/llm_quest_benchmark/tests/agents/test_mode_agents.py b/llm_quest_benchmark/tests/agents/test_mode_agents.py
index c650127..a41a11a 100644
--- a/llm_quest_benchmark/tests/agents/test_mode_agents.py
+++ b/llm_quest_benchmark/tests/agents/test_mode_agents.py
@@ -1,257 +1,5 @@
-"""Tests for planner and tool-augmented agent modes."""
+"""Legacy agent-mode tests retired.
 
-from unittest.mock import Mock
-
-from llm_quest_benchmark.agents.agent_factory import create_agent
-from llm_quest_benchmark.agents.llm_agent import LLMAgent
-from llm_quest_benchmark.agents.planner_agent import PlannerAgent
-from llm_quest_benchmark.agents.tool_agent import ToolAgent
-
-
-def test_create_agent_uses_planner_template_alias():
-    agent = create_agent(model="gpt-5-mini", action_template="planner")
-    assert isinstance(agent, PlannerAgent)
-
-
-def test_create_agent_uses_tool_template_alias():
-    agent = create_agent(model="gpt-5-mini", action_template="tool_augmented")
-    assert isinstance(agent, ToolAgent)
-
-
-def test_create_agent_propagates_memory_mode_to_planner_and_tool_agents():
-    planner = create_agent(
-        model="gpt-5-mini",
-        action_template="planner",
-        memory_mode="compaction",
-        compaction_interval=50,
-    )
-    tool = create_agent(
-        model="gpt-5-mini",
-        action_template="tool_augmented",
-        memory_mode="compaction",
-        compaction_interval=50,
-    )
-
-    assert isinstance(planner, PlannerAgent)
-    assert isinstance(tool, ToolAgent)
-    assert planner._memory_mode == "compaction"
-    assert planner._compaction_interval == 50
-    assert tool._memory_mode == "compaction"
-    assert tool._compaction_interval == 50
-
-
-def test_create_agent_uses_light_hints_template_with_standard_llm_agent():
-    agent = create_agent(model="gpt-5-mini", action_template="light_hints")
-    assert isinstance(agent, LLMAgent)
-    assert not isinstance(agent, (PlannerAgent, ToolAgent))
-
-
-def test_light_hints_template_injects_general_mechanics():
-    agent = LLMAgent(model_name="gpt-5-mini", action_template="light_hints")
-
-    prompt = agent._format_prompt("A sealed vault blocks the route.", [{"text": "Study the vault"}])
-
-    assert "General hints for this type of quest" in prompt
-    assert "Preparation, study, negotiation" in prompt
-
-
-def test_planner_agent_first_turn_generates_plan_then_acts():
-    agent = PlannerAgent(model_name="gpt-5-mini")
-    mocked_llm = Mock()
-    mocked_llm.get_completion.side_effect = [
-        "Gather clues first. Avoid direct fights. Preserve resources.",
-        '{"analysis":"plan says scout","reasoning":"safer branch","result":2}',
-    ]
-    mocked_llm.get_last_usage.side_effect = [
-        {"prompt_tokens": 30, "completion_tokens": 12, "total_tokens": 42, "estimated_cost_usd": 0.001},
-        {"prompt_tokens": 20, "completion_tokens": 8, "total_tokens": 28, "estimated_cost_usd": 0.0007},
-    ]
-    agent.llm = mocked_llm
-
-    action = agent.get_action("You enter a pirate station.", [{"text": "Scout ahead"}, {"text": "Attack now"}])
-
-    assert action == 2
-    assert agent.current_plan is not None
-    assert "Avoid direct fights" in agent.current_plan
-    assert mocked_llm.get_completion.call_count == 2
-    assert agent.get_last_response().total_tokens == 70
-
-
-def test_planner_agent_reuses_plan_when_state_is_stable():
-    agent = PlannerAgent(model_name="gpt-5-mini")
-    agent.current_plan = "Keep moving carefully and avoid a direct fight."
-    agent._observation_history = ["Quiet corridor."]
-    mocked_llm = Mock()
-    mocked_llm.get_completion.return_value = '{"analysis":"plan still fits","reasoning":"careful progress","result":1}'
-    mocked_llm.get_last_usage.return_value = {
-        "prompt_tokens": 18,
-        "completion_tokens": 7,
-        "total_tokens": 25,
-        "estimated_cost_usd": 0.0005,
-    }
-    agent.llm = mocked_llm
-
-    action = agent.get_action("Quiet corridor.", [{"text": "Open the door"}, {"text": "Run"}])
-
-    assert action == 1
-    assert mocked_llm.get_completion.call_count == 1
-
-
-def test_planner_agent_uses_contextual_memory_state():
-    agent = PlannerAgent(model_name="gpt-5-mini", memory_mode="compaction", compaction_interval=50)
-    agent._quest_briefing = "Original mission: win the election."
-    agent._transcript = [
-        {
-            "step": 1,
-            "observation": "You learned Maloqs value strength.",
-            "choice_text": "Ask about Maloqs",
-            "memo": "Maloqs value strength",
-            "action": 1,
-        }
-    ]
-    agent._steps_since_compaction = 1
-    mocked_llm = Mock()
-    mocked_llm.get_completion.side_effect = [
-        "Use the remembered cultural clue.",
-        '{"analysis":"use clue","reasoning":"fits plan","result":1}',
-    ]
-    mocked_llm.get_last_usage.return_value = {
-        "prompt_tokens": 1,
-        "completion_tokens": 1,
-        "total_tokens": 2,
-        "estimated_cost_usd": 0.0,
-    }
-    agent.llm = mocked_llm
-
-    agent.get_action("Current banquet scene.", [{"text": "Greet like a warrior"}])
-
-    first_prompt = mocked_llm.get_completion.call_args_list[0].args[0]
-    assert "Quest briefing" in first_prompt
-    assert "RECENT STEPS" in first_prompt
-    assert "Maloqs value strength" in first_prompt
-
-
-def test_tool_agent_can_use_quest_history():
-    agent = ToolAgent(model_name="gpt-5-mini")
-    agent._step_log = [
-        {
-            "step": 1,
-            "observation": "Merchant mentioned low fuel.",
-            "choices": ["Buy fuel", "Keep flying"],
-            "selected_choice": "Buy fuel",
-        }
-    ]
-    mocked_llm = Mock()
-    mocked_llm.get_completion.side_effect = [
-        '{"analysis":"need history","tool_calls":[{"tool":"quest_history","input":"fuel merchant"}],"result":null}',
-        '{"analysis":"fuel clue matters","reasoning":"play safe","result":1}',
-    ]
-    mocked_llm.get_last_usage.side_effect = [
-        {"prompt_tokens": 24, "completion_tokens": 10, "total_tokens": 34, "estimated_cost_usd": 0.0008},
-        {"prompt_tokens": 22, "completion_tokens": 9, "total_tokens": 31, "estimated_cost_usd": 0.0007},
-    ]
-    agent.llm = mocked_llm
-
-    action = agent.get_action("Your fuel gauge is blinking.", [{"text": "Refuel"}, {"text": "Attack pirates"}])
-
-    assert action == 1
-    assert mocked_llm.get_completion.call_count == 2
-    assert agent.get_last_response().total_tokens == 65
-    assert len(agent._step_log) == 2
-
-
-def test_tool_agent_calculator_supports_arithmetic_and_comparisons():
-    assert ToolAgent.calculator("55 + 12 - 5") == "55 + 12 - 5 = 62"
-    assert ToolAgent.calculator("60 >= 55 and 62 >= 80") == "60 >= 55 and 62 >= 80 = False"
-    assert ToolAgent.calculator("__import__('os')").startswith("error:")
-
-
-def test_tool_agent_scratchpad_read_write_and_reset():
-    agent = ToolAgent(model_name="gpt-5-mini")
-
-    assert agent.scratchpad("read") == "(empty)"
-    assert (
-        agent.scratchpad("write_replace", " Board: W B _ ; failed door 2 ") == "updated: Board: W B _ ; failed door 2"
-    )
-    assert agent.scratchpad("read") == "Board: W B _ ; failed door 2"
-
-    agent.reset()
-
-    assert agent.scratchpad("read") == "(empty)"
-
-
-def test_tool_agent_can_use_calculator_and_records_tool_metadata():
-    agent = ToolAgent(model_name="gpt-5-mini")
-    mocked_llm = Mock()
-    mocked_llm.get_completion.side_effect = [
-        '{"memo":"Need mix math","analysis":"calculate target","tool_calls":[{"tool":"calculator","input":"50 + 3 >= 55"}],"result":null}',
-        '{"memo":"Need more strength","analysis":"math failed","reasoning":"choose strength","result":2}',
-    ]
-    mocked_llm.get_last_usage.return_value = {
-        "prompt_tokens": 10,
-        "completion_tokens": 5,
-        "total_tokens": 15,
-        "estimated_cost_usd": 0.0,
-    }
-    agent.llm = mocked_llm
-
-    action = agent.get_action("Strength is 50. Need at least 55.", [{"text": "Add water"}, {"text": "Add repusator"}])
-
-    response = agent.get_last_response()
-    assert action == 2
-    assert response.tool_calls == [{"tool": "calculator", "input": "50 + 3 >= 55", "operation": "", "content": ""}]
-    assert response.tool_results == ["calculator(50 + 3 >= 55) => 50 + 3 >= 55 = False"]
-    assert response.memo == "Need more strength"
-
-
-def test_tool_agent_uses_contextual_memory_state():
-    agent = ToolAgent(model_name="gpt-5-mini", memory_mode="compaction", compaction_interval=50)
-    agent._quest_briefing = "Original mission: pass pilot certification."
-    agent._transcript = [
-        {
-            "step": 1,
-            "observation": "Hogger is greedy.",
-            "choice_text": "Bribe Hogger",
-            "memo": "Hogger is greedy",
-            "action": 1,
-        }
-    ]
-    agent._steps_since_compaction = 1
-    mocked_llm = Mock()
-    mocked_llm.get_completion.return_value = (
-        '{"memo":"Hogger is greedy","analysis":"no tools needed","tool_calls":[],"result":1}'
-    )
-    mocked_llm.get_last_usage.return_value = {
-        "prompt_tokens": 10,
-        "completion_tokens": 5,
-        "total_tokens": 15,
-        "estimated_cost_usd": 0.0,
-    }
-    agent.llm = mocked_llm
-
-    agent.get_action("Current exam room.", [{"text": "Offer a bribe"}])
-
-    prompt = mocked_llm.get_completion.call_args.args[0]
-    assert "Quest briefing" in prompt
-    assert "RECENT STEPS" in prompt
-    assert "Hogger is greedy" in prompt
-
-
-def test_tool_agent_can_finish_without_tools_in_one_call():
-    agent = ToolAgent(model_name="gpt-5-mini")
-    mocked_llm = Mock()
-    mocked_llm.get_completion.return_value = (
-        '{"analysis":"no tools needed","tool_calls":[],"reasoning":"direct clue","result":2}'
-    )
-    mocked_llm.get_last_usage.return_value = {
-        "prompt_tokens": 15,
-        "completion_tokens": 6,
-        "total_tokens": 21,
-        "estimated_cost_usd": 0.0004,
-    }
-    agent.llm = mocked_llm
-
-    action = agent.get_action("A guard points at the safe exit.", [{"text": "Fight"}, {"text": "Leave"}])
-
-    assert action == 2
-    assert mocked_llm.get_completion.call_count == 1
+Planner/tool/memo behavior now lives in
+``llm_quest_benchmark.tests.harnesses.test_harnesses``.
+"""
diff --git a/llm_quest_benchmark/tests/harnesses/test_harnesses.py b/llm_quest_benchmark/tests/harnesses/test_harnesses.py
new file mode 100644
index 0000000..030648b
--- /dev/null
+++ b/llm_quest_benchmark/tests/harnesses/test_harnesses.py
@@ -0,0 +1,335 @@
+"""Comprehensive tests for concrete harness behavior."""
+
+from unittest.mock import Mock
+
+from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness
+from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory
+from llm_quest_benchmark.harnesses.memo import HintedCompactHarness, MemoCompactHarness
+from llm_quest_benchmark.harnesses.minimal import MinimalHarness
+from llm_quest_benchmark.harnesses.planner import PlannerHarness
+from llm_quest_benchmark.harnesses.reasoning import ReasoningFullTranscriptHarness, ReasoningRecentHarness
+from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness, ToolHintedHarness
+
+
+HARNESS_SPECS = {
+    "minimal": (MinimalHarness, "stub.jinja", DefaultMemory),
+    "reasoning_recent": (ReasoningRecentHarness, "reasoning.jinja", DefaultMemory),
+    "reasoning_full": (ReasoningFullTranscriptHarness, "reasoning.jinja", FullTranscriptMemory),
+    "memo_compact": (MemoCompactHarness, "stateful_compact.jinja", CompactionMemory),
+    "hinted_compact": (HintedCompactHarness, "stateful_compact_hints.jinja", CompactionMemory),
+    "tool_compact": (ToolCompactHarness, "tool_augmented.jinja", CompactionMemory),
+    "tool_hinted": (ToolHintedHarness, "tool_augmented_hints.jinja", CompactionMemory),
+    "planner": (PlannerHarness, "planner.jinja", CompactionMemory),
+}
+
+
+def assert_harness_configuration(harness_name: str) -> None:
+    expected_class, expected_template, expected_memory_class = HARNESS_SPECS[harness_name]
+
+    harness = create_harness(harness_name, model="gpt-5-mini")
+
+    assert isinstance(harness, expected_class)
+    assert harness.harness_name == harness_name
+    assert harness.action_template == expected_template
+    assert isinstance(harness.memory_module, expected_memory_class)
+
+
+def test_minimal_harness_configuration():
+    assert_harness_configuration("minimal")
+
+
+def test_reasoning_recent_harness_configuration():
+    assert_harness_configuration("reasoning_recent")
+
+
+def test_reasoning_full_harness_configuration():
+    assert_harness_configuration("reasoning_full")
+
+
+def test_memo_compact_harness_configuration():
+    assert_harness_configuration("memo_compact")
+
+
+def test_hinted_compact_harness_configuration():
+    assert_harness_configuration("hinted_compact")
+
+
+def test_tool_compact_harness_configuration():
+    assert_harness_configuration("tool_compact")
+
+
+def test_tool_hinted_harness_configuration():
+    assert_harness_configuration("tool_hinted")
+
+
+def test_planner_harness_configuration():
+    assert_harness_configuration("planner")
+
+
+def test_all_registry_harnesses_have_configuration_specs():
+    assert set(HARNESS_REGISTRY) == set(HARNESS_SPECS)
+
+
+def test_all_registry_harnesses_instantiate_with_expected_names():
+    for harness_name in HARNESS_REGISTRY:
+        harness = create_harness(harness_name, model="gpt-5-mini")
+
+        assert harness.harness_name == harness_name
+
+
+def test_memo_compact_mocked_llm_returns_action_and_reuses_memo_context():
+    harness = MemoCompactHarness(model_name="gpt-5-mini")
+    mocked_llm = Mock()
+    mocked_llm.get_completion.side_effect = [
+        '{"memo":"Merchant needs fuel payment","analysis":"pay first","reasoning":"quest clue","result":2}',
+        '{"memo":"Paid fuel merchant","analysis":"memo says paid","reasoning":"continue","result":1}',
+    ]
+    mocked_llm.get_last_usage.return_value = {
+        "prompt_tokens": 10,
+        "completion_tokens": 5,
+        "total_tokens": 15,
+        "estimated_cost_usd": 0.0,
+    }
+    harness.llm = mocked_llm
+
+    first_action = harness.get_action("A merchant offers fuel for a fee.", [{"text": "Leave"}, {"text": "Pay"}])
+    second_action = harness.get_action("The fuel gauge still blinks.", [{"text": "Check receipt"}, {"text": "Leave"}])
+
+    assert first_action == 2
+    assert second_action == 1
+    assert harness.get_last_response().memo == "Paid fuel merchant"
+    second_prompt = mocked_llm.get_completion.call_args_list[1].args[0]
+    assert "Merchant needs fuel payment" in second_prompt
+
+
+def test_planner_harness_first_turn_generates_plan_then_acts():
+    harness = PlannerHarness(model_name="gpt-5-mini")
+    mocked_llm = Mock()
+    mocked_llm.get_completion.side_effect = [
+        "Gather clues first. Avoid direct fights. Preserve resources.",
+        '{"analysis":"plan says scout","reasoning":"safer branch","result":2}',
+    ]
+    mocked_llm.get_last_usage.side_effect = [
+        {"prompt_tokens": 30, "completion_tokens": 12, "total_tokens": 42, "estimated_cost_usd": 0.001},
+        {"prompt_tokens": 20, "completion_tokens": 8, "total_tokens": 28, "estimated_cost_usd": 0.0007},
+    ]
+    harness.llm = mocked_llm
+
+    action = harness.get_action("You enter a pirate station.", [{"text": "Scout ahead"}, {"text": "Attack now"}])
+
+    assert action == 2
+    assert harness.current_plan is not None
+    assert "Avoid direct fights" in harness.current_plan
+    assert mocked_llm.get_completion.call_count == 2
+    assert harness.get_last_response().total_tokens == 70
+
+
+def test_planner_harness_reuses_plan_when_state_is_stable():
+    harness = PlannerHarness(model_name="gpt-5-mini")
+    harness.current_plan = "Keep moving carefully and avoid a direct fight."
+    harness._observation_history = ["Quiet corridor."]
+    mocked_llm = Mock()
+    mocked_llm.get_completion.return_value = '{"analysis":"plan still fits","reasoning":"careful progress","result":1}'
+    mocked_llm.get_last_usage.return_value = {
+        "prompt_tokens": 18,
+        "completion_tokens": 7,
+        "total_tokens": 25,
+        "estimated_cost_usd": 0.0005,
+    }
+    harness.llm = mocked_llm
+
+    action = harness.get_action("Quiet corridor.", [{"text": "Open the door"}, {"text": "Run"}])
+
+    assert action == 1
+    assert mocked_llm.get_completion.call_count == 1
+
+
+def test_planner_harness_uses_contextual_memory_state():
+    harness = PlannerHarness(model_name="gpt-5-mini", compaction_interval=50)
+    harness._quest_briefing = "Original mission: win the election."
+    harness._transcript = [
+        {
+            "step": 1,
+            "observation": "You learned Maloqs value strength.",
+            "choice_text": "Ask about Maloqs",
+            "memo": "Maloqs value strength",
+            "action": 1,
+        }
+    ]
+    harness._steps_since_compaction = 1
+    mocked_llm = Mock()
+    mocked_llm.get_completion.side_effect = [
+        "Use the remembered cultural clue.",
+        '{"analysis":"use clue","reasoning":"fits plan","result":1}',
+    ]
+    mocked_llm.get_last_usage.return_value = {
+        "prompt_tokens": 1,
+        "completion_tokens": 1,
+        "total_tokens": 2,
+        "estimated_cost_usd": 0.0,
+    }
+    harness.llm = mocked_llm
+
+    harness.get_action("Current banquet scene.", [{"text": "Greet like a warrior"}])
+
+    first_prompt = mocked_llm.get_completion.call_args_list[0].args[0]
+    assert "Quest briefing" in first_prompt
+    assert "RECENT STEPS" in first_prompt
+    assert "Maloqs value strength" in first_prompt
+
+
+def test_tool_compact_harness_can_use_quest_history():
+    harness = ToolCompactHarness(model_name="gpt-5-mini")
+    harness._step_log = [
+        {
+            "step": 1,
+            "observation": "Merchant mentioned low fuel.",
+            "choices": ["Buy fuel", "Keep flying"],
+            "selected_choice": "Buy fuel",
+        }
+    ]
+    harness._history_tool.step_log = harness._step_log
+    mocked_llm = Mock()
+    mocked_llm.get_completion.side_effect = [
+        '{"analysis":"need history","tool_calls":[{"tool":"quest_history","input":"fuel merchant"}],"result":null}',
+        '{"analysis":"fuel clue matters","reasoning":"play safe","result":1}',
+    ]
+    mocked_llm.get_last_usage.side_effect = [
+        {"prompt_tokens": 24, "completion_tokens": 10, "total_tokens": 34, "estimated_cost_usd": 0.0008},
+        {"prompt_tokens": 22, "completion_tokens": 9, "total_tokens": 31, "estimated_cost_usd": 0.0007},
+    ]
+    harness.llm = mocked_llm
+
+    action = harness.get_action("Your fuel gauge is blinking.", [{"text": "Refuel"}, {"text": "Attack pirates"}])
+
+    assert action == 1
+    assert mocked_llm.get_completion.call_count == 2
+    assert harness.get_last_response().total_tokens == 65
+    assert len(harness._step_log) == 2
+    assert harness.get_last_response().tool_results
+    assert "Merchant mentioned low fuel" in harness.get_last_response().tool_results[0]
+
+
+def test_tool_compact_calculator_supports_arithmetic_and_comparisons():
+    assert ToolCompactHarness.calculator("55 + 12 - 5") == "55 + 12 - 5 = 62"
+    assert ToolCompactHarness.calculator("60 >= 55 and 62 >= 80") == "60 >= 55 and 62 >= 80 = False"
+    assert ToolCompactHarness.calculator("__import__('os')").startswith("error:")
+
+
+def test_tool_compact_scratchpad_read_write_and_reset():
+    harness = ToolCompactHarness(model_name="gpt-5-mini")
+
+    assert harness.scratchpad("read") == "(empty)"
+    assert (
+        harness.scratchpad("write_replace", " Board: W B _ ; failed door 2 ")
+        == "updated: Board: W B _ ; failed door 2"
+    )
+    assert harness.scratchpad("read") == "Board: W B _ ; failed door 2"
+
+    harness.reset()
+
+    assert harness.scratchpad("read") == "(empty)"
+
+
+def test_tool_compact_harness_can_use_calculator_and_records_tool_metadata():
+    harness = ToolCompactHarness(model_name="gpt-5-mini")
+    mocked_llm = Mock()
+    mocked_llm.get_completion.side_effect = [
+        '{"memo":"Need mix math","analysis":"calculate target","tool_calls":[{"tool":"calculator","input":"50 + 3 >= 55"}],"result":null}',
+        '{"memo":"Need more strength","analysis":"math failed","reasoning":"choose strength","result":2}',
+    ]
+    mocked_llm.get_last_usage.return_value = {
+        "prompt_tokens": 10,
+        "completion_tokens": 5,
+        "total_tokens": 15,
+        "estimated_cost_usd": 0.0,
+    }
+    harness.llm = mocked_llm
+
+    action = harness.get_action("Strength is 50. Need at least 55.", [{"text": "Add water"}, {"text": "Add repusator"}])
+
+    response = harness.get_last_response()
+    assert action == 2
+    assert response.tool_calls == [{"tool": "calculator", "input": "50 + 3 >= 55", "operation": "", "content": ""}]
+    assert response.tool_results == ["calculator(50 + 3 >= 55) => 50 + 3 >= 55 = False"]
+    assert response.memo == "Need more strength"
+
+
+def test_tool_compact_harness_can_use_scratchpad_tool_call():
+    harness = ToolCompactHarness(model_name="gpt-5-mini")
+    mocked_llm = Mock()
+    mocked_llm.get_completion.side_effect = [
+        (
+            '{"analysis":"save board","tool_calls":[{"tool":"scratchpad",'
+            '"operation":"write_replace","content":"Board: red blue blank"}],"result":null}'
+        ),
+        '{"analysis":"note saved","reasoning":"use saved board","result":1}',
+    ]
+    mocked_llm.get_last_usage.return_value = {
+        "prompt_tokens": 10,
+        "completion_tokens": 5,
+        "total_tokens": 15,
+        "estimated_cost_usd": 0.0,
+    }
+    harness.llm = mocked_llm
+
+    action = harness.get_action("A colored board blocks the hall.", [{"text": "Use red-blue order"}])
+
+    assert action == 1
+    assert harness.scratchpad("read") == "Board: red blue blank"
+    assert harness.get_last_response().tool_results == [
+        "scratchpad(write_replace, Board: red blue blank) => updated: Board: red blue blank"
+    ]
+
+
+def test_tool_compact_harness_uses_contextual_memory_state():
+    harness = ToolCompactHarness(model_name="gpt-5-mini", compaction_interval=50)
+    harness._quest_briefing = "Original mission: pass pilot certification."
+    harness._transcript = [
+        {
+            "step": 1,
+            "observation": "Hogger is greedy.",
+            "choice_text": "Bribe Hogger",
+            "memo": "Hogger is greedy",
+            "action": 1,
+        }
+    ]
+    harness._steps_since_compaction = 1
+    mocked_llm = Mock()
+    mocked_llm.get_completion.return_value = (
+        '{"memo":"Hogger is greedy","analysis":"no tools needed","tool_calls":[],"result":1}'
+    )
+    mocked_llm.get_last_usage.return_value = {
+        "prompt_tokens": 10,
+        "completion_tokens": 5,
+        "total_tokens": 15,
+        "estimated_cost_usd": 0.0,
+    }
+    harness.llm = mocked_llm
+
+    harness.get_action("Current exam room.", [{"text": "Offer a bribe"}])
+
+    prompt = mocked_llm.get_completion.call_args.args[0]
+    assert "Quest briefing" in prompt
+    assert "RECENT STEPS" in prompt
+    assert "Hogger is greedy" in prompt
+
+
+def test_tool_compact_harness_can_finish_without_tools_in_one_call():
+    harness = ToolCompactHarness(model_name="gpt-5-mini")
+    mocked_llm = Mock()
+    mocked_llm.get_completion.return_value = (
+        '{"analysis":"no tools needed","tool_calls":[],"reasoning":"direct clue","result":2}'
+    )
+    mocked_llm.get_last_usage.return_value = {
+        "prompt_tokens": 15,
+        "completion_tokens": 6,
+        "total_tokens": 21,
+        "estimated_cost_usd": 0.0004,
+    }
+    harness.llm = mocked_llm
+
+    action = harness.get_action("A guard points at the safe exit.", [{"text": "Fight"}, {"text": "Leave"}])
+
+    assert action == 2
+    assert mocked_llm.get_completion.call_count == 1
diff --git a/llm_quest_benchmark/tests/integration/test_mode_agents_e2e.py b/llm_quest_benchmark/tests/integration/test_mode_agents_e2e.py
index 5563ca2..2ceeaca 100644
--- a/llm_quest_benchmark/tests/integration/test_mode_agents_e2e.py
+++ b/llm_quest_benchmark/tests/integration/test_mode_agents_e2e.py
@@ -1,12 +1,12 @@
-"""Integration tests for planner/tool modes on real quest execution loops."""
+"""Integration tests for planner/tool harness modes on real quest execution loops."""
 
 from pathlib import Path
 
 import pytest
 
-from llm_quest_benchmark.agents.agent_factory import create_agent
 from llm_quest_benchmark.core.runner import run_quest_with_timeout
 from llm_quest_benchmark.environments.state import QuestOutcome
+from llm_quest_benchmark.harnesses.factory import create_harness
 
 QUEST_PATHS = [
     "quests/Boat.qm",
@@ -38,18 +38,18 @@ def get_last_usage(self):
 
 @pytest.mark.timeout(15)
 @pytest.mark.skipif(not Path(QUEST_PATHS[1]).exists(), reason="Quest files not downloaded")
-def test_planner_agent_runs_three_quests_across_openai_and_anthropic_models(monkeypatch):
+def test_planner_harness_runs_three_quests_across_openai_and_anthropic_models(monkeypatch):
     requested_models = []
 
     def fake_get_llm_client(model_name, **kwargs):
         requested_models.append(model_name)
         return FakeLLM("planner")
 
-    monkeypatch.setattr("llm_quest_benchmark.agents.llm_agent.get_llm_client", fake_get_llm_client)
+    monkeypatch.setattr("llm_quest_benchmark.harnesses.base.get_llm_client", fake_get_llm_client)
 
     for model_name in ["gpt-5-mini", "claude-sonnet-4-5"]:
         for quest_path in QUEST_PATHS:
-            agent = create_agent(model=model_name, action_template="planner", skip_single=True)
+            agent = create_harness("planner", model=model_name, skip_single=True)
             outcome = run_quest_with_timeout(quest_path, agent, timeout=10)
             assert outcome in {QuestOutcome.SUCCESS, QuestOutcome.FAILURE, QuestOutcome.TIMEOUT}
             assert outcome != QuestOutcome.ERROR
@@ -60,14 +60,14 @@ def fake_get_llm_client(model_name, **kwargs):
 
 @pytest.mark.timeout(15)
 @pytest.mark.skipif(not Path(QUEST_PATHS[1]).exists(), reason="Quest files not downloaded")
-def test_tool_agent_runs_three_quests(monkeypatch):
+def test_tool_harness_runs_three_quests(monkeypatch):
     monkeypatch.setattr(
-        "llm_quest_benchmark.agents.llm_agent.get_llm_client",
+        "llm_quest_benchmark.harnesses.base.get_llm_client",
         lambda model_name, **kwargs: FakeLLM("tool"),
     )
 
     for quest_path in QUEST_PATHS:
-        agent = create_agent(model="gpt-5-mini", action_template="tool_augmented", skip_single=True)
+        agent = create_harness("tool_compact", model="gpt-5-mini", skip_single=True)
         outcome = run_quest_with_timeout(quest_path, agent, timeout=10)
         assert outcome in {QuestOutcome.SUCCESS, QuestOutcome.FAILURE, QuestOutcome.TIMEOUT}
         assert outcome != QuestOutcome.ERROR
@@ -75,9 +75,9 @@ def test_tool_agent_runs_three_quests(monkeypatch):
 
 @pytest.mark.timeout(15)
 @pytest.mark.skipif(not Path(QUEST_PATHS[1]).exists(), reason="Quest files not downloaded")
-def test_reused_mode_agents_reset_between_quest_runs():
+def test_reused_mode_harnesses_reset_between_quest_runs():
     quest_path = "quests/sr_2_1_2121_eng/Borzukhan_eng.qm"
-    planner_agent = create_agent(model="gpt-5-mini", action_template="planner", skip_single=True)
+    planner_agent = create_harness("planner", model="gpt-5-mini", skip_single=True)
     planner_agent.llm = FakeLLM("planner")
 
     first_outcome = run_quest_with_timeout(quest_path, planner_agent, timeout=10)
@@ -92,7 +92,7 @@ def test_reused_mode_agents_reset_between_quest_runs():
     assert "stale plan from previous run" not in planner_agent._plan_history
     assert "stale observation" not in planner_agent._observation_history
 
-    tool_agent = create_agent(model="gpt-5-mini", action_template="tool_augmented", skip_single=True)
+    tool_agent = create_harness("tool_compact", model="gpt-5-mini", skip_single=True)
     tool_agent.llm = FakeLLM("tool")
 
     first_outcome = run_quest_with_timeout(quest_path, tool_agent, timeout=10)
diff --git a/llm_quest_benchmark/tests/integration/test_quest_e2e.py b/llm_quest_benchmark/tests/integration/test_quest_e2e.py
index 8ebfb91..a0e376d 100644
--- a/llm_quest_benchmark/tests/integration/test_quest_e2e.py
+++ b/llm_quest_benchmark/tests/integration/test_quest_e2e.py
@@ -5,10 +5,10 @@
 
 import pytest
 
-from llm_quest_benchmark.agents.agent_factory import create_agent
-from llm_quest_benchmark.constants import DEFAULT_QUEST, DEFAULT_TEMPLATE, SYSTEM_ROLE_TEMPLATE
+from llm_quest_benchmark.constants import DEFAULT_QUEST, SYSTEM_ROLE_TEMPLATE
 from llm_quest_benchmark.core.runner import run_quest_with_timeout
 from llm_quest_benchmark.environments.state import QuestOutcome
+from llm_quest_benchmark.harnesses.factory import create_harness
 
 TIMEOUT = 20  # 20s should be enough for test quests to complete
 
@@ -19,11 +19,11 @@ def test_quest_run_with_llm(caplog):
     """Test that quest runs with LLM agent and reaches a final state"""
     caplog.set_level(logging.DEBUG)  # Show all logs in test output
 
-    # Create LLM agent
-    agent = create_agent(
+    # Create LLM harness
+    agent = create_harness(
+        harness="minimal",
         model="random_choice",  # Use random for testing
         system_template=SYSTEM_ROLE_TEMPLATE,
-        action_template=DEFAULT_TEMPLATE,
         temperature=0.0,
         skip_single=False,
         debug=True,
@@ -68,7 +68,7 @@ def test_random_agent_on_test_quest(caplog):
     caplog.set_level(logging.DEBUG)  # Show all logs in test output
 
     # Create random agent
-    agent = create_agent("random_choice", skip_single=True, debug=True)
+    agent = create_harness("random_choice", skip_single=True, debug=True)
     assert agent is not None, "Failed to create random agent"
 
     # Mock callback for testing

From 1ac851ebb3c9ba5bcf87d46af1c78374233c7ee5 Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Mon, 11 May 2026 14:57:20 +0400
Subject: [PATCH 08/24] docs: reframe as agent harness benchmark

---
 docs/ARCHITECTURE.md        | 114 +++++++++++++++++++++++-------------
 docs/EXPERIMENTS_LOG.md     |  14 +++++
 docs/HARNESS_ENGINEERING.md |  64 ++++++++++++++++++++
 docs/SPEC.md                |  37 ++++++------
 4 files changed, 171 insertions(+), 58 deletions(-)
 create mode 100644 docs/HARNESS_ENGINEERING.md

diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index c7f556d..998241a 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -1,38 +1,55 @@
 # Architecture
 
 ## Overview
-LLM Quest Benchmark evaluates how different agent architectures complete interactive fiction quests (Space Rangers `.qm` format).
+
+LLM Quest Benchmark evaluates how **agent harnesses** complete interactive
+fiction quests in the Space Rangers `.qm` format. The benchmark holds the quest
+environment and result logging constant while varying the harness around the
+model: prompt template, memory strategy, tools, and action loop.
+
 The runtime loop is:
+
 1. Parse or step quest state via the TypeScript engine bridge.
-2. Build an action prompt from current state and available choices.
-3. Get agent choice (human/random/LLM with varying agent modes).
-4. Apply choice, log step, and detect outcome.
+2. Build harness context from current state, available choices, and memory.
+3. Get a choice from a human, random policy, or LLM-backed harness.
+4. Apply the choice, log the step, and detect the terminal outcome.
 5. Persist run metrics and run summaries.
 
 ## Main Runtime Layers
 
 ### 1. Quest Engine Layer
-- `space-rangers-quest/`:
-  TypeScript quest parser/player submodule.
-- `llm_quest_benchmark/executors/ts_bridge/consoleplayer.ts`:
-  Node entrypoint for parse/step execution.
-- `llm_quest_benchmark/executors/ts_bridge/bridge.py`:
-  Python subprocess bridge with startup preflight and actionable errors.
+
+- `space-rangers-quest/`: TypeScript quest parser/player submodule.
+- `llm_quest_benchmark/executors/ts_bridge/consoleplayer.ts`: Node entrypoint
+  for parse/step execution.
+- `llm_quest_benchmark/executors/ts_bridge/bridge.py`: Python subprocess
+  bridge with startup preflight and actionable errors.
 
 ### 2. Environment Layer
-- `llm_quest_benchmark/environments/qm.py`:
-  Wraps bridge into Python environment semantics (`reset`, `step`, terminal detection).
 
-### 3. Agent Layer
-- `llm_quest_benchmark/agents/llm_agent.py`: Base LLM agent with template-driven prompts, retry logic, loop-breaking, and safety filters.
-- `llm_quest_benchmark/agents/planner_agent.py`: Planner loop with observation-diff heuristic for re-planning.
-- `llm_quest_benchmark/agents/tool_agent.py`: Tool-using scaffold with quest history tool.
-- `llm_quest_benchmark/agents/agent_factory.py`: Factory that maps Prompt Template choices to agent classes.
-- `llm_quest_benchmark/agents/human_player.py`, `random_agent.py`: Non-LLM agents.
+- `llm_quest_benchmark/environments/qm.py`: Wraps the bridge into Python
+  environment semantics (`reset`, `step`, terminal detection).
+
+### 3. Harness Layer
+
+- `llm_quest_benchmark/harnesses/base.py`: `BaseHarness`, the shared
+  LLM-backed `QuestPlayer` implementation for prompt rendering, response
+  parsing, retries, contextual state, and safety filtering.
+- `llm_quest_benchmark/harnesses/memory.py`: `DefaultMemory`,
+  `FullTranscriptMemory`, and `CompactionMemory`.
+- `llm_quest_benchmark/harnesses/tools.py`: Calculator, scratchpad, and quest
+  history helpers used by tool harnesses.
+- `llm_quest_benchmark/harnesses/factory.py`: `create_harness()` and the
+  canonical harness registry.
+- `llm_quest_benchmark/agents/human_player.py`,
+  `llm_quest_benchmark/agents/random_agent.py`: Non-LLM `QuestPlayer`
+  implementations preserved for interactive and random baselines.
 
-`LLMAgent` lazily initializes provider clients, so template rendering and agent construction do not require API keys.
+Harness construction lazily initializes provider clients, so template rendering
+and benchmark configuration parsing do not require API keys.
 
 ### 4. LLM Provider Layer
+
 - `llm_quest_benchmark/llm/client.py`:
   - provider/model normalization (`provider:model` + aliases)
   - adapters: OpenAI, Anthropic, Google Gemini, DeepSeek
@@ -40,38 +57,53 @@ The runtime loop is:
   - token/cost usage tracking per completion call
 
 ### 5. Execution and Analysis Layer
+
 - `llm_quest_benchmark/core/runner.py`: Core quest run loop.
-- `llm_quest_benchmark/core/analyzer.py`: Post-run analysis and benchmark summaries.
+- `llm_quest_benchmark/core/analyzer.py`: Post-run analysis and benchmark
+  summaries.
 - `llm_quest_benchmark/core/benchmark_report.py`: Markdown report generator.
-- `llm_quest_benchmark/core/logging.py`: Quest logger with per-run metrics (repetition_rate, bad_decision_rate).
-- `llm_quest_benchmark/executors/benchmark.py`: Benchmark orchestration with parallel workers.
-- `llm_quest_benchmark/executors/cli/commands.py`: CLI commands (`run`, `play`, `analyze`, `analyze-run`, `benchmark`, `benchmark-report`, `download-quests`, `cleanup`).
+- `llm_quest_benchmark/core/logging.py`: Quest logger with per-run metrics
+  (`repetition_rate`, `bad_decision_rate`).
+- `llm_quest_benchmark/executors/benchmark.py`: Benchmark orchestration with
+  parallel workers.
+- `llm_quest_benchmark/executors/cli/commands.py`: CLI commands (`run`, `play`,
+  `analyze`, `analyze-run`, `benchmark`, `benchmark-report`,
+  `download-quests`, `cleanup`).
 
 ### 6. Prompt Templates
-- `llm_quest_benchmark/prompt_templates/`: Jinja2 templates for each agent mode.
+
+- `llm_quest_benchmark/prompt_templates/`: Jinja2 templates referenced by
+  harnesses.
   - `stub.jinja`: Minimal prompt.
-  - `reasoning.jinja`, `strategic.jinja`, etc.: Short-context or full-history reasoning depending on memory mode.
-  - `stateful_compact.jinja`, `memo_*.jinja`: Compact memory / memo prompts.
-  - `light_hints.jinja`, `stateful_compact_hints.jinja`: Prompt hints.
-  - `planner.jinja`: Planner loop prompts.
-  - `tool_augmented.jinja`, `tool_augmented_hints.jinja`: Tool prompts with compact memory, optionally with hints.
+  - `reasoning.jinja`: Short-context or full-history reasoning depending on
+    harness memory.
+  - `stateful_compact.jinja`: Compact memory / 20-word memo prompt.
+  - `stateful_compact_hints.jinja`: Compact memo prompt with mechanics hints.
+  - `planner.jinja`: Planner loop prompt.
+  - `tool_augmented.jinja`, `tool_augmented_hints.jinja`: Tool prompts with
+    compact memory, optionally with hints.
 
 ## Persistence
+
 - `metrics.db`: Benchmark/run metrics for CLI workflows.
-- `results/<agent>/<quest>/run_<id>/run_summary.json`: Step trace + per-step decisions + aggregated token/cost usage.
+- `results/<harness>/<quest>/run_<id>/run_summary.json`: Step trace,
+  per-step decisions, and aggregated token/cost usage.
 
 ## Configuration
+
 - `.env` (copied from `.env.template`): Provider API keys.
-- `configs/benchmarks/`: Benchmark YAML configs defining model x template x quest matrix.
+- `configs/benchmarks/`: Benchmark YAML configs defining model × harness ×
+  quest matrices.
 
 ## Public Taxonomy (Benchmark Dimension)
-| Label | Template / memory source | Agent Class | Description |
-|------|----------|-------------|-------------|
-| Minimal prompt | stub | LLMAgent | Smallest action-selection prompt |
-| Short-context reasoning | reasoning/strategic + default memory | LLMAgent | Local prompted analysis |
-| Full-history reasoning | reasoning + full transcript memory | LLMAgent | Whole transcript retained in context |
-| Compact memory / memo | reasoning/stateful/memo templates + compaction | LLMAgent | Summarized state instead of unbounded transcript |
-| Prompt hints | light_hints/stateful_compact_hints | LLMAgent | Mechanics hints injected into prompt |
-| Tools + compact memory | tool_augmented | ToolAgent | Quest history/scratchpad tools with compact context |
-| Tools + hints + compact memory | tool_augmented_hints | ToolAgent | Tool scaffold plus prompt hints |
-| Planner loop | planner | PlannerAgent | Plan-maintain-act loop |
+
+| Public label | Harness name | Template | Memory | Tools | Loop |
+|---|---|---|---|---|---|
+| Minimal prompt | `minimal` | `stub.jinja` | `DefaultMemory` | none | react |
+| Short-context reasoning | `reasoning_recent` | `reasoning.jinja` | `DefaultMemory` | none | react |
+| Full-history reasoning | `reasoning_full` | `reasoning.jinja` | `FullTranscriptMemory` | none | react |
+| Compact memory / memo | `memo_compact` | `stateful_compact.jinja` | `CompactionMemory` | none | react |
+| Prompt hints | `hinted_compact` | `stateful_compact_hints.jinja` | `CompactionMemory` | none | react |
+| Tools + compact memory | `tool_compact` | `tool_augmented.jinja` | `CompactionMemory` | calculator, scratchpad, quest history | tool-select-then-act |
+| Tools + hints + compact memory | `tool_hinted` | `tool_augmented_hints.jinja` | `CompactionMemory` | calculator, scratchpad, quest history | tool-select-then-act |
+| Planner loop | `planner` | `planner.jinja` | `CompactionMemory` | none | plan-maintain-act |
diff --git a/docs/EXPERIMENTS_LOG.md b/docs/EXPERIMENTS_LOG.md
index dadef6e..0d9ce49 100644
--- a/docs/EXPERIMENTS_LOG.md
+++ b/docs/EXPERIMENTS_LOG.md
@@ -1,5 +1,19 @@
 # Experiments Log
 
+## Harness Name Mapping
+
+| Experiment arm | Old label | New harness name |
+|---|---|---|
+| Minimal prompt arms | `stub` | `minimal` |
+| Short-context reasoning arms | `reasoning` + `default` memory | `reasoning_recent` |
+| Full-history reasoning arms | `reasoning` + `full_transcript` memory | `reasoning_full` |
+| Stateful compact memo arms | `stateful_compact` + compaction | `memo_compact` |
+| Hinted compact memo arms | `stateful_compact_hints` + compaction | `hinted_compact` |
+| Tool-augmented compact arms | `tool_augmented` + compaction | `tool_compact` |
+| Tool-augmented hinted arms | `tool_augmented_hints` + compaction | `tool_hinted` |
+| Planner arms | `planner` | `planner` |
+| Memo variation arms | `memo_extended`, `memo_structured`, `memo_cot` | retired experiment variants, not canonical harnesses |
+
 > Historical / non-authoritative notes. This log preserves experiment history
 > and branch-era shorthand. For the current public taxonomy and public
 > comparison slice, use `site/about.html`, `site/leaderboard.json`,
diff --git a/docs/HARNESS_ENGINEERING.md b/docs/HARNESS_ENGINEERING.md
new file mode 100644
index 0000000..5666ebc
--- /dev/null
+++ b/docs/HARNESS_ENGINEERING.md
@@ -0,0 +1,64 @@
+# Harness Engineering
+
+LLM Quest Benchmark treats the **agent harness** as the primary experimental
+object. An agent harness is the wrapper around a model that controls what the
+model sees, what state is carried forward, what external tools are available,
+and how a raw model completion is converted into a quest action. In this
+project, harnesses are not incidental plumbing: they are the independent
+variable.
+
+This framing follows the harness engineering question raised by "How Much Heavy
+Lifting Can an Agent Harness Do?" (arXiv:2604.07236): how much performance comes
+from the surrounding scaffold rather than the base model alone? Space Rangers
+text quests are a useful testbed because they are long enough to stress memory,
+planning, and state tracking, but concrete enough to score with terminal
+success/failure outcomes.
+
+## The Eight Canonical Harnesses
+
+| Harness name | What varies |
+|---|---|
+| `minimal` | Uses the smallest action-selection prompt with recent context only. This is the low-scaffold baseline. |
+| `reasoning_recent` | Adds an explicit reasoning prompt while keeping recent-window memory. |
+| `reasoning_full` | Keeps the reasoning prompt but exposes the full transcript instead of a short recent window. |
+| `memo_compact` | Uses compacted memory plus a constrained 20-word memo to preserve salient state. |
+| `hinted_compact` | Adds mechanics hints to the compact memo harness, without tools. |
+| `tool_compact` | Adds calculator, scratchpad, and quest-history tools to compact memory. |
+| `tool_hinted` | Combines compact memory, tools, and mechanics hints. |
+| `planner` | Uses a plan-maintain-act loop with compact memory instead of a pure react loop. |
+
+The harness names are canonical snake_case identifiers used in YAML configs,
+the CLI, and documentation. Public labels can be friendlier, but experimental
+records should preserve these names so runs remain comparable.
+
+## Difference From TextQuests and TALE-Suite
+
+TextQuests (arXiv:2507.23701) and TALE-Suite are closest in spirit because they
+also evaluate language models on interactive text-game tasks. Their main
+comparison axis is model capability under a mostly fixed evaluation scaffold:
+the harness is treated as test infrastructure, and the model is varied.
+
+LLM Quest Benchmark flips that emphasis. We can hold a model fixed and vary the
+harness to ask which context, memory, tool, and planning choices change
+behavior. That makes the benchmark useful for harness engineering: it can
+separate "the model cannot do the task" from "this wrapper failed to show the
+model the right state, preserve the right facts, or expose the right operation."
+
+## Findings So Far
+
+The strongest pattern so far is that bigger scaffolds are not automatically
+better. A concise 20-word memo produced a sweet spot: it improved over no memo
+and full transcript baselines, while longer or more structured memo variants
+regressed. The likely mechanism is selective pressure: the short memo forces
+the harness to preserve only state that matters for future decisions.
+
+Tools and hints show a synergy effect. Prompt hints alone hurt, and tools alone
+were modest, but tools plus hints improved outcomes because the hints pointed
+the model toward quantities and morally grey quest mechanics while the
+calculator, scratchpad, and history search gave it ways to act on those
+signals.
+
+Verbosity hurts in this environment. Some newer or larger models timed out more
+often because they spent too much of the quest budget generating long step
+responses. For sequential decision tasks, a harness that elicits concise,
+actionable state updates can outperform one that invites broad reasoning.
diff --git a/docs/SPEC.md b/docs/SPEC.md
index 99289fb..cadbb99 100644
--- a/docs/SPEC.md
+++ b/docs/SPEC.md
@@ -7,8 +7,11 @@ For the public narrative and interpretation of results, use the project
 ## Purpose
 
 LLM Quest Benchmark evaluates how LLMs make sequential choices in Space
-Rangers text quests. The benchmark varies the context scaffold around a model
-while holding the quest environment and result logging consistent.
+Rangers text quests. The benchmark varies the agent harness around a model
+while holding the quest environment and result logging consistent. A harness is
+the wrapper that decides what context the model sees and how its response is
+converted into an action: prompt template, memory strategy, tools, and loop
+shape.
 
 The core question is practical: which kinds of context help, hurt, or expose
 state-tracking failures during 10-50 turn interactive fiction tasks?
@@ -35,18 +38,18 @@ analysis, but the public slice is the authoritative comparison surface.
 
 ## Current Taxonomy
 
-Use these labels for current public descriptions of benchmark modes:
+Use these labels for current public descriptions of benchmark harnesses:
 
-| Label | Implementation source | Agent class |
-|---|---|---|
-| Minimal prompt | `stub.jinja` | `LLMAgent` |
-| Short-context reasoning | `reasoning.jinja`, `strategic.jinja` with default/recent context | `LLMAgent` |
-| Full-history reasoning | reasoning templates with `full_transcript` memory | `LLMAgent` |
-| Compact memory / memo | `stateful_compact.jinja`, memo templates, compaction memory | `LLMAgent` |
-| Prompt hints | `light_hints.jinja`, `stateful_compact_hints.jinja` | `LLMAgent` |
-| Tools + compact memory | `tool_augmented.jinja` | `ToolAgent` |
-| Tools + hints + compact memory | `tool_augmented_hints.jinja` | `ToolAgent` |
-| Planner loop | `planner.jinja` | `PlannerAgent` |
+| Label | Harness name | Template | Memory | Tools / loop |
+|---|---|---|---|---|
+| Minimal prompt | `minimal` | `stub.jinja` | `DefaultMemory` | no tools, react loop |
+| Short-context reasoning | `reasoning_recent` | `reasoning.jinja` | `DefaultMemory` | no tools, react loop |
+| Full-history reasoning | `reasoning_full` | `reasoning.jinja` | `FullTranscriptMemory` | no tools, react loop |
+| Compact memory / memo | `memo_compact` | `stateful_compact.jinja` | `CompactionMemory` | no tools, react loop |
+| Prompt hints | `hinted_compact` | `stateful_compact_hints.jinja` | `CompactionMemory` | no tools, react loop |
+| Tools + compact memory | `tool_compact` | `tool_augmented.jinja` | `CompactionMemory` | calculator, scratchpad, quest history |
+| Tools + hints + compact memory | `tool_hinted` | `tool_augmented_hints.jinja` | `CompactionMemory` | calculator, scratchpad, quest history |
+| Planner loop | `planner` | `planner.jinja` | `CompactionMemory` | plan-maintain-act loop |
 
 Older internal experiment labels are historical and should not be presented as
 the current public taxonomy.
@@ -56,8 +59,8 @@ the current public taxonomy.
 - Quest execution uses the TypeScript `space-rangers-quest` submodule through
   the Python bridge in `llm_quest_benchmark/executors/ts_bridge/`.
 - Environment state is exposed through `llm_quest_benchmark/environments/qm.py`.
-- Agents live under `llm_quest_benchmark/agents/` and are selected by template
-  aliases and agent factory wiring.
+- Agent harnesses live under `llm_quest_benchmark/harnesses/` and are selected
+  by canonical snake_case harness names.
 - Provider calls are normalized in `llm_quest_benchmark/llm/client.py` with
   OpenAI-compatible, Anthropic, Google, and DeepSeek adapters.
 - Benchmark execution is CLI + YAML driven through `uv run llm-quest ...`.
@@ -107,7 +110,7 @@ Provider API keys are required for real LLM runs. Tests and static validation
 should run without external credentials in a prepared checkout.
 
 Reproducible benchmark rows depend on recording the quest file, model/provider
-ID, prompt templates, memory mode, run ID, outcome, and run summaries with
-usage/metrics. Agent responses are parsed into a chosen action plus optional
+ID, harness name, run ID, outcome, and run summaries with usage/metrics.
+Harness responses are parsed into a chosen action plus optional
 analysis/reasoning so action validity, terminal outcome, steps, tokens/cost,
 and repetition diagnostics can be regenerated from stored artifacts.

From 7cc2a21bc963204b03f7a383313a111d336f3502 Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Mon, 11 May 2026 15:02:07 +0400
Subject: [PATCH 09/24] docs: add experiment audit

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 docs/EXPERIMENT_AUDIT.md | 193 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 docs/EXPERIMENT_AUDIT.md

diff --git a/docs/EXPERIMENT_AUDIT.md b/docs/EXPERIMENT_AUDIT.md
new file mode 100644
index 0000000..3298d0e
--- /dev/null
+++ b/docs/EXPERIMENT_AUDIT.md
@@ -0,0 +1,193 @@
+# Experiment Audit
+
+Generated: 2026-05-11
+
+Sources reviewed:
+
+- `docs/EXPERIMENTS_LOG.md`
+- `docs/ARCHITECTURE.md`
+- `configs/benchmarks/*.yaml`
+- `site/leaderboard.json`
+
+This audit uses the post-refactor harness taxonomy: `minimal`,
+`reasoning_recent`, `reasoning_full`, `memo_compact`, `hinted_compact`,
+`tool_compact`, `tool_hinted`, and `planner`.
+
+## Experiment Inventory
+
+| Experiment | Config / source | Harness mapping | Quest scope | Completed runs recorded in log | Audit disposition |
+|---|---|---|---|---:|---|
+| Exp 2: Memory Modes | `memory_full_transcript.yaml`, `memory_compaction.yaml` | `reasoning_full`, `memo_compact` | 14 historical quests including `Prison` | 126 | Unreliable for canonical comparison: loop-breaker bug era. |
+| Exp 3 Arm 1: No Loop Breaker | `exp3_no_loop_breaker.yaml` | `reasoning_full` | 18 quests, excluding `Boat`/`Prison` | 36 | Use only rerun after timeout fix; pre-fix attempt is noisy/incomplete. |
+| Exp 3 Arm 2: Stateful Compact | `exp3_stateful_compact.yaml` | `memo_compact` | 18 quests, excluding `Boat`/`Prison` | 36 | Canonical memo baseline, but only 2 runs/quest. |
+| Exp 4: Compaction No Memo | `exp4_compaction_no_memo.yaml` | retired ablation, not canonical | 18 quests | 36 | Do not aggregate into `memo_compact`. |
+| Exp 4: Memo Extended | `exp4_memo_extended.yaml` | retired `memo_extended` variant | 18 quests | 36 | Non-canonical variant. |
+| Exp 4: Memo Structured | `exp4_memo_structured.yaml` | retired `memo_structured` variant | 18 quests | 36 | Non-canonical variant. |
+| Exp 4: Memo CoT | `exp4_memo_cot.yaml` | retired `memo_cot` variant | 18 quests | 36 | Non-canonical variant. |
+| Exp 5: Baseline Variance | `exp5_stateful_compact_variance.yaml` | `memo_compact` | 18 quests | 90 | Canonical memo baseline variance study. |
+| Exp 6: Prompt Hints | `exp6_prompt_hints.yaml` | `hinted_compact` | 18 quests | 54 | Canonical single-model harness comparison. |
+| Exp 6: Tools | `exp6_tools.yaml` | `tool_compact` | 18 quests | 54 | Canonical single-model harness comparison. |
+| Exp 6: Tools + Hints | `exp6_tools_hints.yaml` | `tool_hinted` | 18 quests | 54 | Canonical single-model harness comparison. |
+| Exp 7: Multi-Model Comparison | `exp7_*.yaml` | `memo_compact` | 5 winnable quests | 75 | Canonical model sweep for memo harness. |
+| Exp 7b: Model Upgrades | `exp7b_model_upgrades.yaml` | `memo_compact` | 18 quests | 108 | Noisy model-upgrade sweep; high timeout rates for Qwen 3.6 and Haiku 4.5. |
+
+## 1. Harness Coverage Matrix
+
+The table below is computed from `site/leaderboard.json` and counts recorded
+leaderboard runs by harness and quest. `Boat` and `Prison` are retained in this
+matrix because they still appear in the published leaderboard data, but they
+are retired from the canonical experiment set.
+
+| Harness | Badday | Banket | Boat | Codebox | Depth | Driver | Edelweiss | Election | Foncers | Leonardo | Ministry | Pizza | Prison | Robots | Ski | Total |
+|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+| `minimal` | 22 | 22 | 23 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 331 |
+| `reasoning_recent` | 22 | 22 | 28 | 22 | 22 | 24 | 25 | 30 | 25 | 25 | 26 | 22 | 28 | 31 | 31 | 383 |
+| `reasoning_full` | 17 | 17 | 9 | 17 | 17 | 15 | 17 | 17 | 17 | 17 | 16 | 17 | 6 | 14 | 14 | 227 |
+| `memo_compact` | 37 | 39 | 18 | 39 | 39 | 39 | 39 | 37 | 39 | 37 | 39 | 37 | 15 | 39 | 34 | 527 |
+| `hinted_compact` | 4 | 4 | 1 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 1 | 4 | 4 | 54 |
+| `tool_compact` | 3 | 3 | **0** | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | **0** | 3 | 3 | 39 |
+| `tool_hinted` | 3 | 3 | **0** | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | **0** | 3 | 3 | 39 |
+| `planner` | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 15 |
+
+Leaderboard scope note: the current public JSON includes 15 quest columns and
+does not include several 18-quest experiment-log quests such as `Pilot`,
+`Disk`, `Player`, `Shashki`, and `Sortirovka1`. Those quests appear in the
+benchmark configs and experiment log, so a future leaderboard refresh should
+either add them or explicitly document why the public slice excludes them.
+
+## 2. Gap Analysis
+
+### Zero-run harness × quest cells
+
+All zero-run cells in the published leaderboard matrix are retired quest cells:
+
+- `tool_compact` × `Boat`: 0 runs.
+- `tool_compact` × `Prison`: 0 runs.
+- `tool_hinted` × `Boat`: 0 runs.
+- `tool_hinted` × `Prison`: 0 runs.
+
+Because `Boat` and `Prison` are retired, these do not require new canonical
+runs. They do indicate that the public leaderboard mixes active and retired
+quest scopes.
+
+### Fewer than 3 runs
+
+- `hinted_compact` × `Boat`: 1 run; retired quest.
+- `hinted_compact` × `Prison`: 1 run; retired quest.
+- `planner`: 1 run on every published quest (`Badday`, `Banket`, `Boat`,
+  `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`, `Foncers`,
+  `Leonardo`, `Ministry`, `Pizza`, `Prison`, `Robots`, `Ski`).
+
+Canonical action item: the planner harness has insufficient variance coverage.
+For active quests, it needs at least two additional runs per quest to reach the
+minimum 3-run threshold.
+
+### Only 1 model tested
+
+The following harnesses have leaderboard cells where the run count may be at
+least 3, but the model dimension is still only one model. These cells cannot
+separate harness effects from model-specific behavior:
+
+- `tool_compact`: one model on all non-retired published quests
+  (`Badday`, `Banket`, `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`,
+  `Foncers`, `Leonardo`, `Ministry`, `Pizza`, `Robots`, `Ski`).
+- `tool_hinted`: one model on all non-retired published quests
+  (`Badday`, `Banket`, `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`,
+  `Foncers`, `Leonardo`, `Ministry`, `Pizza`, `Robots`, `Ski`).
+- `planner`: one model on every published quest and only one run per quest.
+- `hinted_compact` on `Boat` and `Prison`: one model, but both quests are
+  retired.
+
+The stronger public comparison cells are `minimal`, `reasoning_recent`,
+`reasoning_full`, and `memo_compact`, which have multi-model coverage in the
+leaderboard data. However, `reasoning_full` and `memo_compact` still require
+provenance filtering because early memory-mode runs overlap with the loop-
+breaker bug era.
+
+## 3. Noise / Anomaly List
+
+### Loop-breaker bug era
+
+- Exp 2 memory-mode runs are unreliable. The experiment log documents a
+  number-normalization bug in `_normalize_for_signature` and aggressive loop
+  breaker overrides that changed correct model decisions.
+- Exp 3 Arm 1 has a pre-fix/incomplete attempt affected by SDK timeout issues.
+  Only the rerun after the timeout fix should be considered.
+- Any leaderboard entry whose provenance traces to Exp 2 or the Exp 3 pre-fix
+  attempt should be marked non-canonical until regenerated or excluded.
+
+### High timeout runs
+
+- Exp 7b `Qwen 3.6 Flash`: 17/36 timeouts (47%).
+- Exp 7b `Claude Haiku 4.5`: 19/36 timeouts (53%).
+- Exp 7b `DeepSeek V4 Flash`: 5/36 timeouts (14%), below the >30% threshold
+  but still noisy because success was 0/36.
+
+The Qwen 3.6 and Haiku 4.5 rows should be interpreted primarily as timeout /
+verbosity failures, not clean harness-quality signals.
+
+### Retired quests
+
+- `Boat`: trivial / smoke-test-like quest; removed from canonical experiment
+  configs.
+- `Prison`: loops endlessly; removed from canonical experiment configs.
+
+Both still appear in `site/leaderboard.json`, so public summaries should label
+them as retired or remove them from canonical aggregates.
+
+### Retired harness variants
+
+The following Exp 4 arms are not part of the final taxonomy and should not be
+merged into canonical `memo_compact` results:
+
+- `memo_extended`
+- `memo_structured`
+- `memo_cot`
+- `compaction_no_memo` ablation
+
+Current YAML files have been migrated to the `harness:` key, so historical
+variant identity must be preserved from `docs/EXPERIMENTS_LOG.md` and config
+file names rather than inferred only from the post-refactor `harness` field.
+
+## 4. Budget Estimate
+
+Top-priority new runs to close actionable gaps while avoiding retired quests:
+
+| Priority | Harness | Quest(s) | New runs needed | Reason |
+|---:|---|---|---:|---|
+| 1 | `planner` | 13 active published quests (`Badday`, `Banket`, `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`, `Foncers`, `Leonardo`, `Ministry`, `Pizza`, `Robots`, `Ski`) | 26 | Bring 1-run planner cells up to the 3-run minimum on active leaderboard quests. |
+| 2 | `planner` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest so planner effects are not single-model artifacts. |
+| 3 | `tool_compact` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest; current cells are all one-model results. |
+| 4 | `tool_hinted` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest; current cells are all one-model results. |
+| 5 | Public leaderboard refresh | `Pilot`, `Disk`, `Player`, `Shashki`, `Sortirovka1` | Scope-dependent | These quests are present in canonical 18-quest configs/logs but absent from the current public leaderboard matrix. Backfill or explicitly exclude them. |
+
+Do not spend new budget on `Boat` or `Prison` unless the goal is only to
+reproduce historical/public rows; both are retired from canonical analysis.
+
+## 5. Leaderboard Integrity
+
+Findings from `site/leaderboard.json`:
+
+1. The leaderboard uses the eight canonical public modes and does not expose
+   retired harness variants as separate modes. This is good, but it creates a
+   provenance risk if Exp 4 retired variants were ever aggregated under
+   `memo_compact`.
+2. `Boat` and `Prison` remain in the published quest list despite being retired
+   from canonical experiment configs. They should be excluded from aggregate
+   claims or clearly labeled as retired.
+3. `planner` has only one run per quest and one model. It should not be used for
+   reliability claims yet.
+4. `tool_compact` and `tool_hinted` have three runs per active published quest,
+   but only one model. Their harness comparison is promising but not yet
+   model-robust.
+5. Published `reasoning_full` / `memo_compact` rows need run-level provenance
+   checks before canonical use because early memory-mode experiments overlap
+   with the Exp 2 loop-breaker bug era.
+6. Exp 7b model-upgrade entries for `Qwen 3.6 Flash` and `Claude Haiku 4.5`
+   should be annotated as high-timeout data if included in any leaderboard or
+   narrative comparison.
+
+Recommended integrity rule: canonical leaderboard aggregates should require
+non-retired quests, canonical harness names, no loop-breaker bug provenance, at
+least 3 runs per harness × quest cell, and at least two models for claims about
+harness effects rather than model effects.

From 27328d143b9208247558949850787adac73c737e Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Mon, 11 May 2026 15:21:03 +0400
Subject: [PATCH 10/24] fix: double memory update, compaction guard, test
 config migration

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 configs/default.yaml                    | 10 +++++-----
 configs/kr1.yaml                        |  8 ++++----
 configs/kr1_micro.yaml                  |  4 ++--
 configs/kr1_test.yaml                   |  4 ++--
 configs/kr2_en_benchmark.yaml           |  8 ++++----
 configs/kr2_en_test.yaml                |  2 +-
 configs/test/parallel_agents_test.yaml  |  2 +-
 configs/test/temperature_test.yaml      | 12 ++++++------
 configs/test/test_benchmark.yaml        |  2 +-
 llm_quest_benchmark/harnesses/base.py   |  2 --
 llm_quest_benchmark/harnesses/memory.py |  7 ++++---
 11 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/configs/default.yaml b/configs/default.yaml
index d7dbe67..ff185a5 100644
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -5,27 +5,27 @@ quests:
 
 agents:
   - model: random_choice
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.0
     skip_single: true
 
   - model: gpt-5-mini
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
   - model: claude-sonnet-4-5
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
   - model: gemini-2.5-flash
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
   - model: deepseek-3.2-chat
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
diff --git a/configs/kr1.yaml b/configs/kr1.yaml
index c7771e6..c31cc3b 100644
--- a/configs/kr1.yaml
+++ b/configs/kr1.yaml
@@ -5,22 +5,22 @@ quests:
 
 agents:
   - model: gpt-5-mini
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
   - model: claude-sonnet-4-5
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
   - model: gemini-2.5-flash
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
   - model: deepseek-3.2-chat
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
diff --git a/configs/kr1_micro.yaml b/configs/kr1_micro.yaml
index c19bd1a..ac3df96 100644
--- a/configs/kr1_micro.yaml
+++ b/configs/kr1_micro.yaml
@@ -8,12 +8,12 @@ quests:
 agents:
   # Just 2 agents to validate the process
   - model: gpt-5-mini
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.7
     skip_single: true
     
   - model: gemini-2.5-flash
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.6
     skip_single: true
 
diff --git a/configs/kr1_test.yaml b/configs/kr1_test.yaml
index fbe843c..bb8ed98 100644
--- a/configs/kr1_test.yaml
+++ b/configs/kr1_test.yaml
@@ -7,12 +7,12 @@ quests:
 agents:
   # Just 2 agents to validate the process
   - model: gpt-5-mini
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.7
     skip_single: true
     
   - model: gemini-2.5-flash
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.6
     skip_single: true
 
diff --git a/configs/kr2_en_benchmark.yaml b/configs/kr2_en_benchmark.yaml
index 76b6c21..88a0fe5 100644
--- a/configs/kr2_en_benchmark.yaml
+++ b/configs/kr2_en_benchmark.yaml
@@ -7,23 +7,23 @@ quests:
 agents:
   # OpenAI models
   - model: gpt-4o
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.5
     skip_single: true
 
   - model: gpt-4o-mini
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.7
     skip_single: true
 
   # Anthropic models
   - model: claude-3-7-sonnet-latest
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.5
     skip_single: true
 
   - model: claude-3-5-sonnet-latest
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.6
     skip_single: true
 
diff --git a/configs/kr2_en_test.yaml b/configs/kr2_en_test.yaml
index 7dbe160..0addb04 100644
--- a/configs/kr2_en_test.yaml
+++ b/configs/kr2_en_test.yaml
@@ -5,7 +5,7 @@ quests:
 agents:
   - model: random_choice  # Use random agent for speed and reliability
     temperature: 0.5
-    template: reasoning.jinja
+    harness: reasoning_recent
 quest_timeout: 10  # short timeout for testing
 debug: true
 output_dir: results/benchmarks
diff --git a/configs/test/parallel_agents_test.yaml b/configs/test/parallel_agents_test.yaml
index 37bca75..0aec1be 100644
--- a/configs/test/parallel_agents_test.yaml
+++ b/configs/test/parallel_agents_test.yaml
@@ -6,7 +6,7 @@ quests:
 agents:
   - model: random_choice
   - model: gpt-5-mini
-    template: reasoning.jinja
+    harness: reasoning_recent
 debug: true
 # No max_workers setting - we'll use one worker per agent
 output_dir: results/benchmarks
diff --git a/configs/test/temperature_test.yaml b/configs/test/temperature_test.yaml
index 8f8e0cc..d79b705 100644
--- a/configs/test/temperature_test.yaml
+++ b/configs/test/temperature_test.yaml
@@ -7,32 +7,32 @@ quests:
 
 agents:
   - model: claude-sonnet-4-5
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.3
     skip_single: true
 
   - model: claude-sonnet-4-5
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
   - model: claude-sonnet-4-5
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.7
     skip_single: true
 
   - model: deepseek-3.2-chat
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.3
     skip_single: true
 
   - model: deepseek-3.2-chat
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.4
     skip_single: true
 
   - model: deepseek-3.2-chat
-    template: reasoning.jinja
+    harness: reasoning_recent
     temperature: 0.7
     skip_single: true
 
diff --git a/configs/test/test_benchmark.yaml b/configs/test/test_benchmark.yaml
index 3c89dab..c20c648 100644
--- a/configs/test/test_benchmark.yaml
+++ b/configs/test/test_benchmark.yaml
@@ -4,7 +4,7 @@ quests:
 agents:
   - model: random_choice
   - model: gemini-2.5-flash
-    template: reasoning.jinja
+    harness: reasoning_recent
 debug: true
 quest_timeout: 60
 max_workers: 2
diff --git a/llm_quest_benchmark/harnesses/base.py b/llm_quest_benchmark/harnesses/base.py
index 6fa8afd..0501287 100644
--- a/llm_quest_benchmark/harnesses/base.py
+++ b/llm_quest_benchmark/harnesses/base.py
@@ -328,8 +328,6 @@ def get_action(self, observation: str, choices: list[dict[str, str]]) -> int:
             self._observation_history.append(clean)
             if len(self._observation_history) > 20:
                 self._observation_history = self._observation_history[-20:]
-            if self.memory_module is not None:
-                self.memory_module.update({"observation": clean, "step": self._step_count + 1})
         return super().get_action(observation, choices)
 
     def on_game_start(self) -> None:
diff --git a/llm_quest_benchmark/harnesses/memory.py b/llm_quest_benchmark/harnesses/memory.py
index ff54ff9..ab4f72c 100644
--- a/llm_quest_benchmark/harnesses/memory.py
+++ b/llm_quest_benchmark/harnesses/memory.py
@@ -249,6 +249,9 @@ def reset(self) -> None:
     def _maybe_compact(self) -> None:
         if self._steps_since_compaction < self.compaction_interval:
             return
+        if self.llm_client is None:
+            # No LLM client available for compaction; skip silently
+            return
         transcript_text = self._format_transcript_for_compaction()
         if not transcript_text:
             return
@@ -269,9 +272,7 @@ def _maybe_compact(self) -> None:
             "Write a concise summary in plain text, max 300 words."
         )
 
-        summary = ""
-        if self.llm_client is not None:
-            summary = (self.llm_client.get_completion("\n".join(prompt_parts)) or "").strip()
+        summary = (self.llm_client.get_completion("\n".join(prompt_parts)) or "").strip()
         if summary:
             self._compaction_summary = summary
             self._transcript = []

From 5c2aa5cdb1f7c20e257ba0ca4cab6443e46546e5 Mon Sep 17 00:00:00 2001
From: Kirill Korikov <korikov.kirill@gmail.com>
Date: Mon, 11 May 2026 17:36:59 +0400
Subject: [PATCH 11/24] fix: preserve benchmark result compatibility

---
 llm_quest_benchmark/executors/benchmark.py    | 36 +++++++++++++++++++
 llm_quest_benchmark/schemas/config.py         |  6 +++-
 .../tests/harnesses/test_harnesses.py         |  6 ++--
 3 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/llm_quest_benchmark/executors/benchmark.py b/llm_quest_benchmark/executors/benchmark.py
index 69e7d22..62f5c92 100644
--- a/llm_quest_benchmark/executors/benchmark.py
+++ b/llm_quest_benchmark/executors/benchmark.py
@@ -56,6 +56,40 @@ def _agent_harness(agent_config) -> str:
     return legacy_mapping.get((template, memory_mode), "reasoning_recent")
 
 
+def _agent_template(agent_config) -> str:
+    """Return legacy template name for result artifacts."""
+    if hasattr(agent_config, "action_template"):
+        return agent_config.action_template
+
+    harness_templates = {
+        "minimal": "stub.jinja",
+        "reasoning_recent": "reasoning.jinja",
+        "reasoning_full": "reasoning.jinja",
+        "memo_compact": "stateful_compact.jinja",
+        "hinted_compact": "stateful_compact_hints.jinja",
+        "tool_compact": "tool_augmented.jinja",
+        "tool_hinted": "tool_augmented_hints.jinja",
+        "planner": "planner.jinja",
+    }
+    return harness_templates.get(_agent_harness(agent_config), "reasoning.jinja")
+
+
+def _agent_memory_mode(agent_config) -> str:
+    """Return legacy memory mode for result artifacts."""
+    if hasattr(agent_config, "memory_mode"):
+        return agent_config.memory_mode
+
+    harness_memory_modes = {
+        "reasoning_full": "full_transcript",
+        "memo_compact": "compaction",
+        "hinted_compact": "compaction",
+        "tool_compact": "compaction",
+        "tool_hinted": "compaction",
+        "planner": "compaction",
+    }
+    return harness_memory_modes.get(_agent_harness(agent_config), "default")
+
+
 def _result_entry(
     quest: str,
     agent_config,
@@ -69,6 +103,8 @@ def _result_entry(
         "model": agent_config.model,
         "temperature": agent_config.temperature,
         "harness": _agent_harness(agent_config),
+        "template": _agent_template(agent_config),
+        "memory_mode": _agent_memory_mode(agent_config),
         "agent_id": agent_config.harness_id if hasattr(agent_config, "harness_id") else agent_config.agent_id,
         "attempt": attempt,
         "outcome": outcome,
diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py
index c658729..7d5c74c 100644
--- a/llm_quest_benchmark/schemas/config.py
+++ b/llm_quest_benchmark/schemas/config.py
@@ -104,7 +104,11 @@ def __post_init__(self):
         self.system_template = normalize_template_name(self.system_template)
         from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, SPECIAL_HARNESSES, is_random_choice_harness
 
-        if self.harness not in HARNESS_REGISTRY and self.harness != "human" and not is_random_choice_harness(self.harness):
+        if (
+            self.harness not in HARNESS_REGISTRY
+            and self.harness != "human"
+            and not is_random_choice_harness(self.harness)
+        ):
             valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES]
             raise ValueError(f"Invalid harness: {self.harness}. Supported harnesses: {valid}")
         if not (0.0 <= self.temperature <= 2.0):
diff --git a/llm_quest_benchmark/tests/harnesses/test_harnesses.py b/llm_quest_benchmark/tests/harnesses/test_harnesses.py
index 030648b..9095a46 100644
--- a/llm_quest_benchmark/tests/harnesses/test_harnesses.py
+++ b/llm_quest_benchmark/tests/harnesses/test_harnesses.py
@@ -3,14 +3,13 @@
 from unittest.mock import Mock
 
 from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness
-from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory
 from llm_quest_benchmark.harnesses.memo import HintedCompactHarness, MemoCompactHarness
+from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory
 from llm_quest_benchmark.harnesses.minimal import MinimalHarness
 from llm_quest_benchmark.harnesses.planner import PlannerHarness
 from llm_quest_benchmark.harnesses.reasoning import ReasoningFullTranscriptHarness, ReasoningRecentHarness
 from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness, ToolHintedHarness
 
-
 HARNESS_SPECS = {
     "minimal": (MinimalHarness, "stub.jinja", DefaultMemory),
     "reasoning_recent": (ReasoningRecentHarness, "reasoning.jinja", DefaultMemory),
@@ -221,8 +220,7 @@ def test_tool_compact_scratchpad_read_write_and_reset():
 
     assert harness.scratchpad("read") == "(empty)"
     assert (
-        harness.scratchpad("write_replace", " Board: W B _ ; failed door 2 ")
-        == "updated: Board: W B _ ; failed door 2"
+        harness.scratchpad("write_replace", " Board: W B _ ; failed door 2 ") == "updated: Board: W B _ ; failed door 2"
     )
     assert harness.scratchpad("read") == "Board: W B _ ; failed door 2"
 

From 930edda4ebe1cb796ae55eb2398667d7aea3d063 Mon Sep 17 00:00:00 2001
From: Kirill Korikov <korikov.kirill@gmail.com>
Date: Mon, 11 May 2026 17:57:13 +0400
Subject: [PATCH 12/24] fix: address PR review feedback

---
 .../benchmarks/exp4_compaction_no_memo.yaml   |  2 +-
 configs/benchmarks/exp4_memo_cot.yaml         |  2 +-
 configs/benchmarks/exp4_memo_extended.yaml    |  2 +-
 configs/benchmarks/exp4_memo_structured.yaml  |  2 +-
 llm_quest_benchmark/core/leaderboard.py       | 51 +++++++++++++-
 llm_quest_benchmark/executors/benchmark.py    | 29 +++-----
 llm_quest_benchmark/harnesses/base.py         | 15 ++---
 llm_quest_benchmark/harnesses/factory.py      | 13 +++-
 llm_quest_benchmark/harnesses/memo.py         | 36 ++++++++++
 .../tests/agents/test_llm_agent.py            |  7 ++
 .../tests/harnesses/test_factory.py           |  7 ++
 .../tests/harnesses/test_harnesses.py         | 43 +++++++++++-
 .../tests/integration/test_benchmark.py       | 18 ++---
 .../tests/test_benchmark_with_directory.py    |  8 +--
 llm_quest_benchmark/tests/test_leaderboard.py | 67 +++++++++++++++++++
 15 files changed, 250 insertions(+), 52 deletions(-)

diff --git a/configs/benchmarks/exp4_compaction_no_memo.yaml b/configs/benchmarks/exp4_compaction_no_memo.yaml
index 896dd60..4ab63e6 100644
--- a/configs/benchmarks/exp4_compaction_no_memo.yaml
+++ b/configs/benchmarks/exp4_compaction_no_memo.yaml
@@ -24,7 +24,7 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    harness: memo_compact
+    harness: compaction_no_memo
     temperature: 0.4
     runs: 2
     compaction_interval: 50
diff --git a/configs/benchmarks/exp4_memo_cot.yaml b/configs/benchmarks/exp4_memo_cot.yaml
index 9bfe382..320da54 100644
--- a/configs/benchmarks/exp4_memo_cot.yaml
+++ b/configs/benchmarks/exp4_memo_cot.yaml
@@ -24,7 +24,7 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    harness: memo_compact
+    harness: memo_cot
     temperature: 0.4
     runs: 2
     compaction_interval: 50
diff --git a/configs/benchmarks/exp4_memo_extended.yaml b/configs/benchmarks/exp4_memo_extended.yaml
index 25e5620..a5d6613 100644
--- a/configs/benchmarks/exp4_memo_extended.yaml
+++ b/configs/benchmarks/exp4_memo_extended.yaml
@@ -24,7 +24,7 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    harness: memo_compact
+    harness: memo_extended
     temperature: 0.4
     runs: 2
     compaction_interval: 50
diff --git a/configs/benchmarks/exp4_memo_structured.yaml b/configs/benchmarks/exp4_memo_structured.yaml
index 96e5daf..f70ab81 100644
--- a/configs/benchmarks/exp4_memo_structured.yaml
+++ b/configs/benchmarks/exp4_memo_structured.yaml
@@ -24,7 +24,7 @@ quests:
   - quests/sr_2_1_2121_eng/Sortirovka1_eng.qm
 agents:
   - model: "openrouter:google/gemini-3-flash-preview"
-    harness: memo_compact
+    harness: memo_structured
     temperature: 0.4
     runs: 2
     compaction_interval: 50
diff --git a/llm_quest_benchmark/core/leaderboard.py b/llm_quest_benchmark/core/leaderboard.py
index 032ad48..dc0a67b 100644
--- a/llm_quest_benchmark/core/leaderboard.py
+++ b/llm_quest_benchmark/core/leaderboard.py
@@ -28,9 +28,6 @@
     "stub": ("minimal_prompt", TAXONOMY_MODES["minimal_prompt"]),
     "strategic": ("short_context_reasoning", TAXONOMY_MODES["short_context_reasoning"]),
     "stateful_compact": ("compact_memory_memo", TAXONOMY_MODES["compact_memory_memo"]),
-    "memo_cot": ("compact_memory_memo", TAXONOMY_MODES["compact_memory_memo"]),
-    "memo_extended": ("compact_memory_memo", TAXONOMY_MODES["compact_memory_memo"]),
-    "memo_structured": ("compact_memory_memo", TAXONOMY_MODES["compact_memory_memo"]),
     "light_hints": ("prompt_hints", TAXONOMY_MODES["prompt_hints"]),
     "stateful_compact_hints": ("prompt_hints", TAXONOMY_MODES["prompt_hints"]),
     "planner": ("planner_loop", TAXONOMY_MODES["planner_loop"]),
@@ -38,6 +35,26 @@
     "tool_augmented_hints": ("tools_hints_compact_memory", TAXONOMY_MODES["tools_hints_compact_memory"]),
 }
 
+RETIRED_BENCHMARK_NAMES = {
+    "exp4_compaction_no_memo",
+    "exp4_memo_cot",
+    "exp4_memo_extended",
+    "exp4_memo_structured",
+}
+
+RETIRED_HARNESSES = {
+    "compaction_no_memo",
+    "memo_cot",
+    "memo_extended",
+    "memo_structured",
+}
+
+RETIRED_TEMPLATE_IDS = {
+    "memo_cot",
+    "memo_extended",
+    "memo_structured",
+}
+
 REASONING_STYLE_TEMPLATES = {
     "reasoning",
     "strategic",
@@ -87,6 +104,25 @@ def _mode_from_template(template_name: str, memory_mode: str | None = None) -> t
     return TEMPLATE_TO_MODE.get(template_id, (template_id or "unknown", template_id or "unknown"))
 
 
+def _is_retired_result(
+    source_name: str | None,
+    benchmark_id: str | None,
+    result_row: dict[str, Any],
+    agent_config: dict[str, Any],
+    template_name: str,
+) -> bool:
+    source_names = {str(value) for value in (source_name, benchmark_id) if value}
+    if source_names & RETIRED_BENCHMARK_NAMES:
+        return True
+
+    harness = str(result_row.get("harness") or agent_config.get("harness") or "")
+    if harness in RETIRED_HARNESSES:
+        return True
+
+    template_id = _strip_template_suffix(template_name)
+    return template_id in RETIRED_TEMPLATE_IDS
+
+
 def _agent_config(db_run: dict[str, Any]) -> dict[str, Any]:
     raw_config = db_run.get("agent_config")
     if not isinstance(raw_config, str) or not raw_config:
@@ -298,6 +334,7 @@ def generate_leaderboard(
             continue
 
         benchmark_id = summary.get("benchmark_id")
+        source_name = summary.get("name")
         if benchmark_id:
             benchmark_ids.append(str(benchmark_id))
 
@@ -349,6 +386,14 @@ def generate_leaderboard(
             if template_from_config:
                 template = template_from_config
             memory_mode = config.get("memory_mode")
+            if _is_retired_result(
+                str(source_name) if source_name else None,
+                str(benchmark_id) if benchmark_id else None,
+                result_row,
+                config,
+                template,
+            ):
+                continue
             mode_id, mode_label = _mode_from_template(template, str(memory_mode) if memory_mode is not None else None)
 
             try:
diff --git a/llm_quest_benchmark/executors/benchmark.py b/llm_quest_benchmark/executors/benchmark.py
index 62f5c92..53c430d 100644
--- a/llm_quest_benchmark/executors/benchmark.py
+++ b/llm_quest_benchmark/executors/benchmark.py
@@ -35,25 +35,8 @@
 
 
 def _agent_harness(agent_config) -> str:
-    """Return harness name for new configs, with legacy AgentConfig fallback."""
-    if hasattr(agent_config, "harness"):
-        return agent_config.harness
-
-    template = getattr(agent_config, "action_template", "reasoning.jinja")
-    memory_mode = getattr(agent_config, "memory_mode", "default")
-    template = template.removesuffix(".jinja")
-    legacy_mapping = {
-        ("stub", "default"): "minimal",
-        ("reasoning", "default"): "reasoning_recent",
-        ("reasoning", "full_transcript"): "reasoning_full",
-        ("reasoning", "compaction"): "memo_compact",
-        ("stateful_compact", "compaction"): "memo_compact",
-        ("stateful_compact_hints", "compaction"): "hinted_compact",
-        ("tool_augmented", "compaction"): "tool_compact",
-        ("tool_augmented_hints", "compaction"): "tool_hinted",
-        ("planner", "compaction"): "planner",
-    }
-    return legacy_mapping.get((template, memory_mode), "reasoning_recent")
+    """Return the configured harness name."""
+    return agent_config.harness
 
 
 def _agent_template(agent_config) -> str:
@@ -70,6 +53,10 @@ def _agent_template(agent_config) -> str:
         "tool_compact": "tool_augmented.jinja",
         "tool_hinted": "tool_augmented_hints.jinja",
         "planner": "planner.jinja",
+        "compaction_no_memo": "reasoning.jinja",
+        "memo_cot": "memo_cot.jinja",
+        "memo_extended": "memo_extended.jinja",
+        "memo_structured": "memo_structured.jinja",
     }
     return harness_templates.get(_agent_harness(agent_config), "reasoning.jinja")
 
@@ -86,6 +73,10 @@ def _agent_memory_mode(agent_config) -> str:
         "tool_compact": "compaction",
         "tool_hinted": "compaction",
         "planner": "compaction",
+        "compaction_no_memo": "compaction",
+        "memo_cot": "compaction",
+        "memo_extended": "compaction",
+        "memo_structured": "compaction",
     }
     return harness_memory_modes.get(_agent_harness(agent_config), "default")
 
diff --git a/llm_quest_benchmark/harnesses/base.py b/llm_quest_benchmark/harnesses/base.py
index 0501287..440675b 100644
--- a/llm_quest_benchmark/harnesses/base.py
+++ b/llm_quest_benchmark/harnesses/base.py
@@ -303,6 +303,8 @@ def _ensure_llm(self) -> None:
                 system_prompt=self.prompt_renderer.render_system_prompt(),
                 temperature=self.temperature,
             )
+        if self.memory_module is not None and hasattr(self.memory_module, "llm_client"):
+            self.memory_module.llm_client = self.llm
 
     @abstractmethod
     def _get_action_impl(self, observation, choices) -> int:
@@ -433,22 +435,13 @@ def _remember_decision(
             )
 
     def _format_prompt(self, observation, choices, memo=None, context=None) -> str:
-        """Render system and action Jinja templates for the current decision."""
-        system_prompt = self.prompt_renderer.render_system_prompt(
-            observation=observation,
-            choices=choices,
-            memo=memo,
-            context=context,
-        ).strip()
-        action_prompt = self.prompt_renderer.action_template.render(
+        """Render the action Jinja template for the current decision."""
+        return self.prompt_renderer.action_template.render(
             observation=observation,
             choices=[{"text": c.get("text", "")} for c in choices],
             memo=memo,
             context=context,
         ).strip()
-        if system_prompt:
-            return f"{system_prompt}\n\n{action_prompt}".strip()
-        return action_prompt
 
     def _parse_llm_response(self, response, num_choices) -> LLMResponse:
         """Parse an LLM response into a structured response object."""
diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py
index b46f5dc..8ea4b84 100644
--- a/llm_quest_benchmark/harnesses/factory.py
+++ b/llm_quest_benchmark/harnesses/factory.py
@@ -4,7 +4,14 @@
 from llm_quest_benchmark.agents.human_player import HumanPlayer
 from llm_quest_benchmark.agents.random_agent import RandomAgent
 from llm_quest_benchmark.constants import DEFAULT_MODEL
-from llm_quest_benchmark.harnesses.memo import HintedCompactHarness, MemoCompactHarness
+from llm_quest_benchmark.harnesses.memo import (
+    CompactionNoMemoHarness,
+    HintedCompactHarness,
+    MemoCompactHarness,
+    MemoCotHarness,
+    MemoExtendedHarness,
+    MemoStructuredHarness,
+)
 from llm_quest_benchmark.harnesses.minimal import MinimalHarness
 from llm_quest_benchmark.harnesses.planner import PlannerHarness
 from llm_quest_benchmark.harnesses.reasoning import ReasoningFullTranscriptHarness, ReasoningRecentHarness
@@ -19,6 +26,10 @@
     "tool_compact": ToolCompactHarness,
     "tool_hinted": ToolHintedHarness,
     "planner": PlannerHarness,
+    "compaction_no_memo": CompactionNoMemoHarness,
+    "memo_cot": MemoCotHarness,
+    "memo_extended": MemoExtendedHarness,
+    "memo_structured": MemoStructuredHarness,
 }
 
 SPECIAL_HARNESSES = ("human", "random_choice", "random_choice_<seed>")
diff --git a/llm_quest_benchmark/harnesses/memo.py b/llm_quest_benchmark/harnesses/memo.py
index 764f206..eaab06b 100644
--- a/llm_quest_benchmark/harnesses/memo.py
+++ b/llm_quest_benchmark/harnesses/memo.py
@@ -60,3 +60,39 @@ def __init__(
             memory_module=memory_module,
             **kwargs,
         )
+
+
+class CompactionNoMemoHarness(MemoCompactHarness):
+    """Retired Exp 4 ablation: compacted transcript without memo-oriented prompting."""
+
+    harness_name = "compaction_no_memo"
+
+    def __init__(self, *args, action_template: str = "reasoning.jinja", **kwargs):
+        super().__init__(*args, action_template=action_template, **kwargs)
+
+
+class MemoExtendedHarness(MemoCompactHarness):
+    """Retired Exp 4 variant with a larger generic memo field."""
+
+    harness_name = "memo_extended"
+
+    def __init__(self, *args, action_template: str = "memo_extended.jinja", **kwargs):
+        super().__init__(*args, action_template=action_template, **kwargs)
+
+
+class MemoStructuredHarness(MemoCompactHarness):
+    """Retired Exp 4 variant with structured memo prompting."""
+
+    harness_name = "memo_structured"
+
+    def __init__(self, *args, action_template: str = "memo_structured.jinja", **kwargs):
+        super().__init__(*args, action_template=action_template, **kwargs)
+
+
+class MemoCotHarness(MemoCompactHarness):
+    """Retired Exp 4 variant with scratchpad-style memo prompting."""
+
+    harness_name = "memo_cot"
+
+    def __init__(self, *args, action_template: str = "memo_cot.jinja", **kwargs):
+        super().__init__(*args, action_template=action_template, **kwargs)
diff --git a/llm_quest_benchmark/tests/agents/test_llm_agent.py b/llm_quest_benchmark/tests/agents/test_llm_agent.py
index 1f3b99c..280fd0a 100644
--- a/llm_quest_benchmark/tests/agents/test_llm_agent.py
+++ b/llm_quest_benchmark/tests/agents/test_llm_agent.py
@@ -90,6 +90,13 @@ def test_non_gemini_prompt_uses_selected_template():
     assert "IMPORTANT: Please respond with ONLY a single number" in prompt
 
 
+def test_formatted_user_prompt_does_not_duplicate_system_prompt():
+    harness = MinimalHarness(model_name="gpt-5-mini", action_template="stub.jinja")
+    prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}])
+
+    assert "experienced interactive fiction player" not in prompt
+
+
 def test_template_alias_without_suffix_is_supported():
     harness = MinimalHarness(model_name="gpt-5-mini", action_template="reasoning")
     prompt = harness._format_prompt("state", [{"text": "A"}, {"text": "B"}])
diff --git a/llm_quest_benchmark/tests/harnesses/test_factory.py b/llm_quest_benchmark/tests/harnesses/test_factory.py
index 187f4d3..4d72b61 100644
--- a/llm_quest_benchmark/tests/harnesses/test_factory.py
+++ b/llm_quest_benchmark/tests/harnesses/test_factory.py
@@ -68,6 +68,13 @@ def test_harness_config_allows_seeded_random_choice_harness():
     assert config.harness == "random_choice_123"
 
 
+def test_harness_config_allows_retired_exp4_aliases():
+    for harness_name in ("compaction_no_memo", "memo_cot", "memo_extended", "memo_structured"):
+        config = HarnessConfig(harness=harness_name, model="gpt-5-mini")
+
+        assert config.harness == harness_name
+
+
 def test_harness_config_rejects_old_template_key():
     with pytest.raises(ValueError, match="Use harness: key instead of template:"):
         HarnessConfig(model="gpt-5-mini", template="reasoning.jinja")
diff --git a/llm_quest_benchmark/tests/harnesses/test_harnesses.py b/llm_quest_benchmark/tests/harnesses/test_harnesses.py
index 9095a46..3cba73e 100644
--- a/llm_quest_benchmark/tests/harnesses/test_harnesses.py
+++ b/llm_quest_benchmark/tests/harnesses/test_harnesses.py
@@ -3,7 +3,14 @@
 from unittest.mock import Mock
 
 from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness
-from llm_quest_benchmark.harnesses.memo import HintedCompactHarness, MemoCompactHarness
+from llm_quest_benchmark.harnesses.memo import (
+    CompactionNoMemoHarness,
+    HintedCompactHarness,
+    MemoCompactHarness,
+    MemoCotHarness,
+    MemoExtendedHarness,
+    MemoStructuredHarness,
+)
 from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory
 from llm_quest_benchmark.harnesses.minimal import MinimalHarness
 from llm_quest_benchmark.harnesses.planner import PlannerHarness
@@ -19,6 +26,10 @@
     "tool_compact": (ToolCompactHarness, "tool_augmented.jinja", CompactionMemory),
     "tool_hinted": (ToolHintedHarness, "tool_augmented_hints.jinja", CompactionMemory),
     "planner": (PlannerHarness, "planner.jinja", CompactionMemory),
+    "compaction_no_memo": (CompactionNoMemoHarness, "reasoning.jinja", CompactionMemory),
+    "memo_cot": (MemoCotHarness, "memo_cot.jinja", CompactionMemory),
+    "memo_extended": (MemoExtendedHarness, "memo_extended.jinja", CompactionMemory),
+    "memo_structured": (MemoStructuredHarness, "memo_structured.jinja", CompactionMemory),
 }
 
 
@@ -65,6 +76,13 @@ def test_planner_harness_configuration():
     assert_harness_configuration("planner")
 
 
+def test_exp4_retired_harness_configuration():
+    assert_harness_configuration("compaction_no_memo")
+    assert_harness_configuration("memo_cot")
+    assert_harness_configuration("memo_extended")
+    assert_harness_configuration("memo_structured")
+
+
 def test_all_registry_harnesses_have_configuration_specs():
     assert set(HARNESS_REGISTRY) == set(HARNESS_SPECS)
 
@@ -101,6 +119,29 @@ def test_memo_compact_mocked_llm_returns_action_and_reuses_memo_context():
     assert "Merchant needs fuel payment" in second_prompt
 
 
+def test_compaction_memory_receives_existing_llm_client():
+    harness = MemoCompactHarness(model_name="gpt-5-mini", compaction_interval=1)
+    mocked_llm = Mock()
+    mocked_llm.get_completion.side_effect = [
+        '{"memo":"Paid fuel merchant","analysis":"pay first","reasoning":"quest clue","result":2}',
+        "Summary: paid the fuel merchant and should keep receipt.",
+    ]
+    mocked_llm.get_last_usage.return_value = {
+        "prompt_tokens": 10,
+        "completion_tokens": 5,
+        "total_tokens": 15,
+        "estimated_cost_usd": 0.0,
+    }
+    harness.llm = mocked_llm
+
+    action = harness.get_action("A merchant offers fuel for a fee.", [{"text": "Leave"}, {"text": "Pay"}])
+
+    assert action == 2
+    assert harness.memory_module.llm_client is mocked_llm
+    assert harness.memory_module._compaction_summary == "Summary: paid the fuel merchant and should keep receipt."
+    assert harness._steps_since_compaction == 0
+
+
 def test_planner_harness_first_turn_generates_plan_then_acts():
     harness = PlannerHarness(model_name="gpt-5-mini")
     mocked_llm = Mock()
diff --git a/llm_quest_benchmark/tests/integration/test_benchmark.py b/llm_quest_benchmark/tests/integration/test_benchmark.py
index ee0704e..c024da4 100644
--- a/llm_quest_benchmark/tests/integration/test_benchmark.py
+++ b/llm_quest_benchmark/tests/integration/test_benchmark.py
@@ -5,11 +5,11 @@
 
 import pytest
 
-from llm_quest_benchmark.constants import DEFAULT_TEMPLATE, SYSTEM_ROLE_TEMPLATE
+from llm_quest_benchmark.constants import SYSTEM_ROLE_TEMPLATE
 from llm_quest_benchmark.environments.state import QuestOutcome
 from llm_quest_benchmark.executors import benchmark as benchmark_module
 from llm_quest_benchmark.executors.benchmark import run_benchmark
-from llm_quest_benchmark.schemas.config import AgentConfig, BenchmarkConfig
+from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig
 
 
 def _fake_task_for_parallel_test(task, result_queue):
@@ -58,10 +58,10 @@ def test_benchmark_e2e(caplog, tmp_path):
     config = BenchmarkConfig(
         quests=[str(quest_path)],
         agents=[
-            AgentConfig(
+            HarnessConfig(
                 model="random_choice",  # Use random_choice for testing
+                harness="random_choice",
                 system_template=SYSTEM_ROLE_TEMPLATE,
-                action_template=DEFAULT_TEMPLATE,
                 temperature=0.0,
                 skip_single=True,
             )
@@ -85,7 +85,7 @@ def test_benchmark_e2e(caplog, tmp_path):
         assert result["quest"] == str(quest_path)
         assert result["model"] == "random_choice"
         assert result["temperature"] == 0.0
-        assert result["template"] == DEFAULT_TEMPLATE
+        assert result["template"] == "reasoning.jinja"
         assert result["attempt"] == 1
         assert "agent_id" in result
         assert "outcome" in result
@@ -122,9 +122,9 @@ def test_benchmark_supports_multiple_runs_per_agent(tmp_path):
     config = BenchmarkConfig(
         quests=[str(quest_path)],
         agents=[
-            AgentConfig(
+            HarnessConfig(
                 model="random_choice",
-                action_template="reasoning",
+                harness="random_choice",
                 temperature=0.0,
                 runs=2,
                 skip_single=True,
@@ -154,7 +154,7 @@ def test_benchmark_uses_max_workers(monkeypatch, tmp_path):
 
     config = BenchmarkConfig(
         quests=[str(quest_path)],
-        agents=[AgentConfig(model="random_choice", runs=4)],
+        agents=[HarnessConfig(model="random_choice", harness="random_choice", runs=4)],
         quest_timeout=5,
         max_workers=2,
         output_dir=str(tmp_path),
@@ -187,7 +187,7 @@ def test_benchmark_enforces_child_process_timeout(monkeypatch, tmp_path):
 
     config = BenchmarkConfig(
         quests=[str(quest_path)],
-        agents=[AgentConfig(model="random_choice", runs=1)],
+        agents=[HarnessConfig(model="random_choice", harness="random_choice", runs=1)],
         quest_timeout=1,
         max_workers=1,
         output_dir=str(tmp_path),
diff --git a/llm_quest_benchmark/tests/test_benchmark_with_directory.py b/llm_quest_benchmark/tests/test_benchmark_with_directory.py
index 7661f3d..bfe4e8d 100644
--- a/llm_quest_benchmark/tests/test_benchmark_with_directory.py
+++ b/llm_quest_benchmark/tests/test_benchmark_with_directory.py
@@ -11,7 +11,7 @@
 logger = logging.getLogger(__name__)
 
 from llm_quest_benchmark.executors.benchmark import run_benchmark
-from llm_quest_benchmark.schemas.config import AgentConfig, BenchmarkConfig
+from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig
 
 
 def create_test_config():
@@ -19,7 +19,7 @@ def create_test_config():
     return {
         "name": "Directory Benchmark Test",
         "quests": ["quests/sr_2_1_2121_eng"],
-        "agents": [{"model": "random_choice", "skip_single": True, "temperature": 0.7}],
+        "agents": [{"model": "random_choice", "harness": "random_choice", "skip_single": True, "temperature": 0.7}],
         "quest_timeout": 4,  # Keep runtime below pytest global timeout
         "max_quests": 1,
         "debug": True,
@@ -34,8 +34,8 @@ def test_benchmark_with_directory():
     config_dict = create_test_config()
     logger.info(f"Created test config: {json.dumps(config_dict, indent=2)}")
 
-    # Convert agent dictionaries to AgentConfig objects first
-    config_dict["agents"] = [AgentConfig(**agent_dict) for agent_dict in config_dict["agents"]]
+    # Convert agent dictionaries to HarnessConfig objects first
+    config_dict["agents"] = [HarnessConfig(**agent_dict) for agent_dict in config_dict["agents"]]
     config = BenchmarkConfig(**config_dict)
     logger.info("Config validation passed")
 
diff --git a/llm_quest_benchmark/tests/test_leaderboard.py b/llm_quest_benchmark/tests/test_leaderboard.py
index aa22296..28a3e31 100644
--- a/llm_quest_benchmark/tests/test_leaderboard.py
+++ b/llm_quest_benchmark/tests/test_leaderboard.py
@@ -243,6 +243,73 @@ def test_generate_leaderboard_filters_public_slice(tmp_path, monkeypatch):
     assert {row["model"] for row in leaderboard["results"]} == {"model-a", "model-b", "model-c"}
 
 
+def test_generate_leaderboard_excludes_retired_exp4_variants(tmp_path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
+
+    active_dir = Path("results/benchmarks/active")
+    active_dir.mkdir(parents=True, exist_ok=True)
+    retired_dir = Path("results/benchmarks/retired")
+    retired_dir.mkdir(parents=True, exist_ok=True)
+
+    active_row = {
+        "quest": "quests/Core.qm",
+        "model": "gpt-5-mini",
+        "template": "stateful_compact.jinja",
+        "harness": "memo_compact",
+        "agent_id": "active",
+        "attempt": 1,
+        "outcome": "SUCCESS",
+    }
+    retired_rows = [
+        {
+            "quest": "quests/Core.qm",
+            "model": "gpt-5-mini",
+            "template": "reasoning.jinja",
+            "harness": "compaction_no_memo",
+            "agent_id": "retired-no-memo",
+            "attempt": 1,
+            "outcome": "FAILURE",
+        },
+        {
+            "quest": "quests/Core.qm",
+            "model": "gpt-5-mini",
+            "template": "memo_extended.jinja",
+            "harness": "memo_extended",
+            "agent_id": "retired-extended",
+            "attempt": 1,
+            "outcome": "FAILURE",
+        },
+    ]
+
+    (active_dir / "benchmark_summary.json").write_text(
+        json.dumps({"benchmark_id": "active", "name": "active", "agents": [], "results": [active_row], "db_runs": []}),
+        encoding="utf-8",
+    )
+    (retired_dir / "benchmark_summary.json").write_text(
+        json.dumps(
+            {
+                "benchmark_id": "retired",
+                "name": "exp4_compaction_no_memo",
+                "agents": [],
+                "results": retired_rows,
+                "db_runs": [],
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    leaderboard = generate_leaderboard(
+        [str(active_dir), str(retired_dir)],
+        "site/leaderboard.json",
+        min_runs=0,
+        public_model_ids=None,
+    )
+
+    assert len(leaderboard["results"]) == 1
+    assert leaderboard["results"][0]["mode"] == "compact_memory_memo"
+    assert leaderboard["results"][0]["runs"] == 1
+
+
 def test_generate_leaderboard_matches_db_runs_by_identifiers(tmp_path, monkeypatch):
     monkeypatch.chdir(tmp_path)
 

From 9ed2679266cea2ec5a9efd275b165f2032a8ad70 Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Mon, 11 May 2026 22:10:42 +0400
Subject: [PATCH 13/24] fix: P2 harness model attribution and harness_id
 includes system_template

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 llm_quest_benchmark/executors/benchmark.py    | 12 ++++++++-
 llm_quest_benchmark/schemas/config.py         |  2 +-
 .../tests/harnesses/test_factory.py           |  7 +++++
 .../tests/integration/test_benchmark.py       |  2 +-
 .../tests/test_benchmark_with_directory.py    | 26 ++++++++++++++++---
 5 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/llm_quest_benchmark/executors/benchmark.py b/llm_quest_benchmark/executors/benchmark.py
index 53c430d..82800b3 100644
--- a/llm_quest_benchmark/executors/benchmark.py
+++ b/llm_quest_benchmark/executors/benchmark.py
@@ -39,6 +39,16 @@ def _agent_harness(agent_config) -> str:
     return agent_config.harness
 
 
+def _agent_model(agent_config) -> str:
+    """Return the result model label for the executed harness."""
+    harness = _agent_harness(agent_config)
+    if harness == "human":
+        return "human"
+    if harness.startswith("random_choice"):
+        return "random_policy"
+    return agent_config.model
+
+
 def _agent_template(agent_config) -> str:
     """Return legacy template name for result artifacts."""
     if hasattr(agent_config, "action_template"):
@@ -91,7 +101,7 @@ def _result_entry(
 ) -> dict[str, Any]:
     return {
         "quest": quest,
-        "model": agent_config.model,
+        "model": _agent_model(agent_config),
         "temperature": agent_config.temperature,
         "harness": _agent_harness(agent_config),
         "template": _agent_template(agent_config),
diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py
index 7d5c74c..f32184e 100644
--- a/llm_quest_benchmark/schemas/config.py
+++ b/llm_quest_benchmark/schemas/config.py
@@ -123,7 +123,7 @@ def harness_id(self) -> str:
         """Generate a stable harness ID based on configuration values"""
         import hashlib
 
-        config_str = f"{self.model}_{self.temperature}_{self.harness}_{self.compaction_interval}"
+        config_str = f"{self.model}_{self.temperature}_{self.harness}_{self.system_template}_{self.compaction_interval}"
         hash_val = hashlib.md5(config_str.encode()).hexdigest()[:8]
         return f"{self.model}_t{self.temperature}_{self.harness}_{hash_val}"
 
diff --git a/llm_quest_benchmark/tests/harnesses/test_factory.py b/llm_quest_benchmark/tests/harnesses/test_factory.py
index 4d72b61..7f6f11c 100644
--- a/llm_quest_benchmark/tests/harnesses/test_factory.py
+++ b/llm_quest_benchmark/tests/harnesses/test_factory.py
@@ -62,6 +62,13 @@ def test_harness_config_stable_harness_id():
     assert config.harness_id == HarnessConfig(harness="memo_compact", model="gpt-5-mini").harness_id
 
 
+def test_harness_config_system_template_affects_harness_id():
+    first = HarnessConfig(harness="memo_compact", model="gpt-5-mini", system_template="system_role.jinja")
+    second = HarnessConfig(harness="memo_compact", model="gpt-5-mini", system_template="system_role_risk.jinja")
+
+    assert first.harness_id != second.harness_id
+
+
 def test_harness_config_allows_seeded_random_choice_harness():
     config = HarnessConfig(harness="random_choice_123", model="gpt-5-mini")
 
diff --git a/llm_quest_benchmark/tests/integration/test_benchmark.py b/llm_quest_benchmark/tests/integration/test_benchmark.py
index c024da4..1c56d35 100644
--- a/llm_quest_benchmark/tests/integration/test_benchmark.py
+++ b/llm_quest_benchmark/tests/integration/test_benchmark.py
@@ -83,7 +83,7 @@ def test_benchmark_e2e(caplog, tmp_path):
         # Check first result
         result = results[0]
         assert result["quest"] == str(quest_path)
-        assert result["model"] == "random_choice"
+        assert result["model"] == "random_policy"
         assert result["temperature"] == 0.0
         assert result["template"] == "reasoning.jinja"
         assert result["attempt"] == 1
diff --git a/llm_quest_benchmark/tests/test_benchmark_with_directory.py b/llm_quest_benchmark/tests/test_benchmark_with_directory.py
index bfe4e8d..c6dc855 100644
--- a/llm_quest_benchmark/tests/test_benchmark_with_directory.py
+++ b/llm_quest_benchmark/tests/test_benchmark_with_directory.py
@@ -6,13 +6,13 @@
 
 import pytest
 
+from llm_quest_benchmark.executors.benchmark import _result_entry, run_benchmark
+from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig
+
 # Configure logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
 
-from llm_quest_benchmark.executors.benchmark import run_benchmark
-from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig
-
 
 def create_test_config():
     """Create a test benchmark configuration with directory path"""
@@ -27,6 +27,26 @@ def create_test_config():
     }
 
 
+def test_result_entry_logs_random_harness_model_as_random_policy():
+    """Random harness results should not be attributed to the default LLM model."""
+    agent_config = HarnessConfig(harness="random_choice", model="gpt-5-mini")
+
+    result = _result_entry("quests/Boat.qm", agent_config, 1, "FAILURE")
+
+    assert result["model"] == "random_policy"
+    assert result["harness"] == "random_choice"
+
+
+def test_result_entry_logs_human_harness_model_as_human():
+    """Human harness results should not be attributed to the default LLM model."""
+    agent_config = HarnessConfig(harness="human", model="gpt-5-mini")
+
+    result = _result_entry("quests/Boat.qm", agent_config, 1, "FAILURE")
+
+    assert result["model"] == "human"
+    assert result["harness"] == "human"
+
+
 @pytest.mark.skipif(not Path("quests/sr_2_1_2121_eng").exists(), reason="Quest files not downloaded")
 def test_benchmark_with_directory():
     """Test running a benchmark with a directory path"""

From 96704c98c14b8c088262b5967b8fbc57d8bba175 Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Mon, 11 May 2026 22:44:45 +0400
Subject: [PATCH 14/24] fix: hide legacy AgentConfig public export

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 llm_quest_benchmark/core/runner.py      | 4 ++--
 llm_quest_benchmark/schemas/__init__.py | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/llm_quest_benchmark/core/runner.py b/llm_quest_benchmark/core/runner.py
index e5f531a..e2b9ef8 100644
--- a/llm_quest_benchmark/core/runner.py
+++ b/llm_quest_benchmark/core/runner.py
@@ -15,7 +15,7 @@
 from llm_quest_benchmark.core.logging import LogManager, QuestLogger
 from llm_quest_benchmark.environments.qm import QMPlayerEnv as QuestEnvironment
 from llm_quest_benchmark.environments.state import QuestOutcome
-from llm_quest_benchmark.schemas.config import AgentConfig
+from llm_quest_benchmark.schemas.config import HarnessConfig
 from llm_quest_benchmark.schemas.state import AgentState
 
 # Configure logging
@@ -27,7 +27,7 @@ def run_quest_with_timeout(
     quest_path: str,
     agent: QuestPlayer,
     timeout: int = DEFAULT_QUEST_TIMEOUT,
-    agent_config: AgentConfig | None = None,
+    agent_config: HarnessConfig | Any | None = None,
     debug: bool = False,
     callbacks: list[Callable[[str, Any], None]] = None,
 ) -> QuestOutcome | None:
diff --git a/llm_quest_benchmark/schemas/__init__.py b/llm_quest_benchmark/schemas/__init__.py
index 0cb4242..cb0338f 100644
--- a/llm_quest_benchmark/schemas/__init__.py
+++ b/llm_quest_benchmark/schemas/__init__.py
@@ -7,11 +7,10 @@
     "QMBridgeState",
     "BenchmarkConfig",
     "HarnessConfig",
-    "AgentConfig",
 ]
 
 # Import directly from the schema modules using relative imports
 from .bridge import QMBridgeState
-from .config import AgentConfig, BenchmarkConfig, HarnessConfig
+from .config import BenchmarkConfig, HarnessConfig
 from .response import LLMResponse
 from .state import AgentState, QMState

From cce82dc0cf6b37b5d1ede7d7e641be6c0ed2783a Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Tue, 12 May 2026 00:39:44 +0400
Subject: [PATCH 15/24] docs: consolidate harness documentation

---
 .../balanced_gpt5mini_all_modes.yaml          |  20 +-
 configs/kr2_en_benchmark.yaml                 |  40 ----
 docs/ARCHITECTURE.md                          |  25 +++
 docs/EXPERIMENTS_LOG.md                       | 139 +++++++++++++
 docs/EXPERIMENT_AUDIT.md                      | 193 ------------------
 docs/HARNESS_ENGINEERING.md                   |  64 ------
 docs/SPEC.md                                  |  18 ++
 7 files changed, 190 insertions(+), 309 deletions(-)
 delete mode 100644 configs/kr2_en_benchmark.yaml
 delete mode 100644 docs/EXPERIMENT_AUDIT.md
 delete mode 100644 docs/HARNESS_ENGINEERING.md

diff --git a/configs/benchmarks/balanced_gpt5mini_all_modes.yaml b/configs/benchmarks/balanced_gpt5mini_all_modes.yaml
index 4ab3e65..812e94a 100644
--- a/configs/benchmarks/balanced_gpt5mini_all_modes.yaml
+++ b/configs/benchmarks/balanced_gpt5mini_all_modes.yaml
@@ -21,49 +21,45 @@ quests:
 agents:
   # 1. Minimal prompt
   - model: gpt-5-mini
-    template: stub
+    harness: minimal
     temperature: 0.4
     runs: 3
   # 2. Short-context reasoning
   - model: gpt-5-mini
-    template: reasoning
+    harness: reasoning_recent
     temperature: 0.4
     runs: 3
   # 3. Full-history reasoning
   - model: gpt-5-mini
-    template: reasoning
+    harness: reasoning_full
     temperature: 0.4
     runs: 3
-    memory_mode: full_transcript
   # 4. Compact memory / memo
   - model: gpt-5-mini
-    template: stateful_compact
+    harness: memo_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
   # 5. Prompt hints
   - model: gpt-5-mini
-    template: light_hints
+    harness: hinted_compact
     temperature: 0.4
     runs: 3
   # 6. Tools + compact memory
   - model: gpt-5-mini
-    template: tool_augmented
+    harness: tool_compact
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
   # 7. Tools + hints + compact memory
   - model: gpt-5-mini
-    template: tool_augmented_hints
+    harness: tool_hinted
     temperature: 0.4
     runs: 3
-    memory_mode: compaction
     compaction_interval: 50
   # 8. Planner loop
   - model: gpt-5-mini
-    template: planner
+    harness: planner
     temperature: 0.4
     runs: 3
 debug: false
diff --git a/configs/kr2_en_benchmark.yaml b/configs/kr2_en_benchmark.yaml
deleted file mode 100644
index 88a0fe5..0000000
--- a/configs/kr2_en_benchmark.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Benchmark configuration for Kr2 English quests
-# Using recommended models with optimized temperature settings
-
-quests:
-  - quests/kr2_en
-
-agents:
-  # OpenAI models
-  - model: gpt-4o
-    harness: reasoning_recent
-    temperature: 0.5
-    skip_single: true
-
-  - model: gpt-4o-mini
-    harness: reasoning_recent
-    temperature: 0.7
-    skip_single: true
-
-  # Anthropic models
-  - model: claude-3-7-sonnet-latest
-    harness: reasoning_recent
-    temperature: 0.5
-    skip_single: true
-
-  - model: claude-3-5-sonnet-latest
-    harness: reasoning_recent
-    temperature: 0.6
-    skip_single: true
-
-# Debug mode enables more detailed logging
-debug: true
-
-# Quest timeout in seconds
-quest_timeout: 120
-
-# Output directory for benchmark results
-output_dir: metrics/kr2_en
-
-# Optional name for this benchmark run
-name: kr2_en_benchmark
\ No newline at end of file
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index 998241a..83472b3 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -15,6 +15,26 @@ The runtime loop is:
 4. Apply the choice, log the step, and detect the terminal outcome.
 5. Persist run metrics and run summaries.
 
+## Harness Engineering Framing
+
+This project treats the **agent harness** as the primary experimental object.
+An agent harness is the wrapper around a model that controls what the model
+sees, what state is carried forward, what external tools are available, and how
+a raw completion is converted into a quest action. In this codebase, harnesses
+are not incidental plumbing: they are the independent variable.
+
+This follows the practical question raised by "How Much Heavy Lifting Can an
+Agent Harness Do?" (arXiv:2604.07236): how much performance comes from the
+surrounding scaffold rather than the base model alone? Space Rangers text
+quests are useful because they are long enough to stress memory, planning, and
+state tracking, but concrete enough to score with terminal success/failure
+outcomes.
+
+Closest text-game benchmarks such as TextQuests and TALE-Suite usually vary
+models under a mostly fixed evaluation scaffold. LLM Quest Benchmark can hold
+the model fixed and vary the harness to ask which prompt, memory, tool, and
+planning choices change behavior.
+
 ## Main Runtime Layers
 
 ### 1. Quest Engine Layer
@@ -107,3 +127,8 @@ and benchmark configuration parsing do not require API keys.
 | Tools + compact memory | `tool_compact` | `tool_augmented.jinja` | `CompactionMemory` | calculator, scratchpad, quest history | tool-select-then-act |
 | Tools + hints + compact memory | `tool_hinted` | `tool_augmented_hints.jinja` | `CompactionMemory` | calculator, scratchpad, quest history | tool-select-then-act |
 | Planner loop | `planner` | `planner.jinja` | `CompactionMemory` | none | plan-maintain-act |
+
+The harness names above are canonical snake_case identifiers used in YAML
+configs, the CLI, result artifacts, and documentation. Public labels can be
+friendlier, but experiment records should preserve the canonical names so runs
+remain comparable.
diff --git a/docs/EXPERIMENTS_LOG.md b/docs/EXPERIMENTS_LOG.md
index 0d9ce49..6d0d7f9 100644
--- a/docs/EXPERIMENTS_LOG.md
+++ b/docs/EXPERIMENTS_LOG.md
@@ -21,6 +21,145 @@
 
 Record of benchmark experiments, findings, and decisions. Keeps history out of source code.
 
+## Current Coverage Audit (2026-05-11)
+
+Sources reviewed for this audit:
+
+- `docs/EXPERIMENTS_LOG.md`
+- `docs/ARCHITECTURE.md`
+- `configs/benchmarks/*.yaml`
+- `site/leaderboard.json`
+
+This audit uses the post-refactor harness taxonomy: `minimal`,
+`reasoning_recent`, `reasoning_full`, `memo_compact`, `hinted_compact`,
+`tool_compact`, `tool_hinted`, and `planner`.
+
+### Experiment Inventory
+
+| Experiment | Config / source | Harness mapping | Quest scope | Completed runs recorded in log | Audit disposition |
+|---|---|---|---|---:|---|
+| Exp 2: Memory Modes | `memory_full_transcript.yaml`, `memory_compaction.yaml` | `reasoning_full`, `memo_compact` | 14 historical quests including `Prison` | 126 | Unreliable for canonical comparison: loop-breaker bug era. |
+| Exp 3 Arm 1: No Loop Breaker | `exp3_no_loop_breaker.yaml` | `reasoning_full` | 18 quests, excluding `Boat`/`Prison` | 36 | Use only rerun after timeout fix; pre-fix attempt is noisy/incomplete. |
+| Exp 3 Arm 2: Stateful Compact | `exp3_stateful_compact.yaml` | `memo_compact` | 18 quests, excluding `Boat`/`Prison` | 36 | Canonical memo baseline, but only 2 runs/quest. |
+| Exp 4: Compaction No Memo | `exp4_compaction_no_memo.yaml` | retired ablation, not canonical | 18 quests | 36 | Do not aggregate into `memo_compact`. |
+| Exp 4: Memo Extended | `exp4_memo_extended.yaml` | retired `memo_extended` variant | 18 quests | 36 | Non-canonical variant. |
+| Exp 4: Memo Structured | `exp4_memo_structured.yaml` | retired `memo_structured` variant | 18 quests | 36 | Non-canonical variant. |
+| Exp 4: Memo CoT | `exp4_memo_cot.yaml` | retired `memo_cot` variant | 18 quests | 36 | Non-canonical variant. |
+| Exp 5: Baseline Variance | `exp5_stateful_compact_variance.yaml` | `memo_compact` | 18 quests | 90 | Canonical memo baseline variance study. |
+| Exp 6: Prompt Hints | `exp6_prompt_hints.yaml` | `hinted_compact` | 18 quests | 54 | Canonical single-model harness comparison. |
+| Exp 6: Tools | `exp6_tools.yaml` | `tool_compact` | 18 quests | 54 | Canonical single-model harness comparison. |
+| Exp 6: Tools + Hints | `exp6_tools_hints.yaml` | `tool_hinted` | 18 quests | 54 | Canonical single-model harness comparison. |
+| Exp 7: Multi-Model Comparison | `exp7_*.yaml` | `memo_compact` | 5 winnable quests | 75 | Canonical model sweep for memo harness. |
+| Exp 7b: Model Upgrades | `exp7b_model_upgrades.yaml` | `memo_compact` | 18 quests | 108 | Noisy model-upgrade sweep; high timeout rates for Qwen 3.6 and Haiku 4.5. |
+
+### Harness Coverage Matrix
+
+The table below is computed from `site/leaderboard.json` and counts recorded
+leaderboard runs by harness and quest. `Boat` and `Prison` are retained because
+they still appear in the published leaderboard data, but they are retired from
+the canonical experiment set.
+
+| Harness | Badday | Banket | Boat | Codebox | Depth | Driver | Edelweiss | Election | Foncers | Leonardo | Ministry | Pizza | Prison | Robots | Ski | Total |
+|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+| `minimal` | 22 | 22 | 23 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 331 |
+| `reasoning_recent` | 22 | 22 | 28 | 22 | 22 | 24 | 25 | 30 | 25 | 25 | 26 | 22 | 28 | 31 | 31 | 383 |
+| `reasoning_full` | 17 | 17 | 9 | 17 | 17 | 15 | 17 | 17 | 17 | 17 | 16 | 17 | 6 | 14 | 14 | 227 |
+| `memo_compact` | 37 | 39 | 18 | 39 | 39 | 39 | 39 | 37 | 39 | 37 | 39 | 37 | 15 | 39 | 34 | 527 |
+| `hinted_compact` | 4 | 4 | 1 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 1 | 4 | 4 | 54 |
+| `tool_compact` | 3 | 3 | 0 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 0 | 3 | 3 | 39 |
+| `tool_hinted` | 3 | 3 | 0 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 0 | 3 | 3 | 39 |
+| `planner` | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 15 |
+
+Leaderboard scope note: the current public JSON includes 15 quest columns and
+does not include several 18-quest experiment-log quests such as `Pilot`,
+`Disk`, `Player`, `Shashki`, and `Sortirovka1`. A future leaderboard refresh
+should either add them or explicitly document why the public slice excludes
+them.
+
+### Gap Analysis
+
+All zero-run cells in the published leaderboard matrix are retired quest cells:
+
+- `tool_compact` x `Boat`: 0 runs.
+- `tool_compact` x `Prison`: 0 runs.
+- `tool_hinted` x `Boat`: 0 runs.
+- `tool_hinted` x `Prison`: 0 runs.
+
+Because `Boat` and `Prison` are retired, these do not require new canonical
+runs. They do indicate that the public leaderboard mixes active and retired
+quest scopes.
+
+Cells with fewer than 3 runs:
+
+- `hinted_compact` x `Boat`: 1 run; retired quest.
+- `hinted_compact` x `Prison`: 1 run; retired quest.
+- `planner`: 1 run on every published quest.
+
+Canonical action item: the planner harness has insufficient variance coverage.
+For active quests, it needs at least two additional runs per quest to reach the
+minimum 3-run threshold.
+
+The following harnesses have leaderboard cells where the run count may be at
+least 3, but the model dimension is still only one model: `tool_compact`,
+`tool_hinted`, and `planner`. Their comparison is promising, but not yet
+model-robust.
+
+### Noise And Anomalies
+
+Loop-breaker bug era:
+
+- Exp 2 memory-mode runs are unreliable. The experiment log documents a
+  number-normalization bug in `_normalize_for_signature` and aggressive loop
+  breaker overrides that changed correct model decisions.
+- Exp 3 Arm 1 has a pre-fix/incomplete attempt affected by SDK timeout issues.
+  Only the rerun after the timeout fix should be considered.
+- Any leaderboard entry whose provenance traces to Exp 2 or the Exp 3 pre-fix
+  attempt should be marked non-canonical until regenerated or excluded.
+
+High-timeout model-upgrade runs:
+
+- Exp 7b `Qwen 3.6 Flash`: 17/36 timeouts (47%).
+- Exp 7b `Claude Haiku 4.5`: 19/36 timeouts (53%).
+- Exp 7b `DeepSeek V4 Flash`: 5/36 timeouts (14%), below the >30% threshold
+  but still noisy because success was 0/36.
+
+Retired quests:
+
+- `Boat`: trivial / smoke-test-like quest; removed from canonical experiment
+  configs.
+- `Prison`: loops endlessly; removed from canonical experiment configs.
+
+Retired harness variants:
+
+- `memo_extended`
+- `memo_structured`
+- `memo_cot`
+- `compaction_no_memo` ablation
+
+These variants should not be merged into canonical `memo_compact` results.
+
+### Budget Estimate
+
+Top-priority new runs to close actionable gaps while avoiding retired quests:
+
+| Priority | Harness | Quest(s) | New runs needed | Reason |
+|---:|---|---|---:|---|
+| 1 | `planner` | 13 active published quests (`Badday`, `Banket`, `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`, `Foncers`, `Leonardo`, `Ministry`, `Pizza`, `Robots`, `Ski`) | 26 | Bring 1-run planner cells up to the 3-run minimum on active leaderboard quests. |
+| 2 | `planner` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest so planner effects are not single-model artifacts. |
+| 3 | `tool_compact` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest; current cells are all one-model results. |
+| 4 | `tool_hinted` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest; current cells are all one-model results. |
+| 5 | Public leaderboard refresh | `Pilot`, `Disk`, `Player`, `Shashki`, `Sortirovka1` | Scope-dependent | These quests are present in canonical 18-quest configs/logs but absent from the current public leaderboard matrix. Backfill or explicitly exclude them. |
+
+Do not spend new budget on `Boat` or `Prison` unless the goal is only to
+reproduce historical/public rows; both are retired from canonical analysis.
+
+### Leaderboard Integrity
+
+Recommended integrity rule: canonical leaderboard aggregates should require
+non-retired quests, canonical harness names, no loop-breaker bug provenance, at
+least 3 runs per harness x quest cell, and at least two models for claims about
+harness effects rather than model effects.
+
 ## Exp 2: Memory Modes (2026-04-27)
 
 **Config**: `configs/benchmarks/memory_full_transcript.yaml`, `configs/benchmarks/memory_compaction.yaml`
diff --git a/docs/EXPERIMENT_AUDIT.md b/docs/EXPERIMENT_AUDIT.md
deleted file mode 100644
index 3298d0e..0000000
--- a/docs/EXPERIMENT_AUDIT.md
+++ /dev/null
@@ -1,193 +0,0 @@
-# Experiment Audit
-
-Generated: 2026-05-11
-
-Sources reviewed:
-
-- `docs/EXPERIMENTS_LOG.md`
-- `docs/ARCHITECTURE.md`
-- `configs/benchmarks/*.yaml`
-- `site/leaderboard.json`
-
-This audit uses the post-refactor harness taxonomy: `minimal`,
-`reasoning_recent`, `reasoning_full`, `memo_compact`, `hinted_compact`,
-`tool_compact`, `tool_hinted`, and `planner`.
-
-## Experiment Inventory
-
-| Experiment | Config / source | Harness mapping | Quest scope | Completed runs recorded in log | Audit disposition |
-|---|---|---|---|---:|---|
-| Exp 2: Memory Modes | `memory_full_transcript.yaml`, `memory_compaction.yaml` | `reasoning_full`, `memo_compact` | 14 historical quests including `Prison` | 126 | Unreliable for canonical comparison: loop-breaker bug era. |
-| Exp 3 Arm 1: No Loop Breaker | `exp3_no_loop_breaker.yaml` | `reasoning_full` | 18 quests, excluding `Boat`/`Prison` | 36 | Use only rerun after timeout fix; pre-fix attempt is noisy/incomplete. |
-| Exp 3 Arm 2: Stateful Compact | `exp3_stateful_compact.yaml` | `memo_compact` | 18 quests, excluding `Boat`/`Prison` | 36 | Canonical memo baseline, but only 2 runs/quest. |
-| Exp 4: Compaction No Memo | `exp4_compaction_no_memo.yaml` | retired ablation, not canonical | 18 quests | 36 | Do not aggregate into `memo_compact`. |
-| Exp 4: Memo Extended | `exp4_memo_extended.yaml` | retired `memo_extended` variant | 18 quests | 36 | Non-canonical variant. |
-| Exp 4: Memo Structured | `exp4_memo_structured.yaml` | retired `memo_structured` variant | 18 quests | 36 | Non-canonical variant. |
-| Exp 4: Memo CoT | `exp4_memo_cot.yaml` | retired `memo_cot` variant | 18 quests | 36 | Non-canonical variant. |
-| Exp 5: Baseline Variance | `exp5_stateful_compact_variance.yaml` | `memo_compact` | 18 quests | 90 | Canonical memo baseline variance study. |
-| Exp 6: Prompt Hints | `exp6_prompt_hints.yaml` | `hinted_compact` | 18 quests | 54 | Canonical single-model harness comparison. |
-| Exp 6: Tools | `exp6_tools.yaml` | `tool_compact` | 18 quests | 54 | Canonical single-model harness comparison. |
-| Exp 6: Tools + Hints | `exp6_tools_hints.yaml` | `tool_hinted` | 18 quests | 54 | Canonical single-model harness comparison. |
-| Exp 7: Multi-Model Comparison | `exp7_*.yaml` | `memo_compact` | 5 winnable quests | 75 | Canonical model sweep for memo harness. |
-| Exp 7b: Model Upgrades | `exp7b_model_upgrades.yaml` | `memo_compact` | 18 quests | 108 | Noisy model-upgrade sweep; high timeout rates for Qwen 3.6 and Haiku 4.5. |
-
-## 1. Harness Coverage Matrix
-
-The table below is computed from `site/leaderboard.json` and counts recorded
-leaderboard runs by harness and quest. `Boat` and `Prison` are retained in this
-matrix because they still appear in the published leaderboard data, but they
-are retired from the canonical experiment set.
-
-| Harness | Badday | Banket | Boat | Codebox | Depth | Driver | Edelweiss | Election | Foncers | Leonardo | Ministry | Pizza | Prison | Robots | Ski | Total |
-|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
-| `minimal` | 22 | 22 | 23 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 331 |
-| `reasoning_recent` | 22 | 22 | 28 | 22 | 22 | 24 | 25 | 30 | 25 | 25 | 26 | 22 | 28 | 31 | 31 | 383 |
-| `reasoning_full` | 17 | 17 | 9 | 17 | 17 | 15 | 17 | 17 | 17 | 17 | 16 | 17 | 6 | 14 | 14 | 227 |
-| `memo_compact` | 37 | 39 | 18 | 39 | 39 | 39 | 39 | 37 | 39 | 37 | 39 | 37 | 15 | 39 | 34 | 527 |
-| `hinted_compact` | 4 | 4 | 1 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 1 | 4 | 4 | 54 |
-| `tool_compact` | 3 | 3 | **0** | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | **0** | 3 | 3 | 39 |
-| `tool_hinted` | 3 | 3 | **0** | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | **0** | 3 | 3 | 39 |
-| `planner` | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 15 |
-
-Leaderboard scope note: the current public JSON includes 15 quest columns and
-does not include several 18-quest experiment-log quests such as `Pilot`,
-`Disk`, `Player`, `Shashki`, and `Sortirovka1`. Those quests appear in the
-benchmark configs and experiment log, so a future leaderboard refresh should
-either add them or explicitly document why the public slice excludes them.
-
-## 2. Gap Analysis
-
-### Zero-run harness × quest cells
-
-All zero-run cells in the published leaderboard matrix are retired quest cells:
-
-- `tool_compact` × `Boat`: 0 runs.
-- `tool_compact` × `Prison`: 0 runs.
-- `tool_hinted` × `Boat`: 0 runs.
-- `tool_hinted` × `Prison`: 0 runs.
-
-Because `Boat` and `Prison` are retired, these do not require new canonical
-runs. They do indicate that the public leaderboard mixes active and retired
-quest scopes.
-
-### Fewer than 3 runs
-
-- `hinted_compact` × `Boat`: 1 run; retired quest.
-- `hinted_compact` × `Prison`: 1 run; retired quest.
-- `planner`: 1 run on every published quest (`Badday`, `Banket`, `Boat`,
-  `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`, `Foncers`,
-  `Leonardo`, `Ministry`, `Pizza`, `Prison`, `Robots`, `Ski`).
-
-Canonical action item: the planner harness has insufficient variance coverage.
-For active quests, it needs at least two additional runs per quest to reach the
-minimum 3-run threshold.
-
-### Only 1 model tested
-
-The following harnesses have leaderboard cells where the run count may be at
-least 3, but the model dimension is still only one model. These cells cannot
-separate harness effects from model-specific behavior:
-
-- `tool_compact`: one model on all non-retired published quests
-  (`Badday`, `Banket`, `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`,
-  `Foncers`, `Leonardo`, `Ministry`, `Pizza`, `Robots`, `Ski`).
-- `tool_hinted`: one model on all non-retired published quests
-  (`Badday`, `Banket`, `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`,
-  `Foncers`, `Leonardo`, `Ministry`, `Pizza`, `Robots`, `Ski`).
-- `planner`: one model on every published quest and only one run per quest.
-- `hinted_compact` on `Boat` and `Prison`: one model, but both quests are
-  retired.
-
-The stronger public comparison cells are `minimal`, `reasoning_recent`,
-`reasoning_full`, and `memo_compact`, which have multi-model coverage in the
-leaderboard data. However, `reasoning_full` and `memo_compact` still require
-provenance filtering because early memory-mode runs overlap with the loop-
-breaker bug era.
-
-## 3. Noise / Anomaly List
-
-### Loop-breaker bug era
-
-- Exp 2 memory-mode runs are unreliable. The experiment log documents a
-  number-normalization bug in `_normalize_for_signature` and aggressive loop
-  breaker overrides that changed correct model decisions.
-- Exp 3 Arm 1 has a pre-fix/incomplete attempt affected by SDK timeout issues.
-  Only the rerun after the timeout fix should be considered.
-- Any leaderboard entry whose provenance traces to Exp 2 or the Exp 3 pre-fix
-  attempt should be marked non-canonical until regenerated or excluded.
-
-### High timeout runs
-
-- Exp 7b `Qwen 3.6 Flash`: 17/36 timeouts (47%).
-- Exp 7b `Claude Haiku 4.5`: 19/36 timeouts (53%).
-- Exp 7b `DeepSeek V4 Flash`: 5/36 timeouts (14%), below the >30% threshold
-  but still noisy because success was 0/36.
-
-The Qwen 3.6 and Haiku 4.5 rows should be interpreted primarily as timeout /
-verbosity failures, not clean harness-quality signals.
-
-### Retired quests
-
-- `Boat`: trivial / smoke-test-like quest; removed from canonical experiment
-  configs.
-- `Prison`: loops endlessly; removed from canonical experiment configs.
-
-Both still appear in `site/leaderboard.json`, so public summaries should label
-them as retired or remove them from canonical aggregates.
-
-### Retired harness variants
-
-The following Exp 4 arms are not part of the final taxonomy and should not be
-merged into canonical `memo_compact` results:
-
-- `memo_extended`
-- `memo_structured`
-- `memo_cot`
-- `compaction_no_memo` ablation
-
-Current YAML files have been migrated to the `harness:` key, so historical
-variant identity must be preserved from `docs/EXPERIMENTS_LOG.md` and config
-file names rather than inferred only from the post-refactor `harness` field.
-
-## 4. Budget Estimate
-
-Top-priority new runs to close actionable gaps while avoiding retired quests:
-
-| Priority | Harness | Quest(s) | New runs needed | Reason |
-|---:|---|---|---:|---|
-| 1 | `planner` | 13 active published quests (`Badday`, `Banket`, `Codebox`, `Depth`, `Driver`, `Edelweiss`, `Election`, `Foncers`, `Leonardo`, `Ministry`, `Pizza`, `Robots`, `Ski`) | 26 | Bring 1-run planner cells up to the 3-run minimum on active leaderboard quests. |
-| 2 | `planner` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest so planner effects are not single-model artifacts. |
-| 3 | `tool_compact` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest; current cells are all one-model results. |
-| 4 | `tool_hinted` | Same 13 active published quests | 39 | Add a second model with 3 runs/quest; current cells are all one-model results. |
-| 5 | Public leaderboard refresh | `Pilot`, `Disk`, `Player`, `Shashki`, `Sortirovka1` | Scope-dependent | These quests are present in canonical 18-quest configs/logs but absent from the current public leaderboard matrix. Backfill or explicitly exclude them. |
-
-Do not spend new budget on `Boat` or `Prison` unless the goal is only to
-reproduce historical/public rows; both are retired from canonical analysis.
-
-## 5. Leaderboard Integrity
-
-Findings from `site/leaderboard.json`:
-
-1. The leaderboard uses the eight canonical public modes and does not expose
-   retired harness variants as separate modes. This is good, but it creates a
-   provenance risk if Exp 4 retired variants were ever aggregated under
-   `memo_compact`.
-2. `Boat` and `Prison` remain in the published quest list despite being retired
-   from canonical experiment configs. They should be excluded from aggregate
-   claims or clearly labeled as retired.
-3. `planner` has only one run per quest and one model. It should not be used for
-   reliability claims yet.
-4. `tool_compact` and `tool_hinted` have three runs per active published quest,
-   but only one model. Their harness comparison is promising but not yet
-   model-robust.
-5. Published `reasoning_full` / `memo_compact` rows need run-level provenance
-   checks before canonical use because early memory-mode experiments overlap
-   with the Exp 2 loop-breaker bug era.
-6. Exp 7b model-upgrade entries for `Qwen 3.6 Flash` and `Claude Haiku 4.5`
-   should be annotated as high-timeout data if included in any leaderboard or
-   narrative comparison.
-
-Recommended integrity rule: canonical leaderboard aggregates should require
-non-retired quests, canonical harness names, no loop-breaker bug provenance, at
-least 3 runs per harness × quest cell, and at least two models for claims about
-harness effects rather than model effects.
diff --git a/docs/HARNESS_ENGINEERING.md b/docs/HARNESS_ENGINEERING.md
deleted file mode 100644
index 5666ebc..0000000
--- a/docs/HARNESS_ENGINEERING.md
+++ /dev/null
@@ -1,64 +0,0 @@
-# Harness Engineering
-
-LLM Quest Benchmark treats the **agent harness** as the primary experimental
-object. An agent harness is the wrapper around a model that controls what the
-model sees, what state is carried forward, what external tools are available,
-and how a raw model completion is converted into a quest action. In this
-project, harnesses are not incidental plumbing: they are the independent
-variable.
-
-This framing follows the harness engineering question raised by "How Much Heavy
-Lifting Can an Agent Harness Do?" (arXiv:2604.07236): how much performance comes
-from the surrounding scaffold rather than the base model alone? Space Rangers
-text quests are a useful testbed because they are long enough to stress memory,
-planning, and state tracking, but concrete enough to score with terminal
-success/failure outcomes.
-
-## The Eight Canonical Harnesses
-
-| Harness name | What varies |
-|---|---|
-| `minimal` | Uses the smallest action-selection prompt with recent context only. This is the low-scaffold baseline. |
-| `reasoning_recent` | Adds an explicit reasoning prompt while keeping recent-window memory. |
-| `reasoning_full` | Keeps the reasoning prompt but exposes the full transcript instead of a short recent window. |
-| `memo_compact` | Uses compacted memory plus a constrained 20-word memo to preserve salient state. |
-| `hinted_compact` | Adds mechanics hints to the compact memo harness, without tools. |
-| `tool_compact` | Adds calculator, scratchpad, and quest-history tools to compact memory. |
-| `tool_hinted` | Combines compact memory, tools, and mechanics hints. |
-| `planner` | Uses a plan-maintain-act loop with compact memory instead of a pure react loop. |
-
-The harness names are canonical snake_case identifiers used in YAML configs,
-the CLI, and documentation. Public labels can be friendlier, but experimental
-records should preserve these names so runs remain comparable.
-
-## Difference From TextQuests and TALE-Suite
-
-TextQuests (arXiv:2507.23701) and TALE-Suite are closest in spirit because they
-also evaluate language models on interactive text-game tasks. Their main
-comparison axis is model capability under a mostly fixed evaluation scaffold:
-the harness is treated as test infrastructure, and the model is varied.
-
-LLM Quest Benchmark flips that emphasis. We can hold a model fixed and vary the
-harness to ask which context, memory, tool, and planning choices change
-behavior. That makes the benchmark useful for harness engineering: it can
-separate "the model cannot do the task" from "this wrapper failed to show the
-model the right state, preserve the right facts, or expose the right operation."
-
-## Findings So Far
-
-The strongest pattern so far is that bigger scaffolds are not automatically
-better. A concise 20-word memo produced a sweet spot: it improved over no memo
-and full transcript baselines, while longer or more structured memo variants
-regressed. The likely mechanism is selective pressure: the short memo forces
-the harness to preserve only state that matters for future decisions.
-
-Tools and hints show a synergy effect. Prompt hints alone hurt, and tools alone
-were modest, but tools plus hints improved outcomes because the hints pointed
-the model toward quantities and morally grey quest mechanics while the
-calculator, scratchpad, and history search gave it ways to act on those
-signals.
-
-Verbosity hurts in this environment. Some newer or larger models timed out more
-often because they spent too much of the quest budget generating long step
-responses. For sequential decision tasks, a harness that elicits concise,
-actionable state updates can outperform one that invites broad reasoning.
diff --git a/docs/SPEC.md b/docs/SPEC.md
index cadbb99..44ef498 100644
--- a/docs/SPEC.md
+++ b/docs/SPEC.md
@@ -54,6 +54,24 @@ Use these labels for current public descriptions of benchmark harnesses:
 Older internal experiment labels are historical and should not be presented as
 the current public taxonomy.
 
+## Current Interpretation
+
+The strongest pattern so far is that bigger scaffolds are not automatically
+better. A concise 20-word memo produced a useful sweet spot: it improved over
+no-memo and full-transcript baselines, while longer or more structured memo
+variants regressed. The likely mechanism is selective pressure: the short memo
+forces the harness to preserve only state that matters for future decisions.
+
+Tools and hints showed a synergy effect. Prompt hints alone hurt, and tools
+alone were modest, but tools plus hints improved outcomes because the hints
+pointed the model toward quantities and quest mechanics while the calculator,
+scratchpad, and history search gave it ways to act on those signals.
+
+Verbosity is a recurring failure mode. Some newer or larger models timed out
+more often because they spent too much of the quest budget generating long step
+responses. For sequential decision tasks, a harness that elicits concise,
+actionable state updates can outperform one that invites broad reasoning.
+
 ## Implemented Runtime
 
 - Quest execution uses the TypeScript `space-rangers-quest` submodule through

From 2276639d94832abb3176727b7f20d7d30bcb8251 Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Tue, 12 May 2026 00:43:01 +0400
Subject: [PATCH 16/24] fix: address harness review feedback

---
 configs/benchmarks/exp7b_model_upgrades.yaml  |  2 +-
 llm_quest_benchmark/agents/agent_factory.py   |  5 +++++
 llm_quest_benchmark/executors/benchmark.py    | 21 ++++++++++++-------
 llm_quest_benchmark/executors/cli/commands.py |  2 --
 llm_quest_benchmark/harnesses/memo.py         |  6 +++++-
 llm_quest_benchmark/harnesses/memory.py       | 10 ++++++++-
 llm_quest_benchmark/harnesses/minimal.py      |  4 ++--
 llm_quest_benchmark/harnesses/planner.py      |  4 ++--
 llm_quest_benchmark/harnesses/tools.py        |  8 ++++---
 llm_quest_benchmark/schemas/config.py         |  5 +++++
 10 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/configs/benchmarks/exp7b_model_upgrades.yaml b/configs/benchmarks/exp7b_model_upgrades.yaml
index 22da91b..80ab53c 100644
--- a/configs/benchmarks/exp7b_model_upgrades.yaml
+++ b/configs/benchmarks/exp7b_model_upgrades.yaml
@@ -29,7 +29,7 @@ agents:
     temperature: 0.4
     runs: 2
     compaction_interval: 50
-  - model: "claude:claude-haiku-4-5-20251001"
+  - model: "anthropic:claude-haiku-4-5-20251001"
     harness: memo_compact
     temperature: 0.4
     runs: 2
diff --git a/llm_quest_benchmark/agents/agent_factory.py b/llm_quest_benchmark/agents/agent_factory.py
index 6d2ff42..09e2607 100644
--- a/llm_quest_benchmark/agents/agent_factory.py
+++ b/llm_quest_benchmark/agents/agent_factory.py
@@ -48,6 +48,11 @@ def create_agent(
     """
     logger.debug(f"Creating agent for model: {model}")
     resolved_action_template = normalize_template_name(action_template)
+    harness_routed_templates = {"planner.jinja", "tool_augmented.jinja", "tool_augmented_hints.jinja"}
+    if resolved_action_template in harness_routed_templates and memory_mode != "default":
+        raise ValueError(
+            "memory_mode is not supported for planner/tool harness templates; configure memory via harness selection."
+        )
 
     # Human player
     if model == "human":
diff --git a/llm_quest_benchmark/executors/benchmark.py b/llm_quest_benchmark/executors/benchmark.py
index 82800b3..14dacaf 100644
--- a/llm_quest_benchmark/executors/benchmark.py
+++ b/llm_quest_benchmark/executors/benchmark.py
@@ -49,6 +49,11 @@ def _agent_model(agent_config) -> str:
     return agent_config.model
 
 
+def _agent_id(agent_config) -> str:
+    """Return the stable result identifier for legacy and harness configs."""
+    return getattr(agent_config, "harness_id", None) or agent_config.agent_id
+
+
 def _agent_template(agent_config) -> str:
     """Return legacy template name for result artifacts."""
     if hasattr(agent_config, "action_template"):
@@ -106,7 +111,7 @@ def _result_entry(
         "harness": _agent_harness(agent_config),
         "template": _agent_template(agent_config),
         "memory_mode": _agent_memory_mode(agent_config),
-        "agent_id": agent_config.harness_id if hasattr(agent_config, "harness_id") else agent_config.agent_id,
+        "agent_id": _agent_id(agent_config),
         "attempt": attempt,
         "outcome": outcome,
         "reward": reward,
@@ -137,7 +142,7 @@ def _mark_run_timeout(run_id: int | None, quest: str, agent_config, benchmark_id
                 WHERE id = ?
                 """,
                 (
-                    agent_config.agent_id,
+                    _agent_id(agent_config),
                     agent_config_json,
                     benchmark_id,
                     QuestOutcome.TIMEOUT.name,
@@ -160,7 +165,7 @@ def _mark_run_timeout(run_id: int | None, quest: str, agent_config, benchmark_id
                     Path(quest).stem,
                     end_time,
                     end_time,
-                    agent_config.agent_id,
+                    _agent_id(agent_config),
                     agent_config_json,
                     QuestOutcome.TIMEOUT.name,
                     0.0,
@@ -404,7 +409,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[
 
                 logger.info(
                     "Queued agent %s quest %s (attempt %s/%s)",
-                    agent_config.agent_id,
+                    _agent_id(agent_config),
                     quest_name,
                     attempt,
                     agent_config.runs,
@@ -436,7 +441,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[
             }
             logger.info(
                 "Agent %s running quest %s (attempt %s/%s)",
-                agent_config.agent_id,
+                _agent_id(agent_config),
                 task["quest_name"],
                 task["attempt"],
                 agent_config.runs,
@@ -449,7 +454,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[
                     "total_runs": total_runs,
                     "quest": task["quest"],
                     "quest_name": task["quest_name"],
-                    "agent_id": agent_config.agent_id,
+                    "agent_id": _agent_id(agent_config),
                     "model": agent_config.model,
                     "attempt": task["attempt"],
                 },
@@ -484,7 +489,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[
                             "total_runs": total_runs,
                             "quest": task["quest"],
                             "quest_name": task["quest_name"],
-                            "agent_id": agent_config.agent_id,
+                            "agent_id": _agent_id(agent_config),
                             "model": agent_config.model,
                             "attempt": task["attempt"],
                             "outcome": result["outcome"],
@@ -529,7 +534,7 @@ def run_benchmark(config: BenchmarkConfig, progress_callback=None) -> list[dict[
                         "total_runs": total_runs,
                         "quest": task["quest"],
                         "quest_name": task["quest_name"],
-                        "agent_id": agent_config.agent_id,
+                        "agent_id": _agent_id(agent_config),
                         "model": agent_config.model,
                         "attempt": task["attempt"],
                         "outcome": QuestOutcome.TIMEOUT.name,
diff --git a/llm_quest_benchmark/executors/cli/commands.py b/llm_quest_benchmark/executors/cli/commands.py
index 4b029bd..d554f70 100644
--- a/llm_quest_benchmark/executors/cli/commands.py
+++ b/llm_quest_benchmark/executors/cli/commands.py
@@ -8,7 +8,6 @@
 from pathlib import Path
 from typing import Any
 
-import click
 from dotenv import load_dotenv
 
 # Initialize quest registry early
@@ -353,7 +352,6 @@ def run(
         "reasoning_recent",
         "--harness",
         help="Harness to use for quest decisions.",
-        click_type=click.Choice(HARNESS_CHOICES),
     ),
     compaction_interval: int = typer.Option(50, help="Advanced override for compaction interval."),
     timeout: int = typer.Option(60, help="Timeout in seconds for run (0 for no timeout)."),
diff --git a/llm_quest_benchmark/harnesses/memo.py b/llm_quest_benchmark/harnesses/memo.py
index eaab06b..63bfb60 100644
--- a/llm_quest_benchmark/harnesses/memo.py
+++ b/llm_quest_benchmark/harnesses/memo.py
@@ -27,7 +27,11 @@ def __init__(
             temperature=temperature,
             skip_single=skip_single,
             debug=debug,
-            memory_module=memory_module or CompactionMemory(compaction_interval=compaction_interval),
+            memory_module=(
+                memory_module
+                if memory_module is not None
+                else CompactionMemory(compaction_interval=compaction_interval)
+            ),
             **kwargs,
         )
         self._memory_mode = "compaction"
diff --git a/llm_quest_benchmark/harnesses/memory.py b/llm_quest_benchmark/harnesses/memory.py
index ab4f72c..22581fa 100644
--- a/llm_quest_benchmark/harnesses/memory.py
+++ b/llm_quest_benchmark/harnesses/memory.py
@@ -251,9 +251,11 @@ def _maybe_compact(self) -> None:
             return
         if self.llm_client is None:
             # No LLM client available for compaction; skip silently
+            self._steps_since_compaction = 0
             return
         transcript_text = self._format_transcript_for_compaction()
         if not transcript_text:
+            self._steps_since_compaction = 0
             return
 
         prompt_parts = ["You are summarizing an agent's progress through a text quest."]
@@ -272,11 +274,17 @@ def _maybe_compact(self) -> None:
             "Write a concise summary in plain text, max 300 words."
         )
 
-        summary = (self.llm_client.get_completion("\n".join(prompt_parts)) or "").strip()
+        try:
+            summary = (self.llm_client.get_completion("\n".join(prompt_parts)) or "").strip()
+        except Exception:
+            self._steps_since_compaction = 0
+            return
         if summary:
             self._compaction_summary = summary
             self._transcript = []
             self._steps_since_compaction = 0
+        else:
+            self._steps_since_compaction = 0
 
     def _format_transcript_for_compaction(self) -> str:
         recent = (
diff --git a/llm_quest_benchmark/harnesses/minimal.py b/llm_quest_benchmark/harnesses/minimal.py
index 8fa8ba0..8fdd944 100644
--- a/llm_quest_benchmark/harnesses/minimal.py
+++ b/llm_quest_benchmark/harnesses/minimal.py
@@ -39,11 +39,11 @@ def _get_action_impl(self, observation: str, choices: list[dict[str, str]]) -> i
             state_signature = self._state_signature(observation, choices)
             prompt = self._format_prompt(self._build_contextual_state(observation), choices)
             parsed_response = self._parse_with_retries(prompt, observation, choices)
+            if parsed_response.action < 1 or parsed_response.action > len(choices):
+                parsed_response.action = 1
             self.history.append(parsed_response)
             self._last_response = parsed_response
             self._remember_decision(observation, choices, state_signature, parsed_response)
-            if parsed_response.action < 1 or parsed_response.action > len(choices):
-                parsed_response.action = 1
             return parsed_response.action
         except Exception as exc:
             self.logger.error("Harness error during LLM call: %s", exc)
diff --git a/llm_quest_benchmark/harnesses/planner.py b/llm_quest_benchmark/harnesses/planner.py
index efb77a9..810440c 100644
--- a/llm_quest_benchmark/harnesses/planner.py
+++ b/llm_quest_benchmark/harnesses/planner.py
@@ -164,11 +164,11 @@ def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int:
             parsed_response.total_tokens = total_usage["total_tokens"]
             parsed_response.estimated_cost_usd = total_usage["estimated_cost_usd"]
 
+            if parsed_response.action < 1 or parsed_response.action > len(choices):
+                parsed_response.action = 1
             self.history.append(parsed_response)
             self._last_response = parsed_response
             self._remember_decision(state, choices, state_signature, parsed_response)
-            if parsed_response.action < 1 or parsed_response.action > len(choices):
-                parsed_response.action = 1
             return parsed_response.action
         except Exception as exc:
             self.logger.error("Planner harness error during LLM call: %s", exc)
diff --git a/llm_quest_benchmark/harnesses/tools.py b/llm_quest_benchmark/harnesses/tools.py
index 5386d6d..63edcd8 100644
--- a/llm_quest_benchmark/harnesses/tools.py
+++ b/llm_quest_benchmark/harnesses/tools.py
@@ -160,12 +160,14 @@ def search(self, query: str) -> str:
         scored.sort(key=lambda item: (item[0], item[1].get("step", 0)), reverse=True)
         best = [entry for score, entry in scored if score > 0][: self.history_window]
         if not best:
-            best = [entry for _, entry in scored[-self.history_window :]]
+            best = [entry for _, entry in scored[: self.history_window]]
 
         lines = []
         for entry in best:
+            choices = entry.get("choices", [])
+            choices_text = choices if isinstance(choices, str) else "; ".join(choices)
             lines.append(
-                f"Step {entry['step']}: obs={entry['observation']} | "
-                f"choices={'; '.join(entry['choices'])} | picked={entry.get('selected_choice', 'n/a')}"
+                f"Step {entry.get('step', '?')}: obs={entry.get('observation', '')} | "
+                f"choices={choices_text} | picked={entry.get('selected_choice', 'n/a')}"
             )
         return "\n".join(lines)
diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py
index f32184e..05053cd 100644
--- a/llm_quest_benchmark/schemas/config.py
+++ b/llm_quest_benchmark/schemas/config.py
@@ -111,6 +111,11 @@ def __post_init__(self):
         ):
             valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES]
             raise ValueError(f"Invalid harness: {self.harness}. Supported harnesses: {valid}")
+        if self.model not in ("human",) and not is_random_choice_harness(self.model):
+            from llm_quest_benchmark.llm.client import is_supported_model_name
+
+            if not is_supported_model_name(self.model):
+                raise ValueError(f"Invalid model: {self.model}. Supported models: {MODEL_CHOICES}")
         if not (0.0 <= self.temperature <= 2.0):
             raise ValueError(f"Temperature must be between 0.0 and 2.0, got {self.temperature}")
         if self.runs < 1:

From 42a89b1d8ecadcbfddb6e3e9623914d54f89f203 Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Tue, 12 May 2026 12:03:53 +0400
Subject: [PATCH 17/24] fix: preserve legacy agent compatibility

---
 llm_quest_benchmark/agents/agent_factory.py   |  2 +-
 llm_quest_benchmark/agents/strategic_agent.py | 80 ++++++++++++++++++-
 2 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/llm_quest_benchmark/agents/agent_factory.py b/llm_quest_benchmark/agents/agent_factory.py
index 09e2607..43ad273 100644
--- a/llm_quest_benchmark/agents/agent_factory.py
+++ b/llm_quest_benchmark/agents/agent_factory.py
@@ -49,7 +49,7 @@ def create_agent(
     logger.debug(f"Creating agent for model: {model}")
     resolved_action_template = normalize_template_name(action_template)
     harness_routed_templates = {"planner.jinja", "tool_augmented.jinja", "tool_augmented_hints.jinja"}
-    if resolved_action_template in harness_routed_templates and memory_mode != "default":
+    if resolved_action_template in harness_routed_templates and memory_mode not in ("default", "compaction"):
         raise ValueError(
             "memory_mode is not supported for planner/tool harness templates; configure memory via harness selection."
         )
diff --git a/llm_quest_benchmark/agents/strategic_agent.py b/llm_quest_benchmark/agents/strategic_agent.py
index a4cc4e7..edd656f 100644
--- a/llm_quest_benchmark/agents/strategic_agent.py
+++ b/llm_quest_benchmark/agents/strategic_agent.py
@@ -1,3 +1,79 @@
-"""Deprecated strategic agent module."""
+"""Deprecated compatibility wrapper for strategic agents."""
 
-raise ImportError("strategic_agent is deprecated; use llm_quest_benchmark.harnesses instead")
+import logging
+import warnings
+from typing import Any
+
+from llm_quest_benchmark.agents.base import QuestPlayer
+from llm_quest_benchmark.llm.prompt import PromptRenderer
+
+warnings.warn("strategic_agent is deprecated, use llm_quest_benchmark.harnesses", DeprecationWarning, stacklevel=2)
+
+
+class StrategicAgent(QuestPlayer):
+    """Backward-compatible strategic analysis decorator."""
+
+    def __init__(self, base_agent: QuestPlayer, debug: bool = False, template: str = "advanced.jinja"):
+        super().__init__(skip_single=base_agent.skip_single)
+        self.agent = base_agent
+        self.debug = debug
+        self.history = []
+
+        self.logger = logging.getLogger(self.__class__.__name__)
+        if self.debug:
+            self.logger.setLevel(logging.DEBUG)
+            handler = logging.StreamHandler()
+            handler.setFormatter(logging.Formatter("%(name)s - %(message)s"))
+            self.logger.addHandler(handler)
+
+        self.prompt_renderer = PromptRenderer(None, template=template)
+
+    def _get_action_impl(self, observation: str, choices: list) -> int:
+        if hasattr(self.agent, "llm"):
+            if self.debug:
+                self.logger.debug("\nObservation:\n%s", observation)
+
+            analysis = self.agent.llm(
+                "Analyze this situation and explain your thinking step-by-step instead of choosing an action:\n"
+                + observation
+            )
+
+            if self.debug:
+                self.logger.debug("\nAnalysis:\n%s", analysis)
+
+            self.history.append({"observation": observation, "analysis": analysis})
+            enhanced_context = self.get_enhanced_context(observation, choices)
+            if self.debug:
+                self.logger.debug("\nEnhanced Context:\n%s", enhanced_context)
+
+            return self.agent.get_action(enhanced_context, choices)
+
+        return self.agent.get_action(observation, choices)
+
+    def get_enhanced_context(self, observation: str, choices: list) -> str:
+        context = [f"Turn {len(self.history) + 1}: {entry['analysis']}" for entry in self.history[-3:]]
+        return self.prompt_renderer.render_action_prompt(
+            observation=observation,
+            choices=choices,
+            state_tracker=context,
+        )
+
+    def reset(self) -> None:
+        self.history = []
+        self.agent.reset()
+
+    def on_game_start(self) -> None:
+        if self.debug:
+            self.logger.debug("Starting new game with strategic analysis")
+        self.agent.on_game_start()
+
+    def on_game_end(self, final_state: dict[str, Any]) -> None:
+        self.agent.on_game_end(final_state)
+        if self.debug:
+            self.logger.debug("Final Analysis History:")
+            for entry in self.history:
+                self.logger.debug("\nObservation: %s", entry["observation"])
+                self.logger.debug("Analysis: %s", entry["analysis"])
+
+
+__all__ = ["StrategicAgent"]

From d9a0f224205f98f61f6fe09ec872cc98426bb431 Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Tue, 12 May 2026 12:56:55 +0400
Subject: [PATCH 18/24] fix: validate special harness models

---
 configs/default.yaml                          |  2 +-
 configs/kr2_en_test.yaml                      |  2 +-
 configs/test/parallel_agents_test.yaml        |  1 +
 configs/test/test_benchmark.yaml              |  1 +
 llm_quest_benchmark/harnesses/factory.py      |  4 +++-
 llm_quest_benchmark/schemas/config.py         |  7 ++++++-
 .../tests/executors/cli/test_commands.py      |  7 +++++--
 .../tests/harnesses/test_factory.py           | 20 +++++++++++++++++++
 .../tests/integration/test_quest_e2e.py       |  4 ++--
 9 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/configs/default.yaml b/configs/default.yaml
index ff185a5..3159029 100644
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -5,7 +5,7 @@ quests:
 
 agents:
   - model: random_choice
-    harness: reasoning_recent
+    harness: random_choice
     temperature: 0.0
     skip_single: true
 
diff --git a/configs/kr2_en_test.yaml b/configs/kr2_en_test.yaml
index 0addb04..94cfaa3 100644
--- a/configs/kr2_en_test.yaml
+++ b/configs/kr2_en_test.yaml
@@ -5,7 +5,7 @@ quests:
 agents:
   - model: random_choice  # Use random agent for speed and reliability
     temperature: 0.5
-    harness: reasoning_recent
+    harness: random_choice
 quest_timeout: 10  # short timeout for testing
 debug: true
 output_dir: results/benchmarks
diff --git a/configs/test/parallel_agents_test.yaml b/configs/test/parallel_agents_test.yaml
index 0aec1be..873d3ed 100644
--- a/configs/test/parallel_agents_test.yaml
+++ b/configs/test/parallel_agents_test.yaml
@@ -5,6 +5,7 @@ quests:
   - quests/kr_1_ru/Diamond.qm
 agents:
   - model: random_choice
+    harness: random_choice
   - model: gpt-5-mini
     harness: reasoning_recent
 debug: true
diff --git a/configs/test/test_benchmark.yaml b/configs/test/test_benchmark.yaml
index c20c648..b3321d9 100644
--- a/configs/test/test_benchmark.yaml
+++ b/configs/test/test_benchmark.yaml
@@ -3,6 +3,7 @@ quests:
   - quests/Boat.qm
 agents:
   - model: random_choice
+    harness: random_choice
   - model: gemini-2.5-flash
     harness: reasoning_recent
 debug: true
diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py
index 8ea4b84..5af7c50 100644
--- a/llm_quest_benchmark/harnesses/factory.py
+++ b/llm_quest_benchmark/harnesses/factory.py
@@ -70,9 +70,11 @@ def create_harness(
         raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}")
     is_random_model, seed = _parse_random_choice_seed(model)
     if is_random_model:
-        return RandomAgent(seed=seed, debug=debug, skip_single=skip_single)
+        raise ValueError("Use harness='random_choice' for random policy runs instead of pairing random_choice model with an LLM harness")
     if model.startswith("random_choice"):
         raise ValueError(f"Unknown random_choice model '{model}'. Valid: {valid}")
+    if model == "human":
+        raise ValueError("Use harness='human' for human runs instead of pairing human model with an LLM harness")
     cls = HARNESS_REGISTRY[harness]
     return cls(
         model_name=model,
diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py
index 05053cd..8008e1b 100644
--- a/llm_quest_benchmark/schemas/config.py
+++ b/llm_quest_benchmark/schemas/config.py
@@ -18,7 +18,7 @@
 DEFAULT_BENCHMARK_CONFIG = {
     "quests": ["quests/Boat.qm"],
     "agents": [
-        {"model": "random_choice", "skip_single": True, "temperature": 0.0, "harness": "minimal"},
+        {"model": "random_choice", "skip_single": True, "temperature": 0.0, "harness": "random_choice"},
         {"model": "gpt-5-mini", "skip_single": True, "temperature": 0.4, "harness": "reasoning_recent"},
     ],
     "debug": False,
@@ -43,6 +43,7 @@ def get_default_benchmark_yaml() -> str:
   - quests/Boat.qm
 agents:
   - model: random_choice
+    harness: random_choice
   - model: gpt-5-mini
     harness: reasoning_recent
 debug: true
@@ -111,6 +112,10 @@ def __post_init__(self):
         ):
             valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES]
             raise ValueError(f"Invalid harness: {self.harness}. Supported harnesses: {valid}")
+        if self.model == "human" and self.harness != "human":
+            raise ValueError("Use harness: human with model: human")
+        if is_random_choice_harness(self.model) and not is_random_choice_harness(self.harness):
+            raise ValueError("Use harness: random_choice with model: random_choice")
         if self.model not in ("human",) and not is_random_choice_harness(self.model):
             from llm_quest_benchmark.llm.client import is_supported_model_name
 
diff --git a/llm_quest_benchmark/tests/executors/cli/test_commands.py b/llm_quest_benchmark/tests/executors/cli/test_commands.py
index db0daf1..d88fdbf 100644
--- a/llm_quest_benchmark/tests/executors/cli/test_commands.py
+++ b/llm_quest_benchmark/tests/executors/cli/test_commands.py
@@ -20,7 +20,10 @@ def test_version():
 
 def test_run_quest():
     """Test running a quest with random agent"""
-    result = runner.invoke(app, ["run", "--quest", str(DEFAULT_QUEST), "--model", "random_choice", "--debug"])
+    result = runner.invoke(
+        app,
+        ["run", "--quest", str(DEFAULT_QUEST), "--model", "random_choice", "--harness", "random_choice", "--debug"],
+    )
     assert result.exit_code in [0, 1, 2]
 
 
@@ -31,7 +34,7 @@ def test_run_quest_invalid_args():
     assert result.exit_code == 2
 
     # Test missing quest file
-    result = runner.invoke(app, ["run", "--quest", "nonexistent.qm", "--model", "random_choice"])
+    result = runner.invoke(app, ["run", "--quest", "nonexistent.qm", "--model", "random_choice", "--harness", "random_choice"])
     assert result.exit_code == 2
 
 
diff --git a/llm_quest_benchmark/tests/harnesses/test_factory.py b/llm_quest_benchmark/tests/harnesses/test_factory.py
index 7f6f11c..800a502 100644
--- a/llm_quest_benchmark/tests/harnesses/test_factory.py
+++ b/llm_quest_benchmark/tests/harnesses/test_factory.py
@@ -55,6 +55,16 @@ def test_random_choice_model_does_not_hide_bad_harness():
         create_harness("bad_name", model="random_choice_123")
 
 
+def test_random_choice_model_requires_random_harness():
+    with pytest.raises(ValueError, match="harness='random_choice'"):
+        create_harness("minimal", model="random_choice")
+
+
+def test_human_model_requires_human_harness():
+    with pytest.raises(ValueError, match="harness='human'"):
+        create_harness("minimal", model="human")
+
+
 def test_harness_config_stable_harness_id():
     config = HarnessConfig(harness="memo_compact", model="gpt-5-mini")
 
@@ -75,6 +85,16 @@ def test_harness_config_allows_seeded_random_choice_harness():
     assert config.harness == "random_choice_123"
 
 
+def test_harness_config_rejects_random_model_with_llm_harness():
+    with pytest.raises(ValueError, match="harness: random_choice"):
+        HarnessConfig(harness="minimal", model="random_choice")
+
+
+def test_harness_config_rejects_human_model_with_llm_harness():
+    with pytest.raises(ValueError, match="harness: human"):
+        HarnessConfig(harness="minimal", model="human")
+
+
 def test_harness_config_allows_retired_exp4_aliases():
     for harness_name in ("compaction_no_memo", "memo_cot", "memo_extended", "memo_structured"):
         config = HarnessConfig(harness=harness_name, model="gpt-5-mini")
diff --git a/llm_quest_benchmark/tests/integration/test_quest_e2e.py b/llm_quest_benchmark/tests/integration/test_quest_e2e.py
index a0e376d..86568bb 100644
--- a/llm_quest_benchmark/tests/integration/test_quest_e2e.py
+++ b/llm_quest_benchmark/tests/integration/test_quest_e2e.py
@@ -19,9 +19,9 @@ def test_quest_run_with_llm(caplog):
     """Test that quest runs with LLM agent and reaches a final state"""
     caplog.set_level(logging.DEBUG)  # Show all logs in test output
 
-    # Create LLM harness
+    # Create random harness
     agent = create_harness(
-        harness="minimal",
+        harness="random_choice",
         model="random_choice",  # Use random for testing
         system_template=SYSTEM_ROLE_TEMPLATE,
         temperature=0.0,

From 0a3f0b0081ecdd6f687751851c1f5265dc7effd7 Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Tue, 12 May 2026 13:00:20 +0400
Subject: [PATCH 19/24] style: format special harness validation

---
 llm_quest_benchmark/harnesses/factory.py                 | 4 +++-
 llm_quest_benchmark/tests/executors/cli/test_commands.py | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py
index 5af7c50..4c1591f 100644
--- a/llm_quest_benchmark/harnesses/factory.py
+++ b/llm_quest_benchmark/harnesses/factory.py
@@ -70,7 +70,9 @@ def create_harness(
         raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}")
     is_random_model, seed = _parse_random_choice_seed(model)
     if is_random_model:
-        raise ValueError("Use harness='random_choice' for random policy runs instead of pairing random_choice model with an LLM harness")
+        raise ValueError(
+            "Use harness='random_choice' for random policy runs instead of pairing random_choice model with an LLM harness"
+        )
     if model.startswith("random_choice"):
         raise ValueError(f"Unknown random_choice model '{model}'. Valid: {valid}")
     if model == "human":
diff --git a/llm_quest_benchmark/tests/executors/cli/test_commands.py b/llm_quest_benchmark/tests/executors/cli/test_commands.py
index d88fdbf..a3825cd 100644
--- a/llm_quest_benchmark/tests/executors/cli/test_commands.py
+++ b/llm_quest_benchmark/tests/executors/cli/test_commands.py
@@ -34,7 +34,9 @@ def test_run_quest_invalid_args():
     assert result.exit_code == 2
 
     # Test missing quest file
-    result = runner.invoke(app, ["run", "--quest", "nonexistent.qm", "--model", "random_choice", "--harness", "random_choice"])
+    result = runner.invoke(
+        app, ["run", "--quest", "nonexistent.qm", "--model", "random_choice", "--harness", "random_choice"]
+    )
     assert result.exit_code == 2
 
 

From ff73599521e9c96352882e05cb5f7a12ea050117 Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Tue, 12 May 2026 13:11:24 +0400
Subject: [PATCH 20/24] fix: preserve legacy memory routing

---
 llm_quest_benchmark/agents/agent_factory.py | 38 ++++++++++++++-------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/llm_quest_benchmark/agents/agent_factory.py b/llm_quest_benchmark/agents/agent_factory.py
index 43ad273..3d68e92 100644
--- a/llm_quest_benchmark/agents/agent_factory.py
+++ b/llm_quest_benchmark/agents/agent_factory.py
@@ -17,6 +17,18 @@
 logger = logging.getLogger(__name__)
 
 
+def _legacy_memory_module(memory_mode: str, compaction_interval: int):
+    from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory
+
+    if memory_mode == "default":
+        return DefaultMemory()
+    if memory_mode == "full_transcript":
+        return FullTranscriptMemory()
+    if memory_mode == "compaction":
+        return CompactionMemory(compaction_interval=compaction_interval)
+    raise ValueError(f"Invalid memory_mode: {memory_mode}")
+
+
 def create_agent(
     model: str = DEFAULT_MODEL,
     system_template: str = SYSTEM_ROLE_TEMPLATE,
@@ -48,11 +60,6 @@ def create_agent(
     """
     logger.debug(f"Creating agent for model: {model}")
     resolved_action_template = normalize_template_name(action_template)
-    harness_routed_templates = {"planner.jinja", "tool_augmented.jinja", "tool_augmented_hints.jinja"}
-    if resolved_action_template in harness_routed_templates and memory_mode not in ("default", "compaction"):
-        raise ValueError(
-            "memory_mode is not supported for planner/tool harness templates; configure memory via harness selection."
-        )
 
     # Human player
     if model == "human":
@@ -69,30 +76,35 @@ def create_agent(
         return RandomAgent(seed=seed, debug=debug, skip_single=skip_single)
 
     if resolved_action_template == "planner.jinja":
-        from llm_quest_benchmark.harnesses.factory import create_harness
+        from llm_quest_benchmark.harnesses.planner import PlannerHarness
 
-        return create_harness(
-            harness="planner",
-            model=model,
+        agent = PlannerHarness(
+            model_name=model,
             temperature=temperature,
             skip_single=skip_single,
             debug=debug,
             compaction_interval=compaction_interval,
             system_template=system_template,
+            memory_module=_legacy_memory_module(memory_mode, compaction_interval),
         )
+        agent._memory_mode = memory_mode
+        return agent
 
     if resolved_action_template in ("tool_augmented.jinja", "tool_augmented_hints.jinja"):
-        from llm_quest_benchmark.harnesses.factory import create_harness
+        from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness, ToolHintedHarness
 
-        return create_harness(
-            harness="tool_hinted" if resolved_action_template == "tool_augmented_hints.jinja" else "tool_compact",
-            model=model,
+        cls = ToolHintedHarness if resolved_action_template == "tool_augmented_hints.jinja" else ToolCompactHarness
+        agent = cls(
+            model_name=model,
             temperature=temperature,
             skip_single=skip_single,
             debug=debug,
             compaction_interval=compaction_interval,
             system_template=system_template,
+            memory_module=_legacy_memory_module(memory_mode, compaction_interval),
         )
+        agent._memory_mode = memory_mode
+        return agent
 
     # Default to LLM agent
     return LLMAgent(

From 6ae2265874e0a835cdcafdff2c320daf434880dd Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Tue, 12 May 2026 18:17:38 +0400
Subject: [PATCH 21/24] remove legacy agent compatibility

---
 docs/EXPERIMENTS_LOG.md                       |   2 +-
 llm_quest_benchmark/agents/__init__.py        |  22 +---
 llm_quest_benchmark/agents/agent_factory.py   | 119 ------------------
 llm_quest_benchmark/agents/llm_agent.py       | 103 ---------------
 llm_quest_benchmark/agents/planner_agent.py   |   9 --
 llm_quest_benchmark/agents/strategic_agent.py |  79 ------------
 llm_quest_benchmark/agents/tool_agent.py      |   9 --
 llm_quest_benchmark/core/runner.py            |   1 -
 llm_quest_benchmark/harnesses/tool_harness.py |   2 +-
 llm_quest_benchmark/schemas/config.py         |  45 -------
 10 files changed, 7 insertions(+), 384 deletions(-)
 delete mode 100644 llm_quest_benchmark/agents/agent_factory.py
 delete mode 100644 llm_quest_benchmark/agents/llm_agent.py
 delete mode 100644 llm_quest_benchmark/agents/planner_agent.py
 delete mode 100644 llm_quest_benchmark/agents/strategic_agent.py
 delete mode 100644 llm_quest_benchmark/agents/tool_agent.py

diff --git a/docs/EXPERIMENTS_LOG.md b/docs/EXPERIMENTS_LOG.md
index 6d0d7f9..a9ca972 100644
--- a/docs/EXPERIMENTS_LOG.md
+++ b/docs/EXPERIMENTS_LOG.md
@@ -190,7 +190,7 @@ The `_apply_loop_breaker` mechanism was overriding correct LLM decisions. Eviden
 
 ### Decision
 
-- **Disabled loop breaker** entirely in all agent types (llm_agent, planner_agent, tool_agent)
+- **Disabled loop breaker** entirely in all harness types
 - **Removed number normalization** from state signature computation
 - Kept `_state_action_counts` and `_state_signature` (used by safety filter and loop escape)
 - Removed `_apply_loop_breaker` method and `_loop_repetition_threshold` field as dead code
diff --git a/llm_quest_benchmark/agents/__init__.py b/llm_quest_benchmark/agents/__init__.py
index d056964..fdd1aa6 100644
--- a/llm_quest_benchmark/agents/__init__.py
+++ b/llm_quest_benchmark/agents/__init__.py
@@ -1,29 +1,17 @@
-__all__ = ["create_agent", "QuestPlayer", "RandomAgent", "LLMAgent", "PlannerAgent", "ToolAgent"]
+__all__ = ["QuestPlayer", "HumanPlayer", "RandomAgent"]
 
 
 def __getattr__(name):
-    if name == "create_agent":
-        from .agent_factory import create_agent
-
-        return create_agent
     if name == "QuestPlayer":
         from .base import QuestPlayer
 
         return QuestPlayer
+    if name == "HumanPlayer":
+        from .human_player import HumanPlayer
+
+        return HumanPlayer
     if name == "RandomAgent":
         from .random_agent import RandomAgent
 
         return RandomAgent
-    if name == "LLMAgent":
-        from .llm_agent import LLMAgent
-
-        return LLMAgent
-    if name == "PlannerAgent":
-        from .planner_agent import PlannerAgent
-
-        return PlannerAgent
-    if name == "ToolAgent":
-        from .tool_agent import ToolAgent
-
-        return ToolAgent
     raise AttributeError(name)
diff --git a/llm_quest_benchmark/agents/agent_factory.py b/llm_quest_benchmark/agents/agent_factory.py
deleted file mode 100644
index 3d68e92..0000000
--- a/llm_quest_benchmark/agents/agent_factory.py
+++ /dev/null
@@ -1,119 +0,0 @@
-"""Factory for creating quest agents"""
-
-import logging
-
-from llm_quest_benchmark.agents.base import QuestPlayer
-from llm_quest_benchmark.agents.human_player import HumanPlayer
-from llm_quest_benchmark.agents.llm_agent import LLMAgent
-from llm_quest_benchmark.agents.random_agent import RandomAgent
-from llm_quest_benchmark.constants import (
-    DEFAULT_MODEL,
-    DEFAULT_TEMPERATURE,
-    DEFAULT_TEMPLATE,
-    SYSTEM_ROLE_TEMPLATE,
-    normalize_template_name,
-)
-
-logger = logging.getLogger(__name__)
-
-
-def _legacy_memory_module(memory_mode: str, compaction_interval: int):
-    from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory
-
-    if memory_mode == "default":
-        return DefaultMemory()
-    if memory_mode == "full_transcript":
-        return FullTranscriptMemory()
-    if memory_mode == "compaction":
-        return CompactionMemory(compaction_interval=compaction_interval)
-    raise ValueError(f"Invalid memory_mode: {memory_mode}")
-
-
-def create_agent(
-    model: str = DEFAULT_MODEL,
-    system_template: str = SYSTEM_ROLE_TEMPLATE,
-    action_template: str = DEFAULT_TEMPLATE,
-    temperature: float = DEFAULT_TEMPERATURE,
-    skip_single: bool = False,
-    debug: bool = False,
-    memory_mode: str = "default",
-    compaction_interval: int = 10,
-) -> QuestPlayer:
-    """Create a quest agent based on model name and parameters.
-
-    Args:
-        model (str): Model identifier. Can be:
-            - LLM model name (e.g. 'gpt-5-mini', 'claude-sonnet-4-5')
-            - 'random_choice' for random testing agent (can include seed e.g. 'random_choice_123')
-            - 'human' for interactive human player
-        debug (bool): Enable debug logging
-        system_template (str): System template for LLM agents
-        action_template (str): Action template for LLM agents
-        temperature (float): Temperature for LLM sampling
-        skip_single (bool): Auto-select single choices
-
-    Returns:
-        QuestPlayer: Appropriate agent instance
-
-    Raises:
-        ValueError: If model type is not recognized
-    """
-    logger.debug(f"Creating agent for model: {model}")
-    resolved_action_template = normalize_template_name(action_template)
-
-    # Human player
-    if model == "human":
-        return HumanPlayer(skip_single=skip_single)
-
-    # Random choice agent
-    if model.startswith("random_choice"):
-        seed = None
-        if "_" in model:
-            try:
-                seed = int(model.split("_")[-1])
-            except ValueError:
-                pass
-        return RandomAgent(seed=seed, debug=debug, skip_single=skip_single)
-
-    if resolved_action_template == "planner.jinja":
-        from llm_quest_benchmark.harnesses.planner import PlannerHarness
-
-        agent = PlannerHarness(
-            model_name=model,
-            temperature=temperature,
-            skip_single=skip_single,
-            debug=debug,
-            compaction_interval=compaction_interval,
-            system_template=system_template,
-            memory_module=_legacy_memory_module(memory_mode, compaction_interval),
-        )
-        agent._memory_mode = memory_mode
-        return agent
-
-    if resolved_action_template in ("tool_augmented.jinja", "tool_augmented_hints.jinja"):
-        from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness, ToolHintedHarness
-
-        cls = ToolHintedHarness if resolved_action_template == "tool_augmented_hints.jinja" else ToolCompactHarness
-        agent = cls(
-            model_name=model,
-            temperature=temperature,
-            skip_single=skip_single,
-            debug=debug,
-            compaction_interval=compaction_interval,
-            system_template=system_template,
-            memory_module=_legacy_memory_module(memory_mode, compaction_interval),
-        )
-        agent._memory_mode = memory_mode
-        return agent
-
-    # Default to LLM agent
-    return LLMAgent(
-        debug=debug,
-        model_name=model,
-        system_template=system_template,
-        action_template=resolved_action_template,
-        temperature=temperature,
-        skip_single=skip_single,
-        memory_mode=memory_mode,
-        compaction_interval=compaction_interval,
-    )
diff --git a/llm_quest_benchmark/agents/llm_agent.py b/llm_quest_benchmark/agents/llm_agent.py
deleted file mode 100644
index 7b6d352..0000000
--- a/llm_quest_benchmark/agents/llm_agent.py
+++ /dev/null
@@ -1,103 +0,0 @@
-"""Deprecated compatibility wrapper for harness-based LLM agents."""
-
-import warnings
-
-from llm_quest_benchmark.constants import (
-    DEFAULT_MODEL,
-    DEFAULT_TEMPERATURE,
-    DEFAULT_TEMPLATE,
-    MODEL_CHOICES,
-    SYSTEM_ROLE_TEMPLATE,
-)
-from llm_quest_benchmark.harnesses.base import (
-    RISKY_CHOICE_KEYWORDS,
-    SAFE_CHOICE_KEYWORDS,
-    _is_numeric_raw_reasoning,
-    _parse_json_response,
-    _raw_reasoning_fallback,
-    parse_llm_response,
-)
-from llm_quest_benchmark.harnesses.memory import CompactionMemory, DefaultMemory, FullTranscriptMemory
-from llm_quest_benchmark.harnesses.minimal import MinimalHarness
-
-warnings.warn("llm_agent is deprecated, use llm_quest_benchmark.harnesses", DeprecationWarning, stacklevel=2)
-
-
-class LLMAgent(MinimalHarness):
-    """Backward-compatible LLMAgent facade backed by concrete harness classes."""
-
-    SUPPORTED_MODELS = MODEL_CHOICES
-
-    def __init__(
-        self,
-        model_name: str = DEFAULT_MODEL,
-        system_template: str = SYSTEM_ROLE_TEMPLATE,
-        action_template: str = DEFAULT_TEMPLATE,
-        temperature: float = DEFAULT_TEMPERATURE,
-        skip_single: bool = False,
-        debug: bool = False,
-        memory_mode: str = "default",
-        compaction_interval: int = 10,
-    ):
-        if memory_mode == "default":
-            memory_module = DefaultMemory()
-        elif memory_mode == "full_transcript":
-            memory_module = FullTranscriptMemory()
-        elif memory_mode == "compaction":
-            memory_module = CompactionMemory(compaction_interval=compaction_interval)
-        else:
-            raise ValueError(f"Invalid memory_mode: {memory_mode}")
-
-        super().__init__(
-            model_name=model_name,
-            system_template=system_template,
-            action_template=action_template,
-            temperature=temperature,
-            skip_single=skip_single,
-            debug=debug,
-            memory_module=memory_module,
-        )
-        self.agent_id = f"llm_{self.model_name}"
-        self._memory_mode = memory_mode
-        self._compaction_interval = compaction_interval
-
-    def _remember_observation(self, observation: str) -> None:
-        """Compatibility hook used by legacy tests and callers."""
-        clean = (observation or "").strip()
-        if not clean:
-            return
-        self._observation_history.append(clean)
-        if len(self._observation_history) > 20:
-            self._observation_history = self._observation_history[-20:]
-        if self.memory_module is not None:
-            self.memory_module.update({"observation": clean, "step": self._step_count + 1})
-
-    def _build_contextual_state(self, state: str) -> str:
-        """Build context while honoring legacy direct history mutation."""
-        if isinstance(self.memory_module, DefaultMemory):
-            self.memory_module._observations = list(self._observation_history)
-            self.memory_module._decisions = list(self._decision_history)
-        return super()._build_contextual_state(state)
-
-    def _apply_safety_filter(self, action_or_choices, choices_or_action) -> int:
-        """Accept both legacy (action, choices) and harness (choices, action) argument order."""
-        if isinstance(action_or_choices, list):
-            return super()._apply_safety_filter(action_or_choices, choices_or_action)
-        return super()._apply_safety_filter(choices_or_action, action_or_choices)
-
-    def __str__(self) -> str:
-        return (
-            f"LLMAgent(model={self.model_name}, system_template={self.system_template}, "
-            f"action_template={self.action_template}, temperature={self.temperature})"
-        )
-
-
-__all__ = [
-    "LLMAgent",
-    "parse_llm_response",
-    "_parse_json_response",
-    "_raw_reasoning_fallback",
-    "_is_numeric_raw_reasoning",
-    "RISKY_CHOICE_KEYWORDS",
-    "SAFE_CHOICE_KEYWORDS",
-]
diff --git a/llm_quest_benchmark/agents/planner_agent.py b/llm_quest_benchmark/agents/planner_agent.py
deleted file mode 100644
index cd20e0d..0000000
--- a/llm_quest_benchmark/agents/planner_agent.py
+++ /dev/null
@@ -1,9 +0,0 @@
-"""Deprecated compatibility wrapper for the planner harness."""
-
-import warnings
-
-from llm_quest_benchmark.harnesses.planner import PlannerHarness as PlannerAgent
-
-warnings.warn("planner_agent is deprecated, use harnesses.planner", DeprecationWarning, stacklevel=2)
-
-__all__ = ["PlannerAgent"]
diff --git a/llm_quest_benchmark/agents/strategic_agent.py b/llm_quest_benchmark/agents/strategic_agent.py
deleted file mode 100644
index edd656f..0000000
--- a/llm_quest_benchmark/agents/strategic_agent.py
+++ /dev/null
@@ -1,79 +0,0 @@
-"""Deprecated compatibility wrapper for strategic agents."""
-
-import logging
-import warnings
-from typing import Any
-
-from llm_quest_benchmark.agents.base import QuestPlayer
-from llm_quest_benchmark.llm.prompt import PromptRenderer
-
-warnings.warn("strategic_agent is deprecated, use llm_quest_benchmark.harnesses", DeprecationWarning, stacklevel=2)
-
-
-class StrategicAgent(QuestPlayer):
-    """Backward-compatible strategic analysis decorator."""
-
-    def __init__(self, base_agent: QuestPlayer, debug: bool = False, template: str = "advanced.jinja"):
-        super().__init__(skip_single=base_agent.skip_single)
-        self.agent = base_agent
-        self.debug = debug
-        self.history = []
-
-        self.logger = logging.getLogger(self.__class__.__name__)
-        if self.debug:
-            self.logger.setLevel(logging.DEBUG)
-            handler = logging.StreamHandler()
-            handler.setFormatter(logging.Formatter("%(name)s - %(message)s"))
-            self.logger.addHandler(handler)
-
-        self.prompt_renderer = PromptRenderer(None, template=template)
-
-    def _get_action_impl(self, observation: str, choices: list) -> int:
-        if hasattr(self.agent, "llm"):
-            if self.debug:
-                self.logger.debug("\nObservation:\n%s", observation)
-
-            analysis = self.agent.llm(
-                "Analyze this situation and explain your thinking step-by-step instead of choosing an action:\n"
-                + observation
-            )
-
-            if self.debug:
-                self.logger.debug("\nAnalysis:\n%s", analysis)
-
-            self.history.append({"observation": observation, "analysis": analysis})
-            enhanced_context = self.get_enhanced_context(observation, choices)
-            if self.debug:
-                self.logger.debug("\nEnhanced Context:\n%s", enhanced_context)
-
-            return self.agent.get_action(enhanced_context, choices)
-
-        return self.agent.get_action(observation, choices)
-
-    def get_enhanced_context(self, observation: str, choices: list) -> str:
-        context = [f"Turn {len(self.history) + 1}: {entry['analysis']}" for entry in self.history[-3:]]
-        return self.prompt_renderer.render_action_prompt(
-            observation=observation,
-            choices=choices,
-            state_tracker=context,
-        )
-
-    def reset(self) -> None:
-        self.history = []
-        self.agent.reset()
-
-    def on_game_start(self) -> None:
-        if self.debug:
-            self.logger.debug("Starting new game with strategic analysis")
-        self.agent.on_game_start()
-
-    def on_game_end(self, final_state: dict[str, Any]) -> None:
-        self.agent.on_game_end(final_state)
-        if self.debug:
-            self.logger.debug("Final Analysis History:")
-            for entry in self.history:
-                self.logger.debug("\nObservation: %s", entry["observation"])
-                self.logger.debug("Analysis: %s", entry["analysis"])
-
-
-__all__ = ["StrategicAgent"]
diff --git a/llm_quest_benchmark/agents/tool_agent.py b/llm_quest_benchmark/agents/tool_agent.py
deleted file mode 100644
index 659a747..0000000
--- a/llm_quest_benchmark/agents/tool_agent.py
+++ /dev/null
@@ -1,9 +0,0 @@
-"""Deprecated compatibility wrapper for the tool harness."""
-
-import warnings
-
-from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness as ToolAgent
-
-warnings.warn("tool_agent is deprecated, use harnesses.tool_harness", DeprecationWarning, stacklevel=2)
-
-__all__ = ["ToolAgent"]
diff --git a/llm_quest_benchmark/core/runner.py b/llm_quest_benchmark/core/runner.py
index e2b9ef8..bebf0c6 100644
--- a/llm_quest_benchmark/core/runner.py
+++ b/llm_quest_benchmark/core/runner.py
@@ -20,7 +20,6 @@
 
 # Configure logging
 logging.getLogger("quest").setLevel(logging.WARNING)
-logging.getLogger("LLMAgent").setLevel(logging.WARNING)
 
 
 def run_quest_with_timeout(
diff --git a/llm_quest_benchmark/harnesses/tool_harness.py b/llm_quest_benchmark/harnesses/tool_harness.py
index a398bfe..a4f09dd 100644
--- a/llm_quest_benchmark/harnesses/tool_harness.py
+++ b/llm_quest_benchmark/harnesses/tool_harness.py
@@ -217,7 +217,7 @@ def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int:
                 action=1,
                 is_default=True,
                 parse_mode="error_default",
-                reasoning=f"tool_agent_error: {exc}",
+                reasoning=f"tool_harness_error: {exc}",
             )
             self.history.append(default_response)
             self._last_response = default_response
diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py
index 8008e1b..505416b 100644
--- a/llm_quest_benchmark/schemas/config.py
+++ b/llm_quest_benchmark/schemas/config.py
@@ -8,7 +8,6 @@
 from llm_quest_benchmark.constants import (
     DEFAULT_MODEL,
     DEFAULT_TEMPERATURE,
-    DEFAULT_TEMPLATE,
     MODEL_CHOICES,
     SYSTEM_ROLE_TEMPLATE,
     normalize_template_name,
@@ -143,50 +142,6 @@ def agent_id(self) -> str:
         return self.harness_id
 
 
-@dataclass
-class AgentConfig:
-    """Legacy configuration for a single agent in benchmark"""
-
-    model: str = DEFAULT_MODEL
-    system_template: str = SYSTEM_ROLE_TEMPLATE
-    action_template: str = DEFAULT_TEMPLATE
-    temperature: float = DEFAULT_TEMPERATURE
-    runs: int = 1
-    skip_single: bool = False
-    debug: bool = False
-    benchmark_id: str | None = None
-    memory_mode: str = "default"
-    compaction_interval: int = 10
-
-    def __post_init__(self):
-        self.system_template = normalize_template_name(self.system_template)
-        self.action_template = normalize_template_name(self.action_template)
-        if self.model not in ("random_choice", "human"):
-            # Keep parser compatibility for legacy names while UI remains clean.
-            from llm_quest_benchmark.llm.client import is_supported_model_name
-
-            if not is_supported_model_name(self.model):
-                raise ValueError(f"Invalid model: {self.model}. Supported models: {MODEL_CHOICES}")
-        if not (0.0 <= self.temperature <= 2.0):
-            raise ValueError(f"Temperature must be between 0.0 and 2.0, got {self.temperature}")
-        if self.runs < 1:
-            raise ValueError(f"runs must be >= 1, got {self.runs}")
-        if self.memory_mode not in ("default", "full_transcript", "compaction"):
-            raise ValueError(f"Invalid memory_mode: {self.memory_mode}")
-        if self.memory_mode == "compaction" and self.compaction_interval < 1:
-            raise ValueError(f"compaction_interval must be >= 1, got {self.compaction_interval}")
-
-    @property
-    def agent_id(self) -> str:
-        """Generate a unique agent ID based on configuration values"""
-        import hashlib
-
-        interval_tag = f"_ci{self.compaction_interval}" if self.memory_mode == "compaction" else ""
-        config_str = f"{self.model}_{self.temperature}_{self.system_template}_{self.action_template}_{self.memory_mode}{interval_tag}"
-        hash_val = hashlib.md5(config_str.encode()).hexdigest()[:8]
-        return f"{self.model}_t{self.temperature}_{hash_val}"
-
-
 @dataclass
 class BenchmarkConfig:
     """Configuration for benchmark run"""

From 131ecad7b021b243b061d38b6a8bdfb39cdd1f44 Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Wed, 13 May 2026 19:28:32 +0400
Subject: [PATCH 22/24] simplify harness template surface

---
 README.md                                     |  3 +-
 docs/ARCHITECTURE.md                          |  2 ++
 llm_quest_benchmark/harnesses/factory.py      |  6 +++-
 .../prompt_templates/consequence_scan.jinja   | 18 ----------
 .../consequence_scan_subgoal.jinja            | 19 ----------
 .../prompt_templates/light_hints.jinja        | 18 ----------
 .../loop_aware_reasoning.jinja                | 19 ----------
 .../prompt_templates/objective_guard.jinja    | 18 ----------
 .../prompt_templates/strategic.jinja          | 32 -----------------
 .../system_role_completion.jinja              | 11 ------
 .../prompt_templates/system_role_risk.jinja   | 16 ---------
 llm_quest_benchmark/schemas/config.py         | 19 +++++++++-
 .../tests/harnesses/test_factory.py           | 35 +++++++++++++++++--
 .../tests/test_benchmark_with_directory.py    |  4 +--
 14 files changed, 61 insertions(+), 159 deletions(-)
 delete mode 100644 llm_quest_benchmark/prompt_templates/consequence_scan.jinja
 delete mode 100644 llm_quest_benchmark/prompt_templates/consequence_scan_subgoal.jinja
 delete mode 100644 llm_quest_benchmark/prompt_templates/light_hints.jinja
 delete mode 100644 llm_quest_benchmark/prompt_templates/loop_aware_reasoning.jinja
 delete mode 100644 llm_quest_benchmark/prompt_templates/objective_guard.jinja
 delete mode 100644 llm_quest_benchmark/prompt_templates/strategic.jinja
 delete mode 100644 llm_quest_benchmark/prompt_templates/system_role_completion.jinja
 delete mode 100644 llm_quest_benchmark/prompt_templates/system_role_risk.jinja

diff --git a/README.md b/README.md
index 3ff854c..d99a55b 100644
--- a/README.md
+++ b/README.md
@@ -107,7 +107,8 @@ Provider-specific keys in `.env`:
 
 ## Project Structure
 
-- `llm_quest_benchmark/agents/` - Agent implementations (LLM, planner, tool-augmented)
+- `llm_quest_benchmark/harnesses/` - LLM harness implementations for prompt, memory, tools, and planning experiments
+- `llm_quest_benchmark/agents/` - Non-LLM player primitives (`human`, `random_choice`)
 - `llm_quest_benchmark/prompt_templates/` - Jinja2 prompt templates for the public context-scaffold taxonomy
 - `llm_quest_benchmark/executors/` - CLI, benchmark orchestration, TS bridge
 - `configs/benchmarks/` - YAML benchmark configurations
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index 83472b3..bf474bc 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -99,6 +99,8 @@ and benchmark configuration parsing do not require API keys.
     harness memory.
   - `stateful_compact.jinja`: Compact memory / 20-word memo prompt.
   - `stateful_compact_hints.jinja`: Compact memo prompt with mechanics hints.
+  - `memo_cot.jinja`, `memo_extended.jinja`, `memo_structured.jinja`:
+    retained Exp 4 memo variants.
   - `planner.jinja`: Planner loop prompt.
   - `tool_augmented.jinja`, `tool_augmented_hints.jinja`: Tool prompts with
     compact memory, optionally with hints.
diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py
index 4c1591f..8d22462 100644
--- a/llm_quest_benchmark/harnesses/factory.py
+++ b/llm_quest_benchmark/harnesses/factory.py
@@ -60,7 +60,12 @@ def create_harness(
 ) -> QuestPlayer:
     valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES]
     is_random_harness, seed = _parse_random_choice_seed(harness)
+    is_random_model, _ = _parse_random_choice_seed(model)
     if is_random_harness:
+        if is_random_model and model != "random_choice":
+            raise ValueError("Encode random seeds in harness, for example harness='random_choice_123'")
+        if model not in (DEFAULT_MODEL, "random_choice"):
+            raise ValueError("Use model='random_choice' with random_choice harnesses")
         return RandomAgent(seed=seed, debug=debug, skip_single=skip_single)
     if harness.startswith("random_choice"):
         raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}")
@@ -68,7 +73,6 @@ def create_harness(
         return HumanPlayer(skip_single=skip_single)
     if harness not in HARNESS_REGISTRY:
         raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}")
-    is_random_model, seed = _parse_random_choice_seed(model)
     if is_random_model:
         raise ValueError(
             "Use harness='random_choice' for random policy runs instead of pairing random_choice model with an LLM harness"
diff --git a/llm_quest_benchmark/prompt_templates/consequence_scan.jinja b/llm_quest_benchmark/prompt_templates/consequence_scan.jinja
deleted file mode 100644
index 55ce54b..0000000
--- a/llm_quest_benchmark/prompt_templates/consequence_scan.jinja
+++ /dev/null
@@ -1,18 +0,0 @@
-Current story state:
-{{ observation }}
-
-Available actions:
-{% for choice in choices %}
-{{ loop.index }}. {{ choice.text }}
-{% endfor %}
-
-Mission objective: complete the quest successfully.
-
-Decision method:
-1. For each action, estimate immediate consequence in 5 words max.
-2. Prefer actions that preserve progress and gather information.
-3. Avoid options that abandon, surrender, or end the mission early.
-4. If uncertain, choose the lowest-risk progress action.
-
-Return ONLY valid JSON (no markdown/code fences), exactly:
-{"analysis":"<max 25 words>","reasoning":"<max 25 words>","result":<action_number>}
diff --git a/llm_quest_benchmark/prompt_templates/consequence_scan_subgoal.jinja b/llm_quest_benchmark/prompt_templates/consequence_scan_subgoal.jinja
deleted file mode 100644
index 7fd4236..0000000
--- a/llm_quest_benchmark/prompt_templates/consequence_scan_subgoal.jinja
+++ /dev/null
@@ -1,19 +0,0 @@
-Current story state:
-{{ observation }}
-
-Available actions:
-{% for choice in choices %}
-{{ loop.index }}. {{ choice.text }}
-{% endfor %}
-
-Mission objective: complete the quest successfully.
-
-Decision method:
-1. Use any provided memo from prior turns to stay consistent.
-2. For each action, estimate immediate consequence in 5 words max.
-3. Prefer actions that preserve progress and gather information.
-4. Avoid options that abandon, surrender, or end the mission early.
-5. If uncertain, choose the lowest-risk progress action.
-
-Return ONLY valid JSON (no markdown/code fences), exactly:
-{"analysis":"<max 25 words>","reasoning":"<max 25 words>","memo":"<max 12 words next objective>","result":<action_number>}
diff --git a/llm_quest_benchmark/prompt_templates/light_hints.jinja b/llm_quest_benchmark/prompt_templates/light_hints.jinja
deleted file mode 100644
index eb3ab60..0000000
--- a/llm_quest_benchmark/prompt_templates/light_hints.jinja
+++ /dev/null
@@ -1,18 +0,0 @@
-General hints for this type of quest:
-- Read the scene literally. Win/loss constraints are usually stated directly in the text.
-- Preparation, study, negotiation, and reconnaissance are often safer than direct combat or bravado.
-- Prefer actions that gather clues or unlock safer options before committing to irreversible moves.
-- Avoid choices that abandon the mission, surrender, or waste scarce time/resources for no gain.
-- If a scene repeats, the last branch did not help - try a different action.
-- Prioritize the core objective over optional heroic detours.
-
-Current story state:
-{{ observation }}
-
-Available actions:
-{% for choice in choices %}
-{{ loop.index }}. {{ choice.text }}
-{% endfor %}
-
-Return ONLY valid JSON (no markdown/code fences), exactly:
-{"analysis":"<max 25 words>","reasoning":"<max 25 words>","result":<action_number>}
diff --git a/llm_quest_benchmark/prompt_templates/loop_aware_reasoning.jinja b/llm_quest_benchmark/prompt_templates/loop_aware_reasoning.jinja
deleted file mode 100644
index 38a9343..0000000
--- a/llm_quest_benchmark/prompt_templates/loop_aware_reasoning.jinja
+++ /dev/null
@@ -1,19 +0,0 @@
-Current story state:
-{{ observation }}
-
-Available actions:
-{% for choice in choices %}
-{{ loop.index }}. {{ choice.text }}
-{% endfor %}
-
-Mission objective: complete the quest successfully.
-
-Decision policy:
-1. Prefer actions that preserve progress and avoid premature failure.
-2. Use Status/context hints (stats, resources, relationships) to reduce obvious risks.
-3. If this scene appears repeated, avoid repeating the same action that did not progress.
-4. When uncertain, choose the safest reversible action that keeps the mission alive.
-5. Do not surrender/quit unless it is clearly required for success.
-
-Return ONLY valid JSON (no markdown/code fences), exactly:
-{"analysis":"<max 25 words>","reasoning":"<max 25 words>","result":<action_number>}
diff --git a/llm_quest_benchmark/prompt_templates/objective_guard.jinja b/llm_quest_benchmark/prompt_templates/objective_guard.jinja
deleted file mode 100644
index b80d482..0000000
--- a/llm_quest_benchmark/prompt_templates/objective_guard.jinja
+++ /dev/null
@@ -1,18 +0,0 @@
-Current story state:
-{{ observation }}
-
-Available actions:
-{% for choice in choices %}
-{{ loop.index }}. {{ choice.text }}
-{% endfor %}
-
-Choose the action that best supports mission completion.
-
-Guardrails:
-1. Keep the run alive unless ending is clearly successful.
-2. Penalize actions that look like quitting, escaping, or self-sabotage.
-3. Prefer actions that unlock clues, credentials, access, or progression gates.
-4. Resolve ambiguity by selecting the most reversible safe option.
-
-Return ONLY valid JSON (no markdown/code fences), exactly:
-{"analysis":"<max 25 words>","reasoning":"<max 25 words>","result":<action_number>}
diff --git a/llm_quest_benchmark/prompt_templates/strategic.jinja b/llm_quest_benchmark/prompt_templates/strategic.jinja
deleted file mode 100644
index 1668c41..0000000
--- a/llm_quest_benchmark/prompt_templates/strategic.jinja
+++ /dev/null
@@ -1,32 +0,0 @@
-{# Tier 3: Strategic agent prompt #}
-{# Contextual State Tracker #}
-{% if state_tracker %}
-Historical context:
-{% for entry in state_tracker %}
-- {{ entry }}
-{% endfor %}
-{% endif %}
-
-Current Situation:
-{{ observation }}
-
-Available Actions:
-{% for choice in choices %}
-{{ loop.index }}. {{ choice.text }}
-{% endfor %}
-
-Analysis Framework:
-1. Immediate Context [<50 words]
-2. Plausible Hypotheses [2-3 possibilities]
-3. Action Impact Forecast [short/long-term]
-4. Confidence Estimate [High/Medium/Low]
-
-Response format:
-```json
-{
-    "hypotheses": ["...", "..."],
-    "reasoning": "<concise reason for selected action>",
-    "choice": <number>,
-    "confidence": "<level>"
-}
-```
diff --git a/llm_quest_benchmark/prompt_templates/system_role_completion.jinja b/llm_quest_benchmark/prompt_templates/system_role_completion.jinja
deleted file mode 100644
index 918a695..0000000
--- a/llm_quest_benchmark/prompt_templates/system_role_completion.jinja
+++ /dev/null
@@ -1,11 +0,0 @@
-You are a mission-completion specialist for interactive fiction quests.
-
-Core behavior:
-1. Infer the current objective from narrative clues.
-2. Prioritize actions that maintain progress and optionality.
-3. Avoid premature terminal outcomes unless success is explicit.
-4. Prefer evidence-based choices over stylistic roleplay.
-
-When the state is ambiguous:
-- choose the safest action that still advances the mission.
-- avoid speculative high-risk branches without support in the text.
diff --git a/llm_quest_benchmark/prompt_templates/system_role_risk.jinja b/llm_quest_benchmark/prompt_templates/system_role_risk.jinja
deleted file mode 100644
index ea19c36..0000000
--- a/llm_quest_benchmark/prompt_templates/system_role_risk.jinja
+++ /dev/null
@@ -1,16 +0,0 @@
-{# Enhanced system role for interactive fiction #}
-You are an experienced interactive fiction player. Your capabilities include:
-
-1. Dynamic Goal Recognition: Infer objectives from narrative context
-2. Clue Chaining: Connect information across scenes
-3. Consequence Forecasting: Predict 2-3 steps ahead for each action
-4. Narrative Consistency: Maintain character/story logic
-
-Follow these principles:
-- Treat each choice as part of an unfolding mystery
-- Track objects/characters/relationships as state components
-- Consider both practical and thematic implications
-- Admit uncertainty when clues are ambiguous
-- Flag potential contradictions in story logic
-
-Any bad move can fail the quest, so prefer robust low-risk progress over flashy but uncertain options.
diff --git a/llm_quest_benchmark/schemas/config.py b/llm_quest_benchmark/schemas/config.py
index 505416b..5cd93b2 100644
--- a/llm_quest_benchmark/schemas/config.py
+++ b/llm_quest_benchmark/schemas/config.py
@@ -26,6 +26,18 @@
     "name": "Default Benchmark",
 }
 
+COMPACTION_HARNESSES = {
+    "memo_compact",
+    "hinted_compact",
+    "tool_compact",
+    "tool_hinted",
+    "planner",
+    "compaction_no_memo",
+    "memo_cot",
+    "memo_extended",
+    "memo_structured",
+}
+
 
 def get_default_benchmark_yaml() -> str:
     """Get the default benchmark configuration from default.yaml file"""
@@ -111,8 +123,12 @@ def __post_init__(self):
         ):
             valid = [*sorted(HARNESS_REGISTRY), *SPECIAL_HARNESSES]
             raise ValueError(f"Invalid harness: {self.harness}. Supported harnesses: {valid}")
+        if self.harness == "human" and self.model != "human":
+            raise ValueError("Use model: human with harness: human")
         if self.model == "human" and self.harness != "human":
             raise ValueError("Use harness: human with model: human")
+        if is_random_choice_harness(self.harness) and self.model != "random_choice":
+            raise ValueError("Use model: random_choice with random_choice harnesses")
         if is_random_choice_harness(self.model) and not is_random_choice_harness(self.harness):
             raise ValueError("Use harness: random_choice with model: random_choice")
         if self.model not in ("human",) and not is_random_choice_harness(self.model):
@@ -132,7 +148,8 @@ def harness_id(self) -> str:
         """Generate a stable harness ID based on configuration values"""
         import hashlib
 
-        config_str = f"{self.model}_{self.temperature}_{self.harness}_{self.system_template}_{self.compaction_interval}"
+        interval_tag = f"_ci{self.compaction_interval}" if self.harness in COMPACTION_HARNESSES else ""
+        config_str = f"{self.model}_{self.temperature}_{self.harness}_{self.system_template}{interval_tag}"
         hash_val = hashlib.md5(config_str.encode()).hexdigest()[:8]
         return f"{self.model}_t{self.temperature}_{self.harness}_{hash_val}"
 
diff --git a/llm_quest_benchmark/tests/harnesses/test_factory.py b/llm_quest_benchmark/tests/harnesses/test_factory.py
index 800a502..31c3ae2 100644
--- a/llm_quest_benchmark/tests/harnesses/test_factory.py
+++ b/llm_quest_benchmark/tests/harnesses/test_factory.py
@@ -34,7 +34,7 @@ def test_create_random_choice_harness():
 
 
 def test_create_seeded_random_choice_harness():
-    harness = create_harness("random_choice_123")
+    harness = create_harness("random_choice_123", model="random_choice")
 
     assert isinstance(harness, RandomAgent)
     assert harness.agent_id == "random_123"
@@ -60,6 +60,11 @@ def test_random_choice_model_requires_random_harness():
         create_harness("minimal", model="random_choice")
 
 
+def test_seeded_random_model_is_rejected():
+    with pytest.raises(ValueError, match="Encode random seeds in harness"):
+        create_harness("random_choice", model="random_choice_123")
+
+
 def test_human_model_requires_human_harness():
     with pytest.raises(ValueError, match="harness='human'"):
         create_harness("minimal", model="human")
@@ -74,17 +79,41 @@ def test_harness_config_stable_harness_id():
 
 def test_harness_config_system_template_affects_harness_id():
     first = HarnessConfig(harness="memo_compact", model="gpt-5-mini", system_template="system_role.jinja")
-    second = HarnessConfig(harness="memo_compact", model="gpt-5-mini", system_template="system_role_risk.jinja")
+    second = HarnessConfig(harness="memo_compact", model="gpt-5-mini", system_template="custom_system_role.jinja")
+
+    assert first.harness_id != second.harness_id
+
+
+def test_non_compaction_harness_id_ignores_compaction_interval():
+    first = HarnessConfig(harness="reasoning_recent", model="gpt-5-mini", compaction_interval=10)
+    second = HarnessConfig(harness="reasoning_recent", model="gpt-5-mini", compaction_interval=99)
+
+    assert first.harness_id == second.harness_id
+
+
+def test_compaction_harness_id_includes_compaction_interval():
+    first = HarnessConfig(harness="memo_compact", model="gpt-5-mini", compaction_interval=10)
+    second = HarnessConfig(harness="memo_compact", model="gpt-5-mini", compaction_interval=99)
 
     assert first.harness_id != second.harness_id
 
 
 def test_harness_config_allows_seeded_random_choice_harness():
-    config = HarnessConfig(harness="random_choice_123", model="gpt-5-mini")
+    config = HarnessConfig(harness="random_choice_123", model="random_choice")
 
     assert config.harness == "random_choice_123"
 
 
+def test_harness_config_rejects_llm_model_with_random_harness():
+    with pytest.raises(ValueError, match="model: random_choice"):
+        HarnessConfig(harness="random_choice", model="gpt-5-mini")
+
+
+def test_harness_config_rejects_llm_model_with_human_harness():
+    with pytest.raises(ValueError, match="model: human"):
+        HarnessConfig(harness="human", model="gpt-5-mini")
+
+
 def test_harness_config_rejects_random_model_with_llm_harness():
     with pytest.raises(ValueError, match="harness: random_choice"):
         HarnessConfig(harness="minimal", model="random_choice")
diff --git a/llm_quest_benchmark/tests/test_benchmark_with_directory.py b/llm_quest_benchmark/tests/test_benchmark_with_directory.py
index c6dc855..87b2221 100644
--- a/llm_quest_benchmark/tests/test_benchmark_with_directory.py
+++ b/llm_quest_benchmark/tests/test_benchmark_with_directory.py
@@ -29,7 +29,7 @@ def create_test_config():
 
 def test_result_entry_logs_random_harness_model_as_random_policy():
     """Random harness results should not be attributed to the default LLM model."""
-    agent_config = HarnessConfig(harness="random_choice", model="gpt-5-mini")
+    agent_config = HarnessConfig(harness="random_choice", model="random_choice")
 
     result = _result_entry("quests/Boat.qm", agent_config, 1, "FAILURE")
 
@@ -39,7 +39,7 @@ def test_result_entry_logs_random_harness_model_as_random_policy():
 
 def test_result_entry_logs_human_harness_model_as_human():
     """Human harness results should not be attributed to the default LLM model."""
-    agent_config = HarnessConfig(harness="human", model="gpt-5-mini")
+    agent_config = HarnessConfig(harness="human", model="human")
 
     result = _result_entry("quests/Boat.qm", agent_config, 1, "FAILURE")
 

From 4f3983457cd9107a3f72a0264c4f9905da63ff19 Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Wed, 13 May 2026 20:01:42 +0400
Subject: [PATCH 23/24] rename non-llm agents to players

---
 README.md                                     |   4 +-
 docs/ARCHITECTURE.md                          |   4 +-
 llm_quest_benchmark/core/runner.py            |   2 +-
 llm_quest_benchmark/harnesses/base.py         |  30 +---
 llm_quest_benchmark/harnesses/factory.py      |   8 +-
 llm_quest_benchmark/harnesses/memory.py       | 129 +++++++++++-------
 llm_quest_benchmark/harnesses/minimal.py      |   5 -
 llm_quest_benchmark/harnesses/tool_harness.py |  12 +-
 llm_quest_benchmark/harnesses/tools.py        |   2 +-
 .../{agents => players}/__init__.py           |  10 +-
 .../{agents => players}/base.py               |   6 +-
 .../human_player.py => players/human.py}      |   4 +-
 .../random_agent.py => players/random.py}     |  18 +--
 llm_quest_benchmark/renderers/factory.py      |   6 +-
 llm_quest_benchmark/renderers/progress.py     |  10 +-
 .../tests/agents/test_mode_agents.py          |   5 -
 .../tests/executors/cli/test_commands.py      |   2 +-
 .../{agents => harnesses}/test_anthropic.py   |   0
 .../test_base.py}                             |   0
 .../tests/harnesses/test_factory.py           |   8 +-
 .../tests/harnesses/test_harnesses.py         |  14 +-
 .../tests/integration/test_quest_e2e.py       |  10 +-
 .../{agents => players}/test_human_player.py  |   2 +-
 llm_quest_benchmark/tests/test_database.py    |   4 +-
 24 files changed, 145 insertions(+), 150 deletions(-)
 rename llm_quest_benchmark/{agents => players}/__init__.py (50%)
 rename llm_quest_benchmark/{agents => players}/base.py (92%)
 rename llm_quest_benchmark/{agents/human_player.py => players/human.py} (91%)
 rename llm_quest_benchmark/{agents/random_agent.py => players/random.py} (75%)
 delete mode 100644 llm_quest_benchmark/tests/agents/test_mode_agents.py
 rename llm_quest_benchmark/tests/{agents => harnesses}/test_anthropic.py (100%)
 rename llm_quest_benchmark/tests/{agents/test_llm_agent.py => harnesses/test_base.py} (100%)
 rename llm_quest_benchmark/tests/{agents => players}/test_human_player.py (95%)

diff --git a/README.md b/README.md
index d99a55b..013fb57 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ uv run llm-quest benchmark --config configs/benchmarks/memory_full_transcript.ya
 uv run llm-quest benchmark-report --benchmark-id <id> --output report.md
 
 # Analyze a single run
-uv run llm-quest analyze-run --run-summary results/<agent>/<quest>/run_<id>/run_summary.json
+uv run llm-quest analyze-run --run-summary results/<harness>/<quest>/run_<id>/run_summary.json
 
 # Play as human in terminal
 uv run llm-quest play --quest quests/Boat.qm
@@ -108,7 +108,7 @@ Provider-specific keys in `.env`:
 ## Project Structure
 
 - `llm_quest_benchmark/harnesses/` - LLM harness implementations for prompt, memory, tools, and planning experiments
-- `llm_quest_benchmark/agents/` - Non-LLM player primitives (`human`, `random_choice`)
+- `llm_quest_benchmark/players/` - Non-LLM player primitives (`human`, `random_choice`)
 - `llm_quest_benchmark/prompt_templates/` - Jinja2 prompt templates for the public context-scaffold taxonomy
 - `llm_quest_benchmark/executors/` - CLI, benchmark orchestration, TS bridge
 - `configs/benchmarks/` - YAML benchmark configurations
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index bf474bc..2588ee2 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -61,8 +61,8 @@ planning choices change behavior.
   history helpers used by tool harnesses.
 - `llm_quest_benchmark/harnesses/factory.py`: `create_harness()` and the
   canonical harness registry.
-- `llm_quest_benchmark/agents/human_player.py`,
-  `llm_quest_benchmark/agents/random_agent.py`: Non-LLM `QuestPlayer`
+- `llm_quest_benchmark/players/human.py`,
+  `llm_quest_benchmark/players/random.py`: Non-LLM `QuestPlayer`
   implementations preserved for interactive and random baselines.
 
 Harness construction lazily initializes provider clients, so template rendering
diff --git a/llm_quest_benchmark/core/runner.py b/llm_quest_benchmark/core/runner.py
index bebf0c6..d86c07b 100644
--- a/llm_quest_benchmark/core/runner.py
+++ b/llm_quest_benchmark/core/runner.py
@@ -10,11 +10,11 @@
 from copy import deepcopy
 from typing import Any
 
-from llm_quest_benchmark.agents.base import QuestPlayer
 from llm_quest_benchmark.constants import DEFAULT_QUEST_TIMEOUT
 from llm_quest_benchmark.core.logging import LogManager, QuestLogger
 from llm_quest_benchmark.environments.qm import QMPlayerEnv as QuestEnvironment
 from llm_quest_benchmark.environments.state import QuestOutcome
+from llm_quest_benchmark.players.base import QuestPlayer
 from llm_quest_benchmark.schemas.config import HarnessConfig
 from llm_quest_benchmark.schemas.state import AgentState
 
diff --git a/llm_quest_benchmark/harnesses/base.py b/llm_quest_benchmark/harnesses/base.py
index 440675b..fd8864b 100644
--- a/llm_quest_benchmark/harnesses/base.py
+++ b/llm_quest_benchmark/harnesses/base.py
@@ -9,10 +9,10 @@
 
 from json_repair import repair_json
 
-from llm_quest_benchmark.agents.base import QuestPlayer
 from llm_quest_benchmark.constants import DEFAULT_TEMPLATE, normalize_template_name
 from llm_quest_benchmark.llm.client import get_llm_client, parse_model_name
 from llm_quest_benchmark.llm.prompt import PromptRenderer
+from llm_quest_benchmark.players.base import QuestPlayer
 from llm_quest_benchmark.schemas.response import LLMResponse
 
 RISKY_CHOICE_KEYWORDS = (
@@ -311,7 +311,6 @@ def _get_action_impl(self, observation, choices) -> int:
         """Return the selected 1-based action number."""
         pass
 
-    @abstractmethod
     def reset(self) -> None:
         """Reset harness state between episodes."""
         super().reset()
@@ -343,33 +342,6 @@ def on_game_end(self, final_state: dict[str, Any]) -> None:
     def get_last_response(self) -> LLMResponse | None:
         return self._last_response
 
-    @property
-    def _quest_briefing(self) -> str | None:
-        return getattr(self.memory_module, "_quest_briefing", None)
-
-    @_quest_briefing.setter
-    def _quest_briefing(self, value: str | None) -> None:
-        if self.memory_module is not None:
-            self.memory_module._quest_briefing = value
-
-    @property
-    def _transcript(self) -> list[dict[str, Any]]:
-        return getattr(self.memory_module, "_transcript", [])
-
-    @_transcript.setter
-    def _transcript(self, value: list[dict[str, Any]]) -> None:
-        if self.memory_module is not None:
-            self.memory_module._transcript = value
-
-    @property
-    def _steps_since_compaction(self) -> int:
-        return getattr(self.memory_module, "_steps_since_compaction", 0)
-
-    @_steps_since_compaction.setter
-    def _steps_since_compaction(self, value: int) -> None:
-        if self.memory_module is not None:
-            self.memory_module._steps_since_compaction = value
-
     def _build_contextual_state(self, state: str) -> str:
         if self.memory_module is None:
             return state
diff --git a/llm_quest_benchmark/harnesses/factory.py b/llm_quest_benchmark/harnesses/factory.py
index 8d22462..87e2d77 100644
--- a/llm_quest_benchmark/harnesses/factory.py
+++ b/llm_quest_benchmark/harnesses/factory.py
@@ -1,8 +1,5 @@
 """Factory for creating harness-based quest players."""
 
-from llm_quest_benchmark.agents.base import QuestPlayer
-from llm_quest_benchmark.agents.human_player import HumanPlayer
-from llm_quest_benchmark.agents.random_agent import RandomAgent
 from llm_quest_benchmark.constants import DEFAULT_MODEL
 from llm_quest_benchmark.harnesses.memo import (
     CompactionNoMemoHarness,
@@ -16,6 +13,9 @@
 from llm_quest_benchmark.harnesses.planner import PlannerHarness
 from llm_quest_benchmark.harnesses.reasoning import ReasoningFullTranscriptHarness, ReasoningRecentHarness
 from llm_quest_benchmark.harnesses.tool_harness import ToolCompactHarness, ToolHintedHarness
+from llm_quest_benchmark.players.base import QuestPlayer
+from llm_quest_benchmark.players.human import HumanPlayer
+from llm_quest_benchmark.players.random import RandomPlayer
 
 HARNESS_REGISTRY = {
     "minimal": MinimalHarness,
@@ -66,7 +66,7 @@ def create_harness(
             raise ValueError("Encode random seeds in harness, for example harness='random_choice_123'")
         if model not in (DEFAULT_MODEL, "random_choice"):
             raise ValueError("Use model='random_choice' with random_choice harnesses")
-        return RandomAgent(seed=seed, debug=debug, skip_single=skip_single)
+        return RandomPlayer(seed=seed, debug=debug, skip_single=skip_single)
     if harness.startswith("random_choice"):
         raise ValueError(f"Unknown harness '{harness}'. Valid: {valid}")
     if harness == "human":
diff --git a/llm_quest_benchmark/harnesses/memory.py b/llm_quest_benchmark/harnesses/memory.py
index 22581fa..45ba5e5 100644
--- a/llm_quest_benchmark/harnesses/memory.py
+++ b/llm_quest_benchmark/harnesses/memory.py
@@ -1,8 +1,11 @@
-"""Memory modules for harness-based quest agents."""
+"""Memory modules for harness-based quest players."""
 
+import logging
 from abc import ABC, abstractmethod
 from typing import Any
 
+logger = logging.getLogger(__name__)
+
 
 class MemoryModule(ABC):
     @abstractmethod
@@ -17,12 +20,44 @@ def update(self, step_data: dict) -> None:
     def reset(self) -> None:
         pass
 
+    @property
+    def quest_briefing(self) -> str | None:
+        return None
+
+    @property
+    def transcript(self) -> list[dict[str, Any]]:
+        return []
+
+    @transcript.setter
+    def transcript(self, value: list[dict[str, Any]]) -> None:
+        raise TypeError(f"{self.__class__.__name__} does not support transcript assignment")
+
+    @property
+    def steps_since_compaction(self) -> int:
+        return 0
+
+    @steps_since_compaction.setter
+    def steps_since_compaction(self, value: int) -> None:
+        raise TypeError(f"{self.__class__.__name__} does not support compaction counters")
+
     def set_quest_briefing(self, briefing: str) -> None:
-        pass
+        clean = (briefing or "").strip()
+        if hasattr(self, "_quest_briefing"):
+            self._quest_briefing = clean or None
+
+    def _briefing_block(self, current_state: str) -> str | None:
+        briefing = self.quest_briefing
+        if not briefing:
+            return None
+        if current_state.strip() == briefing:
+            return None
+        if len(briefing) > 800:
+            briefing = briefing[:800] + "..."
+        return f"Quest briefing (your mission):\n{briefing}"
 
 
 class DefaultMemory(MemoryModule):
-    """Recent N observations window (no compaction)."""
+    """Recent N observations window without compaction."""
 
     def __init__(self, context_window: int = 3, context_chars: int = 220, decision_window: int = 5):
         self.context_window = context_window
@@ -32,9 +67,9 @@ def __init__(self, context_window: int = 3, context_chars: int = 220, decision_w
         self._observations: list[str] = []
         self._decisions: list[dict[str, Any]] = []
 
-    def set_quest_briefing(self, briefing: str) -> None:
-        clean = (briefing or "").strip()
-        self._quest_briefing = clean or None
+    @property
+    def quest_briefing(self) -> str | None:
+        return self._quest_briefing
 
     def get_context(self, step: int) -> str:
         blocks: list[str] = []
@@ -106,16 +141,6 @@ def reset(self) -> None:
         self._observations = []
         self._decisions = []
 
-    def _briefing_block(self, current_state: str) -> str | None:
-        if not self._quest_briefing:
-            return None
-        if current_state.strip() == self._quest_briefing:
-            return None
-        briefing = self._quest_briefing
-        if len(briefing) > 800:
-            briefing = briefing[:800] + "..."
-        return f"Quest briefing (your mission):\n{briefing}"
-
 
 class FullTranscriptMemory(MemoryModule):
     """Unbounded full transcript in context."""
@@ -124,9 +149,17 @@ def __init__(self):
         self._quest_briefing: str | None = None
         self._transcript: list[dict[str, Any]] = []
 
-    def set_quest_briefing(self, briefing: str) -> None:
-        clean = (briefing or "").strip()
-        self._quest_briefing = clean or None
+    @property
+    def quest_briefing(self) -> str | None:
+        return self._quest_briefing
+
+    @property
+    def transcript(self) -> list[dict[str, Any]]:
+        return self._transcript
+
+    @transcript.setter
+    def transcript(self, value: list[dict[str, Any]]) -> None:
+        self._transcript = value
 
     def get_context(self, step: int) -> str:
         blocks: list[str] = []
@@ -170,19 +203,9 @@ def reset(self) -> None:
         self._quest_briefing = None
         self._transcript = []
 
-    def _briefing_block(self, current_state: str) -> str | None:
-        if not self._quest_briefing:
-            return None
-        if current_state.strip() == self._quest_briefing:
-            return None
-        briefing = self._quest_briefing
-        if len(briefing) > 800:
-            briefing = briefing[:800] + "..."
-        return f"Quest briefing (your mission):\n{briefing}"
-
 
 class CompactionMemory(MemoryModule):
-    """Periodic LLM summarization + 20-word memo field."""
+    """Periodic LLM summarization plus 20-word memo field."""
 
     def __init__(self, compaction_interval: int = 50, llm_client=None):
         self.compaction_interval = compaction_interval
@@ -192,9 +215,25 @@ def __init__(self, compaction_interval: int = 50, llm_client=None):
         self._compaction_summary: str | None = None
         self._steps_since_compaction = 0
 
-    def set_quest_briefing(self, briefing: str) -> None:
-        clean = (briefing or "").strip()
-        self._quest_briefing = clean or None
+    @property
+    def quest_briefing(self) -> str | None:
+        return self._quest_briefing
+
+    @property
+    def transcript(self) -> list[dict[str, Any]]:
+        return self._transcript
+
+    @transcript.setter
+    def transcript(self, value: list[dict[str, Any]]) -> None:
+        self._transcript = value
+
+    @property
+    def steps_since_compaction(self) -> int:
+        return self._steps_since_compaction
+
+    @steps_since_compaction.setter
+    def steps_since_compaction(self, value: int) -> None:
+        self._steps_since_compaction = value
 
     def get_context(self, step: int) -> str:
         blocks: list[str] = []
@@ -250,15 +289,14 @@ def _maybe_compact(self) -> None:
         if self._steps_since_compaction < self.compaction_interval:
             return
         if self.llm_client is None:
-            # No LLM client available for compaction; skip silently
-            self._steps_since_compaction = 0
+            logger.debug("Skipping compaction because no LLM client is attached")
             return
         transcript_text = self._format_transcript_for_compaction()
         if not transcript_text:
             self._steps_since_compaction = 0
             return
 
-        prompt_parts = ["You are summarizing an agent's progress through a text quest."]
+        prompt_parts = ["You are summarizing a quest player's progress through a text quest."]
         if self._quest_briefing:
             prompt_parts.append(f"\nQUEST BRIEFING (the original mission):\n{self._quest_briefing}")
         if self._compaction_summary:
@@ -266,7 +304,7 @@ def _maybe_compact(self) -> None:
         prompt_parts.append(f"\nTRANSCRIPT OF LAST {self._steps_since_compaction} STEPS:\n{transcript_text}")
         prompt_parts.append(
             "\nSummarize the agent's progress. Include:\n"
-            "- Current objective (what the agent should do next)\n"
+            "- Current objective (what the player should do next)\n"
             "- Progress so far (what has been accomplished)\n"
             "- Key facts (NPCs, items, locations, deadlines discovered)\n"
             "- Failed approaches (actions/paths that didn't work)\n"
@@ -276,15 +314,14 @@ def _maybe_compact(self) -> None:
 
         try:
             summary = (self.llm_client.get_completion("\n".join(prompt_parts)) or "").strip()
-        except Exception:
+        except Exception as exc:
+            logger.debug("Skipping compaction because summarization failed: %s", exc)
             self._steps_since_compaction = 0
             return
         if summary:
             self._compaction_summary = summary
             self._transcript = []
-            self._steps_since_compaction = 0
-        else:
-            self._steps_since_compaction = 0
+        self._steps_since_compaction = 0
 
     def _format_transcript_for_compaction(self) -> str:
         recent = (
@@ -311,16 +348,6 @@ def _format_transcript_for_compaction(self) -> str:
             lines.append(line)
         return "\n\n".join(lines)
 
-    def _briefing_block(self, current_state: str) -> str | None:
-        if not self._quest_briefing:
-            return None
-        if current_state.strip() == self._quest_briefing:
-            return None
-        briefing = self._quest_briefing
-        if len(briefing) > 800:
-            briefing = briefing[:800] + "..."
-        return f"Quest briefing (your mission):\n{briefing}"
-
     @staticmethod
     def _twenty_word_memo(memo: str) -> str:
         return " ".join(memo.split()[:20])
diff --git a/llm_quest_benchmark/harnesses/minimal.py b/llm_quest_benchmark/harnesses/minimal.py
index 8fdd944..462d128 100644
--- a/llm_quest_benchmark/harnesses/minimal.py
+++ b/llm_quest_benchmark/harnesses/minimal.py
@@ -20,10 +20,8 @@ def __init__(
         skip_single: bool = False,
         debug: bool = False,
         memory_module=None,
-        compaction_interval: int = 50,
         **_,
     ):
-        del compaction_interval
         super().__init__(
             model_name=model_name,
             system_template=system_template,
@@ -56,6 +54,3 @@ def _get_action_impl(self, observation: str, choices: list[dict[str, str]]) -> i
             self.history.append(default_response)
             self._last_response = default_response
             return 1
-
-    def reset(self) -> None:
-        super().reset()
diff --git a/llm_quest_benchmark/harnesses/tool_harness.py b/llm_quest_benchmark/harnesses/tool_harness.py
index a4f09dd..0acc699 100644
--- a/llm_quest_benchmark/harnesses/tool_harness.py
+++ b/llm_quest_benchmark/harnesses/tool_harness.py
@@ -184,20 +184,24 @@ def _get_action_impl(self, state: str, choices: list[dict[str, str]]) -> int:
             tool_calls = self._extract_tool_calls(selection_response)
             parsed_response = self._parse_llm_response(selection_response, len(choices))
             tool_results: list[str] = []
+            final_choice_used = False
 
             total_usage = self._normalize_usage(selection_usage)
             if tool_calls:
                 tool_results = self._execute_tool_calls(tool_calls)
                 parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=tool_results)
                 total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage))
+                final_choice_used = True
             elif parsed_response.is_default:
                 parsed_response, final_usage = self._final_choice(contextual_state, choices, tool_results=[])
                 total_usage = self._normalize_usage(self._merge_usage(total_usage, final_usage))
+                final_choice_used = True
 
-            action_before_policy = parsed_response.action
-            parsed_response.action = self._apply_safety_filter(choices, parsed_response.action)
-            if parsed_response.action != action_before_policy and not parsed_response.reasoning:
-                parsed_response.reasoning = "policy_safety_override"
+            if not final_choice_used:
+                action_before_policy = parsed_response.action
+                parsed_response.action = self._apply_safety_filter(choices, parsed_response.action)
+                if parsed_response.action != action_before_policy and not parsed_response.reasoning:
+                    parsed_response.reasoning = "policy_safety_override"
 
             parsed_response.prompt_tokens = total_usage["prompt_tokens"]
             parsed_response.completion_tokens = total_usage["completion_tokens"]
diff --git a/llm_quest_benchmark/harnesses/tools.py b/llm_quest_benchmark/harnesses/tools.py
index 63edcd8..9978c58 100644
--- a/llm_quest_benchmark/harnesses/tools.py
+++ b/llm_quest_benchmark/harnesses/tools.py
@@ -1,4 +1,4 @@
-"""Reusable tools for harness-based quest agents."""
+"""Reusable tools for harness-based quest players."""
 
 import ast
 import re
diff --git a/llm_quest_benchmark/agents/__init__.py b/llm_quest_benchmark/players/__init__.py
similarity index 50%
rename from llm_quest_benchmark/agents/__init__.py
rename to llm_quest_benchmark/players/__init__.py
index fdd1aa6..aa71d5b 100644
--- a/llm_quest_benchmark/agents/__init__.py
+++ b/llm_quest_benchmark/players/__init__.py
@@ -1,4 +1,4 @@
-__all__ = ["QuestPlayer", "HumanPlayer", "RandomAgent"]
+__all__ = ["QuestPlayer", "HumanPlayer", "RandomPlayer"]
 
 
 def __getattr__(name):
@@ -7,11 +7,11 @@ def __getattr__(name):
 
         return QuestPlayer
     if name == "HumanPlayer":
-        from .human_player import HumanPlayer
+        from .human import HumanPlayer
 
         return HumanPlayer
-    if name == "RandomAgent":
-        from .random_agent import RandomAgent
+    if name == "RandomPlayer":
+        from .random import RandomPlayer
 
-        return RandomAgent
+        return RandomPlayer
     raise AttributeError(name)
diff --git a/llm_quest_benchmark/agents/base.py b/llm_quest_benchmark/players/base.py
similarity index 92%
rename from llm_quest_benchmark/agents/base.py
rename to llm_quest_benchmark/players/base.py
index eed7609..9e53750 100644
--- a/llm_quest_benchmark/agents/base.py
+++ b/llm_quest_benchmark/players/base.py
@@ -1,4 +1,4 @@
-"""Base classes for quest players (both human and LLM)"""
+"""Base class for quest players and harnesses."""
 
 from abc import ABC, abstractmethod
 from typing import Any
@@ -13,7 +13,7 @@ def __init__(self, skip_single: bool = False):
         """Initialize player with skip_single option"""
         self.skip_single = skip_single
         self._last_response: LLMResponse = None
-        self.agent_id = "base_agent"  # Default agent ID
+        self.agent_id = "base_player"
 
     def get_action(self, observation: str, choices: list) -> int:
         """Get action number from observation and choices
@@ -55,7 +55,7 @@ def _get_action_impl(self, observation: str, choices: list) -> int:
         pass
 
     def get_last_response(self) -> LLMResponse:
-        """Get the last response from the agent"""
+        """Get the last response from the player or harness."""
         return self._last_response
 
     @abstractmethod
diff --git a/llm_quest_benchmark/agents/human_player.py b/llm_quest_benchmark/players/human.py
similarity index 91%
rename from llm_quest_benchmark/agents/human_player.py
rename to llm_quest_benchmark/players/human.py
index 721c43d..b5d74f4 100644
--- a/llm_quest_benchmark/agents/human_player.py
+++ b/llm_quest_benchmark/players/human.py
@@ -3,7 +3,7 @@
 import logging
 from typing import Any
 
-from llm_quest_benchmark.agents.base import QuestPlayer
+from llm_quest_benchmark.players.base import QuestPlayer
 
 
 class HumanPlayer(QuestPlayer):
@@ -15,7 +15,7 @@ def __init__(self, skip_single: bool = False, debug: bool = False):
         self.logger = logging.getLogger(__name__)
         if debug:
             self.logger.setLevel(logging.DEBUG)
-        # Set agent_id for database records
+        # Keep the persisted identifier stable for existing result artifacts.
         self.agent_id = "human"
 
     def _get_action_impl(self, observation: str, choices: list) -> int:
diff --git a/llm_quest_benchmark/agents/random_agent.py b/llm_quest_benchmark/players/random.py
similarity index 75%
rename from llm_quest_benchmark/agents/random_agent.py
rename to llm_quest_benchmark/players/random.py
index e428353..a8fea29 100644
--- a/llm_quest_benchmark/agents/random_agent.py
+++ b/llm_quest_benchmark/players/random.py
@@ -1,17 +1,19 @@
-"""Random agent for testing quests"""
+"""Random player for testing quests"""
 
 import logging
 import random
 
-from llm_quest_benchmark.agents.base import QuestPlayer
+from llm_quest_benchmark.players.base import QuestPlayer
 
 
-class RandomAgent(QuestPlayer):
-    """Agent that randomly selects from available choices.
-    Used for testing quests and finding edge cases."""
+class RandomPlayer(QuestPlayer):
+    """Player that randomly selects from available choices.
+
+    Used for testing quests and finding edge cases.
+    """
 
     def __init__(self, seed: int = None, debug: bool = False, skip_single: bool = False):
-        """Initialize random agent.
+        """Initialize random player.
 
         Args:
             seed (int, optional): Random seed for reproducibility. Defaults to None.
@@ -24,7 +26,7 @@ def __init__(self, seed: int = None, debug: bool = False, skip_single: bool = Fa
         if debug:
             self.logger.setLevel(logging.DEBUG)
         self.rng = random.Random(seed)
-        # Set agent_id for database records
+        # Keep the persisted identifier stable for existing result artifacts.
         self.agent_id = f"random_{seed}" if seed is not None else "random"
 
     def _get_action_impl(self, observation: str, choices: list[dict[str, str]]) -> int:
@@ -43,5 +45,5 @@ def _get_action_impl(self, observation: str, choices: list[dict[str, str]]) -> i
         return self.rng.randint(1, len(choices))
 
     def reset(self) -> None:
-        """Reset agent state - nothing to reset for random agent"""
+        """Reset player state; nothing to reset for random choice."""
         pass
diff --git a/llm_quest_benchmark/renderers/factory.py b/llm_quest_benchmark/renderers/factory.py
index 8b18218..0a8f3e5 100644
--- a/llm_quest_benchmark/renderers/factory.py
+++ b/llm_quest_benchmark/renderers/factory.py
@@ -1,7 +1,7 @@
 """Factory for creating appropriate renderers based on agent type and mode"""
 
-from llm_quest_benchmark.agents.base import QuestPlayer
-from llm_quest_benchmark.agents.human_player import HumanPlayer
+from llm_quest_benchmark.players.base import QuestPlayer
+from llm_quest_benchmark.players.human import HumanPlayer
 from llm_quest_benchmark.renderers.base import BaseRenderer
 from llm_quest_benchmark.renderers.null import NoRenderer
 from llm_quest_benchmark.renderers.progress import ProgressRenderer
@@ -25,7 +25,7 @@ def create_renderer(
     The factory follows these rules:
     1. In debug mode, always use NoRenderer
     2. For human players, use RichRenderer
-    3. For automated agents (LLM, Random):
+    3. For automated players (LLM, Random):
        - In benchmark mode (total_quests provided), use ProgressRenderer
        - Otherwise, use NoRenderer
     """
diff --git a/llm_quest_benchmark/renderers/progress.py b/llm_quest_benchmark/renderers/progress.py
index 9d2cde9..a5097d2 100644
--- a/llm_quest_benchmark/renderers/progress.py
+++ b/llm_quest_benchmark/renderers/progress.py
@@ -45,23 +45,23 @@ def __init__(self, total_quests: int, total_runs: int):
         self.console.print("\n[bold cyan]Benchmark Progress[/]")
 
     def render_game_state(self, state: dict[str, Any]) -> None:
-        """No game state rendering needed for automated agents"""
+        """No game state rendering needed for automated players"""
         pass
 
     def render_title(self) -> None:
-        """No title rendering needed for automated agents"""
+        """No title rendering needed for automated players"""
         pass
 
     def render_quest_text(self, text: str) -> None:
-        """No quest text rendering needed for automated agents"""
+        """No quest text rendering needed for automated players"""
         pass
 
     def render_choices(self, choices: list) -> None:
-        """No choices rendering needed for automated agents"""
+        """No choices rendering needed for automated players"""
         pass
 
     def render_parameters(self, params: list) -> None:
-        """No parameters rendering needed for automated agents"""
+        """No parameters rendering needed for automated players"""
         pass
 
     def render_error(self, message: str) -> None:
diff --git a/llm_quest_benchmark/tests/agents/test_mode_agents.py b/llm_quest_benchmark/tests/agents/test_mode_agents.py
deleted file mode 100644
index a41a11a..0000000
--- a/llm_quest_benchmark/tests/agents/test_mode_agents.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""Legacy agent-mode tests retired.
-
-Planner/tool/memo behavior now lives in
-``llm_quest_benchmark.tests.harnesses.test_harnesses``.
-"""
diff --git a/llm_quest_benchmark/tests/executors/cli/test_commands.py b/llm_quest_benchmark/tests/executors/cli/test_commands.py
index a3825cd..1bd972e 100644
--- a/llm_quest_benchmark/tests/executors/cli/test_commands.py
+++ b/llm_quest_benchmark/tests/executors/cli/test_commands.py
@@ -19,7 +19,7 @@ def test_version():
 
 
 def test_run_quest():
-    """Test running a quest with random agent"""
+    """Test running a quest with random player"""
     result = runner.invoke(
         app,
         ["run", "--quest", str(DEFAULT_QUEST), "--model", "random_choice", "--harness", "random_choice", "--debug"],
diff --git a/llm_quest_benchmark/tests/agents/test_anthropic.py b/llm_quest_benchmark/tests/harnesses/test_anthropic.py
similarity index 100%
rename from llm_quest_benchmark/tests/agents/test_anthropic.py
rename to llm_quest_benchmark/tests/harnesses/test_anthropic.py
diff --git a/llm_quest_benchmark/tests/agents/test_llm_agent.py b/llm_quest_benchmark/tests/harnesses/test_base.py
similarity index 100%
rename from llm_quest_benchmark/tests/agents/test_llm_agent.py
rename to llm_quest_benchmark/tests/harnesses/test_base.py
diff --git a/llm_quest_benchmark/tests/harnesses/test_factory.py b/llm_quest_benchmark/tests/harnesses/test_factory.py
index 31c3ae2..49062fe 100644
--- a/llm_quest_benchmark/tests/harnesses/test_factory.py
+++ b/llm_quest_benchmark/tests/harnesses/test_factory.py
@@ -1,10 +1,10 @@
 import pytest
 
-from llm_quest_benchmark.agents.human_player import HumanPlayer
-from llm_quest_benchmark.agents.random_agent import RandomAgent
 from llm_quest_benchmark.harnesses.factory import HARNESS_REGISTRY, create_harness
 from llm_quest_benchmark.harnesses.memo import MemoCompactHarness
 from llm_quest_benchmark.harnesses.minimal import MinimalHarness
+from llm_quest_benchmark.players.human import HumanPlayer
+from llm_quest_benchmark.players.random import RandomPlayer
 from llm_quest_benchmark.schemas.config import BenchmarkConfig, HarnessConfig
 
 
@@ -30,13 +30,13 @@ def test_create_human_harness():
 def test_create_random_choice_harness():
     harness = create_harness("random_choice")
 
-    assert isinstance(harness, RandomAgent)
+    assert isinstance(harness, RandomPlayer)
 
 
 def test_create_seeded_random_choice_harness():
     harness = create_harness("random_choice_123", model="random_choice")
 
-    assert isinstance(harness, RandomAgent)
+    assert isinstance(harness, RandomPlayer)
     assert harness.agent_id == "random_123"
 
 
diff --git a/llm_quest_benchmark/tests/harnesses/test_harnesses.py b/llm_quest_benchmark/tests/harnesses/test_harnesses.py
index 3cba73e..efa03bb 100644
--- a/llm_quest_benchmark/tests/harnesses/test_harnesses.py
+++ b/llm_quest_benchmark/tests/harnesses/test_harnesses.py
@@ -139,7 +139,7 @@ def test_compaction_memory_receives_existing_llm_client():
     assert action == 2
     assert harness.memory_module.llm_client is mocked_llm
     assert harness.memory_module._compaction_summary == "Summary: paid the fuel merchant and should keep receipt."
-    assert harness._steps_since_compaction == 0
+    assert harness.memory_module.steps_since_compaction == 0
 
 
 def test_planner_harness_first_turn_generates_plan_then_acts():
@@ -186,8 +186,8 @@ def test_planner_harness_reuses_plan_when_state_is_stable():
 
 def test_planner_harness_uses_contextual_memory_state():
     harness = PlannerHarness(model_name="gpt-5-mini", compaction_interval=50)
-    harness._quest_briefing = "Original mission: win the election."
-    harness._transcript = [
+    harness.memory_module.set_quest_briefing("Original mission: win the election.")
+    harness.memory_module.transcript = [
         {
             "step": 1,
             "observation": "You learned Maloqs value strength.",
@@ -196,7 +196,7 @@ def test_planner_harness_uses_contextual_memory_state():
             "action": 1,
         }
     ]
-    harness._steps_since_compaction = 1
+    harness.memory_module.steps_since_compaction = 1
     mocked_llm = Mock()
     mocked_llm.get_completion.side_effect = [
         "Use the remembered cultural clue.",
@@ -323,8 +323,8 @@ def test_tool_compact_harness_can_use_scratchpad_tool_call():
 
 def test_tool_compact_harness_uses_contextual_memory_state():
     harness = ToolCompactHarness(model_name="gpt-5-mini", compaction_interval=50)
-    harness._quest_briefing = "Original mission: pass pilot certification."
-    harness._transcript = [
+    harness.memory_module.set_quest_briefing("Original mission: pass pilot certification.")
+    harness.memory_module.transcript = [
         {
             "step": 1,
             "observation": "Hogger is greedy.",
@@ -333,7 +333,7 @@ def test_tool_compact_harness_uses_contextual_memory_state():
             "action": 1,
         }
     ]
-    harness._steps_since_compaction = 1
+    harness.memory_module.steps_since_compaction = 1
     mocked_llm = Mock()
     mocked_llm.get_completion.return_value = (
         '{"memo":"Hogger is greedy","analysis":"no tools needed","tool_calls":[],"result":1}'
diff --git a/llm_quest_benchmark/tests/integration/test_quest_e2e.py b/llm_quest_benchmark/tests/integration/test_quest_e2e.py
index 86568bb..3d02d1a 100644
--- a/llm_quest_benchmark/tests/integration/test_quest_e2e.py
+++ b/llm_quest_benchmark/tests/integration/test_quest_e2e.py
@@ -63,13 +63,13 @@ def mock_callback(event: str, data: Any) -> None:
 
 @pytest.mark.e2e
 @pytest.mark.timeout(TIMEOUT)
-def test_random_agent_on_test_quest(caplog):
-    """Test that random agent can complete a test quest"""
+def test_random_player_on_test_quest(caplog):
+    """Test that random player can complete a test quest"""
     caplog.set_level(logging.DEBUG)  # Show all logs in test output
 
-    # Create random agent
+    # Create random player
     agent = create_harness("random_choice", skip_single=True, debug=True)
-    assert agent is not None, "Failed to create random agent"
+    assert agent is not None, "Failed to create random player"
 
     # Mock callback for testing
     def mock_callback(event: str, data: Any) -> None:
@@ -80,7 +80,7 @@ def mock_callback(event: str, data: Any) -> None:
         elif event == "error":
             caplog.error(f"Error: {data}")
 
-    # Run quest with random agent
+    # Run quest with random player
     try:
         outcome = run_quest_with_timeout(
             quest_path=str(DEFAULT_QUEST),
diff --git a/llm_quest_benchmark/tests/agents/test_human_player.py b/llm_quest_benchmark/tests/players/test_human_player.py
similarity index 95%
rename from llm_quest_benchmark/tests/agents/test_human_player.py
rename to llm_quest_benchmark/tests/players/test_human_player.py
index 8334ebd..7108f78 100644
--- a/llm_quest_benchmark/tests/agents/test_human_player.py
+++ b/llm_quest_benchmark/tests/players/test_human_player.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from llm_quest_benchmark.agents.human_player import HumanPlayer
+from llm_quest_benchmark.players.human import HumanPlayer
 
 
 def test_human_player_initialization():
diff --git a/llm_quest_benchmark/tests/test_database.py b/llm_quest_benchmark/tests/test_database.py
index a04f53d..d00c6b1 100644
--- a/llm_quest_benchmark/tests/test_database.py
+++ b/llm_quest_benchmark/tests/test_database.py
@@ -249,8 +249,8 @@ def test_run_summary_export_tracks_repetition_rate(tmp_path, monkeypatch, quest_
     assert exported["metrics"]["bad_decision_rate"] == 0.0
 
 
-def test_random_agent_does_not_export_json(tmp_path, monkeypatch, quest_logger):
-    """Random agent runs should not create result artifacts in results/."""
+def test_random_player_does_not_export_json(tmp_path, monkeypatch, quest_logger):
+    """Random player runs should not create result artifacts in results/."""
     monkeypatch.setattr(logging_module, "RESULTS_DIR", tmp_path)
 
     quest_logger.agent = "random_choice"

From 1fe3930742e2f15d21aa38cf71d42bad08fcdb6c Mon Sep 17 00:00:00 2001
From: Kirill Korikov <11762090+yourconscience@users.noreply.github.com>
Date: Wed, 13 May 2026 20:03:17 +0400
Subject: [PATCH 24/24] fix harness leaderboard memory mode

---
 llm_quest_benchmark/core/leaderboard.py       |  2 +-
 llm_quest_benchmark/tests/test_leaderboard.py | 41 +++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/llm_quest_benchmark/core/leaderboard.py b/llm_quest_benchmark/core/leaderboard.py
index dc0a67b..078648e 100644
--- a/llm_quest_benchmark/core/leaderboard.py
+++ b/llm_quest_benchmark/core/leaderboard.py
@@ -385,7 +385,7 @@ def generate_leaderboard(
             template_from_config = str(config.get("action_template") or "")
             if template_from_config:
                 template = template_from_config
-            memory_mode = config.get("memory_mode")
+            memory_mode = config.get("memory_mode") or result_row.get("memory_mode")
             if _is_retired_result(
                 str(source_name) if source_name else None,
                 str(benchmark_id) if benchmark_id else None,
diff --git a/llm_quest_benchmark/tests/test_leaderboard.py b/llm_quest_benchmark/tests/test_leaderboard.py
index 28a3e31..46407cf 100644
--- a/llm_quest_benchmark/tests/test_leaderboard.py
+++ b/llm_quest_benchmark/tests/test_leaderboard.py
@@ -377,3 +377,44 @@ def test_generate_leaderboard_matches_db_runs_by_identifiers(tmp_path, monkeypat
     rows = {(row["quest"], row["mode"]): row for row in leaderboard["results"]}
     assert rows[("Alpha", "compact_memory_memo")]["avg_steps"] == 10.0
     assert rows[("Beta", "full_history_reasoning")]["avg_steps"] == 20.0
+
+
+def test_generate_leaderboard_uses_result_row_memory_mode_without_db_config(tmp_path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
+    benchmark_dir = Path("results/benchmarks/bench_result_memory_mode")
+    benchmark_dir.mkdir(parents=True, exist_ok=True)
+    results = [
+        {
+            "quest": "quests/Beta.qm",
+            "model": "gpt-5-mini",
+            "template": "reasoning.jinja",
+            "memory_mode": "full_transcript",
+            "agent_id": "harness_gpt-5-mini",
+            "outcome": "SUCCESS",
+        }
+    ]
+    db_runs = [
+        {
+            "id": 20,
+            "quest_file": "quests/Beta.qm",
+            "quest_name": "Beta",
+            "agent_id": "harness_gpt-5-mini",
+            "agent_config": json.dumps({"model": "gpt-5-mini", "harness": "reasoning_full"}),
+            "outcome": "SUCCESS",
+        }
+    ]
+    (benchmark_dir / "benchmark_summary.json").write_text(
+        json.dumps(
+            {"benchmark_id": "bench_result_memory_mode", "harnesses": [], "results": results, "db_runs": db_runs}
+        ),
+        encoding="utf-8",
+    )
+
+    leaderboard = generate_leaderboard(
+        [str(benchmark_dir)],
+        "site/leaderboard.json",
+        min_runs=0,
+        public_model_ids=None,
+    )
+
+    assert leaderboard["results"][0]["mode"] == "full_history_reasoning"