From 1fb6a9e8151abe4feea39cb550a3b78ee0873a14 Mon Sep 17 00:00:00 2001 From: Aaron Landy Date: Wed, 6 May 2026 17:24:27 -0700 Subject: [PATCH 1/4] =?UTF-8?q?Add=20Among=20Them=20SDK=20=E2=80=94=20Phas?= =?UTF-8?q?e=200/1=20with=20cogames=20packaging=20+=20dev=20loop?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A Cursor-SDK-style Python harness for authoring Among Them policy bots. Wraps `evidencebot_v2` via in-process FFI as the default scripted policy and exposes a `instructions=` parameter plus module-swap kwargs (`voter=`, `chatter=`, `reporter=`, ...) for LLM-augmented cognition. What's in this iteration - `among_them/sdk/` — full Python package (~3.5k LOC) - `Agent.create()` / `LiveGame` / `LocalSDKPolicy` / `SDKPolicy` (cogames `MultiAgentPolicy` entrypoint composing `EvidenceBotV2NimPolicy`) - `_DirectiveOverrideEngine` shared between local + tournament code paths - `Directives` Pydantic schema parsed from natural-language instructions via the existing `cognition/llm.py` provider (deterministic keyword fallback when no API key is set — required for cogames Docker validator) - `among_them_sdk.package` CLI for emitting `cogames upload` bundles - 13 runnable examples (hello, instructions, personas, custom voter/reporter, LLM chatter, mixed modules, A/B test, win-rate loop, transcript logger, debug directives, provider switch, tournament, eight_player_game, variant_arena head-to-head) - 5 test files / 25+ tests passing under `uv run pytest` - `among_them/sdk/docs/` — python-guide, tournament-submission, local-iteration-guide - `among_them/players/sdk/DESIGN.md` — architecture + phased roadmap - `among_them/server.nim` — drop a duplicate `liveProgressMaxTick` proc that blocked compilation Tournament submission status Validated end-to-end via `cogames upload --season among-them` after two fixes: - `SDKPolicy` now resolves `evidencebot_v2_policy` at runtime by walking up to find `among_them/players/` (cogames only puts the entry-point package's dir on `sys.path`) - Bundle now includes `among_them/votereader.nim` (recently added Nim dep that `evidencebot_v2.nim` imports) Known follow-ups (in flight in parallel iterations) - LLM integration design doc (`docs/llm-integration.md`) - Cross-game opponent modeling (`opponents/` subpackage) - `--persona` shortcut on `eight_player_game.py` - Voter/Chatter advisory surfacing in result block Co-authored-by: Cursor --- among_them/players/sdk/DESIGN.md | 772 +++++++++++++++++ among_them/sdk/README.md | 183 ++++ among_them/sdk/docs/local-iteration-guide.md | 518 ++++++++++++ among_them/sdk/docs/python-guide.md | 493 +++++++++++ among_them/sdk/docs/tournament-submission.md | 220 +++++ among_them/sdk/examples/_arena_common.py | 248 ++++++ among_them/sdk/examples/_variant_worker.py | 193 +++++ .../sdk/examples/ab_test_instructions.py | 113 +++ among_them/sdk/examples/custom_reporter.py | 74 ++ among_them/sdk/examples/custom_voter.py | 19 + among_them/sdk/examples/debug_directives.py | 62 ++ among_them/sdk/examples/eight_player_game.py | 751 +++++++++++++++++ among_them/sdk/examples/hello.py | 7 + among_them/sdk/examples/instructions.py | 23 + among_them/sdk/examples/llm_chatter.py | 19 + among_them/sdk/examples/mixed_modules.py | 96 +++ among_them/sdk/examples/personas.py | 81 ++ among_them/sdk/examples/provider_switch.py | 62 ++ among_them/sdk/examples/tournament.py | 24 + among_them/sdk/examples/transcript_logger.py | 83 ++ among_them/sdk/examples/variant_arena.py | 796 ++++++++++++++++++ among_them/sdk/examples/win_rate_loop.py | 97 +++ among_them/sdk/pyproject.toml | 51 ++ among_them/sdk/src/among_them_sdk/__init__.py | 128 +++ among_them/sdk/src/among_them_sdk/_cyborg.py | 102 +++ among_them/sdk/src/among_them_sdk/agent.py | 385 +++++++++ .../sdk/src/among_them_sdk/cogames_config.py | 241 ++++++ .../src/among_them_sdk/cognition/__init__.py | 16 + .../among_them_sdk/cognition/instructions.py | 218 +++++ .../sdk/src/among_them_sdk/cognition/llm.py | 209 +++++ .../sdk/src/among_them_sdk/cognition/tools.py | 140 +++ among_them/sdk/src/among_them_sdk/config.py | 91 ++ .../sdk/src/among_them_sdk/extensions.py | 62 ++ among_them/sdk/src/among_them_sdk/ffi.py | 278 ++++++ among_them/sdk/src/among_them_sdk/hooks.py | 40 + .../sdk/src/among_them_sdk/live_game.py | 520 ++++++++++++ .../src/among_them_sdk/modules/__init__.py | 39 + .../sdk/src/among_them_sdk/modules/chatter.py | 102 +++ .../sdk/src/among_them_sdk/modules/memory.py | 102 +++ .../src/among_them_sdk/modules/navigator.py | 42 + .../src/among_them_sdk/modules/perception.py | 62 ++ .../src/among_them_sdk/modules/reporter.py | 48 ++ .../sdk/src/among_them_sdk/modules/voter.py | 125 +++ among_them/sdk/src/among_them_sdk/package.py | 386 +++++++++ .../sdk/src/among_them_sdk/policy/__init__.py | 27 + .../sdk/src/among_them_sdk/policy/cogames.py | 550 ++++++++++++ .../among_them_sdk/policy/evidencebot_v2.py | 216 +++++ among_them/sdk/src/among_them_sdk/runner.py | 50 ++ among_them/sdk/src/among_them_sdk/runtime.py | 157 ++++ among_them/sdk/src/among_them_sdk/tracing.py | 64 ++ among_them/sdk/src/among_them_sdk/wire.py | 125 +++ among_them/sdk/tests/__init__.py | 0 among_them/sdk/tests/test_agent_default.py | 38 + .../sdk/tests/test_cogames_packaging.py | 244 ++++++ among_them/sdk/tests/test_ffi_load.py | 31 + among_them/sdk/tests/test_instructions.py | 58 ++ among_them/sdk/tests/test_module_override.py | 72 ++ among_them/sdk/uv.lock | 653 ++++++++++++++ among_them/server.nim | 20 - 59 files changed, 10606 insertions(+), 20 deletions(-) create mode 100644 among_them/players/sdk/DESIGN.md create mode 100644 among_them/sdk/README.md create mode 100644 among_them/sdk/docs/local-iteration-guide.md create mode 100644 among_them/sdk/docs/python-guide.md create mode 100644 among_them/sdk/docs/tournament-submission.md create mode 100644 among_them/sdk/examples/_arena_common.py create mode 100644 among_them/sdk/examples/_variant_worker.py create mode 100644 among_them/sdk/examples/ab_test_instructions.py create mode 100644 among_them/sdk/examples/custom_reporter.py create mode 100644 among_them/sdk/examples/custom_voter.py create mode 100644 among_them/sdk/examples/debug_directives.py create mode 100644 among_them/sdk/examples/eight_player_game.py create mode 100644 among_them/sdk/examples/hello.py create mode 100644 among_them/sdk/examples/instructions.py create mode 100644 among_them/sdk/examples/llm_chatter.py create mode 100644 among_them/sdk/examples/mixed_modules.py create mode 100644 among_them/sdk/examples/personas.py create mode 100644 among_them/sdk/examples/provider_switch.py create mode 100644 among_them/sdk/examples/tournament.py create mode 100644 among_them/sdk/examples/transcript_logger.py create mode 100644 among_them/sdk/examples/variant_arena.py create mode 100644 among_them/sdk/examples/win_rate_loop.py create mode 100644 among_them/sdk/pyproject.toml create mode 100644 among_them/sdk/src/among_them_sdk/__init__.py create mode 100644 among_them/sdk/src/among_them_sdk/_cyborg.py create mode 100644 among_them/sdk/src/among_them_sdk/agent.py create mode 100644 among_them/sdk/src/among_them_sdk/cogames_config.py create mode 100644 among_them/sdk/src/among_them_sdk/cognition/__init__.py create mode 100644 among_them/sdk/src/among_them_sdk/cognition/instructions.py create mode 100644 among_them/sdk/src/among_them_sdk/cognition/llm.py create mode 100644 among_them/sdk/src/among_them_sdk/cognition/tools.py create mode 100644 among_them/sdk/src/among_them_sdk/config.py create mode 100644 among_them/sdk/src/among_them_sdk/extensions.py create mode 100644 among_them/sdk/src/among_them_sdk/ffi.py create mode 100644 among_them/sdk/src/among_them_sdk/hooks.py create mode 100644 among_them/sdk/src/among_them_sdk/live_game.py create mode 100644 among_them/sdk/src/among_them_sdk/modules/__init__.py create mode 100644 among_them/sdk/src/among_them_sdk/modules/chatter.py create mode 100644 among_them/sdk/src/among_them_sdk/modules/memory.py create mode 100644 among_them/sdk/src/among_them_sdk/modules/navigator.py create mode 100644 among_them/sdk/src/among_them_sdk/modules/perception.py create mode 100644 among_them/sdk/src/among_them_sdk/modules/reporter.py create mode 100644 among_them/sdk/src/among_them_sdk/modules/voter.py create mode 100644 among_them/sdk/src/among_them_sdk/package.py create mode 100644 among_them/sdk/src/among_them_sdk/policy/__init__.py create mode 100644 among_them/sdk/src/among_them_sdk/policy/cogames.py create mode 100644 among_them/sdk/src/among_them_sdk/policy/evidencebot_v2.py create mode 100644 among_them/sdk/src/among_them_sdk/runner.py create mode 100644 among_them/sdk/src/among_them_sdk/runtime.py create mode 100644 among_them/sdk/src/among_them_sdk/tracing.py create mode 100644 among_them/sdk/src/among_them_sdk/wire.py create mode 100644 among_them/sdk/tests/__init__.py create mode 100644 among_them/sdk/tests/test_agent_default.py create mode 100644 among_them/sdk/tests/test_cogames_packaging.py create mode 100644 among_them/sdk/tests/test_ffi_load.py create mode 100644 among_them/sdk/tests/test_instructions.py create mode 100644 among_them/sdk/tests/test_module_override.py create mode 100644 among_them/sdk/uv.lock diff --git a/among_them/players/sdk/DESIGN.md b/among_them/players/sdk/DESIGN.md new file mode 100644 index 00000000..2f398974 --- /dev/null +++ b/among_them/players/sdk/DESIGN.md @@ -0,0 +1,772 @@ +# among-them-sdk — Python SDK Design + +> A Cursor‑SDK‑style developer experience for authoring **Among Them** policy +> bots in Python: pure scripted, pure LLM, or any mix of the two — same API, +> same harness, same observability. + +--- + +## 1. Executive summary + +Today, writing a competitive Among Them bot means either (a) writing Nim and +shipping a recompiled `.dylib` for the CoGames pipeline +(`among_them/players/build_evidencebot_v2.py:28-51`), or (b) re‑implementing the +WebSocket protocol and perception loop in Python under `bot-policies/sidecar/` +(`among_them/bot-policies/sidecar/bot.py:37-53`). Neither path lets a developer +"just write the brain" — both force them to own the protocol, the localization +math, the bitmask actuator, and a custom build/launch story. + +**among-them-sdk** is a Python package (`pip install among-them-sdk`) that +ships a competitive scripted policy out of the box and lets authors **swap any +cognitive module for a Python function or an LLM call without touching the +perception/actuation pipeline**. It borrows naming and DX directly from the +Cursor TypeScript SDK (`Agent.create`, `agent.send`, `run.stream`, +`hooks.json`, `skills/`, subagents) and the OpenAI Agents SDK (`Runner`, +`tool` decorator, lifecycle hooks, tracing). + +**Success criteria for DX** + +1. **5‑line hello world** that runs a competitive bot in local sim with zero + config beyond `pip install among-them-sdk`. +2. **One‑line LLM mix‑in**: `voting=LLMVoter("gpt-5.5")` swaps voting only; + everything else stays scripted. +3. **No Nim required** for pure‑Python authors; **Nim policy reuse** + available via an FFI runtime when authors want the optimized core. +4. **One config knob to pick the runtime**: in‑process local sim, subprocess + tournament harness, or remote `games_server` connection. +5. **Tracing that "just works"**: every tick, decision, and LLM call is + observable in Langfuse and on disk via structlog. + +**Five‑line hello world** + +```python +from among_them import Agent + +agent = Agent.create() # default = evidencebot_v2-equivalent +agent.run_local(n_games=10, render=False) # in-process sim, no LLM, no API keys +``` + +--- + +## 2. External research summary + +I surveyed five agent SDKs and codified the patterns we should adopt. + +**Cursor TypeScript SDK** (`@cursor/sdk`) — `Agent.create({ apiKey, model, +local|cloud })` → `agent.send(prompt)` → `run.stream()`. Runtime is a single +field swap (`local: { cwd }` vs `cloud: { repos, autoCreatePR }`). Skills, +hooks, MCP, and subagents are all filesystem‑driven (`.cursor/skills/`, +`.cursor/hooks.json`, `.cursor/mcp.json`). DX gut feel: opinionated, minimal +ceremony, runtime swap is the killer feature. + +**Anthropic Claude Agent SDK** (`claude-agent-sdk`) — top‑level `query(prompt, +options=ClaudeAgentOptions(...))` async generator; subagents declared inline as +`AgentDefinition`s; hooks are typed callbacks (`PreToolUse`, `PostToolUse`, +`SessionStart`, …) registered via `HookMatcher`. Skills loaded from +`.claude/skills/*/SKILL.md`. DX gut feel: heavy on filesystem conventions, +strong hook taxonomy, weak ergonomic story for stateful long‑running agents. + +**Vercel AI SDK** — `new ToolLoopAgent({ model, tools, stopWhen })` with the +loop, context, and stop conditions handled internally; `tool({ description, +inputSchema, execute })` is the canonical tool factory; `prepareStep` +intercepts every loop iteration. Provider model strings are AI Gateway +addresses (e.g. `"openai/gpt-5.5"`). DX gut feel: best‑in‑class tool loop, +provider unification. + +**OpenAI Agents SDK (Python)** — `Agent(name, instructions, tools, handoffs, +model)` + `Runner.run_sync(agent, prompt)`; tools are `@function_tool` +decorators, automatic Pydantic schema; sessions are first‑class; built‑in +tracing; handoffs are an explicit primitive. DX gut feel: smallest primitive +set, "Python‑first" — what a Python‑native game SDK should imitate. + +**LangGraph + Pydantic AI** — LangGraph is graph/state‑machine flavored +(`StateGraph`, nodes, edges) — too heavyweight for a tick loop. Pydantic AI +gives typed agents, dependency injection (`deps_type`), and `@agent.tool` +decorators — worth borrowing the typed‑deps idea so cognitive modules can be +constructor‑injected. + +**What to steal** + +- `Agent.create(...)` factory + `agent.send(...)` + `run.stream()` from Cursor + SDK — primary external surface. +- `Runner.run_sync(agent, ...)` and lifecycle hooks (`AgentHooks`, + `RunHooks`) from OpenAI Agents SDK — Python‑idiomatic batch orchestration. +- `ToolLoopAgent`/`stopWhen`/`prepareStep` semantics from Vercel AI SDK for + the LLM‑driven decision loops (voting, chat). +- `tool()` factory + Pydantic schemas (Vercel/OpenAI) for any LLM tool we + expose. +- Filesystem conventions (`.among-them/hooks.json`, `among_them/skills/`, + `among-them.toml`) from Cursor + Claude Agent SDK. +- `AgentDefinition` for declaring subagents inline (Claude Agent SDK). +- AI Gateway model strings (Vercel) so model choice is a single string, not a + provider import. +- Pydantic AI's `deps_type` idea for typed dependency injection of cognitive + modules. + +**What NOT to steal** + +- LangGraph's explicit graph DSL — wrong abstraction for a tick loop. +- Claude Agent SDK's permission‑prompt flow — irrelevant for a game policy. +- Cursor SDK's "cloud VM with PR" runtime — we don't need PRs; cloud means + "submitted to `games_server`." +- OpenAI Agents SDK's `handoffs` as a top‑level primitive — overkill; we model + this with subagents instead. +- Heavy tracing UIs as the only observability story — we'll be Langfuse‑first + but offer a zero‑dependency stdlog default. + +**Naming we adopt**: `Agent.create`, `agent.send`, `run.stream`, `Runner`, +`@tool`, `hooks.json`, `skills/`, `subagents`, `local`/`remote` runtime keys. + +--- + +## 3. Current state + pain points + +A policy author today walks into a thicket. The cliff notes from a thorough +read of the existing code: + +- **Two parallel author paths exist.** The CoGames/tournament path runs Nim + via ctypes (`among_them/players/evidencebot_v2_policy.py:56-122`, ABI version + pinned at `EVIDENCEBOT_V2_ABI_VERSION = 1` in + `among_them/players/build_evidencebot_v2.py:25`). The "smart bot" path runs + Python that re‑implements the wire protocol + (`among_them/bot-policies/sidecar/bot.py:37-53`). Neither path lets a Python + author drop into the modular Nim pipeline directly. +- **The Nim pipeline is already modular** — `decideNextMaskCore` orchestrates + perception → localization → tasks/motion/evidence → policy + (`among_them/players/modulabot/bot.nim:355-501`); imposter/crewmate ladders + live in `policy_imp.nim` / `policy_crew.nim` + (`among_them/players/modulabot/policy_imp.nim:1-67`, + `among_them/players/modulabot/policy_crew.nim:35-50`). A Python SDK should + mirror this pipeline 1:1. +- **LLM seams already exist, but only in Nim.** `mod_talks` adds an + `LlmDispatcher` for non‑blocking subprocess/HTTP completion + (`among_them/players/mod_talks/llm_dispatch.nim:46-119`) and an + `LlmVotingState` machine (`among_them/players/mod_talks/llm.nim:64-75`), + guarded by `when defined(modTalksLlm)` (`among_them/players/mod_talks/llm.nim:16-22`). + Python can and should reuse the **structured JSON context** they emit + (`among_them/players/mod_talks/LLM_VOTING.md:54-73`). +- **The Python sidecar already has a clean cognitive split**: `Trigger` → + `WorkingMemory` → `Narrator` (context builder) → `Advisor` (LLM) → + `Directive` actuator (`among_them/bot-policies/sidecar/triggers.py:69-99`, + `among_them/bot-policies/sidecar/memory.py:60-73`, + `among_them/bot-policies/sidecar/advisor.py:14-61`). This is the right + template for the SDK's cognitive layering — just generalized and packaged. +- **Server‑side launch is via Docker manifests** — + `coplayer_manifest.json` scanned per game + (`games_server/games_server.nim:543-553`), Docker containers spun up by + `startWaitingBots` (`games_server/games_server.nim:1695-1742`), capped at 16 + players (`games_server/games_server.nim:14`). The SDK must produce a manifest + + container/script entry point that `games_server` can launch unmodified. +- **Pain points authors hit today**: localization CPU + (`among_them/players/how_to_make_a_bot.md:119-141`), interstitial detection + (`among_them/players/how_to_make_a_bot.md:166-179`), task completion timing + (`among_them/players/how_to_make_a_bot.md:292-307`), ABI mismatches forcing + rebuilds (`among_them/players/evidencebot_v2_policy.py:173-199`), duplicated + protocol constants between Python and Nim (`among_them/bot-policies/sidecar/bot.py:37-44` + vs `common/protocol.nim:4-25`), and SSL `-d:ssl` requirement for HTTPS in + Nim LLM provider (`among_them/players/mod_talks/llm_provider.nim:49-59`). + +The SDK's job is to absorb every one of these pain points into the default +configuration, so authors only write what they want to change. + +--- + +## 4. Proposed Python SDK API + +### 4.1 Top‑level surface + +```python +from among_them import Agent, Runner, tool, hooks +from among_them.modules import Perception, Memory, Voter, Navigator, Chatter +from among_them.providers import LLM, AIGateway +from among_them.runtimes import LocalSim, Subprocess, RemoteServer +``` + +Three objects matter: + +- **`Agent`** — the policy. Stateful across ticks of a single game. Created + via `Agent.create(...)`. Composes cognitive modules. +- **`Runner`** — orchestration. Picks a runtime, runs N games (sequential or + parallel), collects results, drives tracing. Borrowed from OpenAI Agents SDK. +- **`Module`** — the constructor‑injectable unit of cognition. `Perception`, + `Memory`, `Voter`, `Navigator`, `Chatter`, `Reporter` are the canonical six. + +### 4.2 `Agent.create()` shape + +```python +@dataclass +class AgentConfig: + role_hint: Literal["auto", "crewmate", "imposter"] = "auto" + perception: Perception = ScriptedPerception() # localization, sprites + memory: Memory = WorkingMemory() # tiered memory + diff log + voter: Voter = ScriptedVoter() # default = evidence ladder + navigator: Navigator = ScriptedNavigator() # path/motion masks + chatter: Chatter = SilentChatter() # default: emit nothing + reporter: Reporter = ScriptedReporter() # body-report heuristic + hooks: AgentHooks = AgentHooks() + skills_dir: Path | None = Path("among_them/skills") + trace: Tracer = StructlogTracer() + +class Agent: + @classmethod + def create(cls, **overrides) -> "Agent": ... + async def send(self, observation: Frame) -> Decision: ... + async def connect(self, runtime: Runtime) -> "Run": ... +``` + +The defaults are **the entire evidencebot_v2 policy ported to Python** — no +LLM, no API key, competitive at submission time. Every override is a one‑line +swap. + +### 4.3 Runtime abstraction + +Borrowing Cursor SDK's `local | cloud` split. Three runtimes, one type: + +```python +class Runtime(Protocol): + async def stream_observations(self) -> AsyncIterator[Frame]: ... + async def submit_action(self, mask: ActionMask) -> None: ... + +LocalSim(seed=42, n_players=8, role_assignment="auto") # in-process +Subprocess(binary="evidencebot", config_dir=...) # for tournaments +RemoteServer(url="wss://games.softmax.dev/player", token=...) # live games +``` + +```python +agent = Agent.create() +run = await agent.connect(LocalSim()) # or RemoteServer(...) +async for event in run.stream(): + print(event) # Tick | MeetingStart | Vote | Kill | GameOver +``` + +This means **the same Agent runs in unit tests, tournaments, and the live +server** without code changes. `LocalSim` reuses the `MultiAgentPolicy` ABI +already defined in `among_them/players/evidencebot_v2_policy.py:99-117`. + +### 4.4 Modular cognition + +Each module is an `abc.ABC` with one obvious method. Replacement is a +constructor kwarg: + +```python +class Voter(ABC): + async def vote(self, ctx: VotingContext) -> Vote: ... + +class Navigator(ABC): + def step(self, state: BotState) -> ActionMask: ... + +class Perception(ABC): + def perceive(self, frame: Frame, state: BotState) -> Percept: ... + +# ... Memory, Chatter, Reporter analogous +``` + +This is the Pydantic‑AI / OpenAI Agents SDK pattern (typed deps as +constructor args), specialized to our six modules. The pipeline that consumes +them mirrors `decideNextMaskCore` +(`among_them/players/modulabot/bot.nim:355-501`): + +``` +Frame ─▶ Perception ─▶ Memory ─▶ (Reporter | Voter | Chatter | Navigator) ─▶ ActionMask +``` + +### 4.5 LLM mix‑in (the headline feature) + +```python +from among_them.providers import LLM + +agent = Agent.create( + voter=LLMVoter(LLM("gpt-5.5"), prompt="among_them/skills/voting.md"), + chatter=LLMChatter(LLM("anthropic/claude-opus-4.7"), tone="suspicious"), +) +``` + +`LLMVoter` and `LLMChatter` are concrete `Voter`/`Chatter` subclasses that +internally run a **`ToolLoop`** (Vercel AI SDK pattern). The agent stays +scripted everywhere else; only voting and chat go through an LLM. + +### 4.6 Tool‑loop pattern + +For LLM‑driven decisions we ship a thin `ToolLoop`: + +```python +@tool +def accuse(player_id: str, reason: str) -> Vote: + """Vote to eject a player. `reason` will be posted in chat.""" + return Vote(target=player_id, reason=reason) + +@tool +def skip() -> Vote: + """Skip voting this round.""" + return Vote.SKIP + +class LLMVoter(Voter): + def __init__(self, llm: LLM, tools: list = (accuse, skip)): + self._loop = ToolLoop(llm=llm, tools=tools, stop_when=stop_on_vote) + + async def vote(self, ctx: VotingContext) -> Vote: + return await self._loop.run(prompt=ctx.to_prompt()) +``` + +`ToolLoop.run` returns when any registered tool's return type matches +`stop_when` — exactly the `stopWhen` semantics from Vercel AI SDK. Tools are +declared with the `@tool` decorator (Pydantic schema auto‑generated); this +matches the OpenAI Agents SDK `@function_tool` and Vercel AI SDK `tool()` we +researched. + +### 4.7 Provider abstraction + +One unified `LLM` class, AI‑Gateway‑style model strings: + +```python +LLM("gpt-5.5") # OpenAI direct +LLM("anthropic/claude-opus-4.7") # AI Gateway routed +LLM("bedrock/anthropic.claude-3-5-sonnet") +LLM("local/llama3:70b") # via Ollama / vLLM +``` + +Internally we wrap `openai`, `anthropic`, and `httpx`. Default routing is via +**Vercel AI Gateway** when `AI_GATEWAY_API_KEY` is set — directly informed by +the `mod_talks` `LlmDispatcher` design +(`among_them/players/mod_talks/llm_dispatch.nim:46-119`) which already +multiplexes provider kinds. Output is always typed (`pydantic.BaseModel` with +JSON‑mode forcing for structured fields). + +### 4.8 Extension model — **entry points (decision)** + +I picked **Python entry points** over decorator registries or a `Module` +plugin protocol. Justification: + +1. `pip install among-them-evilbot` should drop a new agent profile into + `among-them list-profiles` without import side effects. +2. Tournament submission already uses Docker manifests + (`games_server/games_server.nim:543-553`); pip‑installable, entry‑point‑ + declared profiles are the Python equivalent and play cleanly with the + tournament packager. +3. Decorators force authors to import a registry module; entry points don't. + +```toml +# pyproject.toml of a third-party bot +[project.entry-points."among_them.profiles"] +evilbot = "evilbot.profile:EvilBotProfile" + +[project.entry-points."among_them.modules.voter"] +hothead = "evilbot.voter:HotheadVoter" +``` + +Then: + +```python +agent = Agent.create(profile="evilbot") +agent = Agent.create(voter="hothead") # by entry-point name +``` + +A `Module` ABC subclass is still the implementation contract — entry points +just publish them. + +### 4.9 Hooks + +A typed callback table on `AgentHooks`, plus a filesystem fallback at +`.among-them/hooks.json` (Cursor‑style). The events match the cognitive +pipeline plus protocol events: + +```python +class AgentHooks: + pre_tick: Callable[[Frame, BotState], None] | None = None + post_tick: Callable[[Decision, BotState], None] | None = None + on_vote: Callable[[Vote, VotingContext], None] | None = None + on_kill: Callable[[KillEvent], None] | None = None + on_meeting: Callable[[MeetingEvent], None] | None = None + on_message: Callable[[ChatMessage], None] | None = None + on_llm_call: Callable[[LLMCall], None] | None = None +``` + +Hooks can also be registered as entry points +(`among_them.hooks.pre_tick = "mybot.hooks:my_pre_tick"`), so observability and +analytics packages compose cleanly. + +### 4.10 Skills directory + +Mirroring Cursor's `.cursor/skills/` and Claude Agent SDK's +`.claude/skills/*/SKILL.md`. We adopt **`among_them/skills/*.md`** with +front‑matter metadata. The SDK auto‑injects matching skills into the LLM +prompt when their **front‑matter triggers** match the current event: + +```markdown +--- +name: voting-strategy +trigger: on_vote +applies_to: [crewmate] +--- +When the body location overlaps with someone's last reported position by ≤3 +tiles within ≤5 ticks, vote them out. Otherwise, skip. +``` + +This is identical in spirit to the existing strategy markdown +(`among_them/players/evidencebot_strategy.md`) but loaded automatically. + +### 4.11 Subagents + +Cursor‑style: a parent agent spawns a focused child reasoner. We use this for +"should I report this body?" and "draft an accusation": + +```python +reporter_subagent = Subagent( + name="report-decider", + model=LLM("gpt-5.5"), + prompt="Decide whether to report a body given the evidence list.", +) + +agent = Agent.create( + reporter=LLMReporter(subagent=reporter_subagent), +) +``` + +Subagents share the parent's `Tracer` and `Memory` snapshot but have isolated +LLM context. This is a thin wrapper around `ToolLoop` + a forked memory +slice — directly inspired by Claude Agent SDK's `AgentDefinition`. + +### 4.12 Tracing / observability + +Two backends, one API: + +- **Default**: `structlog` JSONL on disk — zero dependency, works in CI. +- **Opt‑in**: **Langfuse**, configured via `LANGFUSE_*` env vars or + `among-them.toml`. Every `Tick`, `Decision`, `LLMCall`, and `Vote` becomes a + Langfuse span; LLM calls auto‑attach prompt/completion. We integrate via + the [`langfuse`](https://langfuse.com) Python SDK. +- **Bridge**: emit OpenTelemetry traces too, so anyone with an OTel collector + gets data without Langfuse. + +The `Tracer` interface and existing per‑frame trace points +(`among_them/players/modulabot/bot.nim:507-516`) are reused — the Python +tracer wraps them when the runtime is `Subprocess` to a Nim binary. + +### 4.13 Config + secrets + +Three layers, in increasing precedence: + +1. `among-them.toml` at the repo root (committable defaults). +2. Environment variables (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, + `AI_GATEWAY_API_KEY`, `AMONG_THEM_PROFILE`, `LANGFUSE_PUBLIC_KEY`). +3. Constructor kwargs to `Agent.create(...)`. + +`among-them.toml` example: + +```toml +[agent] +profile = "evidencebot_v2" + +[agent.voter] +type = "llm" +model = "openai/gpt-5.5" + +[runtime] +default = "local-sim" + +[runtime.remote] +url = "wss://games.softmax.dev/player" + +[tracing] +backend = "langfuse" +sampling = 0.2 +``` + +Secrets never appear in `among-them.toml`; the loader actively rejects keys +matching `*_API_KEY`. + +--- + +## 5. Six progressive code samples + +### (a) Default bot in local sim — 5 lines, zero config + +```python +from among_them import Agent + +agent = Agent.create() +agent.run_local(n_games=10) +``` + +### (b) Default bot + OpenAI brain on chat only + +```python +from among_them import Agent +from among_them.modules import LLMChatter +from among_them.providers import LLM + +agent = Agent.create( + chatter=LLMChatter(LLM("openai/gpt-5.5"), tone="paranoid") +) +agent.run_local(n_games=10) +``` + +### (c) Custom voting heuristic — pure Python function + +```python +from among_them import Agent, Voter, Vote + +class GrudgeVoter(Voter): + """Vote whoever was nearest the most recent body.""" + async def vote(self, ctx): + nearest = min(ctx.suspects, key=lambda s: s.distance_to_body) + return Vote(target=nearest.id, reason=f"You were 2 tiles from {ctx.body.victim}.") + +agent = Agent.create(voter=GrudgeVoter()) +agent.run_local(n_games=20) +``` + +### (d) Full LLM imposter policy — tool loop + +```python +from among_them import Agent, tool, ToolLoop, LLMVoter, LLMChatter +from among_them.providers import LLM +from among_them.modules import LLMNavigator + +@tool +def go_to(room: str) -> "Move": + """Move to a named room.""" + return Move(room=room) + +@tool +def kill(player_id: str) -> "Kill": + """Kill a specific player. Only callable when alone with them.""" + return Kill(target=player_id) + +@tool +def fake_task(task_id: str) -> "FakeTask": + """Pretend to do a task at this location.""" + return FakeTask(task_id=task_id) + +llm = LLM("anthropic/claude-opus-4.7") + +agent = Agent.create( + role_hint="imposter", + navigator=LLMNavigator(ToolLoop(llm=llm, tools=[go_to, kill, fake_task])), + voter=LLMVoter(llm), + chatter=LLMChatter(llm, tone="defensive"), +) +agent.run_local(n_games=50) +``` + +### (e) User‑defined extension via `pip install` + entry point + +In `evilbot/pyproject.toml`: + +```toml +[project] +name = "among-them-evilbot" +dependencies = ["among-them-sdk>=0.4"] + +[project.entry-points."among_them.profiles"] +evilbot = "evilbot.profile:EvilBotProfile" +``` + +In `evilbot/profile.py`: + +```python +from among_them import AgentProfile +from .voter import HotheadVoter +from .chatter import GaslightChatter + +class EvilBotProfile(AgentProfile): + name = "evilbot" + voter = HotheadVoter() + chatter = GaslightChatter(model="openai/gpt-5.5") +``` + +End user, after `pip install among-them-evilbot`: + +```python +from among_them import Agent +agent = Agent.create(profile="evilbot") +agent.run_local() +``` + +### (f) Tournament — N parallel agents against `games_server` + +```python +import asyncio +from among_them import Agent, Runner, RemoteServer + +profiles = ["default", "evilbot", "grudge_voter", "llm_imposter"] +agents = [Agent.create(profile=p) for p in profiles] + +runner = Runner( + agents=agents, + runtime=RemoteServer(url="wss://games.softmax.dev/player"), + parallelism=4, + n_games_per_agent=25, +) +asyncio.run(runner.run()) +print(runner.leaderboard()) # win-rate, kills/game, eject-correctness +``` + +--- + +## 6. Packaging + +**Layout** (monorepo location: `among_them/players/sdk/` for the design, +`packages/among-them-sdk/` for the published package — eventually pulled out +into its own repo): + +``` +packages/among-them-sdk/ +├── pyproject.toml +├── src/among_them/ +│ ├── __init__.py # re-export Agent, Runner, tool, hooks +│ ├── agent.py +│ ├── runner.py +│ ├── tool.py +│ ├── hooks.py +│ ├── modules/ # Perception, Memory, Voter, Navigator, Chatter, Reporter +│ ├── providers/ # LLM, AIGateway, OpenAI, Anthropic, Bedrock, Local +│ ├── runtimes/ # LocalSim, Subprocess, RemoteServer +│ ├── skills/ # bundled default skill markdown +│ ├── tracing.py # structlog + Langfuse + OTel bridges +│ └── ffi/ # ctypes wrapper around modulabot/evidencebot_v2 .so +└── tests/ +``` + +**`pyproject.toml`** (the shape, not full): + +```toml +[project] +name = "among-them-sdk" +version = "0.1.0" +requires-python = ">=3.11" +dependencies = [ + "httpx>=0.27", + "pydantic>=2.7", + "anyio>=4.4", + "structlog>=24", + "websockets>=12", + "tomli>=2", +] + +[project.optional-dependencies] +openai = ["openai>=1.40"] +anthropic = ["anthropic>=0.30"] +bedrock = ["boto3>=1.34"] +langfuse = ["langfuse>=2.40"] +viz = ["rich>=13"] + +[project.scripts] +among-them = "among_them.cli:main" + +[project.entry-points."among_them.profiles"] +default = "among_them.profiles:DefaultProfile" +evidencebot_v2 = "among_them.profiles:EvidenceBotV2Profile" +``` + +**Python**: 3.11 minimum (we want `tomllib`, generic syntax, `asyncio.TaskGroup`). +**No mandatory ML deps** — `numpy` only when the FFI runtime is selected. + +--- + +## 7. Open questions + +1. **In‑process Nim FFI vs subprocess for the default scripted policy?** + *Recommendation:* in‑process via ctypes + `evidencebot_v2.dylib` + (`among_them/players/evidencebot_v2_policy.py:99-117`) for performance, with + a pure‑Python fallback that we keep parity‑tested. The pure‑Python fallback + is non‑negotiable for `pip install` UX without a Nim toolchain. + +2. **Async vs sync top‑level API?** *Recommendation:* async‑first (matches + Claude Agent SDK and the WebSocket runtime), with `Agent.run_local_sync()` + sugar for scripts and notebooks. + +3. **Pin‑down LLM tool‑loop semantics: turn‑based vs streaming?** + *Recommendation:* turn‑based by default for voting (low latency budget, + often <1 sec), streaming for chat where the user perceives the typing. + +4. **Skill auto‑loading: prompt prefix vs RAG?** *Recommendation:* prefix for + the first 1–2 skill markdowns matched by event, fall back to RAG (with a + bundled Sentence Transformers backend) when more than 8 skills are + registered. Keep the API identical. + +5. **Where do Nim‑side LLM calls (`mod_talks`) fit?** *Recommendation:* drop + them. Long‑term, all LLM cognition runs Python‑side (the SDK is the source + of truth). The Nim core stays pure scripted; if Nim needs an LLM result, it + reads it from a Python‑written shared‑memory channel — extending the + `LlmDispatcher` FFI seam already at + `among_them/players/mod_talks/llm_dispatch.nim:46-82`. + +6. **AI Gateway hard requirement, or optional?** *Recommendation:* optional + but default‑on when `AI_GATEWAY_API_KEY` is present; we don't want pip + users to need a Vercel account to run hello‑world. + +7. **License + repo location.** *Recommendation:* MIT, eventually a separate + repo (`among-them-sdk`) for clean external contributions. For Phase 0–2 + live in this monorepo under `packages/among-them-sdk/`. + +--- + +## 8. Phased rollout + +**Phase 0 — Scaffold (1 week).** *DoD:* `pip install -e .` works; `Agent`, +`Runner`, `LocalSim`, `tool`, `hooks` exist as typed stubs; default profile +returns no‑op masks; smoke test in CI. + +**Phase 1 — Scripted policy parity (3 weeks).** *DoD:* a pure‑Python port of +`evidencebot_v2`'s perception, voting, and navigation passes a parity test +against the Nim FFI on 1000 fixed seeds; Subprocess runtime can launch a +compiled Nim binary and stream its decisions; `among_them/skills/` shipping +2–3 default skills. + +**Phase 2 — LLM mix‑ins (2 weeks).** *DoD:* `LLMVoter` and `LLMChatter` ship; +`LLM("openai/...")` and `LLM("anthropic/...")` work; AI Gateway routing works +when env var present; `ToolLoop` battle‑tested on the imposter sample (d). +Langfuse tracing is enabled by default when keys are set. + +**Phase 3 — Extension model (1 week).** *DoD:* third‑party `pip install +among-them-evilbot` profile loads via entry point; `among-them list-profiles` +CLI; `among-them.toml` config layering works; skill auto‑loading hits +front‑matter triggers. + +**Phase 4 — Cloud + tournament (3 weeks).** *DoD:* `RemoteServer` runtime +talks to live `games_server` over WebSocket and survives a full tournament; +`Runner` parallelism with `RemoteServer` confirmed; SDK emits a +`coplayer_manifest.json` (`games_server/games_server.nim:543-553`) so that +`startWaitingBots` (`games_server/games_server.nim:1695-1742`) launches +SDK‑authored bots in containers without Nim‑specific scaffolding; +end‑to‑end tournament demo with 4 SDK profiles + 4 legacy Nim bots. + +After Phase 4 the SDK is the recommended path for all new Among Them bots and +the legacy `bot-policies/sidecar/` tree can be archived. + +--- + +## 9. Implementation status (Phase 0 + Phase 1) + +**Implemented at** `among_them/sdk/` (sibling to this design doc, package +name `among_them_sdk`). Core surface — `Agent`, `Runner`, `LocalSim`, +module ABCs, the FFI loader, the cognition layer, and the natural-language +`instructions=` API — is shipping. The default policy is `evidencebot_v2` +loaded via FFI; there is **no pure-Python fallback** in this milestone. + +What deviated from this design: + +- **No async API yet.** Phase 0/1 ships a sync `Agent.run(rounds=N)` that + satisfies the 5-line hello world; async + `connect(runtime)` arrives + with Phase 4 streaming. +- **Module overrides run *above* the FFI**, not inside it. The Nim shared + library exposes only `abi_version`, `new_policy`, and `step_batch`, so + the SDK cannot literally replace `decideVotingMask` inside the bot. + Instead, the runtime calls user-supplied modules at meeting / report / + chat events while the FFI handles every-tick navigation. See the + architectural note at the top of + `among_them_sdk.policy.evidencebot_v2`. +- **Cyborg framework is bridged via `sys.path`**, not a path-installable + dependency — cyborg has no `pyproject.toml`. The SDK reuses cyborg's + `Directive`/`Command`/`CommandKind` types when the path is reachable + and falls back to local equivalents otherwise. +- **Skill auto-loading and the AgentDefinition subagent shape are + deferred to Phase 3.** +- **Langfuse + OTel emission are deferred to Phase 4.** The default + structlog tracer is wired up; `tracing.enable_langfuse()` raises + `NotImplementedError` for now. + +Required tests (`test_ffi_load.py`, `test_agent_default.py`) plus +`test_instructions.py` and `test_module_override.py` all pass under +`uv run pytest tests/`. See `among_them/sdk/README.md` for quickstart. diff --git a/among_them/sdk/README.md b/among_them/sdk/README.md new file mode 100644 index 00000000..4bfe3342 --- /dev/null +++ b/among_them/sdk/README.md @@ -0,0 +1,183 @@ +# among-them-sdk + +A Python SDK for authoring [Among Them](../README.md) policy bots. Wraps the +production scripted policy (`evidencebot_v2`) via FFI and exposes +module-level overrides plus a natural-language **instructions** API. + +> **Status:** Phase 0 + Phase 1 of the +> [DESIGN.md spec](../players/sdk/DESIGN.md). Pure-Python fallback, +> `RemoteServer` runtime, skill loaders, and Langfuse integration are +> intentionally out of scope for this milestone. + +## Install + +```bash +cd among_them/sdk +uv sync # creates a .venv and installs the package + dev deps +# OR: +pip install -e ".[test]" +``` + +### FFI requirement (no pure-Python fallback) + +The default policy is the Nim-built `evidencebot_v2` shared library. The SDK +will auto-build it the first time it loads, but you must have: + +* **Nim 2.2.4** on `PATH` (`nim --version`). The build script can install + it via `nimby` if it's missing — see + [`build_evidencebot_v2.py`](../players/build_evidencebot_v2.py). +* A C toolchain (clang / gcc / msvc) reachable to Nim. +* The full monorepo checked out — the FFI loader walks up to + `among_them/players/` from the SDK source. Set + `AMONG_THEM_PLAYERS_DIR=/path/to/among_them/players` to override. + +If the toolchain is missing, every entry point that touches the FFI raises +`among_them_sdk.ffi.FFIError` with a clear message naming the missing dep. + +### Optional: Cyborg framework + +The SDK opportunistically reuses primitives from +[`cyborg-policy-framework`](/Users/aaln/experiments/softmax/policies/policies/cyborg-policy-framework) +when it's checked out at the default path (or `CYBORG_FRAMEWORK_PATH` is +set). Cyborg has no `pyproject.toml`, so we add it to `sys.path` lazily and +fall back to local equivalents if it isn't reachable. See +[`_cyborg.py`](src/among_them_sdk/_cyborg.py) for the contract. + +## Hello world + +```python +from among_them_sdk import Agent + +agent = Agent.create() # evidencebot_v2 via FFI, LocalSim +result = agent.run(rounds=1) +print(result.summary) +``` + +That's it. No API keys. No config. The first run builds the .dylib. + +## Instructions — the headline feature + +```python +from among_them_sdk import Agent + +agent = Agent.create( + instructions=( + "Report bodies aggressively. Trust no one after meeting 2. " + "Vote with the majority unless you have direct evidence." + ), + cognitive={"suspicion_threshold": 0.6, "report_eagerness": "high"}, +) + +print(agent.directives.model_dump_json(indent=2)) +``` + +The string is parsed into a typed `Directives` Pydantic model at agent +creation time. If `OPENAI_API_KEY` (or `ANTHROPIC_API_KEY`, +`AI_GATEWAY_API_KEY`) is set, the SDK calls a small LLM to translate +freeform text into structured directives. Otherwise it falls back to a +deterministic regex/keyword parser. Either way you get the same Pydantic +type — and the scripted modules consult `agent.directives` while making +decisions. + +## Module overrides + +```python +from among_them_sdk import Agent, LLMVoter + +agent = Agent.create(voter=LLMVoter(model="gpt-5.5")) # voting only +``` + +```python +from among_them_sdk import Agent, Vote, Voter, VotingContext + +class GrudgeVoter(Voter): + def vote(self, ctx: VotingContext) -> Vote: + top = max(ctx.suspects, key=lambda s: s.score) + return Vote(target=top.player_id, reason=f"grudge ({top.score:.2f})") + +agent = Agent.create(voter=GrudgeVoter()) +``` + +Slots: `perception`, `memory`, `voter`, `navigator`, `chatter`, `reporter`. +Replace one or all of them — everything else stays scripted. + +## Architectural note (read before extending) + +The Nim FFI exposes only `abi_version`, `new_policy`, `step_batch`. Per +tick: pixel frames in, action *indices* out. The .so does not surface its +internal voting / reporting / chat decisions, so module overrides cannot +literally replace the bot's voting function inside Nim. Instead the SDK +runs `evidencebot_v2` as the default low-level action producer; the +runtime layer surfaces explicit voting / reporting / chat events to your +modules. When you pass `voter=LLMVoter()`, the runtime calls that voter at +meeting time while the FFI continues to handle every-tick navigation. + +This is honest about the FFI surface. Future work (Phase 2+) will extend +the Nim exports so we can intercept inside the .so. + +## Tournament submission + +Ship your SDK policy to the Among Them leaderboard via cogames using +`SDKPolicy` + a bundled JSON config: + +```bash +cd among_them/sdk +python -m among_them_sdk.package \ + --from-agent examples/personas.py:_build_aggressive \ + --policy-name "$USER-sdk-aggressive" +``` + +The packaging CLI writes `among_them_sdk_config.json` next to the +policy module and prints the exact `cogames upload` command to run. +Full happy path + Phase 2 caveats: [`docs/tournament-submission.md`](docs/tournament-submission.md). + +## Going further + +For a deeper, hands-on walkthrough — module overrides, hooks, runtimes, +provider routing, troubleshooting, and copy-pasteable recipes — see +[`docs/python-guide.md`](docs/python-guide.md). For the dev loop +(edit → run an 8-player local game vs `nottoodumb` → debug → iterate), +see [`docs/local-iteration-guide.md`](docs/local-iteration-guide.md). + +## Examples + +* [`examples/hello.py`](examples/hello.py) — 5-line default +* [`examples/instructions.py`](examples/instructions.py) — directives API +* [`examples/custom_voter.py`](examples/custom_voter.py) — Python override +* [`examples/llm_chatter.py`](examples/llm_chatter.py) — LLM mix-in +* [`examples/tournament.py`](examples/tournament.py) — parallel agents + +## Tests + +```bash +uv run pytest tests/test_ffi_load.py tests/test_agent_default.py -v +``` + +Both must pass on a machine with a working Nim toolchain. The other tests +(`test_instructions.py`, `test_module_override.py`) run hermetically. + +## Layout + +``` +among_them/sdk/ +├── pyproject.toml +├── src/among_them_sdk/ +│ ├── __init__.py # public surface re-exports +│ ├── agent.py # Agent.create, send, run, stream +│ ├── runner.py # parallel fan-out +│ ├── runtime.py # LocalSim / Subprocess / RemoteServer (stub) +│ ├── ffi.py # ctypes wrapper + auto-build +│ ├── _cyborg.py # cyborg framework bridge +│ ├── policy/evidencebot_v2.py +│ ├── modules/ # Voter, Chatter, Reporter, Navigator, Memory, Perception +│ ├── cognition/ # Directives, LLM, ToolLoop, @tool +│ ├── hooks.py +│ ├── config.py +│ ├── extensions.py +│ └── tracing.py +├── examples/ +└── tests/ +``` + +See [`../players/sdk/DESIGN.md`](../players/sdk/DESIGN.md) for the full +design rationale and Phase 2+ roadmap. diff --git a/among_them/sdk/docs/local-iteration-guide.md b/among_them/sdk/docs/local-iteration-guide.md new file mode 100644 index 00000000..894b356d --- /dev/null +++ b/among_them/sdk/docs/local-iteration-guide.md @@ -0,0 +1,518 @@ +# Among Them SDK — Local Iteration & Testing Guide + +Last updated: May 6, 2026 + +## 1. What this guide covers + +The dev loop: edit your SDK module or directives, run a real 8-player Among +Them game on your laptop against `nottoodumb` opponents, read the output, +debug, repeat. Pair it with [`README.md`](../README.md) (pitch + 5-line +hello), [`docs/python-guide.md`](python-guide.md) (API reference), and +[`docs/tournament-submission.md`](tournament-submission.md) (cogames +upload path). + +## 2. The standing local-game setup + +Every local game in this guide is **1 SDK-controlled player + 7 +`nottoodumb` opponents**, hard-wired in +[`among_them/sdk/examples/eight_player_game.py`](../examples/eight_player_game.py) +(see the loop `for i in range(1, 8): ... nottoodumb` and the headline +config `minPlayers=8`). `nottoodumb` is the right default opponent because +it's a real Nim policy bot with the same shape as tournament opponents +— its image is published as `ghcr.io/treeform/bitworld-nottoodumb:latest` +([`coplayer_manifest.json`](../../players/nottoodumb/coplayer_manifest.json)) +and it's part of the cogames among-them pool. So the same opponent you +beat (or lose to) locally is what you'll see on the leaderboard. + +The example does not currently take an `--opponent` flag — there's +nothing to override, the default *is* nottoodumb. Don't go looking for +one. + +## 3. One-time prerequisites + +Verify each step before continuing. + +**Python 3.11+ and uv.** + +```bash +python3 --version # >= 3.11 +uv --version # any recent +``` + +**Nim toolchain.** The build helpers install Nim 2.2.4 via +[`nimby`](https://github.com/treeform/nimby) on first run. To pre-install: + +```bash +uv run --project /Users/aaln/experiments/softmax/bitworld/among_them/sdk \ + python /Users/aaln/experiments/softmax/bitworld/among_them/players/build_evidencebot_v2.py +nim --version # should print 2.2.4 +``` + +That one command does three things: installs Nim 2.2.4 if missing, +syncs `nimby.lock` Nim deps, and produces +`among_them/players/libevidencebot_v2.dylib` (the FFI .dylib the SDK's +default policy loads). The matching `.abi` stamp lives next to it. + +**Build the `nottoodumb` binary.** There is **no** dedicated +`build_nottoodumb*.py` helper — the deleted one was replaced with an +in-place `nim c` invocation that +[`eight_player_game.py:ensure_native_binary`](../examples/eight_player_game.py) +runs for you on first launch. The compile flags it uses are exactly: + +```bash +cd /Users/aaln/experiments/softmax/bitworld +nim c -d:release -d:ssl -d:botHeadless \ + among_them/players/nottoodumb/nottoodumb.nim +``` + +The repo's [`config.nims`](../../../config.nims) sets `--outdir:./out` and +`--nimcache:./nimcache`, so the binary lands at +`bitworld/out/nottoodumb`. The same call also handles +`among_them/among_them.nim` → `bitworld/out/among_them` (the local game +server). If `nim c` fails with "package X not found", run the +`build_evidencebot_v2.py` step above first — it owns the `nimby sync`. + +**First-time SDK install.** + +```bash +unset VIRTUAL_ENV +cd /Users/aaln/experiments/softmax/bitworld/among_them/sdk +uv sync +``` + +`unset VIRTUAL_ENV` is mandatory if your shell has any other venv +active — `uv` will silently install into the wrong project otherwise. + +**Verify with the test suite.** + +```bash +uv run --directory /Users/aaln/experiments/softmax/bitworld/among_them/sdk \ + pytest -q +``` + +Recent runs are 25 passed + 1 skipped (the `SDKPolicy` constructor test +skips when `mettagrid` isn't installed locally — see +[`tests/test_cogames_packaging.py`](../tests/test_cogames_packaging.py)). + +## 4. The 60-second dev loop + +The cycle: + +1. Edit a module file — your custom `Voter` class, an entry in + [`examples/personas.py`](../examples/personas.py), or your own copy of + `eight_player_game.py`. +2. Run `uv run python examples/eight_player_game.py` (with whatever + `--instructions` / `--module` flags you're tuning). +3. Read the printed final block: scores, override-engine stats, + resolved `Directives`. +4. Tweak. Re-run. Repeat. + +One concrete cycle: + +```bash +cd /Users/aaln/experiments/softmax/bitworld/among_them/sdk +uv run python examples/eight_player_game.py \ + --instructions "Be aggressive about reporting bodies." +# Look at the final RESULT block: +# overrides: reports_passed=12 reports_suppressed=0 +# Now suppress the same with the opposite directive: +uv run python examples/eight_player_game.py \ + --instructions "Never report bodies." +# Expected: +# overrides: reports_passed=0 reports_suppressed=12 +``` + +If those two stat lines look identical you've got a bug — the +`_DirectiveOverrideEngine` should swing on `report_eagerness`. Confirm +the parse in isolation with the directive-debug recipe in §7. + +## 5. Running an 8-player local game + +Canonical command: + +```bash +unset VIRTUAL_ENV +cd /Users/aaln/experiments/softmax/bitworld/among_them/sdk +uv run python examples/eight_player_game.py +``` + +Flags actually exposed by [`parse_args` in +`eight_player_game.py`](../examples/eight_player_game.py): + +| Flag | Meaning | +| --- | --- | +| `--instructions ""` | Natural-language directives. Deterministic regex parse unless `--use-llm`. | +| `--cognitive key=value` | Repeatable. Same shape as `Agent.create(cognitive={...})`. | +| `--module slot=type[:k=v,...]` | Repeatable. e.g. `--module voter=scripted:threshold=0.7`. | +| `--bundle-config ` | Path to a hand-written `among_them_sdk_config.json`. Wins over the three above. | +| `--rounds-max N` | Number of full games (server `maxGames`). Default 1. | +| `--seed N` | RNG seed for the SDK agent. Default 42. | +| `--server-port N` | Bind the server here. `0` = pick a free port. Default 0. | +| `--imposter-count N` | Default 2. | +| `--tasks-per-player N` | Default 6. | +| `--vote-timer-ticks N` | Voting duration in ticks @ 24fps. Default 360 = 15s. | +| `--max-ticks N` | SDK disconnects after this many frames. Default 8000 (~5.5 min). | +| `--game-timeout N` | Wall-clock seconds before the harness gives up. Default 600. | +| `--use-llm` | Allow the SDK to LLM-parse `--instructions`. Default off. | +| `--log-root ` | Where per-process `.log` files go. Default `/logs/eight_player_game`. | + +Three worked invocations: + +(a) Default — bare command, ships SDK defaults: + +```bash +uv run python examples/eight_player_game.py +``` + +(b) With instructions: + +```bash +uv run python examples/eight_player_game.py \ + --instructions "Be aggressive about reporting bodies" +``` + +(c) With a bundled persona config. There's no `--persona NAME` flag. +The packager-friendly path is to package the persona into a JSON config +first, then load it with `--bundle-config`: + +```bash +cd /Users/aaln/experiments/softmax/bitworld/among_them/sdk +uv run python -m among_them_sdk.package \ + --from-agent examples/personas.py:_build_paranoid \ + --policy-name dev-paranoid \ + --out /tmp/paranoid_config.json +uv run python examples/eight_player_game.py \ + --bundle-config /tmp/paranoid_config.json +``` + +The packager prints `[package] resolved directives:` plus the JSON it +wrote — that's the same shape `--bundle-config` consumes. See +[`docs/tournament-submission.md`](tournament-submission.md) for the full +packaging path. + +## 6. What you'll see + +A successful run prints, in order: + +1. **Build banner** — `[build] evidencebot_v2 lib OK: …`, + `[build] among_them -> …/out/among_them`, + `[build] nottoodumb -> …/out/nottoodumb` (or just an OK line if cached). +2. **Setup line** — + `[setup] logs -> /Users/.../logs/eight_player_game/`. +3. **Server boot** — + `[server] launching on 127.0.0.1: (config={'minPlayers': 8, ...})` + then `[server] OK — listening on 127.0.0.1: (PID )`. +4. **Seven nottoodumb connections** — + `[player 1/7] nottoodumb1 (PID ) -> ws://127.0.0.1:` … 7×. +5. **SDK policy load** — + `[sdk] policy=LocalSDKPolicy (directives=susp=…, report=…, chat=…, vote=…, modules=…)` + then `[sdk] connecting -> ws://127.0.0.1:/player?name=sdkbot`. +6. **Per-30s status pings** — + `[status] server up; bots alive=7/7; sdk frames so far=`. +7. **Final RESULT block** — a `player / role / kills / tasks / reward / + win` table from `scores.json`, then an `SDK agent` section with + `summary`, pretty-printed `directives`, the override-engine line + (`overrides: reports_passed=… reports_suppressed=…`), and frame / + mask / top-action counts. Final three lines point at the + `logs:`, `scores:`, and `replay:` paths. + +The `overrides` line is the headline metric: it tells you whether your +`Reporter` / `Voter` / `Chatter` overrides actually fired (see +[`policy/cogames.py:_OverrideStats`](../src/among_them_sdk/policy/cogames.py)). + +Per-process logs live in the printed `logs:` directory: + +* `server.log` — the local Among Them server's stdout/stderr. +* `player_1_nottoodumb1.log` … `player_7_nottoodumb7.log`. +* `sdk.log` — your SDK player's pre-amble (instructions, resolved + directives, bundle config) plus a `# done:` or `# error:` trailer. +* `scores.json` — `names`, `scores`, `win`, `tasks`, `kills` per slot + (the same JSON `fetch_results_json` reads). +* `replay.bitreplay` — full replay; open via the `replay_viewer` Nim + binary if you build it. + +## 7. Iterating faster + +**K parallel games.** +[`win_rate_loop.py`](../examples/win_rate_loop.py) and +[`ab_test_instructions.py`](../examples/ab_test_instructions.py) both +run against **`LocalSim`** — synthetic frame driver, no real game, no +win/loss signal. Useful for sanity-checking directive parsing, **not** +for "did we win". For real win-rate, wrap `eight_player_game.py`: + +```bash +for i in 1 2 3 4 5; do + uv run python examples/eight_player_game.py \ + --instructions "$VARIANT" --seed $((100+i)) \ + --log-root /tmp/loop > /tmp/loop/$i.out +done +# aggregate from each /tmp/loop//scores.json +``` + +For real-game A/B, run the loop twice with different `--instructions` +and diff the per-game `scores.json` files. + +**Quick directive sanity-check (no game).** + +```bash +uv run python examples/debug_directives.py "be paranoid" +``` + +This calls the same `parse_instructions` the agent uses and prints the +resolved Directives JSON. Use it before every game to confirm your +phrasing actually hit the regex/LLM rules you expected. + +## 8. Writing your own module against nottoodumb + +A custom `Voter` (the same shape applies to `Reporter` and `Chatter`): + +```python +from among_them_sdk import Vote, Voter, VotingContext + +class GrudgeVoter(Voter): + def vote(self, ctx: VotingContext) -> Vote: + if not ctx.suspects: + return Vote.skip("no suspects") + top = max(ctx.suspects, key=lambda s: s.score) + if top.score < 0.4: + return Vote.skip(f"low conf {top.score:.2f}") + return Vote(target=top.player_id, reason=f"grudge {top.score:.2f}") +``` + +For the **LiveGame** path (full `Agent` shape, fires on synthesized +meetings), wire it the same way `examples/custom_voter.py` does: + +```python +from among_them_sdk import Agent, LiveGame +agent = Agent.create(voter=GrudgeVoter(), use_llm_for_instructions=False) +result, transcript = LiveGame(host="127.0.0.1", port=).run_agent(agent) +``` + +For the **`LocalSDKPolicy`** path (the override engine +`eight_player_game.py` actually runs against nottoodumb), pack the +voter into a `CogamesBundleConfig` and either pass it via +`--bundle-config` or build it inline. Caveat from +[`policy/cogames.py`](../src/among_them_sdk/policy/cogames.py): the Nim +FFI surface is action-indices-out only, so on the cogames code path +**`Voter` and `Chatter` overrides don't fire — only `Reporter` does** +(it gates report-flavoured action indices). They still show up in +`engine.stats.voter_advisories` for inspection but don't change the +game. To actually drive votes locally, use `Agent.create(...).run( +runtime=LiveGame(...))` (the `Agent` path). + +To run one game with your custom module against 7 nottoodumb, simplest +path: drop the class into a file your example can import, call it from +a 10-line wrapper that mirrors `eight_player_game.py`'s server + +nottoodumb spawning, and join with `LiveGame.run_agent(agent)`. + +## 9. Inspecting + debugging + +**Resolved directives.** + +```python +print(agent.directives.model_dump_json(indent=2)) +# or for SDKPolicy: +print(json.dumps(sdk_policy.directives.model_dump(), indent=2, default=str)) +``` + +The 8-player example dumps these to `sdk.log` automatically. + +**`RunResult` shape.** From +[`runtime.py`](../src/among_them_sdk/runtime.py): `ticks`, `actions`, +`meetings`, `votes`, `reports`, `chat_messages`, `summary`, `raw`. For +`LiveGame.run_local_sdk_policy` the per-action `actions` list is empty +(use the transcript histogram); `votes` and `reports` are also empty +because the FFI doesn't surface them — see the architectural note at the +top of `policy/cogames.py`. For `LiveGame.run_agent(agent)` (the +`Agent`-driven path with synthetic meetings) `votes` / `reports` / +`chat_messages` are populated. + +**Structured logs.** [`tracing.py`](../src/among_them_sdk/tracing.py) +emits structlog JSONL on stdout. Crank the level: + +```python +import logging +logging.getLogger("among_them_sdk").setLevel(logging.DEBUG) +logging.getLogger("among_them_sdk.live_game").setLevel(logging.DEBUG) +``` + +That second one is the LiveGame frame loop — connect/close, frames +received, mask sends. + +**Per-player log tails.** + +```bash +tail -f /Users/aaln/experiments/softmax/bitworld/logs/eight_player_game//server.log +tail -f /Users/aaln/experiments/softmax/bitworld/logs/eight_player_game//sdk.log +``` + +`sdk.log` carries your `# instructions:`, `# directives:`, and +`# bundle config:` headers up front — useful when an old config sneaks +into a run. + +**Validate the bundle config without running a game.** + +```bash +uv run python -m among_them_sdk.package \ + --instructions "your tuning string" \ + --cognitive suspicion_threshold=0.7 \ + --out /tmp/dev_config.json +cat /tmp/dev_config.json +``` + +The packager prints the resolved Directives and writes the bundle JSON +in the same shape `SDKPolicy` will load. If a hand-written config +parses here, it'll parse inside the cogames Docker too. + +**Spotting Nim FFI silent out-of-range actions.** The risk in +[`policy/cogames.py`](../src/among_them_sdk/policy/cogames.py) (`Phase 2 +gap`): an out-of-range index from the `.dylib` becomes `None` from +`BITWORLD_ACTION_NAMES[idx]` and is silently skipped by +`_DirectiveOverrideEngine.apply_per_tick`. To catch it, watch the +`top actions (idx, count)` line in the final block — every index there +should map to a name in +[`policy/evidencebot_v2.py:BITWORLD_ACTION_NAMES`](../src/among_them_sdk/policy/evidencebot_v2.py). +Anything out of range is the FFI emitting garbage; rebuild the .dylib +(see §12). + +**Debugger.** Plain `breakpoint()` inside your `Voter` / +`Reporter` works because `LiveGame` runs on the calling thread (sdk +runner is a Python thread the example spawns; pdb is fine inside it). +Don't break inside a frame handler that holds the FFI handle for +> a few seconds — the websocket is ping-disabled but the server can +still time you out from the game side. + +## 10. Testing changes + +```bash +cd /Users/aaln/experiments/softmax/bitworld/among_them/sdk +uv run pytest -q # full suite (25 pass + 1 skip) +uv run pytest tests/test_module_override.py -v # custom Voter / Reporter tests +uv run pytest tests/test_cogames_packaging.py -v # bundle config + override engine +uv run ruff check src/ # lint +``` + +Add a test for your module by following the +[`test_module_override.py`](../tests/test_module_override.py) shape — it +uses `LocalSim`, not `LiveGame`, so it's hermetic and fast: + +```python +from among_them_sdk import Agent, LocalSim, Vote, Voter, VotingContext + +class StickyVoter(Voter): + def vote(self, ctx: VotingContext) -> Vote: + return Vote(target="P00", reason="sticky") + +def test_sticky_voter_replaces_default(): + agent = Agent.create(voter=StickyVoter(), use_llm_for_instructions=False) + result = agent.run(rounds=1, runtime=LocalSim(ticks_per_round=12, meeting_every=4, seed=1)) + assert all(v.target == "P00" for v in result.votes) +``` + +For real-game smoke tests, point a pytest fixture at +`LiveGame(host="127.0.0.1", port=)` after spawning the server + +nottoodumb the same way the example does. Re-running the cogames +packaging tests catches regressions in your bundle config schema. + +## 11. From local to tournament + +When the directive + module mix wins (or at least doesn't actively +lose) locally: + +```bash +cd /Users/aaln/experiments/softmax/bitworld/among_them/sdk +uv run python -m among_them_sdk.package \ + --instructions "" \ + --cognitive suspicion_threshold=0.65 \ + --policy-name "$USER-sdk-tuned" +``` + +Then run the printed `cogames upload --dry-run` line from +[`docs/tournament-submission.md`](tournament-submission.md) §3 to +validate inside Docker. The local nottoodumb you've been beating is +also one of the tournament opponents (its image is in the +among-them pool — see +[`coplayer_manifest.json`](../../players/nottoodumb/coplayer_manifest.json)), +so a stable local edge usually carries — but cogames mixes in other +opponents too, so don't read 1-game wins as a leaderboard guarantee. + +## 12. Common iteration pitfalls + +* **`uv` synced the wrong project.** Symptom: `ModuleNotFoundError: + among_them_sdk` after a clean install. Fix: `unset VIRTUAL_ENV`, then + `uv sync` from `among_them/sdk` *or* pass + `uv --directory /Users/aaln/experiments/softmax/bitworld/among_them/sdk sync`. + Don't run `uv sync` from the repo root unless you mean to sync the + repo-root project (which doesn't include the SDK). +* **`OSError: cannot load libevidencebot_v2`.** The .dylib is missing + or stale. Rebuild: + `python /Users/aaln/experiments/softmax/bitworld/among_them/players/build_evidencebot_v2.py`. + Check `among_them/players/libevidencebot_v2.dylib.abi` — it should + contain `1`. +* **`nottoodumb binary not found`** or the example dies with + `nim c ... failed`. Either Nim isn't 2.2.4 (run + `build_evidencebot_v2.py` once, it installs Nim via nimby), or the + `nimby.lock` deps aren't synced (same fix). Manual rebuild: + `nim c -d:release -d:ssl -d:botHeadless among_them/players/nottoodumb/nottoodumb.nim` + from the repo root. +* **Port already in use.** Use `--server-port N` to pin one. Stale + `among_them` server processes also linger after Ctrl+C in some + shells — `pkill -f out/among_them` clears them. +* **Nim cache stale after editing `evidencebot_v2.nim`.** + `config.nims` puts the cache at + `/Users/aaln/experiments/softmax/bitworld/nimcache/`. Blow it away + (`rm -rf nimcache/`) and re-run `build_evidencebot_v2.py`. +* **`overrides: reports_passed=0 reports_suppressed=0`.** The Reporter + override never fired. Either (a) your directive thresholds didn't + flip the parsed `report_eagerness` — confirm with `debug_directives.py + ""`, or (b) the inner Nim bot didn't emit a `report_*` + action this game (rare; bump `--rounds-max 3`). +* **Used `examples/hello.py` for substantive iteration.** `hello.py` + uses `LocalSim` — it doesn't run a real game, doesn't connect to + nottoodumb, and has no win/loss. For substantive iteration always + use `eight_player_game.py`. + +## 13. Cheat sheet + +```bash +# 0. one-time, from anywhere +unset VIRTUAL_ENV +python /Users/aaln/experiments/softmax/bitworld/among_them/players/build_evidencebot_v2.py +uv sync --directory /Users/aaln/experiments/softmax/bitworld/among_them/sdk + +# 1. directive sanity-check (no game) +uv run --directory /Users/aaln/experiments/softmax/bitworld/among_them/sdk \ + python examples/debug_directives.py "be aggressive about reporting" + +# 2. one real 8-player game (1 SDK + 7 nottoodumb), defaults +uv run --directory /Users/aaln/experiments/softmax/bitworld/among_them/sdk \ + python examples/eight_player_game.py + +# 3. same with your tuning string +uv run --directory /Users/aaln/experiments/softmax/bitworld/among_them/sdk \ + python examples/eight_player_game.py \ + --instructions "Trust nobody. Report bodies aggressively." + +# 4. with a bundled persona config +uv run --directory /Users/aaln/experiments/softmax/bitworld/among_them/sdk \ + python -m among_them_sdk.package \ + --from-agent examples/personas.py:_build_paranoid \ + --out /tmp/cfg.json --policy-name dev +uv run --directory /Users/aaln/experiments/softmax/bitworld/among_them/sdk \ + python examples/eight_player_game.py --bundle-config /tmp/cfg.json + +# 5. tail the SDK log mid-game +tail -f /Users/aaln/experiments/softmax/bitworld/logs/eight_player_game/$(ls -t /Users/aaln/experiments/softmax/bitworld/logs/eight_player_game | head -1)/sdk.log + +# 6. test +uv run --directory /Users/aaln/experiments/softmax/bitworld/among_them/sdk pytest -q + +# 7. lint +uv run --directory /Users/aaln/experiments/softmax/bitworld/among_them/sdk ruff check src/ + +# 8. when ready, package for cogames +uv run --directory /Users/aaln/experiments/softmax/bitworld/among_them/sdk \ + python -m among_them_sdk.package \ + --instructions "" \ + --policy-name "$USER-sdk-tuned" +``` diff --git a/among_them/sdk/docs/python-guide.md b/among_them/sdk/docs/python-guide.md new file mode 100644 index 00000000..b6e42189 --- /dev/null +++ b/among_them/sdk/docs/python-guide.md @@ -0,0 +1,493 @@ +# Among Them SDK — Python Usage Guide + +## 1. What this guide covers + +This is the deeper walkthrough for the Among Them SDK at `among_them/sdk/`. By the end you should be able to install the SDK, run the default `evidencebot_v2` agent, steer it with natural-language instructions, swap any of its six cognitive modules, choose a runtime, register hooks, and ship your own profile. The [README](../README.md) is the elevator pitch and 5-line hello world; this guide assumes you've skimmed it. Every example is verified against the actual Phase 0 + Phase 1 API in `src/among_them_sdk/`; design-doc features that haven't shipped yet are flagged inline. + +## 2. Prerequisites & install + +- **Python ≥ 3.11** (per `pyproject.toml`). The SDK uses `tomllib` and PEP 604 generics. +- **Nim 2.2.4** on `PATH` plus a C toolchain (clang/gcc/msvc). Nim is the only mandatory native dep — there is no pure-Python fallback in this milestone (DESIGN.md §9). +- **Among Them monorepo** checked out. The FFI loader (`src/among_them_sdk/ffi.py`) walks up to `among_them/players/` from the SDK source. Override with `AMONG_THEM_PLAYERS_DIR=/abs/path`. + +```bash +cd among_them/sdk +uv sync # creates .venv, installs runtime + dev deps +# or: +pip install -e '.[test]' # editable install with pytest extras +``` + +On first load the SDK shells out to `among_them/players/build_evidencebot_v2.py` to produce `libevidencebot_v2.{dylib,so,dll}` next to the build script (picked up via `ffi.library_path()`). To pre-build: + +```bash +python among_them/players/build_evidencebot_v2.py +``` + +Optional extras declared in `pyproject.toml`: `[openai]` (`openai>=1.40`), `[anthropic]` (`anthropic>=0.30`), `[test]` (pytest), and `[all]` (both LLM SDKs). Install with `pip install -e '.[openai,anthropic]'` or `uv sync --extra all`. + +## 3. Your first bot (60 seconds) + +A slightly chattier `examples/hello.py`: + +```python +from among_them_sdk import Agent, LocalSim + +agent = Agent.create() # FFI + scripted modules +sim = LocalSim(ticks_per_round=60, meeting_every=20) # explicit runtime +result = agent.run(rounds=1, runtime=sim) + +print(result.summary) +print('actions seen:', set(result.actions)) +print('meetings/votes/chats:', + result.meetings, len(result.votes), len(result.chat_messages)) +print('directives:', result.raw['directives']) +``` + +What happens: + +1. `Agent.create()` resolves config (env + `among-them.toml`), parses an empty `instructions=` into default `Directives`, instantiates the six scripted modules, and loads the FFI singleton (auto-building the `.dylib` if needed). +2. `LocalSim` synthesises frames and fires meetings/bodies on a fixed cadence so your modules actually get called. +3. `agent.run` walks the tick loop, calls `policy.step_with_hooks` per frame, and returns a `RunResult` (see `runtime.py`) with `ticks`, `actions`, `meetings`, `votes`, `reports`, `chat_messages`, `summary`, and a `raw` dict containing `policy_summary`, `directives`, and `cyborg` status. + +## 4. Anatomy of an Agent + +```mermaid +flowchart LR + R[Runtime
LocalSim / Subprocess] -->|frame| P[Perception] + P -->|Percept| M[Memory] + M -->|VotingContext / ChatContext / ReportContext| Reasoning + subgraph Reasoning + V[Voter] + C[Chatter] + Rp[Reporter] + N[Navigator] + end + P -->|raw pixels| FFI[FFI step_batch
evidencebot_v2.dylib] + FFI -->|action index| N + N -->|override or pass-through| Cmd[Action / Vote / Report / Chat] + V --> Cmd + C --> Cmd + Rp --> Cmd + Cmd --> R +``` + +**Per stage** (cross-reference `src/among_them_sdk/agent.py`): + +- **Runtime** produces a `Frame` per tick. `LocalSim._make_frame` synthesises a `(1, 1, 128, 128)` uint8 buffer; `Subprocess` is a smoke-test stub today (see §7). +- **Perception** turns the frame into a `Percept`. The default `ScriptedPerception` is a passthrough — Nim's localizer is intentionally not re-implemented in Python. +- **Memory** maintains the suspicion table behind `VotingContext`. `ScriptedMemory` is a flat dict of `SuspicionEntry`; the FFI keeps its own richer table internally that the SDK cannot read. +- **Reasoning modules** fire at meeting/report/chat time. `Voter.vote(ctx) -> Vote`, `Reporter.should_report(ctx) -> bool`, `Chatter.speak(ctx) -> str | None`, `Navigator.step(ctx) -> int | None` (return `None` to keep the FFI action). +- **FFI step_batch** is the action floor: pixels in → action index out, every tick. Module overrides run *above* the FFI (DESIGN.md §9). + +Override any stage by passing `perception=`, `memory=`, `voter=`, `navigator=`, `chatter=`, or `reporter=` to `Agent.create`. + +## 5. Steering with `instructions=` + +Three flavours: + +```python +from among_them_sdk import Agent + +aggressive = Agent.create( + instructions=( + 'Report bodies aggressively. Trust no one after meeting 2. ' + 'Vote with the majority unless you have direct evidence.' + ), + use_llm_for_instructions=False, # hermetic for examples +) + +paranoid = Agent.create( + instructions='Be paranoid. Avoid the central room. Skip votes without evidence.', + use_llm_for_instructions=False, +) + +social = Agent.create( + instructions='Be friendly. Trust everyone. Only report if you saw the kill.', + use_llm_for_instructions=False, +) +``` + +Each string is parsed at `Agent.create` time into a `Directives` Pydantic model (`cognition/instructions.py`). When an API key is present and `use_llm_for_instructions=True`, the SDK calls a small `gpt-5.5` translation prompt that returns strict JSON. Otherwise — including any LLM failure — it falls back to `parse_instructions_keyword`, a deterministic regex parser. Both paths return the same model. + +The `aggressive` example above parses (under the keyword path) to: + +```json +{ + "suspicion_threshold": 0.8, + "report_eagerness": "high", + "kill_eagerness": "normal", + "chat_tone": "neutral", + "voting_style": "majority", + "trust_horizon_meetings": 2, + "avoid_central_room": false, + "follow_majority": true, + "raw": "Report bodies aggressively. Trust no one after meeting 2. ...", + "notes": ["matched: \\b(report|reporting)[^.]*\\b(aggressiv...", ...] +} +``` + +Inspect at runtime with: + +```python +print(aggressive.directives.model_dump_json(indent=2)) +``` + +If you want determinism without an LLM round-trip but with structured input, pass `cognitive=` directly. It overrides the parsed directives field-by-field via `Directives.merged_with`: + +```python +agent = Agent.create( + cognitive={ + 'suspicion_threshold': 0.7, + 'report_eagerness': 'high', + 'voting_style': 'majority', + 'follow_majority': True, + 'chat_tone': 'suspicious', + }, +) +``` + +Valid keys live in `Directives`: `suspicion_threshold`, `report_eagerness`, `kill_eagerness`, `chat_tone`, `voting_style`, `trust_horizon_meetings`, `avoid_central_room`, `follow_majority`. + +## 6. Swapping cognitive modules + +| Slot | Default (scripted) | LLM-backed variant | Source | +| --- | --- | --- | --- | +| `voter` | `ScriptedVoter` | `LLMVoter` | `modules/voter.py` | +| `chatter` | `ScriptedChatter` (also `SilentChatter`) | `LLMChatter` | `modules/chatter.py` | +| `reporter` | `ScriptedReporter` | *not exposed yet — tracked in DESIGN.md §9 (Phase 2)* | `modules/reporter.py` | +| `navigator` | `ScriptedNavigator` | *not exposed yet — extend `Navigator` directly* | `modules/navigator.py` | +| `perception` | `ScriptedPerception` | *not exposed yet — extend `Perception` directly* | `modules/perception.py` | +| `memory` | `ScriptedMemory` | *not exposed yet — extend `Memory` directly* | `modules/memory.py` | + +### (a) Drop in `LLMVoter`, leave the rest scripted + +```python +from among_them_sdk import Agent, LLMVoter + +agent = Agent.create(voter=LLMVoter(model='gpt-5.5')) +result = agent.run(rounds=1) +``` + +When to reach for it: LLM-quality vote justifications, cheap navigation. Cost: one chat completion per meeting (~6 per 5-min game). Latency: the meeting tick blocks on the LLM call. `LLMVoter` falls back to `ScriptedVoter` automatically if the key is missing or the call raises. + +### (b) Custom Python voter + +Implement the `Voter` ABC from `modules/voter.py` (one method, `vote(ctx) -> Vote`): + +```python +from among_them_sdk import Agent, Vote, Voter, VotingContext + +class GrudgeVoter(Voter): + def vote(self, ctx: VotingContext) -> Vote: + if not ctx.suspects: + return Vote.skip('no suspects yet') + top = max(ctx.suspects, key=lambda s: s.score) + if top.score < 0.3: + return Vote.skip(f'low confidence ({top.score:.2f})') + return Vote(target=top.player_id, reason=f'grudge ({top.score:.2f})') + +agent = Agent.create(voter=GrudgeVoter(), use_llm_for_instructions=False) +``` + +When to reach for it: deterministic, zero-LLM, unit-testable. Same shape applies to `Chatter`, `Reporter`, and `Navigator` — one method, one return value. + +### (c) Mixing two modules + +```python +from among_them_sdk import Agent, LLMChatter, Reporter +from among_them_sdk.modules.reporter import ReportContext + +class CautiousReporter(Reporter): + def should_report(self, ctx: ReportContext) -> bool: + return ctx.distance_to_body is not None and ctx.distance_to_body <= 2 + +agent = Agent.create( + chatter=LLMChatter(model='gpt-5.5', tone='suspicious'), + reporter=CautiousReporter(), +) +``` + +Typical real-world shape: LLM chat for personality, scripted/custom reporter for safety, default voter and navigator for cost. Modules are independent — no shared state to wire. + +## 7. Choosing a runtime + +```python +from among_them_sdk import Agent, LocalSim, Subprocess + +agent = Agent.create() +result = agent.run(rounds=2, runtime=LocalSim(ticks_per_round=120, seed=7)) +``` + +- **`LocalSim` (default).** In-process, fast, deterministic via `seed`. Knobs: `ticks_per_round`, `meeting_every`, `report_every`, `n_players`, `noisy_frames`. Use it for unit tests, A/B comparisons, and bulk trials. +- **`Subprocess`.** Today only exposes `run_default_subprocess()`, which shells out to `build_evidencebot_v2.py` as a toolchain smoke test. Full streaming game runs arrive with Phase 4. +- **`RemoteServer`.** Construction raises `NotImplementedError`. Don't pick it; track Phase 4 in DESIGN.md §8. + +If you omit `runtime=`, `agent.run` builds a default `LocalSim()` for you. + +## 8. Hooks + +`AgentHooks` (`hooks.py`) is a dataclass of optional callables. Each is invoked from the runtime tick loop and any exception they raise is logged + swallowed. + +| Hook | Signature | Fired by `agent.run` today? | +| --- | --- | --- | +| `pre_tick` | `(ctx: dict)` | yes | +| `post_tick` | `(ctx: dict, action: int)` | yes | +| `on_meeting` | `(ctx: dict)` | yes (twice — once on entry, once before vote) | +| `on_vote` | `(ctx: dict)` | yes | +| `on_message` | `(ctx: dict)` | yes (only when chatter emits text) | +| `on_kill` | `(ctx: dict)` | declared, **not fired yet** — Phase 2 will route kill events through it | +| `on_llm_call` | `(ctx: dict)` | declared, **not fired yet** — modules call LLMs directly today | +| `custom[name]` | `(*args, **kwargs)` | only when you call `agent.hooks.call('name', ...)` yourself | + +Worked example — log every vote to stdout: + +```python +from among_them_sdk import Agent, AgentHooks + +def log_vote(ctx): + print(f'[meeting {ctx["meeting"]}] -> {ctx["target"]!r} ({ctx["reason"]})') + +agent = Agent.create( + hooks=AgentHooks(on_vote=log_vote), + use_llm_for_instructions=False, +) +agent.run(rounds=1) +``` + +## 9. LLM providers & secrets + +`among_them_sdk.cognition.llm.LLM` parses model strings like an AI Gateway: + +- `'gpt-5.5'` or `'gpt-4o-mini'` → OpenAI (uses `OPENAI_API_KEY`). +- `'openai/gpt-5.5'` → explicit OpenAI. +- `'anthropic/claude-3-5-sonnet'` → Anthropic (uses `ANTHROPIC_API_KEY`). +- `'gateway/openai/gpt-5.5'` → Vercel AI Gateway (uses `AI_GATEWAY_API_KEY` and optional `AI_GATEWAY_BASE_URL`, defaults to `https://ai-gateway.vercel.sh/v1`). + +Switch provider per module: + +```python +from among_them_sdk import Agent, LLMChatter, LLMVoter + +agent = Agent.create( + voter=LLMVoter(model='anthropic/claude-3-5-sonnet'), + chatter=LLMChatter(model='gateway/openai/gpt-5.5', tone='friendly'), +) +``` + +`LLM(...)` raises `LLMUnavailableError` if the matching key isn't set; `LLMVoter` / `LLMChatter` catch it during `__init__` and fall back to their scripted counterparts. `cognition.llm.safe_llm(model)` is the LLM-or-None helper for your own modules. + +`among-them.toml` (loaded from CWD) layers config above env and below kwargs. Keys recognised by `config.py`: + +```toml +[agent] +profile = 'evidencebot_v2' + +[runtime] +default = 'local-sim' + +[tracing] +backend = 'structlog' +``` + +The loader also reads env vars prefixed `AMONG_THEM_` (e.g. `AMONG_THEM_PROFILE`) and **rejects** TOML keys ending in `_API_KEY` to discourage committing secrets — keep keys in env. + +## 10. Tracing & debugging + +The default backend is `structlog` JSONL on stdout (see `tracing.py`). Every `Agent.create` and tick emits an event: + +```python +import logging, structlog +logging.basicConfig(level=logging.INFO) +structlog.contextvars.clear_contextvars() + +from among_them_sdk import Agent +agent = Agent.create(use_llm_for_instructions=False) +agent.run(rounds=1) +# {"event": "agent.created", "profile": "evidencebot_v2", ...} +# {"event": "agent.vote", "meeting": 1, "target": "P03", ...} +# {"event": "agent.run.complete", "ticks": 60, ...} +``` + +Inspecting after a run completes: + +```python +result = agent.run(rounds=1) +print(result.summary) # one-line digest +print(result.actions[:8]) # raw action indices +print([(v.target, v.reason) for v in result.votes]) +print(result.raw['policy_summary']) # FFI handle, ABI, lib path, tick count +print(result.raw['directives']) # parsed Directives dump +print(result.raw['cyborg']) # cyborg framework availability +``` + +Note: there is **no** `result.events` field — log events are emitted via structlog, not collected on the result. If you need a per-event transcript, register hooks (see §8) and accumulate them yourself, or filter the structlog JSONL. + +Dump the parsed directives with `agent.directives.model_dump_json(indent=2)`. + +Confirm the FFI loaded: + +```python +from among_them_sdk import ffi +print('available:', ffi.is_available()) +print('library:', ffi.library_path()) +print('abi:', ffi.EVIDENCEBOT_V2_ABI_VERSION) +lib = ffi.load_library() # forces a full load + ABI handshake +print('loaded:', lib.path, 'abi', lib.abi_version) +``` + +For the cyborg bridge specifically: + +```python +from among_them_sdk import _cyborg +print(_cyborg.status()) +# {'available': True/False, 'root': '...', 'imported': {'Command': ..., ...}} +``` + +`tracing.enable_langfuse(...)` exists but raises `NotImplementedError` — Langfuse + OTel emission are deferred to Phase 4. + +## 11. Extensions: shipping your own profile/module + +`extensions.py` discovers third-party packages via `importlib.metadata.entry_points`. The supported groups are: + +- `among_them.profiles` — full agent profiles. Built-in entries `default` and `evidencebot_v2` are registered by the SDK's own `pyproject.toml`. +- `among_them.modules.voter` +- `among_them.modules.chatter` +- `among_them.modules.reporter` +- `among_them.modules.navigator` + +(Memory and Perception don't have a discovery group yet — pass them directly to `Agent.create` if you want to override them from a third-party package.) + +Minimal `pyproject.toml` for an external package called `among-them-spicy-bots`: + +```toml +[project] +name = 'among-them-spicy-bots' +version = '0.1.0' +dependencies = ['among-them-sdk>=0.1'] + +[project.entry-points.'among_them.profiles'] +spicy = 'spicy_bots.profile:SpicyProfile' + +[project.entry-points.'among_them.modules.voter'] +spicy_voter = 'spicy_bots.voters:SpicyVoter' +``` + +Discovery from the SDK side (lazy, only imports on demand): + +```python +from among_them_sdk.extensions import list_modules, list_profiles, load_profile + +print(list_profiles()) # {'default': '...', 'spicy': 'spicy_bots.profile:SpicyProfile', ...} +print(list_modules('voter')) # {'spicy_voter': 'spicy_bots.voters:SpicyVoter'} +profile = load_profile('spicy') # imports + instantiates +``` + +Profiles should expose a `build(num_agents=1) -> EvidenceBotV2Policy`-compatible policy (see `DefaultProfile` in `among_them/sdk/src/among_them_sdk/policy/evidencebot_v2.py`). + +## 12. Recipes + +### Run 100 quick games and tally vote-rate + +(`LocalSim` has no win/loss signal yet — Phase 4. We use vote-rate as a proxy.) + +```python +from among_them_sdk import Agent, LocalSim + +agent = Agent.create(use_llm_for_instructions=False) +votes_cast = skips = 0 +for i in range(100): + sim = LocalSim(ticks_per_round=60, meeting_every=20, seed=i) + result = agent.run(rounds=1, runtime=sim) + for v in result.votes: + if v.target is None: skips += 1 + else: votes_cast += 1 + +print(f'votes={votes_cast} skips={skips} rate={votes_cast / (votes_cast + skips):.2%}') +``` + +### Tournament: spawn 4 variants and compete in parallel + +See `examples/tournament.py`. Short form: + +```python +from among_them_sdk import Agent, Runner + +agents = [ + Agent.create(seed=1, use_llm_for_instructions=False), + Agent.create(seed=2, instructions='Be aggressive about reporting. Trust nobody.', + use_llm_for_instructions=False), + Agent.create(seed=3, instructions='Vote with the majority. Avoid the central room.', + use_llm_for_instructions=False), + Agent.create(seed=4, cognitive={'suspicion_threshold': 0.8}, + use_llm_for_instructions=False), +] + +runner = Runner(agents=agents, rounds=1, parallelism=2) +for row in runner.leaderboard(): + print(row) +``` + +`Runner.parallelism > 1` uses a thread pool — fine for the FFI (releases the GIL) and any I/O-bound LLM calls. + +### A/B test two instruction strings + +```python +from statistics import mean +from among_them_sdk import Agent, LocalSim + +def trial(instructions: str, n: int = 25) -> float: + rates = [] + for i in range(n): + agent = Agent.create(instructions=instructions, seed=i, use_llm_for_instructions=False) + result = agent.run(rounds=1, runtime=LocalSim(ticks_per_round=60, meeting_every=20, seed=i)) + targets = [v for v in result.votes if v.target is not None] + rates.append(len(targets) / max(1, len(result.votes))) + return mean(rates) + +a = trial('Vote on evidence only.') +b = trial('Vote with the majority always.') +print(f'evidence={a:.2%} majority={b:.2%}') +``` + +### Save a transcript per game to disk + +```python +import json, pathlib +from among_them_sdk import Agent, AgentHooks + +events = [] +hooks = AgentHooks( + on_vote=lambda ctx: events.append({'kind': 'vote', **ctx}), + on_message=lambda ctx: events.append({'kind': 'chat', **ctx}), + on_meeting=lambda ctx: events.append({'kind': 'meeting', **ctx}), +) + +agent = Agent.create(hooks=hooks, use_llm_for_instructions=False) +result = agent.run(rounds=1) + +out = { + 'summary': result.summary, + 'directives': result.raw['directives'], + 'events': events, +} +pathlib.Path('transcript.json').write_text(json.dumps(out, indent=2, default=str)) +``` + +## 13. Troubleshooting + +- **`OSError: cannot load libevidencebot_v2.dylib`** — the artefact is missing or stale. Run `python among_them/players/build_evidencebot_v2.py`. Expected path: `among_them/players/libevidencebot_v2.{dylib,so,dll}`; override the search root with `AMONG_THEM_PLAYERS_DIR`. See `ffi.library_path()`. +- **`FFIError: build_evidencebot_v2.py succeeded but … was not produced`** — the build ran but emitted a different filename. Confirm `nim --version` reports 2.2.4 and that the build script finished without warnings. +- **`Cyborg framework not found`** — set `CYBORG_FRAMEWORK_PATH=/path/to/cyborg-policy-framework`. The SDK still works without it; `_cyborg.is_available()` returns `False`. +- **Directives silently use the keyword parser** — `parse_instructions_with_llm` catches `LLMUnavailableError` and logs at INFO. Set `OPENAI_API_KEY` (or pass `use_llm_for_instructions=False` to make the fallback explicit). Or pass `cognitive={...}` directly to bypass the LLM round-trip. +- **`NotImplementedError: RemoteServer is Phase 4`** — use `LocalSim`. DESIGN.md §8 tracks the cloud roadmap. +- **`uv sync` fails on Python 3.10 or earlier** — bump to 3.11+. The SDK uses `tomllib` and PEP 604 generics. +- **Ruff complains about quote style** — `pyproject.toml` sets `quote-style = 'double'` for the formatter. Run `ruff format` and accept its choice when contributing back. + +## 14. Where to go next + +- [`README.md`](../README.md) — elevator pitch, install, hello world. +- [`tournament-submission.md`](tournament-submission.md) — how to ship an SDK policy to the cogames leaderboard via `SDKPolicy` + the bundled-config flow. +- [`players/sdk/DESIGN.md`](../../players/sdk/DESIGN.md) — full architecture and Phase 2+ roadmap. +- `examples/` — copy-pasteable scripts for every section above. `eight_player_game.py` runs `LocalSDKPolicy` against a real local server and exercises the same override engine the tournament uses. + +**Phase 2 preview** (DESIGN.md §8): a richer Nim FFI so the SDK can intercept *inside* the bot, a real `LocalSim` game loop so agents can play each other in-process, and an async-first top-level API (`async def run`, `agent.connect(runtime)`, `async for event in run.stream()`). Phase 3 adds skill auto-loading and TOML profile composition; Phase 4 adds `RemoteServer`, Langfuse tracing, and tournament `Runner` against the live games server. diff --git a/among_them/sdk/docs/tournament-submission.md b/among_them/sdk/docs/tournament-submission.md new file mode 100644 index 00000000..960597c4 --- /dev/null +++ b/among_them/sdk/docs/tournament-submission.md @@ -0,0 +1,220 @@ +# Submitting an SDK policy to the Among Them tournament + +Last updated: May 6, 2026 + +This guide is the SDK-flavoured companion to +[`among_them/players/SUBMIT_TO_TOURNAMENT.md`](../../players/SUBMIT_TO_TOURNAMENT.md). +Read that first for the cogames basics; this doc only covers what's +different when you ship an `Agent.create(...)`-style policy through the +SDK instead of editing `evidencebot_v2_policy.py` directly. + +## What gets uploaded + +Cogames builds your bundle in an Alpine Docker container during +validation. The container: + +* Has Nim 2.2.4 + a C toolchain (the build script auto-installs Nim). +* **Has no API keys.** No OpenAI, no Anthropic, no AI Gateway. +* **Has no outbound network.** Anything that hits a remote host fails. +* Imports your policy via the class path you pass to `cogames upload -p`. +* Calls `__init__(policy_env_info, device='cpu')` per game (no kwargs). +* Calls `step_batch(raw_observations, raw_actions)` per tick. + +That last point is why `Agent.create(instructions="...")` can't drive +the tournament directly — there's no constructor seam to pass it. The +SDK ships a different entrypoint for the tournament: +`among_them_sdk.policy.cogames.SDKPolicy`. + +## How `SDKPolicy` works + +`SDKPolicy` is a `MultiAgentPolicy` subclass that **composes** +`EvidenceBotV2NimPolicy` rather than replacing it. Per tick: + +1. Pass observations to `EvidenceBotV2NimPolicy.step_batch` — the inner + Nim policy decides every action exactly as it would in a vanilla + `evidencebot_v2` submission. +2. Apply SDK directives + module overrides to the resulting action + indices (see `_DirectiveOverrideEngine` in + `src/among_them_sdk/policy/cogames.py`). + +Step 2 is where SDK semantics show up in the tournament. Concretely: + +| SDK feature | Lands at upload time? | +|------------------------------|-----------------------| +| Pre-resolved `Directives` | **Yes** — bundled JSON. | +| `--instructions "..."` (deterministic regex parse) | Yes. | +| `--instructions "..."` (LLM-resolved) | Yes, *if* the LLM ran at packaging time and the resolved Directives shipped in the bundle. The validator never calls an LLM. | +| `cognitive={...}` overrides | Yes, via the bundle JSON. | +| Custom `Voter` / `Reporter` / `Chatter` Python classes | Yes, **only if** their source ships in the upload bundle. The bundle config's `modules` table resolves to the class instance at construct time. | +| `LLMVoter` / `LLMChatter` | **No.** No keys, no network. Stays as scripted fallback. | +| `LiveGame` runtime hooks | **No.** Cogames runs `step_batch`, not `Agent.run`. | +| Per-tick `pre_tick` / `post_tick` hooks | **No.** No Agent in scope. | +| Memory introspection (`agent.memory.suspects`) | **No.** No Agent. | + +## Architectural caveat (read before relying on overrides) + +The Nim FFI surface is **action-indices-out only**. It does not surface +the bot's internal voting / reporting / chat / kill decisions — only +"what action mask did this tick emit". So the override engine works at +the action-index level: it can suppress a `report_*` action it sees the +inner Nim policy emit, and it can advise a `Voter` decision the bot is +about to take, but it **cannot inject** a vote or report the inner Nim +policy didn't already decide to take. This is the same gap documented +in `src/among_them_sdk/policy/evidencebot_v2.py` and is tracked as a +Phase 2 Nim FFI extension in +[`among_them/players/sdk/DESIGN.md`](../../players/sdk/DESIGN.md) §8. + +In practice that means a `Reporter` override is degraded to a *gate* +("don't report things the Nim bot wants to report") and not a *trigger* +("report things the Nim bot wouldn't"). + +## The full happy path + +### 1. Build your policy locally with `Agent.create` + +```python +from among_them_sdk import Agent, ScriptedChatter + +agent = Agent.create( + instructions=( + "Report bodies aggressively when you have direct evidence. " + "Vote on evidence only — never follow the majority. Trust no one " + "after meeting 2." + ), + cognitive={"suspicion_threshold": 0.65, "report_eagerness": "high"}, + chatter=ScriptedChatter(tone="suspicious"), +) +``` + +Iterate locally with `LiveGame` (see +[`examples/eight_player_game.py`](../examples/eight_player_game.py)) — +that example runs `LocalSDKPolicy`, which uses the **same override +engine** as `SDKPolicy`, so what you see locally is what the tournament +runs. + +### 2. Package the bundle + +The `among_them_sdk.package` CLI extracts your already-resolved +`Directives` + module specs from the agent and writes them to a JSON +file next to the cogames policy module: + +```bash +cd among_them/sdk + +# Option A — package directly from a script that builds an Agent +python -m among_them_sdk.package \ + --from-agent examples/personas.py:_build_aggressive \ + --policy-name "$USER-sdk-aggressive" + +# Option B — inline (for hand-written configs) +python -m among_them_sdk.package \ + --instructions "Trust nobody. Report bodies aggressively." \ + --cognitive suspicion_threshold=0.65 \ + --module voter=scripted:threshold=0.65 \ + --module chatter=scripted:tone=suspicious \ + --policy-name "$USER-sdk-paranoid" +``` + +The packager: + +1. Validates the schema of your directives + module specs. +2. Writes `among_them_sdk_config.json` next to + `src/among_them_sdk/policy/cogames.py` (cogames flattens this into + the bundle root next to `cogames.py` at upload time). +3. Prints the exact `cogames upload` command with every `-f` flag set. + +### 3. Run the printed `cogames upload` command + +The full bundle list (from `SUBMIT_TO_TOURNAMENT.md` plus the SDK): + +```bash +cogames upload \ + -p class=among_them_sdk.policy.cogames.SDKPolicy \ + -f among_them/players/evidencebot_v2_policy.py \ + -f among_them/players/build_evidencebot_v2.py \ + -f among_them/players/evidencebot_v2.nim \ + -f among_them/players/evidencebot_v2 \ + -f among_them/sim.nim \ + -f common \ + -f src/bitworld \ + -f nimby.lock \ + -f among_them/sdk/src/among_them_sdk \ + -f among_them/sdk/pyproject.toml \ + -n "$USER-sdk-aggressive" \ + --season among-them +``` + +Add `--dry-run` to validate the bundle in Docker without uploading. Add +`--skip-validation` only if Docker is broken on your machine and you +want to push anyway. + +### 4. Confirm the validator finds your config + +The validator's stdout shows `SDKPolicy loaded config from ` +when the JSON file landed at the right place. If you see +`no among_them_sdk_config.json found near …; using defaults` instead, +double-check that `-f among_them/sdk/src/among_them_sdk` was on the +upload line — that directory contains both `cogames.py` and the +generated `among_them_sdk_config.json`. + +## Worked example — `aggressive_imposter` from `personas.py` + +```python +# examples/personas.py — already in the repo +from among_them_sdk import Agent, SilentChatter + +def _build_aggressive() -> Agent: + return Agent.create( + instructions=( + "Kill aggressively. Never report bodies. Skip votes unless " + "you must blame someone." + ), + cognitive={"kill_eagerness": "high", "report_eagerness": "low"}, + chatter=SilentChatter(), + use_llm_for_instructions=False, + ) +``` + +```bash +cd among_them/sdk +python -m among_them_sdk.package \ + --from-agent examples/personas.py:_build_aggressive \ + --policy-name "$USER-sdk-aggressive-imposter" +``` + +The CLI prints the resolved directives and the upload command. Run the +upload command from the **repo root** (`bitworld/`) so the relative +`-f` paths resolve. The validator runs the bundle, the SDK overrides +suppress every report the Nim bot would have emitted, and your policy +lands on the leaderboard. + +## Things to sanity-check first + +1. `python -m among_them_sdk.package --from-agent